Merge branch 'stable-3.2' into pandora-3.2
[pandora-kernel.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89         skb_reset_mac_header(newskb);
90         __skb_pull(newskb, skb_network_offset(newskb));
91         newskb->pkt_type = PACKET_LOOPBACK;
92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
93         WARN_ON(!skb_dst(newskb));
94
95         netif_rx_ni(newskb);
96         return 0;
97 }
98
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101         struct dst_entry *dst = skb_dst(skb);
102         struct net_device *dev = dst->dev;
103         struct neighbour *neigh;
104
105         skb->protocol = htons(ETH_P_IPV6);
106         skb->dev = dev;
107
108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110
111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112                     ((mroute6_socket(dev_net(dev), skb) &&
113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115                                          &ipv6_hdr(skb)->saddr))) {
116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117
118                         /* Do not check for IFF_ALLMULTI; multicast routing
119                            is not supported in any case.
120                          */
121                         if (newskb)
122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123                                         newskb, NULL, newskb->dev,
124                                         ip6_dev_loopback_xmit);
125
126                         if (ipv6_hdr(skb)->hop_limit == 0) {
127                                 IP6_INC_STATS(dev_net(dev), idev,
128                                               IPSTATS_MIB_OUTDISCARDS);
129                                 kfree_skb(skb);
130                                 return 0;
131                         }
132                 }
133
134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135                                 skb->len);
136         }
137
138         rcu_read_lock();
139         neigh = dst_get_neighbour(dst);
140         if (neigh) {
141                 int res = neigh_output(neigh, skb);
142
143                 rcu_read_unlock();
144                 return res;
145         }
146         rcu_read_unlock();
147         IP6_INC_STATS(dev_net(dst->dev),
148                       ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149         kfree_skb(skb);
150         return -EINVAL;
151 }
152
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156             dst_allfrag(skb_dst(skb)))
157                 return ip6_fragment(skb, ip6_finish_output2);
158         else
159                 return ip6_finish_output2(skb);
160 }
161
162 int ip6_output(struct sk_buff *skb)
163 {
164         struct net_device *dev = skb_dst(skb)->dev;
165         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166         if (unlikely(idev->cnf.disable_ipv6)) {
167                 IP6_INC_STATS(dev_net(dev), idev,
168                               IPSTATS_MIB_OUTDISCARDS);
169                 kfree_skb(skb);
170                 return 0;
171         }
172
173         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174                             ip6_finish_output,
175                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177
178 /*
179  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183              struct ipv6_txoptions *opt, int tclass)
184 {
185         struct net *net = sock_net(sk);
186         struct ipv6_pinfo *np = inet6_sk(sk);
187         struct in6_addr *first_hop = &fl6->daddr;
188         struct dst_entry *dst = skb_dst(skb);
189         struct ipv6hdr *hdr;
190         u8  proto = fl6->flowi6_proto;
191         int seg_len = skb->len;
192         int hlimit = -1;
193         u32 mtu;
194
195         if (opt) {
196                 unsigned int head_room;
197
198                 /* First: exthdrs may take lots of space (~8K for now)
199                    MAX_HEADER is not enough.
200                  */
201                 head_room = opt->opt_nflen + opt->opt_flen;
202                 seg_len += head_room;
203                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204
205                 if (skb_headroom(skb) < head_room) {
206                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207                         if (skb2 == NULL) {
208                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209                                               IPSTATS_MIB_OUTDISCARDS);
210                                 kfree_skb(skb);
211                                 return -ENOBUFS;
212                         }
213                         kfree_skb(skb);
214                         skb = skb2;
215                         skb_set_owner_w(skb, sk);
216                 }
217                 if (opt->opt_flen)
218                         ipv6_push_frag_opts(skb, opt, &proto);
219                 if (opt->opt_nflen)
220                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221         }
222
223         skb_push(skb, sizeof(struct ipv6hdr));
224         skb_reset_network_header(skb);
225         hdr = ipv6_hdr(skb);
226
227         /*
228          *      Fill in the IPv6 header
229          */
230         if (np)
231                 hlimit = np->hop_limit;
232         if (hlimit < 0)
233                 hlimit = ip6_dst_hoplimit(dst);
234
235         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
236
237         hdr->payload_len = htons(seg_len);
238         hdr->nexthdr = proto;
239         hdr->hop_limit = hlimit;
240
241         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
242         ipv6_addr_copy(&hdr->daddr, first_hop);
243
244         skb->priority = sk->sk_priority;
245         skb->mark = sk->sk_mark;
246
247         mtu = dst_mtu(dst);
248         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250                               IPSTATS_MIB_OUT, skb->len);
251                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252                                dst->dev, dst_output);
253         }
254
255         if (net_ratelimit())
256                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
257         skb->dev = dst->dev;
258         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
259         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
260         kfree_skb(skb);
261         return -EMSGSIZE;
262 }
263
264 EXPORT_SYMBOL(ip6_xmit);
265
266 /*
267  *      To avoid extra problems ND packets are send through this
268  *      routine. It's code duplication but I really want to avoid
269  *      extra checks since ipv6_build_header is used by TCP (which
270  *      is for us performance critical)
271  */
272
273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
274                const struct in6_addr *saddr, const struct in6_addr *daddr,
275                int proto, int len)
276 {
277         struct ipv6_pinfo *np = inet6_sk(sk);
278         struct ipv6hdr *hdr;
279
280         skb->protocol = htons(ETH_P_IPV6);
281         skb->dev = dev;
282
283         skb_reset_network_header(skb);
284         skb_put(skb, sizeof(struct ipv6hdr));
285         hdr = ipv6_hdr(skb);
286
287         *(__be32*)hdr = htonl(0x60000000);
288
289         hdr->payload_len = htons(len);
290         hdr->nexthdr = proto;
291         hdr->hop_limit = np->hop_limit;
292
293         ipv6_addr_copy(&hdr->saddr, saddr);
294         ipv6_addr_copy(&hdr->daddr, daddr);
295
296         return 0;
297 }
298
299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
300 {
301         struct ip6_ra_chain *ra;
302         struct sock *last = NULL;
303
304         read_lock(&ip6_ra_lock);
305         for (ra = ip6_ra_chain; ra; ra = ra->next) {
306                 struct sock *sk = ra->sk;
307                 if (sk && ra->sel == sel &&
308                     (!sk->sk_bound_dev_if ||
309                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
310                         if (last) {
311                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
312                                 if (skb2)
313                                         rawv6_rcv(last, skb2);
314                         }
315                         last = sk;
316                 }
317         }
318
319         if (last) {
320                 rawv6_rcv(last, skb);
321                 read_unlock(&ip6_ra_lock);
322                 return 1;
323         }
324         read_unlock(&ip6_ra_lock);
325         return 0;
326 }
327
328 static int ip6_forward_proxy_check(struct sk_buff *skb)
329 {
330         struct ipv6hdr *hdr = ipv6_hdr(skb);
331         u8 nexthdr = hdr->nexthdr;
332         int offset;
333
334         if (ipv6_ext_hdr(nexthdr)) {
335                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
336                 if (offset < 0)
337                         return 0;
338         } else
339                 offset = sizeof(struct ipv6hdr);
340
341         if (nexthdr == IPPROTO_ICMPV6) {
342                 struct icmp6hdr *icmp6;
343
344                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345                                          offset + 1 - skb->data)))
346                         return 0;
347
348                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349
350                 switch (icmp6->icmp6_type) {
351                 case NDISC_ROUTER_SOLICITATION:
352                 case NDISC_ROUTER_ADVERTISEMENT:
353                 case NDISC_NEIGHBOUR_SOLICITATION:
354                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
355                 case NDISC_REDIRECT:
356                         /* For reaction involving unicast neighbor discovery
357                          * message destined to the proxied address, pass it to
358                          * input function.
359                          */
360                         return 1;
361                 default:
362                         break;
363                 }
364         }
365
366         /*
367          * The proxying router can't forward traffic sent to a link-local
368          * address, so signal the sender and discard the packet. This
369          * behavior is clarified by the MIPv6 specification.
370          */
371         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372                 dst_link_failure(skb);
373                 return -1;
374         }
375
376         return 0;
377 }
378
379 static inline int ip6_forward_finish(struct sk_buff *skb)
380 {
381         return dst_output(skb);
382 }
383
384 int ip6_forward(struct sk_buff *skb)
385 {
386         struct dst_entry *dst = skb_dst(skb);
387         struct ipv6hdr *hdr = ipv6_hdr(skb);
388         struct inet6_skb_parm *opt = IP6CB(skb);
389         struct net *net = dev_net(dst->dev);
390         struct neighbour *n;
391         u32 mtu;
392
393         if (net->ipv6.devconf_all->forwarding == 0)
394                 goto error;
395
396         if (skb_warn_if_lro(skb))
397                 goto drop;
398
399         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
400                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
401                 goto drop;
402         }
403
404         if (skb->pkt_type != PACKET_HOST)
405                 goto drop;
406
407         skb_forward_csum(skb);
408
409         /*
410          *      We DO NOT make any processing on
411          *      RA packets, pushing them to user level AS IS
412          *      without ane WARRANTY that application will be able
413          *      to interpret them. The reason is that we
414          *      cannot make anything clever here.
415          *
416          *      We are not end-node, so that if packet contains
417          *      AH/ESP, we cannot make anything.
418          *      Defragmentation also would be mistake, RA packets
419          *      cannot be fragmented, because there is no warranty
420          *      that different fragments will go along one path. --ANK
421          */
422         if (opt->ra) {
423                 u8 *ptr = skb_network_header(skb) + opt->ra;
424                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
425                         return 0;
426         }
427
428         /*
429          *      check and decrement ttl
430          */
431         if (hdr->hop_limit <= 1) {
432                 /* Force OUTPUT device used as source address */
433                 skb->dev = dst->dev;
434                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
435                 IP6_INC_STATS_BH(net,
436                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
437
438                 kfree_skb(skb);
439                 return -ETIMEDOUT;
440         }
441
442         /* XXX: idev->cnf.proxy_ndp? */
443         if (net->ipv6.devconf_all->proxy_ndp &&
444             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
445                 int proxied = ip6_forward_proxy_check(skb);
446                 if (proxied > 0)
447                         return ip6_input(skb);
448                 else if (proxied < 0) {
449                         IP6_INC_STATS(net, ip6_dst_idev(dst),
450                                       IPSTATS_MIB_INDISCARDS);
451                         goto drop;
452                 }
453         }
454
455         if (!xfrm6_route_forward(skb)) {
456                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
457                 goto drop;
458         }
459         dst = skb_dst(skb);
460
461         /* IPv6 specs say nothing about it, but it is clear that we cannot
462            send redirects to source routed frames.
463            We don't send redirects to frames decapsulated from IPsec.
464          */
465         n = dst_get_neighbour(dst);
466         if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
467                 struct in6_addr *target = NULL;
468                 struct rt6_info *rt;
469
470                 /*
471                  *      incoming and outgoing devices are the same
472                  *      send a redirect.
473                  */
474
475                 rt = (struct rt6_info *) dst;
476                 if ((rt->rt6i_flags & RTF_GATEWAY))
477                         target = (struct in6_addr*)&n->primary_key;
478                 else
479                         target = &hdr->daddr;
480
481                 if (!rt->rt6i_peer)
482                         rt6_bind_peer(rt, 1);
483
484                 /* Limit redirects both by destination (here)
485                    and by source (inside ndisc_send_redirect)
486                  */
487                 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
488                         ndisc_send_redirect(skb, n, target);
489         } else {
490                 int addrtype = ipv6_addr_type(&hdr->saddr);
491
492                 /* This check is security critical. */
493                 if (addrtype == IPV6_ADDR_ANY ||
494                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
495                         goto error;
496                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
497                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
498                                     ICMPV6_NOT_NEIGHBOUR, 0);
499                         goto error;
500                 }
501         }
502
503         mtu = dst_mtu(dst);
504         if (mtu < IPV6_MIN_MTU)
505                 mtu = IPV6_MIN_MTU;
506
507         if (skb->len > mtu && !skb_is_gso(skb)) {
508                 /* Again, force OUTPUT device used as source address */
509                 skb->dev = dst->dev;
510                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
511                 IP6_INC_STATS_BH(net,
512                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
513                 IP6_INC_STATS_BH(net,
514                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
515                 kfree_skb(skb);
516                 return -EMSGSIZE;
517         }
518
519         if (skb_cow(skb, dst->dev->hard_header_len)) {
520                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
521                 goto drop;
522         }
523
524         hdr = ipv6_hdr(skb);
525
526         /* Mangling hops number delayed to point after skb COW */
527
528         hdr->hop_limit--;
529
530         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
531         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
532                        ip6_forward_finish);
533
534 error:
535         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
536 drop:
537         kfree_skb(skb);
538         return -EINVAL;
539 }
540
541 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
542 {
543         to->pkt_type = from->pkt_type;
544         to->priority = from->priority;
545         to->protocol = from->protocol;
546         skb_dst_drop(to);
547         skb_dst_set(to, dst_clone(skb_dst(from)));
548         to->dev = from->dev;
549         to->mark = from->mark;
550
551 #ifdef CONFIG_NET_SCHED
552         to->tc_index = from->tc_index;
553 #endif
554         nf_copy(to, from);
555 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
556     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
557         to->nf_trace = from->nf_trace;
558 #endif
559         skb_copy_secmark(to, from);
560 }
561
562 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
563 {
564         unsigned int offset = sizeof(struct ipv6hdr);
565         unsigned int packet_len = skb->tail - skb->network_header;
566         int found_rhdr = 0;
567         *nexthdr = &ipv6_hdr(skb)->nexthdr;
568
569         while (offset <= packet_len) {
570                 struct ipv6_opt_hdr *exthdr;
571                 unsigned int len;
572
573                 switch (**nexthdr) {
574
575                 case NEXTHDR_HOP:
576                         break;
577                 case NEXTHDR_ROUTING:
578                         found_rhdr = 1;
579                         break;
580                 case NEXTHDR_DEST:
581 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
582                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
583                                 break;
584 #endif
585                         if (found_rhdr)
586                                 return offset;
587                         break;
588                 default :
589                         return offset;
590                 }
591
592                 if (offset + sizeof(struct ipv6_opt_hdr) > packet_len)
593                         return -EINVAL;
594
595                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
596                                                  offset);
597                 len = ipv6_optlen(exthdr);
598                 if (len + offset >= IPV6_MAXPLEN)
599                         return -EINVAL;
600                 offset += len;
601                 *nexthdr = &exthdr->nexthdr;
602         }
603
604         return -EINVAL;
605 }
606
607 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
608 {
609         static u32 ip6_idents_hashrnd __read_mostly;
610         static bool hashrnd_initialized = false;
611         u32 hash, id;
612
613         if (unlikely(!hashrnd_initialized)) {
614                 hashrnd_initialized = true;
615                 get_random_bytes(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
616         }
617         hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd);
618         hash = __ipv6_addr_jhash(&rt->rt6i_src.addr, hash);
619
620         id = ip_idents_reserve(hash, 1);
621         fhdr->identification = htonl(id);
622 }
623
624 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
625 {
626         struct sk_buff *frag;
627         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
628         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
629         struct ipv6hdr *tmp_hdr;
630         struct frag_hdr *fh;
631         unsigned int mtu, hlen, left, len;
632         int hroom, troom;
633         __be32 frag_id = 0;
634         int ptr, offset = 0, err=0;
635         u8 *prevhdr, nexthdr = 0;
636         struct net *net = dev_net(skb_dst(skb)->dev);
637
638         err = ip6_find_1stfragopt(skb, &prevhdr);
639         if (err < 0)
640                 goto fail;
641         hlen = err;
642         nexthdr = *prevhdr;
643
644         mtu = ip6_skb_dst_mtu(skb);
645
646         /* We must not fragment if the socket is set to force MTU discovery
647          * or if the skb it not generated by a local socket.
648          */
649         if (!skb->local_df && skb->len > mtu) {
650                 skb->dev = skb_dst(skb)->dev;
651                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
652                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
653                               IPSTATS_MIB_FRAGFAILS);
654                 kfree_skb(skb);
655                 return -EMSGSIZE;
656         }
657
658         if (np && np->frag_size < mtu) {
659                 if (np->frag_size)
660                         mtu = np->frag_size;
661         }
662         mtu -= hlen + sizeof(struct frag_hdr);
663
664         if (skb_has_frag_list(skb)) {
665                 int first_len = skb_pagelen(skb);
666                 struct sk_buff *frag2;
667
668                 if (first_len - hlen > mtu ||
669                     ((first_len - hlen) & 7) ||
670                     skb_cloned(skb))
671                         goto slow_path;
672
673                 skb_walk_frags(skb, frag) {
674                         /* Correct geometry. */
675                         if (frag->len > mtu ||
676                             ((frag->len & 7) && frag->next) ||
677                             skb_headroom(frag) < hlen)
678                                 goto slow_path_clean;
679
680                         /* Partially cloned skb? */
681                         if (skb_shared(frag))
682                                 goto slow_path_clean;
683
684                         BUG_ON(frag->sk);
685                         if (skb->sk) {
686                                 frag->sk = skb->sk;
687                                 frag->destructor = sock_wfree;
688                         }
689                         skb->truesize -= frag->truesize;
690                 }
691
692                 err = 0;
693                 offset = 0;
694                 frag = skb_shinfo(skb)->frag_list;
695                 skb_frag_list_init(skb);
696                 /* BUILD HEADER */
697
698                 *prevhdr = NEXTHDR_FRAGMENT;
699                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
700                 if (!tmp_hdr) {
701                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
702                                       IPSTATS_MIB_FRAGFAILS);
703                         return -ENOMEM;
704                 }
705
706                 __skb_pull(skb, hlen);
707                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
708                 __skb_push(skb, hlen);
709                 skb_reset_network_header(skb);
710                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
711
712                 ipv6_select_ident(fh, rt);
713                 fh->nexthdr = nexthdr;
714                 fh->reserved = 0;
715                 fh->frag_off = htons(IP6_MF);
716                 frag_id = fh->identification;
717
718                 first_len = skb_pagelen(skb);
719                 skb->data_len = first_len - skb_headlen(skb);
720                 skb->len = first_len;
721                 ipv6_hdr(skb)->payload_len = htons(first_len -
722                                                    sizeof(struct ipv6hdr));
723
724                 dst_hold(&rt->dst);
725
726                 for (;;) {
727                         /* Prepare header of the next frame,
728                          * before previous one went down. */
729                         if (frag) {
730                                 frag->ip_summed = CHECKSUM_NONE;
731                                 skb_reset_transport_header(frag);
732                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
733                                 __skb_push(frag, hlen);
734                                 skb_reset_network_header(frag);
735                                 memcpy(skb_network_header(frag), tmp_hdr,
736                                        hlen);
737                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
738                                 fh->nexthdr = nexthdr;
739                                 fh->reserved = 0;
740                                 fh->frag_off = htons(offset);
741                                 if (frag->next != NULL)
742                                         fh->frag_off |= htons(IP6_MF);
743                                 fh->identification = frag_id;
744                                 ipv6_hdr(frag)->payload_len =
745                                                 htons(frag->len -
746                                                       sizeof(struct ipv6hdr));
747                                 ip6_copy_metadata(frag, skb);
748                         }
749
750                         err = output(skb);
751                         if(!err)
752                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
753                                               IPSTATS_MIB_FRAGCREATES);
754
755                         if (err || !frag)
756                                 break;
757
758                         skb = frag;
759                         frag = skb->next;
760                         skb->next = NULL;
761                 }
762
763                 kfree(tmp_hdr);
764
765                 if (err == 0) {
766                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
767                                       IPSTATS_MIB_FRAGOKS);
768                         dst_release(&rt->dst);
769                         return 0;
770                 }
771
772                 while (frag) {
773                         skb = frag->next;
774                         kfree_skb(frag);
775                         frag = skb;
776                 }
777
778                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
779                               IPSTATS_MIB_FRAGFAILS);
780                 dst_release(&rt->dst);
781                 return err;
782
783 slow_path_clean:
784                 skb_walk_frags(skb, frag2) {
785                         if (frag2 == frag)
786                                 break;
787                         frag2->sk = NULL;
788                         frag2->destructor = NULL;
789                         skb->truesize += frag2->truesize;
790                 }
791         }
792
793 slow_path:
794         left = skb->len - hlen;         /* Space per frame */
795         ptr = hlen;                     /* Where to start from */
796
797         /*
798          *      Fragment the datagram.
799          */
800
801         *prevhdr = NEXTHDR_FRAGMENT;
802         hroom = LL_RESERVED_SPACE(rt->dst.dev);
803         troom = rt->dst.dev->needed_tailroom;
804
805         /*
806          *      Keep copying data until we run out.
807          */
808         while(left > 0) {
809                 len = left;
810                 /* IF: it doesn't fit, use 'mtu' - the data space left */
811                 if (len > mtu)
812                         len = mtu;
813                 /* IF: we are not sending up to and including the packet end
814                    then align the next start on an eight byte boundary */
815                 if (len < left) {
816                         len &= ~7;
817                 }
818                 /*
819                  *      Allocate buffer.
820                  */
821
822                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
823                                       hroom + troom, GFP_ATOMIC)) == NULL) {
824                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
825                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
826                                       IPSTATS_MIB_FRAGFAILS);
827                         err = -ENOMEM;
828                         goto fail;
829                 }
830
831                 /*
832                  *      Set up data on packet
833                  */
834
835                 ip6_copy_metadata(frag, skb);
836                 skb_reserve(frag, hroom);
837                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
838                 skb_reset_network_header(frag);
839                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
840                 frag->transport_header = (frag->network_header + hlen +
841                                           sizeof(struct frag_hdr));
842
843                 /*
844                  *      Charge the memory for the fragment to any owner
845                  *      it might possess
846                  */
847                 if (skb->sk)
848                         skb_set_owner_w(frag, skb->sk);
849
850                 /*
851                  *      Copy the packet header into the new buffer.
852                  */
853                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
854
855                 /*
856                  *      Build fragment header.
857                  */
858                 fh->nexthdr = nexthdr;
859                 fh->reserved = 0;
860                 if (!frag_id) {
861                         ipv6_select_ident(fh, rt);
862                         frag_id = fh->identification;
863                 } else
864                         fh->identification = frag_id;
865
866                 /*
867                  *      Copy a block of the IP datagram.
868                  */
869                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
870                         BUG();
871                 left -= len;
872
873                 fh->frag_off = htons(offset);
874                 if (left > 0)
875                         fh->frag_off |= htons(IP6_MF);
876                 ipv6_hdr(frag)->payload_len = htons(frag->len -
877                                                     sizeof(struct ipv6hdr));
878
879                 ptr += len;
880                 offset += len;
881
882                 /*
883                  *      Put this fragment into the sending queue.
884                  */
885                 err = output(frag);
886                 if (err)
887                         goto fail;
888
889                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
890                               IPSTATS_MIB_FRAGCREATES);
891         }
892         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
893                       IPSTATS_MIB_FRAGOKS);
894         kfree_skb(skb);
895         return err;
896
897 fail:
898         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
899                       IPSTATS_MIB_FRAGFAILS);
900         kfree_skb(skb);
901         return err;
902 }
903
904 static inline int ip6_rt_check(const struct rt6key *rt_key,
905                                const struct in6_addr *fl_addr,
906                                const struct in6_addr *addr_cache)
907 {
908         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
909                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
910 }
911
912 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
913                                           struct dst_entry *dst,
914                                           const struct flowi6 *fl6)
915 {
916         struct ipv6_pinfo *np = inet6_sk(sk);
917         struct rt6_info *rt;
918
919         if (!dst)
920                 goto out;
921
922         if (dst->ops->family != AF_INET6) {
923                 dst_release(dst);
924                 return NULL;
925         }
926
927         rt = (struct rt6_info *)dst;
928         /* Yes, checking route validity in not connected
929          * case is not very simple. Take into account,
930          * that we do not support routing by source, TOS,
931          * and MSG_DONTROUTE            --ANK (980726)
932          *
933          * 1. ip6_rt_check(): If route was host route,
934          *    check that cached destination is current.
935          *    If it is network route, we still may
936          *    check its validity using saved pointer
937          *    to the last used address: daddr_cache.
938          *    We do not want to save whole address now,
939          *    (because main consumer of this service
940          *    is tcp, which has not this problem),
941          *    so that the last trick works only on connected
942          *    sockets.
943          * 2. oif also should be the same.
944          */
945         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
946 #ifdef CONFIG_IPV6_SUBTREES
947             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
948 #endif
949             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
950                 dst_release(dst);
951                 dst = NULL;
952         }
953
954 out:
955         return dst;
956 }
957
958 static int ip6_dst_lookup_tail(struct sock *sk,
959                                struct dst_entry **dst, struct flowi6 *fl6)
960 {
961         struct net *net = sock_net(sk);
962 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
963         struct neighbour *n;
964 #endif
965         int err;
966
967         if (*dst == NULL)
968                 *dst = ip6_route_output(net, sk, fl6);
969
970         if ((err = (*dst)->error))
971                 goto out_err_release;
972
973         if (ipv6_addr_any(&fl6->saddr)) {
974                 struct rt6_info *rt = (struct rt6_info *) *dst;
975                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
976                                           sk ? inet6_sk(sk)->srcprefs : 0,
977                                           &fl6->saddr);
978                 if (err)
979                         goto out_err_release;
980         }
981
982 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
983         /*
984          * Here if the dst entry we've looked up
985          * has a neighbour entry that is in the INCOMPLETE
986          * state and the src address from the flow is
987          * marked as OPTIMISTIC, we release the found
988          * dst entry and replace it instead with the
989          * dst entry of the nexthop router
990          */
991         rcu_read_lock();
992         n = dst_get_neighbour(*dst);
993         if (n && !(n->nud_state & NUD_VALID)) {
994                 struct inet6_ifaddr *ifp;
995                 struct flowi6 fl_gw6;
996                 int redirect;
997
998                 rcu_read_unlock();
999                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1000                                       (*dst)->dev, 1);
1001
1002                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1003                 if (ifp)
1004                         in6_ifa_put(ifp);
1005
1006                 if (redirect) {
1007                         /*
1008                          * We need to get the dst entry for the
1009                          * default router instead
1010                          */
1011                         dst_release(*dst);
1012                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1013                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1014                         *dst = ip6_route_output(net, sk, &fl_gw6);
1015                         if ((err = (*dst)->error))
1016                                 goto out_err_release;
1017                 }
1018         } else {
1019                 rcu_read_unlock();
1020         }
1021 #endif
1022
1023         return 0;
1024
1025 out_err_release:
1026         if (err == -ENETUNREACH)
1027                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1028         dst_release(*dst);
1029         *dst = NULL;
1030         return err;
1031 }
1032
1033 /**
1034  *      ip6_dst_lookup - perform route lookup on flow
1035  *      @sk: socket which provides route info
1036  *      @dst: pointer to dst_entry * for result
1037  *      @fl6: flow to lookup
1038  *
1039  *      This function performs a route lookup on the given flow.
1040  *
1041  *      It returns zero on success, or a standard errno code on error.
1042  */
1043 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1044 {
1045         *dst = NULL;
1046         return ip6_dst_lookup_tail(sk, dst, fl6);
1047 }
1048 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1049
1050 /**
1051  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1052  *      @sk: socket which provides route info
1053  *      @fl6: flow to lookup
1054  *      @final_dst: final destination address for ipsec lookup
1055  *      @can_sleep: we are in a sleepable context
1056  *
1057  *      This function performs a route lookup on the given flow.
1058  *
1059  *      It returns a valid dst pointer on success, or a pointer encoded
1060  *      error code.
1061  */
1062 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1063                                       const struct in6_addr *final_dst,
1064                                       bool can_sleep)
1065 {
1066         struct dst_entry *dst = NULL;
1067         int err;
1068
1069         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1070         if (err)
1071                 return ERR_PTR(err);
1072         if (final_dst)
1073                 ipv6_addr_copy(&fl6->daddr, final_dst);
1074         if (can_sleep)
1075                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1076
1077         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1078 }
1079 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1080
1081 /**
1082  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1083  *      @sk: socket which provides the dst cache and route info
1084  *      @fl6: flow to lookup
1085  *      @final_dst: final destination address for ipsec lookup
1086  *      @can_sleep: we are in a sleepable context
1087  *
1088  *      This function performs a route lookup on the given flow with the
1089  *      possibility of using the cached route in the socket if it is valid.
1090  *      It will take the socket dst lock when operating on the dst cache.
1091  *      As a result, this function can only be used in process context.
1092  *
1093  *      It returns a valid dst pointer on success, or a pointer encoded
1094  *      error code.
1095  */
1096 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1097                                          const struct in6_addr *final_dst,
1098                                          bool can_sleep)
1099 {
1100         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1101         int err;
1102
1103         dst = ip6_sk_dst_check(sk, dst, fl6);
1104
1105         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1106         if (err)
1107                 return ERR_PTR(err);
1108         if (final_dst)
1109                 ipv6_addr_copy(&fl6->daddr, final_dst);
1110         if (can_sleep)
1111                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1112
1113         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1114 }
1115 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1116
1117 static inline int ip6_ufo_append_data(struct sock *sk,
1118                         int getfrag(void *from, char *to, int offset, int len,
1119                         int odd, struct sk_buff *skb),
1120                         void *from, int length, int hh_len, int fragheaderlen,
1121                         int exthdrlen, int transhdrlen, int mtu,
1122                         unsigned int flags, struct rt6_info *rt)
1123 {
1124         struct sk_buff *skb;
1125         int err;
1126
1127         /* There is support for UDP large send offload by network
1128          * device, so create one single skb packet containing complete
1129          * udp datagram
1130          */
1131         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1132                 struct frag_hdr fhdr;
1133
1134                 skb = sock_alloc_send_skb(sk,
1135                         hh_len + fragheaderlen + transhdrlen + 20,
1136                         (flags & MSG_DONTWAIT), &err);
1137                 if (skb == NULL)
1138                         return err;
1139
1140                 /* reserve space for Hardware header */
1141                 skb_reserve(skb, hh_len);
1142
1143                 /* create space for UDP/IP header */
1144                 skb_put(skb,fragheaderlen + transhdrlen);
1145
1146                 /* initialize network header pointer */
1147                 skb_set_network_header(skb, exthdrlen);
1148
1149                 /* initialize protocol header pointer */
1150                 skb->transport_header = skb->network_header + fragheaderlen;
1151
1152                 skb->ip_summed = CHECKSUM_PARTIAL;
1153                 skb->csum = 0;
1154
1155                 /* Specify the length of each IPv6 datagram fragment.
1156                  * It has to be a multiple of 8.
1157                  */
1158                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1159                                              sizeof(struct frag_hdr)) & ~7;
1160                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1161                 ipv6_select_ident(&fhdr, rt);
1162                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1163                 __skb_queue_tail(&sk->sk_write_queue, skb);
1164         }
1165
1166         return skb_append_datato_frags(sk, skb, getfrag, from,
1167                                        (length - transhdrlen));
1168 }
1169
1170 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1171                                                gfp_t gfp)
1172 {
1173         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1174 }
1175
1176 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1177                                                 gfp_t gfp)
1178 {
1179         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1180 }
1181
1182 static void ip6_append_data_mtu(unsigned int *mtu,
1183                                 int *maxfraglen,
1184                                 unsigned int fragheaderlen,
1185                                 struct sk_buff *skb,
1186                                 struct rt6_info *rt,
1187                                 unsigned int orig_mtu)
1188 {
1189         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1190                 if (skb == NULL) {
1191                         /* first fragment, reserve header_len */
1192                         *mtu = orig_mtu - rt->dst.header_len;
1193
1194                 } else {
1195                         /*
1196                          * this fragment is not first, the headers
1197                          * space is regarded as data space.
1198                          */
1199                         *mtu = orig_mtu;
1200                 }
1201                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1202                               + fragheaderlen - sizeof(struct frag_hdr);
1203         }
1204 }
1205
1206 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1207         int offset, int len, int odd, struct sk_buff *skb),
1208         void *from, int length, int transhdrlen,
1209         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1210         struct rt6_info *rt, unsigned int flags, int dontfrag)
1211 {
1212         struct inet_sock *inet = inet_sk(sk);
1213         struct ipv6_pinfo *np = inet6_sk(sk);
1214         struct inet_cork *cork;
1215         struct sk_buff *skb, *skb_prev = NULL;
1216         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1217         int exthdrlen;
1218         int dst_exthdrlen;
1219         int hh_len;
1220         int copy;
1221         int err;
1222         int offset = 0;
1223         int csummode = CHECKSUM_NONE;
1224         __u8 tx_flags = 0;
1225
1226         if (flags&MSG_PROBE)
1227                 return 0;
1228         cork = &inet->cork.base;
1229         if (skb_queue_empty(&sk->sk_write_queue)) {
1230                 /*
1231                  * setup for corking
1232                  */
1233                 if (opt) {
1234                         if (WARN_ON(np->cork.opt))
1235                                 return -EINVAL;
1236
1237                         np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1238                         if (unlikely(np->cork.opt == NULL))
1239                                 return -ENOBUFS;
1240
1241                         np->cork.opt->tot_len = opt->tot_len;
1242                         np->cork.opt->opt_flen = opt->opt_flen;
1243                         np->cork.opt->opt_nflen = opt->opt_nflen;
1244
1245                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1246                                                             sk->sk_allocation);
1247                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1248                                 return -ENOBUFS;
1249
1250                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1251                                                             sk->sk_allocation);
1252                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1253                                 return -ENOBUFS;
1254
1255                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1256                                                            sk->sk_allocation);
1257                         if (opt->hopopt && !np->cork.opt->hopopt)
1258                                 return -ENOBUFS;
1259
1260                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1261                                                             sk->sk_allocation);
1262                         if (opt->srcrt && !np->cork.opt->srcrt)
1263                                 return -ENOBUFS;
1264
1265                         /* need source address above miyazawa*/
1266                 }
1267                 dst_hold(&rt->dst);
1268                 cork->dst = &rt->dst;
1269                 inet->cork.fl.u.ip6 = *fl6;
1270                 np->cork.hop_limit = hlimit;
1271                 np->cork.tclass = tclass;
1272                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1273                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1274                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1275                 else
1276                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1277                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1278                 if (np->frag_size < mtu) {
1279                         if (np->frag_size)
1280                                 mtu = np->frag_size;
1281                 }
1282                 cork->fragsize = mtu;
1283                 if (dst_allfrag(rt->dst.path))
1284                         cork->flags |= IPCORK_ALLFRAG;
1285                 cork->length = 0;
1286                 sk->sk_sndmsg_page = NULL;
1287                 sk->sk_sndmsg_off = 0;
1288                 exthdrlen = (opt ? opt->opt_flen : 0);
1289                 length += exthdrlen;
1290                 transhdrlen += exthdrlen;
1291                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1292         } else {
1293                 rt = (struct rt6_info *)cork->dst;
1294                 fl6 = &inet->cork.fl.u.ip6;
1295                 opt = np->cork.opt;
1296                 transhdrlen = 0;
1297                 exthdrlen = 0;
1298                 dst_exthdrlen = 0;
1299                 mtu = cork->fragsize;
1300         }
1301         orig_mtu = mtu;
1302
1303         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1304
1305         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1306                         (opt ? opt->opt_nflen : 0);
1307         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1308
1309         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1310                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1311                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1312                         return -EMSGSIZE;
1313                 }
1314         }
1315
1316         /* For UDP, check if TX timestamp is enabled */
1317         if (sk->sk_type == SOCK_DGRAM) {
1318                 err = sock_tx_timestamp(sk, &tx_flags);
1319                 if (err)
1320                         goto error;
1321         }
1322
1323         /*
1324          * Let's try using as much space as possible.
1325          * Use MTU if total length of the message fits into the MTU.
1326          * Otherwise, we need to reserve fragment header and
1327          * fragment alignment (= 8-15 octects, in total).
1328          *
1329          * Note that we may need to "move" the data from the tail of
1330          * of the buffer to the new fragment when we split
1331          * the message.
1332          *
1333          * FIXME: It may be fragmented into multiple chunks
1334          *        at once if non-fragmentable extension headers
1335          *        are too large.
1336          * --yoshfuji
1337          */
1338
1339         if ((length > mtu) && dontfrag && (sk->sk_protocol == IPPROTO_UDP ||
1340                                            sk->sk_protocol == IPPROTO_RAW)) {
1341                 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1342                 return -EMSGSIZE;
1343         }
1344
1345         skb = skb_peek_tail(&sk->sk_write_queue);
1346         cork->length += length;
1347         if (((length > mtu) ||
1348              (skb && skb_has_frags(skb))) &&
1349             (sk->sk_protocol == IPPROTO_UDP) &&
1350             (rt->dst.dev->features & NETIF_F_UFO) && !dst_xfrm(&rt->dst) &&
1351             (sk->sk_type == SOCK_DGRAM)) {
1352                 err = ip6_ufo_append_data(sk, getfrag, from, length,
1353                                           hh_len, fragheaderlen, exthdrlen,
1354                                           transhdrlen, mtu, flags, rt);
1355                 if (err)
1356                         goto error;
1357                 return 0;
1358         }
1359
1360         if (!skb)
1361                 goto alloc_new_skb;
1362
1363         while (length > 0) {
1364                 /* Check if the remaining data fits into current packet. */
1365                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1366                 if (copy < length)
1367                         copy = maxfraglen - skb->len;
1368
1369                 if (copy <= 0) {
1370                         char *data;
1371                         unsigned int datalen;
1372                         unsigned int fraglen;
1373                         unsigned int fraggap;
1374                         unsigned int alloclen;
1375 alloc_new_skb:
1376                         /* There's no room in the current skb */
1377                         if (skb)
1378                                 fraggap = skb->len - maxfraglen;
1379                         else
1380                                 fraggap = 0;
1381                         /* update mtu and maxfraglen if necessary */
1382                         if (skb == NULL || skb_prev == NULL)
1383                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1384                                                     fragheaderlen, skb, rt,
1385                                                     orig_mtu);
1386
1387                         skb_prev = skb;
1388
1389                         /*
1390                          * If remaining data exceeds the mtu,
1391                          * we know we need more fragment(s).
1392                          */
1393                         datalen = length + fraggap;
1394
1395                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1396                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1397                         if ((flags & MSG_MORE) &&
1398                             !(rt->dst.dev->features&NETIF_F_SG))
1399                                 alloclen = mtu;
1400                         else
1401                                 alloclen = datalen + fragheaderlen;
1402
1403                         alloclen += dst_exthdrlen;
1404
1405                         if (datalen != length + fraggap) {
1406                                 /*
1407                                  * this is not the last fragment, the trailer
1408                                  * space is regarded as data space.
1409                                  */
1410                                 datalen += rt->dst.trailer_len;
1411                         }
1412
1413                         alloclen += rt->dst.trailer_len;
1414                         fraglen = datalen + fragheaderlen;
1415
1416                         /*
1417                          * We just reserve space for fragment header.
1418                          * Note: this may be overallocation if the message
1419                          * (without MSG_MORE) fits into the MTU.
1420                          */
1421                         alloclen += sizeof(struct frag_hdr);
1422
1423                         copy = datalen - transhdrlen - fraggap;
1424                         if (copy < 0) {
1425                                 err = -EINVAL;
1426                                 goto error;
1427                         }
1428                         if (transhdrlen) {
1429                                 skb = sock_alloc_send_skb(sk,
1430                                                 alloclen + hh_len,
1431                                                 (flags & MSG_DONTWAIT), &err);
1432                         } else {
1433                                 skb = NULL;
1434                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1435                                     2 * sk->sk_sndbuf)
1436                                         skb = sock_wmalloc(sk,
1437                                                            alloclen + hh_len, 1,
1438                                                            sk->sk_allocation);
1439                                 if (unlikely(skb == NULL))
1440                                         err = -ENOBUFS;
1441                                 else {
1442                                         /* Only the initial fragment
1443                                          * is time stamped.
1444                                          */
1445                                         tx_flags = 0;
1446                                 }
1447                         }
1448                         if (skb == NULL)
1449                                 goto error;
1450                         /*
1451                          *      Fill in the control structures
1452                          */
1453                         skb->ip_summed = csummode;
1454                         skb->csum = 0;
1455                         /* reserve for fragmentation and ipsec header */
1456                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1457                                     dst_exthdrlen);
1458
1459                         if (sk->sk_type == SOCK_DGRAM)
1460                                 skb_shinfo(skb)->tx_flags = tx_flags;
1461
1462                         /*
1463                          *      Find where to start putting bytes
1464                          */
1465                         data = skb_put(skb, fraglen);
1466                         skb_set_network_header(skb, exthdrlen);
1467                         data += fragheaderlen;
1468                         skb->transport_header = (skb->network_header +
1469                                                  fragheaderlen);
1470                         if (fraggap) {
1471                                 skb->csum = skb_copy_and_csum_bits(
1472                                         skb_prev, maxfraglen,
1473                                         data + transhdrlen, fraggap, 0);
1474                                 skb_prev->csum = csum_sub(skb_prev->csum,
1475                                                           skb->csum);
1476                                 data += fraggap;
1477                                 pskb_trim_unique(skb_prev, maxfraglen);
1478                         }
1479                         if (copy > 0 &&
1480                             getfrag(from, data + transhdrlen, offset,
1481                                     copy, fraggap, skb) < 0) {
1482                                 err = -EFAULT;
1483                                 kfree_skb(skb);
1484                                 goto error;
1485                         }
1486
1487                         offset += copy;
1488                         length -= datalen - fraggap;
1489                         transhdrlen = 0;
1490                         exthdrlen = 0;
1491                         dst_exthdrlen = 0;
1492                         csummode = CHECKSUM_NONE;
1493
1494                         /*
1495                          * Put the packet on the pending queue
1496                          */
1497                         __skb_queue_tail(&sk->sk_write_queue, skb);
1498                         continue;
1499                 }
1500
1501                 if (copy > length)
1502                         copy = length;
1503
1504                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1505                         unsigned int off;
1506
1507                         off = skb->len;
1508                         if (getfrag(from, skb_put(skb, copy),
1509                                                 offset, copy, off, skb) < 0) {
1510                                 __skb_trim(skb, off);
1511                                 err = -EFAULT;
1512                                 goto error;
1513                         }
1514                 } else {
1515                         int i = skb_shinfo(skb)->nr_frags;
1516                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1517                         struct page *page = sk->sk_sndmsg_page;
1518                         int off = sk->sk_sndmsg_off;
1519                         unsigned int left;
1520
1521                         if (page && (left = PAGE_SIZE - off) > 0) {
1522                                 if (copy >= left)
1523                                         copy = left;
1524                                 if (page != skb_frag_page(frag)) {
1525                                         if (i == MAX_SKB_FRAGS) {
1526                                                 err = -EMSGSIZE;
1527                                                 goto error;
1528                                         }
1529                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1530                                         skb_frag_ref(skb, i);
1531                                         frag = &skb_shinfo(skb)->frags[i];
1532                                 }
1533                         } else if(i < MAX_SKB_FRAGS) {
1534                                 if (copy > PAGE_SIZE)
1535                                         copy = PAGE_SIZE;
1536                                 page = alloc_pages(sk->sk_allocation, 0);
1537                                 if (page == NULL) {
1538                                         err = -ENOMEM;
1539                                         goto error;
1540                                 }
1541                                 sk->sk_sndmsg_page = page;
1542                                 sk->sk_sndmsg_off = 0;
1543
1544                                 skb_fill_page_desc(skb, i, page, 0, 0);
1545                                 frag = &skb_shinfo(skb)->frags[i];
1546                         } else {
1547                                 err = -EMSGSIZE;
1548                                 goto error;
1549                         }
1550                         if (getfrag(from,
1551                                     skb_frag_address(frag) + skb_frag_size(frag),
1552                                     offset, copy, skb->len, skb) < 0) {
1553                                 err = -EFAULT;
1554                                 goto error;
1555                         }
1556                         sk->sk_sndmsg_off += copy;
1557                         skb_frag_size_add(frag, copy);
1558                         skb->len += copy;
1559                         skb->data_len += copy;
1560                         skb->truesize += copy;
1561                         atomic_add(copy, &sk->sk_wmem_alloc);
1562                 }
1563                 offset += copy;
1564                 length -= copy;
1565         }
1566         return 0;
1567 error:
1568         cork->length -= length;
1569         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1570         return err;
1571 }
1572
1573 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1574 {
1575         if (np->cork.opt) {
1576                 kfree(np->cork.opt->dst0opt);
1577                 kfree(np->cork.opt->dst1opt);
1578                 kfree(np->cork.opt->hopopt);
1579                 kfree(np->cork.opt->srcrt);
1580                 kfree(np->cork.opt);
1581                 np->cork.opt = NULL;
1582         }
1583
1584         if (inet->cork.base.dst) {
1585                 dst_release(inet->cork.base.dst);
1586                 inet->cork.base.dst = NULL;
1587                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1588         }
1589         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1590 }
1591
1592 int ip6_push_pending_frames(struct sock *sk)
1593 {
1594         struct sk_buff *skb, *tmp_skb;
1595         struct sk_buff **tail_skb;
1596         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1597         struct inet_sock *inet = inet_sk(sk);
1598         struct ipv6_pinfo *np = inet6_sk(sk);
1599         struct net *net = sock_net(sk);
1600         struct ipv6hdr *hdr;
1601         struct ipv6_txoptions *opt = np->cork.opt;
1602         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1603         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1604         unsigned char proto = fl6->flowi6_proto;
1605         int err = 0;
1606
1607         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1608                 goto out;
1609         tail_skb = &(skb_shinfo(skb)->frag_list);
1610
1611         /* move skb->data to ip header from ext header */
1612         if (skb->data < skb_network_header(skb))
1613                 __skb_pull(skb, skb_network_offset(skb));
1614         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1615                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1616                 *tail_skb = tmp_skb;
1617                 tail_skb = &(tmp_skb->next);
1618                 skb->len += tmp_skb->len;
1619                 skb->data_len += tmp_skb->len;
1620                 skb->truesize += tmp_skb->truesize;
1621                 tmp_skb->destructor = NULL;
1622                 tmp_skb->sk = NULL;
1623         }
1624
1625         /* Allow local fragmentation. */
1626         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1627                 skb->local_df = 1;
1628
1629         ipv6_addr_copy(final_dst, &fl6->daddr);
1630         __skb_pull(skb, skb_network_header_len(skb));
1631         if (opt && opt->opt_flen)
1632                 ipv6_push_frag_opts(skb, opt, &proto);
1633         if (opt && opt->opt_nflen)
1634                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1635
1636         skb_push(skb, sizeof(struct ipv6hdr));
1637         skb_reset_network_header(skb);
1638         hdr = ipv6_hdr(skb);
1639
1640         *(__be32*)hdr = fl6->flowlabel |
1641                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1642
1643         hdr->hop_limit = np->cork.hop_limit;
1644         hdr->nexthdr = proto;
1645         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1646         ipv6_addr_copy(&hdr->daddr, final_dst);
1647
1648         skb->priority = sk->sk_priority;
1649         skb->mark = sk->sk_mark;
1650
1651         skb_dst_set(skb, dst_clone(&rt->dst));
1652         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1653         if (proto == IPPROTO_ICMPV6) {
1654                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1655
1656                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1657                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1658         }
1659
1660         err = ip6_local_out(skb);
1661         if (err) {
1662                 if (err > 0)
1663                         err = net_xmit_errno(err);
1664                 if (err)
1665                         goto error;
1666         }
1667
1668 out:
1669         ip6_cork_release(inet, np);
1670         return err;
1671 error:
1672         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1673         goto out;
1674 }
1675
1676 void ip6_flush_pending_frames(struct sock *sk)
1677 {
1678         struct sk_buff *skb;
1679
1680         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1681                 if (skb_dst(skb))
1682                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1683                                       IPSTATS_MIB_OUTDISCARDS);
1684                 kfree_skb(skb);
1685         }
1686
1687         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1688 }