ipv6: some ipv6 statistic counters failed to disable bh
[pandora-kernel.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89         skb_reset_mac_header(newskb);
90         __skb_pull(newskb, skb_network_offset(newskb));
91         newskb->pkt_type = PACKET_LOOPBACK;
92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
93         WARN_ON(!skb_dst(newskb));
94
95         netif_rx_ni(newskb);
96         return 0;
97 }
98
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101         struct dst_entry *dst = skb_dst(skb);
102         struct net_device *dev = dst->dev;
103         struct neighbour *neigh;
104
105         skb->protocol = htons(ETH_P_IPV6);
106         skb->dev = dev;
107
108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110
111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112                     ((mroute6_socket(dev_net(dev), skb) &&
113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115                                          &ipv6_hdr(skb)->saddr))) {
116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117
118                         /* Do not check for IFF_ALLMULTI; multicast routing
119                            is not supported in any case.
120                          */
121                         if (newskb)
122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123                                         newskb, NULL, newskb->dev,
124                                         ip6_dev_loopback_xmit);
125
126                         if (ipv6_hdr(skb)->hop_limit == 0) {
127                                 IP6_INC_STATS(dev_net(dev), idev,
128                                               IPSTATS_MIB_OUTDISCARDS);
129                                 kfree_skb(skb);
130                                 return 0;
131                         }
132                 }
133
134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135                                 skb->len);
136         }
137
138         rcu_read_lock();
139         neigh = dst_get_neighbour(dst);
140         if (neigh) {
141                 int res = neigh_output(neigh, skb);
142
143                 rcu_read_unlock();
144                 return res;
145         }
146         rcu_read_unlock();
147         IP6_INC_STATS(dev_net(dst->dev),
148                       ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149         kfree_skb(skb);
150         return -EINVAL;
151 }
152
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156             dst_allfrag(skb_dst(skb)))
157                 return ip6_fragment(skb, ip6_finish_output2);
158         else
159                 return ip6_finish_output2(skb);
160 }
161
162 int ip6_output(struct sk_buff *skb)
163 {
164         struct net_device *dev = skb_dst(skb)->dev;
165         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166         if (unlikely(idev->cnf.disable_ipv6)) {
167                 IP6_INC_STATS(dev_net(dev), idev,
168                               IPSTATS_MIB_OUTDISCARDS);
169                 kfree_skb(skb);
170                 return 0;
171         }
172
173         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174                             ip6_finish_output,
175                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177
178 /*
179  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183              struct ipv6_txoptions *opt, int tclass)
184 {
185         struct net *net = sock_net(sk);
186         struct ipv6_pinfo *np = inet6_sk(sk);
187         struct in6_addr *first_hop = &fl6->daddr;
188         struct dst_entry *dst = skb_dst(skb);
189         struct ipv6hdr *hdr;
190         u8  proto = fl6->flowi6_proto;
191         int seg_len = skb->len;
192         int hlimit = -1;
193         u32 mtu;
194
195         if (opt) {
196                 unsigned int head_room;
197
198                 /* First: exthdrs may take lots of space (~8K for now)
199                    MAX_HEADER is not enough.
200                  */
201                 head_room = opt->opt_nflen + opt->opt_flen;
202                 seg_len += head_room;
203                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204
205                 if (skb_headroom(skb) < head_room) {
206                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207                         if (skb2 == NULL) {
208                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209                                               IPSTATS_MIB_OUTDISCARDS);
210                                 kfree_skb(skb);
211                                 return -ENOBUFS;
212                         }
213                         kfree_skb(skb);
214                         skb = skb2;
215                         skb_set_owner_w(skb, sk);
216                 }
217                 if (opt->opt_flen)
218                         ipv6_push_frag_opts(skb, opt, &proto);
219                 if (opt->opt_nflen)
220                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221         }
222
223         skb_push(skb, sizeof(struct ipv6hdr));
224         skb_reset_network_header(skb);
225         hdr = ipv6_hdr(skb);
226
227         /*
228          *      Fill in the IPv6 header
229          */
230         if (np)
231                 hlimit = np->hop_limit;
232         if (hlimit < 0)
233                 hlimit = ip6_dst_hoplimit(dst);
234
235         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
236
237         hdr->payload_len = htons(seg_len);
238         hdr->nexthdr = proto;
239         hdr->hop_limit = hlimit;
240
241         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
242         ipv6_addr_copy(&hdr->daddr, first_hop);
243
244         skb->priority = sk->sk_priority;
245         skb->mark = sk->sk_mark;
246
247         mtu = dst_mtu(dst);
248         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250                               IPSTATS_MIB_OUT, skb->len);
251                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252                                dst->dev, dst_output);
253         }
254
255         if (net_ratelimit())
256                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
257         skb->dev = dst->dev;
258         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
259         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
260         kfree_skb(skb);
261         return -EMSGSIZE;
262 }
263
264 EXPORT_SYMBOL(ip6_xmit);
265
266 /*
267  *      To avoid extra problems ND packets are send through this
268  *      routine. It's code duplication but I really want to avoid
269  *      extra checks since ipv6_build_header is used by TCP (which
270  *      is for us performance critical)
271  */
272
273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
274                const struct in6_addr *saddr, const struct in6_addr *daddr,
275                int proto, int len)
276 {
277         struct ipv6_pinfo *np = inet6_sk(sk);
278         struct ipv6hdr *hdr;
279
280         skb->protocol = htons(ETH_P_IPV6);
281         skb->dev = dev;
282
283         skb_reset_network_header(skb);
284         skb_put(skb, sizeof(struct ipv6hdr));
285         hdr = ipv6_hdr(skb);
286
287         *(__be32*)hdr = htonl(0x60000000);
288
289         hdr->payload_len = htons(len);
290         hdr->nexthdr = proto;
291         hdr->hop_limit = np->hop_limit;
292
293         ipv6_addr_copy(&hdr->saddr, saddr);
294         ipv6_addr_copy(&hdr->daddr, daddr);
295
296         return 0;
297 }
298
299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
300 {
301         struct ip6_ra_chain *ra;
302         struct sock *last = NULL;
303
304         read_lock(&ip6_ra_lock);
305         for (ra = ip6_ra_chain; ra; ra = ra->next) {
306                 struct sock *sk = ra->sk;
307                 if (sk && ra->sel == sel &&
308                     (!sk->sk_bound_dev_if ||
309                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
310                         if (last) {
311                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
312                                 if (skb2)
313                                         rawv6_rcv(last, skb2);
314                         }
315                         last = sk;
316                 }
317         }
318
319         if (last) {
320                 rawv6_rcv(last, skb);
321                 read_unlock(&ip6_ra_lock);
322                 return 1;
323         }
324         read_unlock(&ip6_ra_lock);
325         return 0;
326 }
327
328 static int ip6_forward_proxy_check(struct sk_buff *skb)
329 {
330         struct ipv6hdr *hdr = ipv6_hdr(skb);
331         u8 nexthdr = hdr->nexthdr;
332         int offset;
333
334         if (ipv6_ext_hdr(nexthdr)) {
335                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
336                 if (offset < 0)
337                         return 0;
338         } else
339                 offset = sizeof(struct ipv6hdr);
340
341         if (nexthdr == IPPROTO_ICMPV6) {
342                 struct icmp6hdr *icmp6;
343
344                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345                                          offset + 1 - skb->data)))
346                         return 0;
347
348                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349
350                 switch (icmp6->icmp6_type) {
351                 case NDISC_ROUTER_SOLICITATION:
352                 case NDISC_ROUTER_ADVERTISEMENT:
353                 case NDISC_NEIGHBOUR_SOLICITATION:
354                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
355                 case NDISC_REDIRECT:
356                         /* For reaction involving unicast neighbor discovery
357                          * message destined to the proxied address, pass it to
358                          * input function.
359                          */
360                         return 1;
361                 default:
362                         break;
363                 }
364         }
365
366         /*
367          * The proxying router can't forward traffic sent to a link-local
368          * address, so signal the sender and discard the packet. This
369          * behavior is clarified by the MIPv6 specification.
370          */
371         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372                 dst_link_failure(skb);
373                 return -1;
374         }
375
376         return 0;
377 }
378
379 static inline int ip6_forward_finish(struct sk_buff *skb)
380 {
381         return dst_output(skb);
382 }
383
384 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
385 {
386         if (skb->len <= mtu || skb->local_df)
387                 return false;
388
389         if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
390                 return false;
391
392         return true;
393 }
394
395 int ip6_forward(struct sk_buff *skb)
396 {
397         struct dst_entry *dst = skb_dst(skb);
398         struct ipv6hdr *hdr = ipv6_hdr(skb);
399         struct inet6_skb_parm *opt = IP6CB(skb);
400         struct net *net = dev_net(dst->dev);
401         struct neighbour *n;
402         u32 mtu;
403
404         if (net->ipv6.devconf_all->forwarding == 0)
405                 goto error;
406
407         if (skb_warn_if_lro(skb))
408                 goto drop;
409
410         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
411                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
412                 goto drop;
413         }
414
415         if (skb->pkt_type != PACKET_HOST)
416                 goto drop;
417
418         skb_forward_csum(skb);
419
420         /*
421          *      We DO NOT make any processing on
422          *      RA packets, pushing them to user level AS IS
423          *      without ane WARRANTY that application will be able
424          *      to interpret them. The reason is that we
425          *      cannot make anything clever here.
426          *
427          *      We are not end-node, so that if packet contains
428          *      AH/ESP, we cannot make anything.
429          *      Defragmentation also would be mistake, RA packets
430          *      cannot be fragmented, because there is no warranty
431          *      that different fragments will go along one path. --ANK
432          */
433         if (opt->ra) {
434                 u8 *ptr = skb_network_header(skb) + opt->ra;
435                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
436                         return 0;
437         }
438
439         /*
440          *      check and decrement ttl
441          */
442         if (hdr->hop_limit <= 1) {
443                 /* Force OUTPUT device used as source address */
444                 skb->dev = dst->dev;
445                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
446                 IP6_INC_STATS_BH(net,
447                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
448
449                 kfree_skb(skb);
450                 return -ETIMEDOUT;
451         }
452
453         /* XXX: idev->cnf.proxy_ndp? */
454         if (net->ipv6.devconf_all->proxy_ndp &&
455             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
456                 int proxied = ip6_forward_proxy_check(skb);
457                 if (proxied > 0)
458                         return ip6_input(skb);
459                 else if (proxied < 0) {
460                         IP6_INC_STATS(net, ip6_dst_idev(dst),
461                                       IPSTATS_MIB_INDISCARDS);
462                         goto drop;
463                 }
464         }
465
466         if (!xfrm6_route_forward(skb)) {
467                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
468                 goto drop;
469         }
470         dst = skb_dst(skb);
471
472         /* IPv6 specs say nothing about it, but it is clear that we cannot
473            send redirects to source routed frames.
474            We don't send redirects to frames decapsulated from IPsec.
475          */
476         n = dst_get_neighbour(dst);
477         if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
478                 struct in6_addr *target = NULL;
479                 struct rt6_info *rt;
480
481                 /*
482                  *      incoming and outgoing devices are the same
483                  *      send a redirect.
484                  */
485
486                 rt = (struct rt6_info *) dst;
487                 if ((rt->rt6i_flags & RTF_GATEWAY))
488                         target = (struct in6_addr*)&n->primary_key;
489                 else
490                         target = &hdr->daddr;
491
492                 if (!rt->rt6i_peer)
493                         rt6_bind_peer(rt, 1);
494
495                 /* Limit redirects both by destination (here)
496                    and by source (inside ndisc_send_redirect)
497                  */
498                 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
499                         ndisc_send_redirect(skb, n, target);
500         } else {
501                 int addrtype = ipv6_addr_type(&hdr->saddr);
502
503                 /* This check is security critical. */
504                 if (addrtype == IPV6_ADDR_ANY ||
505                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
506                         goto error;
507                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
508                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
509                                     ICMPV6_NOT_NEIGHBOUR, 0);
510                         goto error;
511                 }
512         }
513
514         mtu = dst_mtu(dst);
515         if (mtu < IPV6_MIN_MTU)
516                 mtu = IPV6_MIN_MTU;
517
518         if (ip6_pkt_too_big(skb, mtu)) {
519                 /* Again, force OUTPUT device used as source address */
520                 skb->dev = dst->dev;
521                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
522                 IP6_INC_STATS_BH(net,
523                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
524                 IP6_INC_STATS_BH(net,
525                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
526                 kfree_skb(skb);
527                 return -EMSGSIZE;
528         }
529
530         if (skb_cow(skb, dst->dev->hard_header_len)) {
531                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
532                 goto drop;
533         }
534
535         hdr = ipv6_hdr(skb);
536
537         /* Mangling hops number delayed to point after skb COW */
538
539         hdr->hop_limit--;
540
541         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
542         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
543                        ip6_forward_finish);
544
545 error:
546         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
547 drop:
548         kfree_skb(skb);
549         return -EINVAL;
550 }
551
552 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
553 {
554         to->pkt_type = from->pkt_type;
555         to->priority = from->priority;
556         to->protocol = from->protocol;
557         skb_dst_drop(to);
558         skb_dst_set(to, dst_clone(skb_dst(from)));
559         to->dev = from->dev;
560         to->mark = from->mark;
561
562 #ifdef CONFIG_NET_SCHED
563         to->tc_index = from->tc_index;
564 #endif
565         nf_copy(to, from);
566 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
567     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
568         to->nf_trace = from->nf_trace;
569 #endif
570         skb_copy_secmark(to, from);
571 }
572
573 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
574 {
575         u16 offset = sizeof(struct ipv6hdr);
576         struct ipv6_opt_hdr *exthdr =
577                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
578         unsigned int packet_len = skb->tail - skb->network_header;
579         int found_rhdr = 0;
580         *nexthdr = &ipv6_hdr(skb)->nexthdr;
581
582         while (offset + 1 <= packet_len) {
583
584                 switch (**nexthdr) {
585
586                 case NEXTHDR_HOP:
587                         break;
588                 case NEXTHDR_ROUTING:
589                         found_rhdr = 1;
590                         break;
591                 case NEXTHDR_DEST:
592 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
593                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
594                                 break;
595 #endif
596                         if (found_rhdr)
597                                 return offset;
598                         break;
599                 default :
600                         return offset;
601                 }
602
603                 offset += ipv6_optlen(exthdr);
604                 *nexthdr = &exthdr->nexthdr;
605                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
606                                                  offset);
607         }
608
609         return offset;
610 }
611
612 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
613 {
614         static atomic_t ipv6_fragmentation_id;
615         int old, new;
616
617         if (rt && !(rt->dst.flags & DST_NOPEER)) {
618                 struct inet_peer *peer;
619
620                 if (!rt->rt6i_peer)
621                         rt6_bind_peer(rt, 1);
622                 peer = rt->rt6i_peer;
623                 if (peer) {
624                         fhdr->identification = htonl(inet_getid(peer, 0));
625                         return;
626                 }
627         }
628         do {
629                 old = atomic_read(&ipv6_fragmentation_id);
630                 new = old + 1;
631                 if (!new)
632                         new = 1;
633         } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
634         fhdr->identification = htonl(new);
635 }
636
637 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
638 {
639         struct sk_buff *frag;
640         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
641         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
642         struct ipv6hdr *tmp_hdr;
643         struct frag_hdr *fh;
644         unsigned int mtu, hlen, left, len;
645         __be32 frag_id = 0;
646         int ptr, offset = 0, err=0;
647         u8 *prevhdr, nexthdr = 0;
648         struct net *net = dev_net(skb_dst(skb)->dev);
649
650         hlen = ip6_find_1stfragopt(skb, &prevhdr);
651         nexthdr = *prevhdr;
652
653         mtu = ip6_skb_dst_mtu(skb);
654
655         /* We must not fragment if the socket is set to force MTU discovery
656          * or if the skb it not generated by a local socket.
657          */
658         if (!skb->local_df && skb->len > mtu) {
659                 skb->dev = skb_dst(skb)->dev;
660                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
661                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
662                               IPSTATS_MIB_FRAGFAILS);
663                 kfree_skb(skb);
664                 return -EMSGSIZE;
665         }
666
667         if (np && np->frag_size < mtu) {
668                 if (np->frag_size)
669                         mtu = np->frag_size;
670         }
671         mtu -= hlen + sizeof(struct frag_hdr);
672
673         if (skb_has_frag_list(skb)) {
674                 int first_len = skb_pagelen(skb);
675                 struct sk_buff *frag2;
676
677                 if (first_len - hlen > mtu ||
678                     ((first_len - hlen) & 7) ||
679                     skb_cloned(skb))
680                         goto slow_path;
681
682                 skb_walk_frags(skb, frag) {
683                         /* Correct geometry. */
684                         if (frag->len > mtu ||
685                             ((frag->len & 7) && frag->next) ||
686                             skb_headroom(frag) < hlen)
687                                 goto slow_path_clean;
688
689                         /* Partially cloned skb? */
690                         if (skb_shared(frag))
691                                 goto slow_path_clean;
692
693                         BUG_ON(frag->sk);
694                         if (skb->sk) {
695                                 frag->sk = skb->sk;
696                                 frag->destructor = sock_wfree;
697                         }
698                         skb->truesize -= frag->truesize;
699                 }
700
701                 err = 0;
702                 offset = 0;
703                 frag = skb_shinfo(skb)->frag_list;
704                 skb_frag_list_init(skb);
705                 /* BUILD HEADER */
706
707                 *prevhdr = NEXTHDR_FRAGMENT;
708                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
709                 if (!tmp_hdr) {
710                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
711                                       IPSTATS_MIB_FRAGFAILS);
712                         return -ENOMEM;
713                 }
714
715                 __skb_pull(skb, hlen);
716                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
717                 __skb_push(skb, hlen);
718                 skb_reset_network_header(skb);
719                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
720
721                 ipv6_select_ident(fh, rt);
722                 fh->nexthdr = nexthdr;
723                 fh->reserved = 0;
724                 fh->frag_off = htons(IP6_MF);
725                 frag_id = fh->identification;
726
727                 first_len = skb_pagelen(skb);
728                 skb->data_len = first_len - skb_headlen(skb);
729                 skb->len = first_len;
730                 ipv6_hdr(skb)->payload_len = htons(first_len -
731                                                    sizeof(struct ipv6hdr));
732
733                 dst_hold(&rt->dst);
734
735                 for (;;) {
736                         /* Prepare header of the next frame,
737                          * before previous one went down. */
738                         if (frag) {
739                                 frag->ip_summed = CHECKSUM_NONE;
740                                 skb_reset_transport_header(frag);
741                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
742                                 __skb_push(frag, hlen);
743                                 skb_reset_network_header(frag);
744                                 memcpy(skb_network_header(frag), tmp_hdr,
745                                        hlen);
746                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
747                                 fh->nexthdr = nexthdr;
748                                 fh->reserved = 0;
749                                 fh->frag_off = htons(offset);
750                                 if (frag->next != NULL)
751                                         fh->frag_off |= htons(IP6_MF);
752                                 fh->identification = frag_id;
753                                 ipv6_hdr(frag)->payload_len =
754                                                 htons(frag->len -
755                                                       sizeof(struct ipv6hdr));
756                                 ip6_copy_metadata(frag, skb);
757                         }
758
759                         err = output(skb);
760                         if(!err)
761                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
762                                               IPSTATS_MIB_FRAGCREATES);
763
764                         if (err || !frag)
765                                 break;
766
767                         skb = frag;
768                         frag = skb->next;
769                         skb->next = NULL;
770                 }
771
772                 kfree(tmp_hdr);
773
774                 if (err == 0) {
775                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
776                                       IPSTATS_MIB_FRAGOKS);
777                         dst_release(&rt->dst);
778                         return 0;
779                 }
780
781                 while (frag) {
782                         skb = frag->next;
783                         kfree_skb(frag);
784                         frag = skb;
785                 }
786
787                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
788                               IPSTATS_MIB_FRAGFAILS);
789                 dst_release(&rt->dst);
790                 return err;
791
792 slow_path_clean:
793                 skb_walk_frags(skb, frag2) {
794                         if (frag2 == frag)
795                                 break;
796                         frag2->sk = NULL;
797                         frag2->destructor = NULL;
798                         skb->truesize += frag2->truesize;
799                 }
800         }
801
802 slow_path:
803         left = skb->len - hlen;         /* Space per frame */
804         ptr = hlen;                     /* Where to start from */
805
806         /*
807          *      Fragment the datagram.
808          */
809
810         *prevhdr = NEXTHDR_FRAGMENT;
811
812         /*
813          *      Keep copying data until we run out.
814          */
815         while(left > 0) {
816                 len = left;
817                 /* IF: it doesn't fit, use 'mtu' - the data space left */
818                 if (len > mtu)
819                         len = mtu;
820                 /* IF: we are not sending up to and including the packet end
821                    then align the next start on an eight byte boundary */
822                 if (len < left) {
823                         len &= ~7;
824                 }
825                 /*
826                  *      Allocate buffer.
827                  */
828
829                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
830                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
831                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
832                                       IPSTATS_MIB_FRAGFAILS);
833                         err = -ENOMEM;
834                         goto fail;
835                 }
836
837                 /*
838                  *      Set up data on packet
839                  */
840
841                 ip6_copy_metadata(frag, skb);
842                 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
843                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
844                 skb_reset_network_header(frag);
845                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
846                 frag->transport_header = (frag->network_header + hlen +
847                                           sizeof(struct frag_hdr));
848
849                 /*
850                  *      Charge the memory for the fragment to any owner
851                  *      it might possess
852                  */
853                 if (skb->sk)
854                         skb_set_owner_w(frag, skb->sk);
855
856                 /*
857                  *      Copy the packet header into the new buffer.
858                  */
859                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
860
861                 /*
862                  *      Build fragment header.
863                  */
864                 fh->nexthdr = nexthdr;
865                 fh->reserved = 0;
866                 if (!frag_id) {
867                         ipv6_select_ident(fh, rt);
868                         frag_id = fh->identification;
869                 } else
870                         fh->identification = frag_id;
871
872                 /*
873                  *      Copy a block of the IP datagram.
874                  */
875                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
876                         BUG();
877                 left -= len;
878
879                 fh->frag_off = htons(offset);
880                 if (left > 0)
881                         fh->frag_off |= htons(IP6_MF);
882                 ipv6_hdr(frag)->payload_len = htons(frag->len -
883                                                     sizeof(struct ipv6hdr));
884
885                 ptr += len;
886                 offset += len;
887
888                 /*
889                  *      Put this fragment into the sending queue.
890                  */
891                 err = output(frag);
892                 if (err)
893                         goto fail;
894
895                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
896                               IPSTATS_MIB_FRAGCREATES);
897         }
898         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
899                       IPSTATS_MIB_FRAGOKS);
900         kfree_skb(skb);
901         return err;
902
903 fail:
904         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
905                       IPSTATS_MIB_FRAGFAILS);
906         kfree_skb(skb);
907         return err;
908 }
909
910 static inline int ip6_rt_check(const struct rt6key *rt_key,
911                                const struct in6_addr *fl_addr,
912                                const struct in6_addr *addr_cache)
913 {
914         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
915                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
916 }
917
918 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
919                                           struct dst_entry *dst,
920                                           const struct flowi6 *fl6)
921 {
922         struct ipv6_pinfo *np = inet6_sk(sk);
923         struct rt6_info *rt;
924
925         if (!dst)
926                 goto out;
927
928         if (dst->ops->family != AF_INET6) {
929                 dst_release(dst);
930                 return NULL;
931         }
932
933         rt = (struct rt6_info *)dst;
934         /* Yes, checking route validity in not connected
935          * case is not very simple. Take into account,
936          * that we do not support routing by source, TOS,
937          * and MSG_DONTROUTE            --ANK (980726)
938          *
939          * 1. ip6_rt_check(): If route was host route,
940          *    check that cached destination is current.
941          *    If it is network route, we still may
942          *    check its validity using saved pointer
943          *    to the last used address: daddr_cache.
944          *    We do not want to save whole address now,
945          *    (because main consumer of this service
946          *    is tcp, which has not this problem),
947          *    so that the last trick works only on connected
948          *    sockets.
949          * 2. oif also should be the same.
950          */
951         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
952 #ifdef CONFIG_IPV6_SUBTREES
953             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
954 #endif
955             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
956                 dst_release(dst);
957                 dst = NULL;
958         }
959
960 out:
961         return dst;
962 }
963
964 static int ip6_dst_lookup_tail(struct sock *sk,
965                                struct dst_entry **dst, struct flowi6 *fl6)
966 {
967         struct net *net = sock_net(sk);
968 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
969         struct neighbour *n;
970 #endif
971         int err;
972
973         if (*dst == NULL)
974                 *dst = ip6_route_output(net, sk, fl6);
975
976         if ((err = (*dst)->error))
977                 goto out_err_release;
978
979         if (ipv6_addr_any(&fl6->saddr)) {
980                 struct rt6_info *rt = (struct rt6_info *) *dst;
981                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
982                                           sk ? inet6_sk(sk)->srcprefs : 0,
983                                           &fl6->saddr);
984                 if (err)
985                         goto out_err_release;
986         }
987
988 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
989         /*
990          * Here if the dst entry we've looked up
991          * has a neighbour entry that is in the INCOMPLETE
992          * state and the src address from the flow is
993          * marked as OPTIMISTIC, we release the found
994          * dst entry and replace it instead with the
995          * dst entry of the nexthop router
996          */
997         rcu_read_lock();
998         n = dst_get_neighbour(*dst);
999         if (n && !(n->nud_state & NUD_VALID)) {
1000                 struct inet6_ifaddr *ifp;
1001                 struct flowi6 fl_gw6;
1002                 int redirect;
1003
1004                 rcu_read_unlock();
1005                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1006                                       (*dst)->dev, 1);
1007
1008                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1009                 if (ifp)
1010                         in6_ifa_put(ifp);
1011
1012                 if (redirect) {
1013                         /*
1014                          * We need to get the dst entry for the
1015                          * default router instead
1016                          */
1017                         dst_release(*dst);
1018                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1019                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1020                         *dst = ip6_route_output(net, sk, &fl_gw6);
1021                         if ((err = (*dst)->error))
1022                                 goto out_err_release;
1023                 }
1024         } else {
1025                 rcu_read_unlock();
1026         }
1027 #endif
1028
1029         return 0;
1030
1031 out_err_release:
1032         if (err == -ENETUNREACH)
1033                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1034         dst_release(*dst);
1035         *dst = NULL;
1036         return err;
1037 }
1038
1039 /**
1040  *      ip6_dst_lookup - perform route lookup on flow
1041  *      @sk: socket which provides route info
1042  *      @dst: pointer to dst_entry * for result
1043  *      @fl6: flow to lookup
1044  *
1045  *      This function performs a route lookup on the given flow.
1046  *
1047  *      It returns zero on success, or a standard errno code on error.
1048  */
1049 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1050 {
1051         *dst = NULL;
1052         return ip6_dst_lookup_tail(sk, dst, fl6);
1053 }
1054 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1055
1056 /**
1057  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1058  *      @sk: socket which provides route info
1059  *      @fl6: flow to lookup
1060  *      @final_dst: final destination address for ipsec lookup
1061  *      @can_sleep: we are in a sleepable context
1062  *
1063  *      This function performs a route lookup on the given flow.
1064  *
1065  *      It returns a valid dst pointer on success, or a pointer encoded
1066  *      error code.
1067  */
1068 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1069                                       const struct in6_addr *final_dst,
1070                                       bool can_sleep)
1071 {
1072         struct dst_entry *dst = NULL;
1073         int err;
1074
1075         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1076         if (err)
1077                 return ERR_PTR(err);
1078         if (final_dst)
1079                 ipv6_addr_copy(&fl6->daddr, final_dst);
1080         if (can_sleep)
1081                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1082
1083         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1084 }
1085 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1086
1087 /**
1088  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1089  *      @sk: socket which provides the dst cache and route info
1090  *      @fl6: flow to lookup
1091  *      @final_dst: final destination address for ipsec lookup
1092  *      @can_sleep: we are in a sleepable context
1093  *
1094  *      This function performs a route lookup on the given flow with the
1095  *      possibility of using the cached route in the socket if it is valid.
1096  *      It will take the socket dst lock when operating on the dst cache.
1097  *      As a result, this function can only be used in process context.
1098  *
1099  *      It returns a valid dst pointer on success, or a pointer encoded
1100  *      error code.
1101  */
1102 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1103                                          const struct in6_addr *final_dst,
1104                                          bool can_sleep)
1105 {
1106         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1107         int err;
1108
1109         dst = ip6_sk_dst_check(sk, dst, fl6);
1110
1111         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1112         if (err)
1113                 return ERR_PTR(err);
1114         if (final_dst)
1115                 ipv6_addr_copy(&fl6->daddr, final_dst);
1116         if (can_sleep)
1117                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1118
1119         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1120 }
1121 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1122
1123 static inline int ip6_ufo_append_data(struct sock *sk,
1124                         int getfrag(void *from, char *to, int offset, int len,
1125                         int odd, struct sk_buff *skb),
1126                         void *from, int length, int hh_len, int fragheaderlen,
1127                         int transhdrlen, int mtu,unsigned int flags,
1128                         struct rt6_info *rt)
1129
1130 {
1131         struct sk_buff *skb;
1132         int err;
1133
1134         /* There is support for UDP large send offload by network
1135          * device, so create one single skb packet containing complete
1136          * udp datagram
1137          */
1138         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1139                 struct frag_hdr fhdr;
1140
1141                 skb = sock_alloc_send_skb(sk,
1142                         hh_len + fragheaderlen + transhdrlen + 20,
1143                         (flags & MSG_DONTWAIT), &err);
1144                 if (skb == NULL)
1145                         return err;
1146
1147                 /* reserve space for Hardware header */
1148                 skb_reserve(skb, hh_len);
1149
1150                 /* create space for UDP/IP header */
1151                 skb_put(skb,fragheaderlen + transhdrlen);
1152
1153                 /* initialize network header pointer */
1154                 skb_reset_network_header(skb);
1155
1156                 /* initialize protocol header pointer */
1157                 skb->transport_header = skb->network_header + fragheaderlen;
1158
1159                 skb->ip_summed = CHECKSUM_PARTIAL;
1160                 skb->csum = 0;
1161
1162                 /* Specify the length of each IPv6 datagram fragment.
1163                  * It has to be a multiple of 8.
1164                  */
1165                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1166                                              sizeof(struct frag_hdr)) & ~7;
1167                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1168                 ipv6_select_ident(&fhdr, rt);
1169                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1170                 __skb_queue_tail(&sk->sk_write_queue, skb);
1171         }
1172
1173         return skb_append_datato_frags(sk, skb, getfrag, from,
1174                                        (length - transhdrlen));
1175 }
1176
1177 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1178                                                gfp_t gfp)
1179 {
1180         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1181 }
1182
1183 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1184                                                 gfp_t gfp)
1185 {
1186         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1187 }
1188
1189 static void ip6_append_data_mtu(unsigned int *mtu,
1190                                 int *maxfraglen,
1191                                 unsigned int fragheaderlen,
1192                                 struct sk_buff *skb,
1193                                 struct rt6_info *rt,
1194                                 unsigned int orig_mtu)
1195 {
1196         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1197                 if (skb == NULL) {
1198                         /* first fragment, reserve header_len */
1199                         *mtu = orig_mtu - rt->dst.header_len;
1200
1201                 } else {
1202                         /*
1203                          * this fragment is not first, the headers
1204                          * space is regarded as data space.
1205                          */
1206                         *mtu = orig_mtu;
1207                 }
1208                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1209                               + fragheaderlen - sizeof(struct frag_hdr);
1210         }
1211 }
1212
1213 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1214         int offset, int len, int odd, struct sk_buff *skb),
1215         void *from, int length, int transhdrlen,
1216         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1217         struct rt6_info *rt, unsigned int flags, int dontfrag)
1218 {
1219         struct inet_sock *inet = inet_sk(sk);
1220         struct ipv6_pinfo *np = inet6_sk(sk);
1221         struct inet_cork *cork;
1222         struct sk_buff *skb, *skb_prev = NULL;
1223         unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1224         int exthdrlen;
1225         int dst_exthdrlen;
1226         int hh_len;
1227         int copy;
1228         int err;
1229         int offset = 0;
1230         int csummode = CHECKSUM_NONE;
1231         __u8 tx_flags = 0;
1232
1233         if (flags&MSG_PROBE)
1234                 return 0;
1235         cork = &inet->cork.base;
1236         if (skb_queue_empty(&sk->sk_write_queue)) {
1237                 /*
1238                  * setup for corking
1239                  */
1240                 if (opt) {
1241                         if (WARN_ON(np->cork.opt))
1242                                 return -EINVAL;
1243
1244                         np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1245                         if (unlikely(np->cork.opt == NULL))
1246                                 return -ENOBUFS;
1247
1248                         np->cork.opt->tot_len = opt->tot_len;
1249                         np->cork.opt->opt_flen = opt->opt_flen;
1250                         np->cork.opt->opt_nflen = opt->opt_nflen;
1251
1252                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1253                                                             sk->sk_allocation);
1254                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1255                                 return -ENOBUFS;
1256
1257                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1258                                                             sk->sk_allocation);
1259                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1260                                 return -ENOBUFS;
1261
1262                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1263                                                            sk->sk_allocation);
1264                         if (opt->hopopt && !np->cork.opt->hopopt)
1265                                 return -ENOBUFS;
1266
1267                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1268                                                             sk->sk_allocation);
1269                         if (opt->srcrt && !np->cork.opt->srcrt)
1270                                 return -ENOBUFS;
1271
1272                         /* need source address above miyazawa*/
1273                 }
1274                 dst_hold(&rt->dst);
1275                 cork->dst = &rt->dst;
1276                 inet->cork.fl.u.ip6 = *fl6;
1277                 np->cork.hop_limit = hlimit;
1278                 np->cork.tclass = tclass;
1279                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1280                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1281                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1282                 else
1283                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1284                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1285                 if (np->frag_size < mtu) {
1286                         if (np->frag_size)
1287                                 mtu = np->frag_size;
1288                 }
1289                 cork->fragsize = mtu;
1290                 if (dst_allfrag(rt->dst.path))
1291                         cork->flags |= IPCORK_ALLFRAG;
1292                 cork->length = 0;
1293                 sk->sk_sndmsg_page = NULL;
1294                 sk->sk_sndmsg_off = 0;
1295                 exthdrlen = (opt ? opt->opt_flen : 0);
1296                 length += exthdrlen;
1297                 transhdrlen += exthdrlen;
1298                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1299         } else {
1300                 rt = (struct rt6_info *)cork->dst;
1301                 fl6 = &inet->cork.fl.u.ip6;
1302                 opt = np->cork.opt;
1303                 transhdrlen = 0;
1304                 exthdrlen = 0;
1305                 dst_exthdrlen = 0;
1306                 mtu = cork->fragsize;
1307         }
1308         orig_mtu = mtu;
1309
1310         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1311
1312         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1313                         (opt ? opt->opt_nflen : 0);
1314         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1315
1316         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1317                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1318                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1319                         return -EMSGSIZE;
1320                 }
1321         }
1322
1323         /* For UDP, check if TX timestamp is enabled */
1324         if (sk->sk_type == SOCK_DGRAM) {
1325                 err = sock_tx_timestamp(sk, &tx_flags);
1326                 if (err)
1327                         goto error;
1328         }
1329
1330         /*
1331          * Let's try using as much space as possible.
1332          * Use MTU if total length of the message fits into the MTU.
1333          * Otherwise, we need to reserve fragment header and
1334          * fragment alignment (= 8-15 octects, in total).
1335          *
1336          * Note that we may need to "move" the data from the tail of
1337          * of the buffer to the new fragment when we split
1338          * the message.
1339          *
1340          * FIXME: It may be fragmented into multiple chunks
1341          *        at once if non-fragmentable extension headers
1342          *        are too large.
1343          * --yoshfuji
1344          */
1345
1346         if ((length > mtu) && dontfrag && (sk->sk_protocol == IPPROTO_UDP ||
1347                                            sk->sk_protocol == IPPROTO_RAW)) {
1348                 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1349                 return -EMSGSIZE;
1350         }
1351
1352         skb = skb_peek_tail(&sk->sk_write_queue);
1353         cork->length += length;
1354         if (((length > mtu) ||
1355              (skb && skb_has_frags(skb))) &&
1356             (sk->sk_protocol == IPPROTO_UDP) &&
1357             (rt->dst.dev->features & NETIF_F_UFO)) {
1358                 err = ip6_ufo_append_data(sk, getfrag, from, length,
1359                                           hh_len, fragheaderlen,
1360                                           transhdrlen, mtu, flags, rt);
1361                 if (err)
1362                         goto error;
1363                 return 0;
1364         }
1365
1366         if (!skb)
1367                 goto alloc_new_skb;
1368
1369         while (length > 0) {
1370                 /* Check if the remaining data fits into current packet. */
1371                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1372                 if (copy < length)
1373                         copy = maxfraglen - skb->len;
1374
1375                 if (copy <= 0) {
1376                         char *data;
1377                         unsigned int datalen;
1378                         unsigned int fraglen;
1379                         unsigned int fraggap;
1380                         unsigned int alloclen;
1381 alloc_new_skb:
1382                         /* There's no room in the current skb */
1383                         if (skb)
1384                                 fraggap = skb->len - maxfraglen;
1385                         else
1386                                 fraggap = 0;
1387                         /* update mtu and maxfraglen if necessary */
1388                         if (skb == NULL || skb_prev == NULL)
1389                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1390                                                     fragheaderlen, skb, rt,
1391                                                     orig_mtu);
1392
1393                         skb_prev = skb;
1394
1395                         /*
1396                          * If remaining data exceeds the mtu,
1397                          * we know we need more fragment(s).
1398                          */
1399                         datalen = length + fraggap;
1400
1401                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1402                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1403                         if ((flags & MSG_MORE) &&
1404                             !(rt->dst.dev->features&NETIF_F_SG))
1405                                 alloclen = mtu;
1406                         else
1407                                 alloclen = datalen + fragheaderlen;
1408
1409                         alloclen += dst_exthdrlen;
1410
1411                         if (datalen != length + fraggap) {
1412                                 /*
1413                                  * this is not the last fragment, the trailer
1414                                  * space is regarded as data space.
1415                                  */
1416                                 datalen += rt->dst.trailer_len;
1417                         }
1418
1419                         alloclen += rt->dst.trailer_len;
1420                         fraglen = datalen + fragheaderlen;
1421
1422                         /*
1423                          * We just reserve space for fragment header.
1424                          * Note: this may be overallocation if the message
1425                          * (without MSG_MORE) fits into the MTU.
1426                          */
1427                         alloclen += sizeof(struct frag_hdr);
1428
1429                         if (transhdrlen) {
1430                                 skb = sock_alloc_send_skb(sk,
1431                                                 alloclen + hh_len,
1432                                                 (flags & MSG_DONTWAIT), &err);
1433                         } else {
1434                                 skb = NULL;
1435                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1436                                     2 * sk->sk_sndbuf)
1437                                         skb = sock_wmalloc(sk,
1438                                                            alloclen + hh_len, 1,
1439                                                            sk->sk_allocation);
1440                                 if (unlikely(skb == NULL))
1441                                         err = -ENOBUFS;
1442                                 else {
1443                                         /* Only the initial fragment
1444                                          * is time stamped.
1445                                          */
1446                                         tx_flags = 0;
1447                                 }
1448                         }
1449                         if (skb == NULL)
1450                                 goto error;
1451                         /*
1452                          *      Fill in the control structures
1453                          */
1454                         skb->ip_summed = csummode;
1455                         skb->csum = 0;
1456                         /* reserve for fragmentation and ipsec header */
1457                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1458                                     dst_exthdrlen);
1459
1460                         if (sk->sk_type == SOCK_DGRAM)
1461                                 skb_shinfo(skb)->tx_flags = tx_flags;
1462
1463                         /*
1464                          *      Find where to start putting bytes
1465                          */
1466                         data = skb_put(skb, fraglen);
1467                         skb_set_network_header(skb, exthdrlen);
1468                         data += fragheaderlen;
1469                         skb->transport_header = (skb->network_header +
1470                                                  fragheaderlen);
1471                         if (fraggap) {
1472                                 skb->csum = skb_copy_and_csum_bits(
1473                                         skb_prev, maxfraglen,
1474                                         data + transhdrlen, fraggap, 0);
1475                                 skb_prev->csum = csum_sub(skb_prev->csum,
1476                                                           skb->csum);
1477                                 data += fraggap;
1478                                 pskb_trim_unique(skb_prev, maxfraglen);
1479                         }
1480                         copy = datalen - transhdrlen - fraggap;
1481
1482                         if (copy < 0) {
1483                                 err = -EINVAL;
1484                                 kfree_skb(skb);
1485                                 goto error;
1486                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1487                                 err = -EFAULT;
1488                                 kfree_skb(skb);
1489                                 goto error;
1490                         }
1491
1492                         offset += copy;
1493                         length -= datalen - fraggap;
1494                         transhdrlen = 0;
1495                         exthdrlen = 0;
1496                         dst_exthdrlen = 0;
1497                         csummode = CHECKSUM_NONE;
1498
1499                         /*
1500                          * Put the packet on the pending queue
1501                          */
1502                         __skb_queue_tail(&sk->sk_write_queue, skb);
1503                         continue;
1504                 }
1505
1506                 if (copy > length)
1507                         copy = length;
1508
1509                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1510                         unsigned int off;
1511
1512                         off = skb->len;
1513                         if (getfrag(from, skb_put(skb, copy),
1514                                                 offset, copy, off, skb) < 0) {
1515                                 __skb_trim(skb, off);
1516                                 err = -EFAULT;
1517                                 goto error;
1518                         }
1519                 } else {
1520                         int i = skb_shinfo(skb)->nr_frags;
1521                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1522                         struct page *page = sk->sk_sndmsg_page;
1523                         int off = sk->sk_sndmsg_off;
1524                         unsigned int left;
1525
1526                         if (page && (left = PAGE_SIZE - off) > 0) {
1527                                 if (copy >= left)
1528                                         copy = left;
1529                                 if (page != skb_frag_page(frag)) {
1530                                         if (i == MAX_SKB_FRAGS) {
1531                                                 err = -EMSGSIZE;
1532                                                 goto error;
1533                                         }
1534                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1535                                         skb_frag_ref(skb, i);
1536                                         frag = &skb_shinfo(skb)->frags[i];
1537                                 }
1538                         } else if(i < MAX_SKB_FRAGS) {
1539                                 if (copy > PAGE_SIZE)
1540                                         copy = PAGE_SIZE;
1541                                 page = alloc_pages(sk->sk_allocation, 0);
1542                                 if (page == NULL) {
1543                                         err = -ENOMEM;
1544                                         goto error;
1545                                 }
1546                                 sk->sk_sndmsg_page = page;
1547                                 sk->sk_sndmsg_off = 0;
1548
1549                                 skb_fill_page_desc(skb, i, page, 0, 0);
1550                                 frag = &skb_shinfo(skb)->frags[i];
1551                         } else {
1552                                 err = -EMSGSIZE;
1553                                 goto error;
1554                         }
1555                         if (getfrag(from,
1556                                     skb_frag_address(frag) + skb_frag_size(frag),
1557                                     offset, copy, skb->len, skb) < 0) {
1558                                 err = -EFAULT;
1559                                 goto error;
1560                         }
1561                         sk->sk_sndmsg_off += copy;
1562                         skb_frag_size_add(frag, copy);
1563                         skb->len += copy;
1564                         skb->data_len += copy;
1565                         skb->truesize += copy;
1566                         atomic_add(copy, &sk->sk_wmem_alloc);
1567                 }
1568                 offset += copy;
1569                 length -= copy;
1570         }
1571         return 0;
1572 error:
1573         cork->length -= length;
1574         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1575         return err;
1576 }
1577
1578 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1579 {
1580         if (np->cork.opt) {
1581                 kfree(np->cork.opt->dst0opt);
1582                 kfree(np->cork.opt->dst1opt);
1583                 kfree(np->cork.opt->hopopt);
1584                 kfree(np->cork.opt->srcrt);
1585                 kfree(np->cork.opt);
1586                 np->cork.opt = NULL;
1587         }
1588
1589         if (inet->cork.base.dst) {
1590                 dst_release(inet->cork.base.dst);
1591                 inet->cork.base.dst = NULL;
1592                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1593         }
1594         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1595 }
1596
1597 int ip6_push_pending_frames(struct sock *sk)
1598 {
1599         struct sk_buff *skb, *tmp_skb;
1600         struct sk_buff **tail_skb;
1601         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1602         struct inet_sock *inet = inet_sk(sk);
1603         struct ipv6_pinfo *np = inet6_sk(sk);
1604         struct net *net = sock_net(sk);
1605         struct ipv6hdr *hdr;
1606         struct ipv6_txoptions *opt = np->cork.opt;
1607         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1608         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1609         unsigned char proto = fl6->flowi6_proto;
1610         int err = 0;
1611
1612         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1613                 goto out;
1614         tail_skb = &(skb_shinfo(skb)->frag_list);
1615
1616         /* move skb->data to ip header from ext header */
1617         if (skb->data < skb_network_header(skb))
1618                 __skb_pull(skb, skb_network_offset(skb));
1619         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1620                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1621                 *tail_skb = tmp_skb;
1622                 tail_skb = &(tmp_skb->next);
1623                 skb->len += tmp_skb->len;
1624                 skb->data_len += tmp_skb->len;
1625                 skb->truesize += tmp_skb->truesize;
1626                 tmp_skb->destructor = NULL;
1627                 tmp_skb->sk = NULL;
1628         }
1629
1630         /* Allow local fragmentation. */
1631         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1632                 skb->local_df = 1;
1633
1634         ipv6_addr_copy(final_dst, &fl6->daddr);
1635         __skb_pull(skb, skb_network_header_len(skb));
1636         if (opt && opt->opt_flen)
1637                 ipv6_push_frag_opts(skb, opt, &proto);
1638         if (opt && opt->opt_nflen)
1639                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1640
1641         skb_push(skb, sizeof(struct ipv6hdr));
1642         skb_reset_network_header(skb);
1643         hdr = ipv6_hdr(skb);
1644
1645         *(__be32*)hdr = fl6->flowlabel |
1646                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1647
1648         hdr->hop_limit = np->cork.hop_limit;
1649         hdr->nexthdr = proto;
1650         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1651         ipv6_addr_copy(&hdr->daddr, final_dst);
1652
1653         skb->priority = sk->sk_priority;
1654         skb->mark = sk->sk_mark;
1655
1656         skb_dst_set(skb, dst_clone(&rt->dst));
1657         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1658         if (proto == IPPROTO_ICMPV6) {
1659                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1660
1661                 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1662                 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1663         }
1664
1665         err = ip6_local_out(skb);
1666         if (err) {
1667                 if (err > 0)
1668                         err = net_xmit_errno(err);
1669                 if (err)
1670                         goto error;
1671         }
1672
1673 out:
1674         ip6_cork_release(inet, np);
1675         return err;
1676 error:
1677         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1678         goto out;
1679 }
1680
1681 void ip6_flush_pending_frames(struct sock *sk)
1682 {
1683         struct sk_buff *skb;
1684
1685         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1686                 if (skb_dst(skb))
1687                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1688                                       IPSTATS_MIB_OUTDISCARDS);
1689                 kfree_skb(skb);
1690         }
1691
1692         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1693 }