Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
[pandora-kernel.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89         skb_reset_mac_header(newskb);
90         __skb_pull(newskb, skb_network_offset(newskb));
91         newskb->pkt_type = PACKET_LOOPBACK;
92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
93         WARN_ON(!skb_dst(newskb));
94
95         netif_rx_ni(newskb);
96         return 0;
97 }
98
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101         struct dst_entry *dst = skb_dst(skb);
102         struct net_device *dev = dst->dev;
103         struct neighbour *neigh;
104
105         skb->protocol = htons(ETH_P_IPV6);
106         skb->dev = dev;
107
108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110
111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112                     ((mroute6_socket(dev_net(dev), skb) &&
113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115                                          &ipv6_hdr(skb)->saddr))) {
116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117
118                         /* Do not check for IFF_ALLMULTI; multicast routing
119                            is not supported in any case.
120                          */
121                         if (newskb)
122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123                                         newskb, NULL, newskb->dev,
124                                         ip6_dev_loopback_xmit);
125
126                         if (ipv6_hdr(skb)->hop_limit == 0) {
127                                 IP6_INC_STATS(dev_net(dev), idev,
128                                               IPSTATS_MIB_OUTDISCARDS);
129                                 kfree_skb(skb);
130                                 return 0;
131                         }
132                 }
133
134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135                                 skb->len);
136         }
137
138         neigh = dst_get_neighbour(dst);
139         if (neigh)
140                 return neigh_output(neigh, skb);
141
142         IP6_INC_STATS_BH(dev_net(dst->dev),
143                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
144         kfree_skb(skb);
145         return -EINVAL;
146 }
147
148 static int ip6_finish_output(struct sk_buff *skb)
149 {
150         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
151             dst_allfrag(skb_dst(skb)))
152                 return ip6_fragment(skb, ip6_finish_output2);
153         else
154                 return ip6_finish_output2(skb);
155 }
156
157 int ip6_output(struct sk_buff *skb)
158 {
159         struct net_device *dev = skb_dst(skb)->dev;
160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161         if (unlikely(idev->cnf.disable_ipv6)) {
162                 IP6_INC_STATS(dev_net(dev), idev,
163                               IPSTATS_MIB_OUTDISCARDS);
164                 kfree_skb(skb);
165                 return 0;
166         }
167
168         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
169                             ip6_finish_output,
170                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
171 }
172
173 /*
174  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
175  */
176
177 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
178              struct ipv6_txoptions *opt)
179 {
180         struct net *net = sock_net(sk);
181         struct ipv6_pinfo *np = inet6_sk(sk);
182         struct in6_addr *first_hop = &fl6->daddr;
183         struct dst_entry *dst = skb_dst(skb);
184         struct ipv6hdr *hdr;
185         u8  proto = fl6->flowi6_proto;
186         int seg_len = skb->len;
187         int hlimit = -1;
188         int tclass = 0;
189         u32 mtu;
190
191         if (opt) {
192                 unsigned int head_room;
193
194                 /* First: exthdrs may take lots of space (~8K for now)
195                    MAX_HEADER is not enough.
196                  */
197                 head_room = opt->opt_nflen + opt->opt_flen;
198                 seg_len += head_room;
199                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
200
201                 if (skb_headroom(skb) < head_room) {
202                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
203                         if (skb2 == NULL) {
204                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
205                                               IPSTATS_MIB_OUTDISCARDS);
206                                 kfree_skb(skb);
207                                 return -ENOBUFS;
208                         }
209                         kfree_skb(skb);
210                         skb = skb2;
211                         skb_set_owner_w(skb, sk);
212                 }
213                 if (opt->opt_flen)
214                         ipv6_push_frag_opts(skb, opt, &proto);
215                 if (opt->opt_nflen)
216                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
217         }
218
219         skb_push(skb, sizeof(struct ipv6hdr));
220         skb_reset_network_header(skb);
221         hdr = ipv6_hdr(skb);
222
223         /*
224          *      Fill in the IPv6 header
225          */
226         if (np) {
227                 tclass = np->tclass;
228                 hlimit = np->hop_limit;
229         }
230         if (hlimit < 0)
231                 hlimit = ip6_dst_hoplimit(dst);
232
233         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
234
235         hdr->payload_len = htons(seg_len);
236         hdr->nexthdr = proto;
237         hdr->hop_limit = hlimit;
238
239         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
240         ipv6_addr_copy(&hdr->daddr, first_hop);
241
242         skb->priority = sk->sk_priority;
243         skb->mark = sk->sk_mark;
244
245         mtu = dst_mtu(dst);
246         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
247                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
248                               IPSTATS_MIB_OUT, skb->len);
249                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
250                                dst->dev, dst_output);
251         }
252
253         if (net_ratelimit())
254                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
255         skb->dev = dst->dev;
256         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
257         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
258         kfree_skb(skb);
259         return -EMSGSIZE;
260 }
261
262 EXPORT_SYMBOL(ip6_xmit);
263
264 /*
265  *      To avoid extra problems ND packets are send through this
266  *      routine. It's code duplication but I really want to avoid
267  *      extra checks since ipv6_build_header is used by TCP (which
268  *      is for us performance critical)
269  */
270
271 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
272                const struct in6_addr *saddr, const struct in6_addr *daddr,
273                int proto, int len)
274 {
275         struct ipv6_pinfo *np = inet6_sk(sk);
276         struct ipv6hdr *hdr;
277
278         skb->protocol = htons(ETH_P_IPV6);
279         skb->dev = dev;
280
281         skb_reset_network_header(skb);
282         skb_put(skb, sizeof(struct ipv6hdr));
283         hdr = ipv6_hdr(skb);
284
285         *(__be32*)hdr = htonl(0x60000000);
286
287         hdr->payload_len = htons(len);
288         hdr->nexthdr = proto;
289         hdr->hop_limit = np->hop_limit;
290
291         ipv6_addr_copy(&hdr->saddr, saddr);
292         ipv6_addr_copy(&hdr->daddr, daddr);
293
294         return 0;
295 }
296
297 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
298 {
299         struct ip6_ra_chain *ra;
300         struct sock *last = NULL;
301
302         read_lock(&ip6_ra_lock);
303         for (ra = ip6_ra_chain; ra; ra = ra->next) {
304                 struct sock *sk = ra->sk;
305                 if (sk && ra->sel == sel &&
306                     (!sk->sk_bound_dev_if ||
307                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
308                         if (last) {
309                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
310                                 if (skb2)
311                                         rawv6_rcv(last, skb2);
312                         }
313                         last = sk;
314                 }
315         }
316
317         if (last) {
318                 rawv6_rcv(last, skb);
319                 read_unlock(&ip6_ra_lock);
320                 return 1;
321         }
322         read_unlock(&ip6_ra_lock);
323         return 0;
324 }
325
326 static int ip6_forward_proxy_check(struct sk_buff *skb)
327 {
328         struct ipv6hdr *hdr = ipv6_hdr(skb);
329         u8 nexthdr = hdr->nexthdr;
330         int offset;
331
332         if (ipv6_ext_hdr(nexthdr)) {
333                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
334                 if (offset < 0)
335                         return 0;
336         } else
337                 offset = sizeof(struct ipv6hdr);
338
339         if (nexthdr == IPPROTO_ICMPV6) {
340                 struct icmp6hdr *icmp6;
341
342                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
343                                          offset + 1 - skb->data)))
344                         return 0;
345
346                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
347
348                 switch (icmp6->icmp6_type) {
349                 case NDISC_ROUTER_SOLICITATION:
350                 case NDISC_ROUTER_ADVERTISEMENT:
351                 case NDISC_NEIGHBOUR_SOLICITATION:
352                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
353                 case NDISC_REDIRECT:
354                         /* For reaction involving unicast neighbor discovery
355                          * message destined to the proxied address, pass it to
356                          * input function.
357                          */
358                         return 1;
359                 default:
360                         break;
361                 }
362         }
363
364         /*
365          * The proxying router can't forward traffic sent to a link-local
366          * address, so signal the sender and discard the packet. This
367          * behavior is clarified by the MIPv6 specification.
368          */
369         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
370                 dst_link_failure(skb);
371                 return -1;
372         }
373
374         return 0;
375 }
376
377 static inline int ip6_forward_finish(struct sk_buff *skb)
378 {
379         return dst_output(skb);
380 }
381
382 int ip6_forward(struct sk_buff *skb)
383 {
384         struct dst_entry *dst = skb_dst(skb);
385         struct ipv6hdr *hdr = ipv6_hdr(skb);
386         struct inet6_skb_parm *opt = IP6CB(skb);
387         struct net *net = dev_net(dst->dev);
388         struct neighbour *n;
389         u32 mtu;
390
391         if (net->ipv6.devconf_all->forwarding == 0)
392                 goto error;
393
394         if (skb_warn_if_lro(skb))
395                 goto drop;
396
397         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
398                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
399                 goto drop;
400         }
401
402         if (skb->pkt_type != PACKET_HOST)
403                 goto drop;
404
405         skb_forward_csum(skb);
406
407         /*
408          *      We DO NOT make any processing on
409          *      RA packets, pushing them to user level AS IS
410          *      without ane WARRANTY that application will be able
411          *      to interpret them. The reason is that we
412          *      cannot make anything clever here.
413          *
414          *      We are not end-node, so that if packet contains
415          *      AH/ESP, we cannot make anything.
416          *      Defragmentation also would be mistake, RA packets
417          *      cannot be fragmented, because there is no warranty
418          *      that different fragments will go along one path. --ANK
419          */
420         if (opt->ra) {
421                 u8 *ptr = skb_network_header(skb) + opt->ra;
422                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
423                         return 0;
424         }
425
426         /*
427          *      check and decrement ttl
428          */
429         if (hdr->hop_limit <= 1) {
430                 /* Force OUTPUT device used as source address */
431                 skb->dev = dst->dev;
432                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
433                 IP6_INC_STATS_BH(net,
434                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
435
436                 kfree_skb(skb);
437                 return -ETIMEDOUT;
438         }
439
440         /* XXX: idev->cnf.proxy_ndp? */
441         if (net->ipv6.devconf_all->proxy_ndp &&
442             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
443                 int proxied = ip6_forward_proxy_check(skb);
444                 if (proxied > 0)
445                         return ip6_input(skb);
446                 else if (proxied < 0) {
447                         IP6_INC_STATS(net, ip6_dst_idev(dst),
448                                       IPSTATS_MIB_INDISCARDS);
449                         goto drop;
450                 }
451         }
452
453         if (!xfrm6_route_forward(skb)) {
454                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
455                 goto drop;
456         }
457         dst = skb_dst(skb);
458
459         /* IPv6 specs say nothing about it, but it is clear that we cannot
460            send redirects to source routed frames.
461            We don't send redirects to frames decapsulated from IPsec.
462          */
463         n = dst_get_neighbour(dst);
464         if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
465                 struct in6_addr *target = NULL;
466                 struct rt6_info *rt;
467
468                 /*
469                  *      incoming and outgoing devices are the same
470                  *      send a redirect.
471                  */
472
473                 rt = (struct rt6_info *) dst;
474                 if ((rt->rt6i_flags & RTF_GATEWAY))
475                         target = (struct in6_addr*)&n->primary_key;
476                 else
477                         target = &hdr->daddr;
478
479                 if (!rt->rt6i_peer)
480                         rt6_bind_peer(rt, 1);
481
482                 /* Limit redirects both by destination (here)
483                    and by source (inside ndisc_send_redirect)
484                  */
485                 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
486                         ndisc_send_redirect(skb, n, target);
487         } else {
488                 int addrtype = ipv6_addr_type(&hdr->saddr);
489
490                 /* This check is security critical. */
491                 if (addrtype == IPV6_ADDR_ANY ||
492                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
493                         goto error;
494                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
495                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496                                     ICMPV6_NOT_NEIGHBOUR, 0);
497                         goto error;
498                 }
499         }
500
501         mtu = dst_mtu(dst);
502         if (mtu < IPV6_MIN_MTU)
503                 mtu = IPV6_MIN_MTU;
504
505         if (skb->len > mtu && !skb_is_gso(skb)) {
506                 /* Again, force OUTPUT device used as source address */
507                 skb->dev = dst->dev;
508                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509                 IP6_INC_STATS_BH(net,
510                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
511                 IP6_INC_STATS_BH(net,
512                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
513                 kfree_skb(skb);
514                 return -EMSGSIZE;
515         }
516
517         if (skb_cow(skb, dst->dev->hard_header_len)) {
518                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
519                 goto drop;
520         }
521
522         hdr = ipv6_hdr(skb);
523
524         /* Mangling hops number delayed to point after skb COW */
525
526         hdr->hop_limit--;
527
528         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
529         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
530                        ip6_forward_finish);
531
532 error:
533         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
534 drop:
535         kfree_skb(skb);
536         return -EINVAL;
537 }
538
539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
540 {
541         to->pkt_type = from->pkt_type;
542         to->priority = from->priority;
543         to->protocol = from->protocol;
544         skb_dst_drop(to);
545         skb_dst_set(to, dst_clone(skb_dst(from)));
546         to->dev = from->dev;
547         to->mark = from->mark;
548
549 #ifdef CONFIG_NET_SCHED
550         to->tc_index = from->tc_index;
551 #endif
552         nf_copy(to, from);
553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
554     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
555         to->nf_trace = from->nf_trace;
556 #endif
557         skb_copy_secmark(to, from);
558 }
559
560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
561 {
562         u16 offset = sizeof(struct ipv6hdr);
563         struct ipv6_opt_hdr *exthdr =
564                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
565         unsigned int packet_len = skb->tail - skb->network_header;
566         int found_rhdr = 0;
567         *nexthdr = &ipv6_hdr(skb)->nexthdr;
568
569         while (offset + 1 <= packet_len) {
570
571                 switch (**nexthdr) {
572
573                 case NEXTHDR_HOP:
574                         break;
575                 case NEXTHDR_ROUTING:
576                         found_rhdr = 1;
577                         break;
578                 case NEXTHDR_DEST:
579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
580                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
581                                 break;
582 #endif
583                         if (found_rhdr)
584                                 return offset;
585                         break;
586                 default :
587                         return offset;
588                 }
589
590                 offset += ipv6_optlen(exthdr);
591                 *nexthdr = &exthdr->nexthdr;
592                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
593                                                  offset);
594         }
595
596         return offset;
597 }
598
599 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
600 {
601         static atomic_t ipv6_fragmentation_id;
602         int old, new;
603
604         if (rt) {
605                 struct inet_peer *peer;
606
607                 if (!rt->rt6i_peer)
608                         rt6_bind_peer(rt, 1);
609                 peer = rt->rt6i_peer;
610                 if (peer) {
611                         fhdr->identification = htonl(inet_getid(peer, 0));
612                         return;
613                 }
614         }
615         do {
616                 old = atomic_read(&ipv6_fragmentation_id);
617                 new = old + 1;
618                 if (!new)
619                         new = 1;
620         } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
621         fhdr->identification = htonl(new);
622 }
623
624 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
625 {
626         struct sk_buff *frag;
627         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
628         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
629         struct ipv6hdr *tmp_hdr;
630         struct frag_hdr *fh;
631         unsigned int mtu, hlen, left, len;
632         __be32 frag_id = 0;
633         int ptr, offset = 0, err=0;
634         u8 *prevhdr, nexthdr = 0;
635         struct net *net = dev_net(skb_dst(skb)->dev);
636
637         hlen = ip6_find_1stfragopt(skb, &prevhdr);
638         nexthdr = *prevhdr;
639
640         mtu = ip6_skb_dst_mtu(skb);
641
642         /* We must not fragment if the socket is set to force MTU discovery
643          * or if the skb it not generated by a local socket.
644          */
645         if (!skb->local_df && skb->len > mtu) {
646                 skb->dev = skb_dst(skb)->dev;
647                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
648                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
649                               IPSTATS_MIB_FRAGFAILS);
650                 kfree_skb(skb);
651                 return -EMSGSIZE;
652         }
653
654         if (np && np->frag_size < mtu) {
655                 if (np->frag_size)
656                         mtu = np->frag_size;
657         }
658         mtu -= hlen + sizeof(struct frag_hdr);
659
660         if (skb_has_frag_list(skb)) {
661                 int first_len = skb_pagelen(skb);
662                 struct sk_buff *frag2;
663
664                 if (first_len - hlen > mtu ||
665                     ((first_len - hlen) & 7) ||
666                     skb_cloned(skb))
667                         goto slow_path;
668
669                 skb_walk_frags(skb, frag) {
670                         /* Correct geometry. */
671                         if (frag->len > mtu ||
672                             ((frag->len & 7) && frag->next) ||
673                             skb_headroom(frag) < hlen)
674                                 goto slow_path_clean;
675
676                         /* Partially cloned skb? */
677                         if (skb_shared(frag))
678                                 goto slow_path_clean;
679
680                         BUG_ON(frag->sk);
681                         if (skb->sk) {
682                                 frag->sk = skb->sk;
683                                 frag->destructor = sock_wfree;
684                         }
685                         skb->truesize -= frag->truesize;
686                 }
687
688                 err = 0;
689                 offset = 0;
690                 frag = skb_shinfo(skb)->frag_list;
691                 skb_frag_list_init(skb);
692                 /* BUILD HEADER */
693
694                 *prevhdr = NEXTHDR_FRAGMENT;
695                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
696                 if (!tmp_hdr) {
697                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
698                                       IPSTATS_MIB_FRAGFAILS);
699                         return -ENOMEM;
700                 }
701
702                 __skb_pull(skb, hlen);
703                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
704                 __skb_push(skb, hlen);
705                 skb_reset_network_header(skb);
706                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
707
708                 ipv6_select_ident(fh, rt);
709                 fh->nexthdr = nexthdr;
710                 fh->reserved = 0;
711                 fh->frag_off = htons(IP6_MF);
712                 frag_id = fh->identification;
713
714                 first_len = skb_pagelen(skb);
715                 skb->data_len = first_len - skb_headlen(skb);
716                 skb->len = first_len;
717                 ipv6_hdr(skb)->payload_len = htons(first_len -
718                                                    sizeof(struct ipv6hdr));
719
720                 dst_hold(&rt->dst);
721
722                 for (;;) {
723                         /* Prepare header of the next frame,
724                          * before previous one went down. */
725                         if (frag) {
726                                 frag->ip_summed = CHECKSUM_NONE;
727                                 skb_reset_transport_header(frag);
728                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
729                                 __skb_push(frag, hlen);
730                                 skb_reset_network_header(frag);
731                                 memcpy(skb_network_header(frag), tmp_hdr,
732                                        hlen);
733                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
734                                 fh->nexthdr = nexthdr;
735                                 fh->reserved = 0;
736                                 fh->frag_off = htons(offset);
737                                 if (frag->next != NULL)
738                                         fh->frag_off |= htons(IP6_MF);
739                                 fh->identification = frag_id;
740                                 ipv6_hdr(frag)->payload_len =
741                                                 htons(frag->len -
742                                                       sizeof(struct ipv6hdr));
743                                 ip6_copy_metadata(frag, skb);
744                         }
745
746                         err = output(skb);
747                         if(!err)
748                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
749                                               IPSTATS_MIB_FRAGCREATES);
750
751                         if (err || !frag)
752                                 break;
753
754                         skb = frag;
755                         frag = skb->next;
756                         skb->next = NULL;
757                 }
758
759                 kfree(tmp_hdr);
760
761                 if (err == 0) {
762                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
763                                       IPSTATS_MIB_FRAGOKS);
764                         dst_release(&rt->dst);
765                         return 0;
766                 }
767
768                 while (frag) {
769                         skb = frag->next;
770                         kfree_skb(frag);
771                         frag = skb;
772                 }
773
774                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
775                               IPSTATS_MIB_FRAGFAILS);
776                 dst_release(&rt->dst);
777                 return err;
778
779 slow_path_clean:
780                 skb_walk_frags(skb, frag2) {
781                         if (frag2 == frag)
782                                 break;
783                         frag2->sk = NULL;
784                         frag2->destructor = NULL;
785                         skb->truesize += frag2->truesize;
786                 }
787         }
788
789 slow_path:
790         left = skb->len - hlen;         /* Space per frame */
791         ptr = hlen;                     /* Where to start from */
792
793         /*
794          *      Fragment the datagram.
795          */
796
797         *prevhdr = NEXTHDR_FRAGMENT;
798
799         /*
800          *      Keep copying data until we run out.
801          */
802         while(left > 0) {
803                 len = left;
804                 /* IF: it doesn't fit, use 'mtu' - the data space left */
805                 if (len > mtu)
806                         len = mtu;
807                 /* IF: we are not sending up to and including the packet end
808                    then align the next start on an eight byte boundary */
809                 if (len < left) {
810                         len &= ~7;
811                 }
812                 /*
813                  *      Allocate buffer.
814                  */
815
816                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
817                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
818                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
819                                       IPSTATS_MIB_FRAGFAILS);
820                         err = -ENOMEM;
821                         goto fail;
822                 }
823
824                 /*
825                  *      Set up data on packet
826                  */
827
828                 ip6_copy_metadata(frag, skb);
829                 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
830                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
831                 skb_reset_network_header(frag);
832                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
833                 frag->transport_header = (frag->network_header + hlen +
834                                           sizeof(struct frag_hdr));
835
836                 /*
837                  *      Charge the memory for the fragment to any owner
838                  *      it might possess
839                  */
840                 if (skb->sk)
841                         skb_set_owner_w(frag, skb->sk);
842
843                 /*
844                  *      Copy the packet header into the new buffer.
845                  */
846                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
847
848                 /*
849                  *      Build fragment header.
850                  */
851                 fh->nexthdr = nexthdr;
852                 fh->reserved = 0;
853                 if (!frag_id) {
854                         ipv6_select_ident(fh, rt);
855                         frag_id = fh->identification;
856                 } else
857                         fh->identification = frag_id;
858
859                 /*
860                  *      Copy a block of the IP datagram.
861                  */
862                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
863                         BUG();
864                 left -= len;
865
866                 fh->frag_off = htons(offset);
867                 if (left > 0)
868                         fh->frag_off |= htons(IP6_MF);
869                 ipv6_hdr(frag)->payload_len = htons(frag->len -
870                                                     sizeof(struct ipv6hdr));
871
872                 ptr += len;
873                 offset += len;
874
875                 /*
876                  *      Put this fragment into the sending queue.
877                  */
878                 err = output(frag);
879                 if (err)
880                         goto fail;
881
882                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
883                               IPSTATS_MIB_FRAGCREATES);
884         }
885         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
886                       IPSTATS_MIB_FRAGOKS);
887         kfree_skb(skb);
888         return err;
889
890 fail:
891         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
892                       IPSTATS_MIB_FRAGFAILS);
893         kfree_skb(skb);
894         return err;
895 }
896
897 static inline int ip6_rt_check(const struct rt6key *rt_key,
898                                const struct in6_addr *fl_addr,
899                                const struct in6_addr *addr_cache)
900 {
901         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
902                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
903 }
904
905 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
906                                           struct dst_entry *dst,
907                                           const struct flowi6 *fl6)
908 {
909         struct ipv6_pinfo *np = inet6_sk(sk);
910         struct rt6_info *rt = (struct rt6_info *)dst;
911
912         if (!dst)
913                 goto out;
914
915         /* Yes, checking route validity in not connected
916          * case is not very simple. Take into account,
917          * that we do not support routing by source, TOS,
918          * and MSG_DONTROUTE            --ANK (980726)
919          *
920          * 1. ip6_rt_check(): If route was host route,
921          *    check that cached destination is current.
922          *    If it is network route, we still may
923          *    check its validity using saved pointer
924          *    to the last used address: daddr_cache.
925          *    We do not want to save whole address now,
926          *    (because main consumer of this service
927          *    is tcp, which has not this problem),
928          *    so that the last trick works only on connected
929          *    sockets.
930          * 2. oif also should be the same.
931          */
932         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
933 #ifdef CONFIG_IPV6_SUBTREES
934             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
935 #endif
936             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
937                 dst_release(dst);
938                 dst = NULL;
939         }
940
941 out:
942         return dst;
943 }
944
945 static int ip6_dst_lookup_tail(struct sock *sk,
946                                struct dst_entry **dst, struct flowi6 *fl6)
947 {
948         struct net *net = sock_net(sk);
949 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
950         struct neighbour *n;
951 #endif
952         int err;
953
954         if (*dst == NULL)
955                 *dst = ip6_route_output(net, sk, fl6);
956
957         if ((err = (*dst)->error))
958                 goto out_err_release;
959
960         if (ipv6_addr_any(&fl6->saddr)) {
961                 struct rt6_info *rt = (struct rt6_info *) *dst;
962                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
963                                           sk ? inet6_sk(sk)->srcprefs : 0,
964                                           &fl6->saddr);
965                 if (err)
966                         goto out_err_release;
967         }
968
969 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
970         /*
971          * Here if the dst entry we've looked up
972          * has a neighbour entry that is in the INCOMPLETE
973          * state and the src address from the flow is
974          * marked as OPTIMISTIC, we release the found
975          * dst entry and replace it instead with the
976          * dst entry of the nexthop router
977          */
978         n = dst_get_neighbour(*dst);
979         if (n && !(n->nud_state & NUD_VALID)) {
980                 struct inet6_ifaddr *ifp;
981                 struct flowi6 fl_gw6;
982                 int redirect;
983
984                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
985                                       (*dst)->dev, 1);
986
987                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
988                 if (ifp)
989                         in6_ifa_put(ifp);
990
991                 if (redirect) {
992                         /*
993                          * We need to get the dst entry for the
994                          * default router instead
995                          */
996                         dst_release(*dst);
997                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
998                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
999                         *dst = ip6_route_output(net, sk, &fl_gw6);
1000                         if ((err = (*dst)->error))
1001                                 goto out_err_release;
1002                 }
1003         }
1004 #endif
1005
1006         return 0;
1007
1008 out_err_release:
1009         if (err == -ENETUNREACH)
1010                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1011         dst_release(*dst);
1012         *dst = NULL;
1013         return err;
1014 }
1015
1016 /**
1017  *      ip6_dst_lookup - perform route lookup on flow
1018  *      @sk: socket which provides route info
1019  *      @dst: pointer to dst_entry * for result
1020  *      @fl6: flow to lookup
1021  *
1022  *      This function performs a route lookup on the given flow.
1023  *
1024  *      It returns zero on success, or a standard errno code on error.
1025  */
1026 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1027 {
1028         *dst = NULL;
1029         return ip6_dst_lookup_tail(sk, dst, fl6);
1030 }
1031 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1032
1033 /**
1034  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1035  *      @sk: socket which provides route info
1036  *      @fl6: flow to lookup
1037  *      @final_dst: final destination address for ipsec lookup
1038  *      @can_sleep: we are in a sleepable context
1039  *
1040  *      This function performs a route lookup on the given flow.
1041  *
1042  *      It returns a valid dst pointer on success, or a pointer encoded
1043  *      error code.
1044  */
1045 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1046                                       const struct in6_addr *final_dst,
1047                                       bool can_sleep)
1048 {
1049         struct dst_entry *dst = NULL;
1050         int err;
1051
1052         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1053         if (err)
1054                 return ERR_PTR(err);
1055         if (final_dst)
1056                 ipv6_addr_copy(&fl6->daddr, final_dst);
1057         if (can_sleep)
1058                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1059
1060         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1061 }
1062 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1063
1064 /**
1065  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1066  *      @sk: socket which provides the dst cache and route info
1067  *      @fl6: flow to lookup
1068  *      @final_dst: final destination address for ipsec lookup
1069  *      @can_sleep: we are in a sleepable context
1070  *
1071  *      This function performs a route lookup on the given flow with the
1072  *      possibility of using the cached route in the socket if it is valid.
1073  *      It will take the socket dst lock when operating on the dst cache.
1074  *      As a result, this function can only be used in process context.
1075  *
1076  *      It returns a valid dst pointer on success, or a pointer encoded
1077  *      error code.
1078  */
1079 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1080                                          const struct in6_addr *final_dst,
1081                                          bool can_sleep)
1082 {
1083         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1084         int err;
1085
1086         dst = ip6_sk_dst_check(sk, dst, fl6);
1087
1088         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1089         if (err)
1090                 return ERR_PTR(err);
1091         if (final_dst)
1092                 ipv6_addr_copy(&fl6->daddr, final_dst);
1093         if (can_sleep)
1094                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1095
1096         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1097 }
1098 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1099
1100 static inline int ip6_ufo_append_data(struct sock *sk,
1101                         int getfrag(void *from, char *to, int offset, int len,
1102                         int odd, struct sk_buff *skb),
1103                         void *from, int length, int hh_len, int fragheaderlen,
1104                         int transhdrlen, int mtu,unsigned int flags,
1105                         struct rt6_info *rt)
1106
1107 {
1108         struct sk_buff *skb;
1109         int err;
1110
1111         /* There is support for UDP large send offload by network
1112          * device, so create one single skb packet containing complete
1113          * udp datagram
1114          */
1115         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1116                 skb = sock_alloc_send_skb(sk,
1117                         hh_len + fragheaderlen + transhdrlen + 20,
1118                         (flags & MSG_DONTWAIT), &err);
1119                 if (skb == NULL)
1120                         return -ENOMEM;
1121
1122                 /* reserve space for Hardware header */
1123                 skb_reserve(skb, hh_len);
1124
1125                 /* create space for UDP/IP header */
1126                 skb_put(skb,fragheaderlen + transhdrlen);
1127
1128                 /* initialize network header pointer */
1129                 skb_reset_network_header(skb);
1130
1131                 /* initialize protocol header pointer */
1132                 skb->transport_header = skb->network_header + fragheaderlen;
1133
1134                 skb->ip_summed = CHECKSUM_PARTIAL;
1135                 skb->csum = 0;
1136         }
1137
1138         err = skb_append_datato_frags(sk,skb, getfrag, from,
1139                                       (length - transhdrlen));
1140         if (!err) {
1141                 struct frag_hdr fhdr;
1142
1143                 /* Specify the length of each IPv6 datagram fragment.
1144                  * It has to be a multiple of 8.
1145                  */
1146                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1147                                              sizeof(struct frag_hdr)) & ~7;
1148                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1149                 ipv6_select_ident(&fhdr, rt);
1150                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1151                 __skb_queue_tail(&sk->sk_write_queue, skb);
1152
1153                 return 0;
1154         }
1155         /* There is not enough support do UPD LSO,
1156          * so follow normal path
1157          */
1158         kfree_skb(skb);
1159
1160         return err;
1161 }
1162
1163 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1164                                                gfp_t gfp)
1165 {
1166         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1167 }
1168
1169 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1170                                                 gfp_t gfp)
1171 {
1172         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1173 }
1174
1175 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1176         int offset, int len, int odd, struct sk_buff *skb),
1177         void *from, int length, int transhdrlen,
1178         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1179         struct rt6_info *rt, unsigned int flags, int dontfrag)
1180 {
1181         struct inet_sock *inet = inet_sk(sk);
1182         struct ipv6_pinfo *np = inet6_sk(sk);
1183         struct inet_cork *cork;
1184         struct sk_buff *skb;
1185         unsigned int maxfraglen, fragheaderlen;
1186         int exthdrlen;
1187         int hh_len;
1188         int mtu;
1189         int copy;
1190         int err;
1191         int offset = 0;
1192         int csummode = CHECKSUM_NONE;
1193         __u8 tx_flags = 0;
1194
1195         if (flags&MSG_PROBE)
1196                 return 0;
1197         cork = &inet->cork.base;
1198         if (skb_queue_empty(&sk->sk_write_queue)) {
1199                 /*
1200                  * setup for corking
1201                  */
1202                 if (opt) {
1203                         if (WARN_ON(np->cork.opt))
1204                                 return -EINVAL;
1205
1206                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1207                         if (unlikely(np->cork.opt == NULL))
1208                                 return -ENOBUFS;
1209
1210                         np->cork.opt->tot_len = opt->tot_len;
1211                         np->cork.opt->opt_flen = opt->opt_flen;
1212                         np->cork.opt->opt_nflen = opt->opt_nflen;
1213
1214                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1215                                                             sk->sk_allocation);
1216                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1217                                 return -ENOBUFS;
1218
1219                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1220                                                             sk->sk_allocation);
1221                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1222                                 return -ENOBUFS;
1223
1224                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1225                                                            sk->sk_allocation);
1226                         if (opt->hopopt && !np->cork.opt->hopopt)
1227                                 return -ENOBUFS;
1228
1229                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1230                                                             sk->sk_allocation);
1231                         if (opt->srcrt && !np->cork.opt->srcrt)
1232                                 return -ENOBUFS;
1233
1234                         /* need source address above miyazawa*/
1235                 }
1236                 dst_hold(&rt->dst);
1237                 cork->dst = &rt->dst;
1238                 inet->cork.fl.u.ip6 = *fl6;
1239                 np->cork.hop_limit = hlimit;
1240                 np->cork.tclass = tclass;
1241                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1242                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1243                 if (np->frag_size < mtu) {
1244                         if (np->frag_size)
1245                                 mtu = np->frag_size;
1246                 }
1247                 cork->fragsize = mtu;
1248                 if (dst_allfrag(rt->dst.path))
1249                         cork->flags |= IPCORK_ALLFRAG;
1250                 cork->length = 0;
1251                 sk->sk_sndmsg_page = NULL;
1252                 sk->sk_sndmsg_off = 0;
1253                 exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1254                             rt->rt6i_nfheader_len;
1255                 length += exthdrlen;
1256                 transhdrlen += exthdrlen;
1257         } else {
1258                 rt = (struct rt6_info *)cork->dst;
1259                 fl6 = &inet->cork.fl.u.ip6;
1260                 opt = np->cork.opt;
1261                 transhdrlen = 0;
1262                 exthdrlen = 0;
1263                 mtu = cork->fragsize;
1264         }
1265
1266         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1267
1268         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1269                         (opt ? opt->opt_nflen : 0);
1270         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1271
1272         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1273                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1274                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1275                         return -EMSGSIZE;
1276                 }
1277         }
1278
1279         /* For UDP, check if TX timestamp is enabled */
1280         if (sk->sk_type == SOCK_DGRAM) {
1281                 err = sock_tx_timestamp(sk, &tx_flags);
1282                 if (err)
1283                         goto error;
1284         }
1285
1286         /*
1287          * Let's try using as much space as possible.
1288          * Use MTU if total length of the message fits into the MTU.
1289          * Otherwise, we need to reserve fragment header and
1290          * fragment alignment (= 8-15 octects, in total).
1291          *
1292          * Note that we may need to "move" the data from the tail of
1293          * of the buffer to the new fragment when we split
1294          * the message.
1295          *
1296          * FIXME: It may be fragmented into multiple chunks
1297          *        at once if non-fragmentable extension headers
1298          *        are too large.
1299          * --yoshfuji
1300          */
1301
1302         cork->length += length;
1303         if (length > mtu) {
1304                 int proto = sk->sk_protocol;
1305                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1306                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1307                         return -EMSGSIZE;
1308                 }
1309
1310                 if (proto == IPPROTO_UDP &&
1311                     (rt->dst.dev->features & NETIF_F_UFO)) {
1312
1313                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1314                                                   hh_len, fragheaderlen,
1315                                                   transhdrlen, mtu, flags, rt);
1316                         if (err)
1317                                 goto error;
1318                         return 0;
1319                 }
1320         }
1321
1322         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1323                 goto alloc_new_skb;
1324
1325         while (length > 0) {
1326                 /* Check if the remaining data fits into current packet. */
1327                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1328                 if (copy < length)
1329                         copy = maxfraglen - skb->len;
1330
1331                 if (copy <= 0) {
1332                         char *data;
1333                         unsigned int datalen;
1334                         unsigned int fraglen;
1335                         unsigned int fraggap;
1336                         unsigned int alloclen;
1337                         struct sk_buff *skb_prev;
1338 alloc_new_skb:
1339                         skb_prev = skb;
1340
1341                         /* There's no room in the current skb */
1342                         if (skb_prev)
1343                                 fraggap = skb_prev->len - maxfraglen;
1344                         else
1345                                 fraggap = 0;
1346
1347                         /*
1348                          * If remaining data exceeds the mtu,
1349                          * we know we need more fragment(s).
1350                          */
1351                         datalen = length + fraggap;
1352                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1353                                 datalen = maxfraglen - fragheaderlen;
1354
1355                         fraglen = datalen + fragheaderlen;
1356                         if ((flags & MSG_MORE) &&
1357                             !(rt->dst.dev->features&NETIF_F_SG))
1358                                 alloclen = mtu;
1359                         else
1360                                 alloclen = datalen + fragheaderlen;
1361
1362                         /*
1363                          * The last fragment gets additional space at tail.
1364                          * Note: we overallocate on fragments with MSG_MODE
1365                          * because we have no idea if we're the last one.
1366                          */
1367                         if (datalen == length + fraggap)
1368                                 alloclen += rt->dst.trailer_len;
1369
1370                         /*
1371                          * We just reserve space for fragment header.
1372                          * Note: this may be overallocation if the message
1373                          * (without MSG_MORE) fits into the MTU.
1374                          */
1375                         alloclen += sizeof(struct frag_hdr);
1376
1377                         if (transhdrlen) {
1378                                 skb = sock_alloc_send_skb(sk,
1379                                                 alloclen + hh_len,
1380                                                 (flags & MSG_DONTWAIT), &err);
1381                         } else {
1382                                 skb = NULL;
1383                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1384                                     2 * sk->sk_sndbuf)
1385                                         skb = sock_wmalloc(sk,
1386                                                            alloclen + hh_len, 1,
1387                                                            sk->sk_allocation);
1388                                 if (unlikely(skb == NULL))
1389                                         err = -ENOBUFS;
1390                                 else {
1391                                         /* Only the initial fragment
1392                                          * is time stamped.
1393                                          */
1394                                         tx_flags = 0;
1395                                 }
1396                         }
1397                         if (skb == NULL)
1398                                 goto error;
1399                         /*
1400                          *      Fill in the control structures
1401                          */
1402                         skb->ip_summed = csummode;
1403                         skb->csum = 0;
1404                         /* reserve for fragmentation */
1405                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1406
1407                         if (sk->sk_type == SOCK_DGRAM)
1408                                 skb_shinfo(skb)->tx_flags = tx_flags;
1409
1410                         /*
1411                          *      Find where to start putting bytes
1412                          */
1413                         data = skb_put(skb, fraglen);
1414                         skb_set_network_header(skb, exthdrlen);
1415                         data += fragheaderlen;
1416                         skb->transport_header = (skb->network_header +
1417                                                  fragheaderlen);
1418                         if (fraggap) {
1419                                 skb->csum = skb_copy_and_csum_bits(
1420                                         skb_prev, maxfraglen,
1421                                         data + transhdrlen, fraggap, 0);
1422                                 skb_prev->csum = csum_sub(skb_prev->csum,
1423                                                           skb->csum);
1424                                 data += fraggap;
1425                                 pskb_trim_unique(skb_prev, maxfraglen);
1426                         }
1427                         copy = datalen - transhdrlen - fraggap;
1428                         if (copy < 0) {
1429                                 err = -EINVAL;
1430                                 kfree_skb(skb);
1431                                 goto error;
1432                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1433                                 err = -EFAULT;
1434                                 kfree_skb(skb);
1435                                 goto error;
1436                         }
1437
1438                         offset += copy;
1439                         length -= datalen - fraggap;
1440                         transhdrlen = 0;
1441                         exthdrlen = 0;
1442                         csummode = CHECKSUM_NONE;
1443
1444                         /*
1445                          * Put the packet on the pending queue
1446                          */
1447                         __skb_queue_tail(&sk->sk_write_queue, skb);
1448                         continue;
1449                 }
1450
1451                 if (copy > length)
1452                         copy = length;
1453
1454                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1455                         unsigned int off;
1456
1457                         off = skb->len;
1458                         if (getfrag(from, skb_put(skb, copy),
1459                                                 offset, copy, off, skb) < 0) {
1460                                 __skb_trim(skb, off);
1461                                 err = -EFAULT;
1462                                 goto error;
1463                         }
1464                 } else {
1465                         int i = skb_shinfo(skb)->nr_frags;
1466                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1467                         struct page *page = sk->sk_sndmsg_page;
1468                         int off = sk->sk_sndmsg_off;
1469                         unsigned int left;
1470
1471                         if (page && (left = PAGE_SIZE - off) > 0) {
1472                                 if (copy >= left)
1473                                         copy = left;
1474                                 if (page != frag->page) {
1475                                         if (i == MAX_SKB_FRAGS) {
1476                                                 err = -EMSGSIZE;
1477                                                 goto error;
1478                                         }
1479                                         get_page(page);
1480                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1481                                         frag = &skb_shinfo(skb)->frags[i];
1482                                 }
1483                         } else if(i < MAX_SKB_FRAGS) {
1484                                 if (copy > PAGE_SIZE)
1485                                         copy = PAGE_SIZE;
1486                                 page = alloc_pages(sk->sk_allocation, 0);
1487                                 if (page == NULL) {
1488                                         err = -ENOMEM;
1489                                         goto error;
1490                                 }
1491                                 sk->sk_sndmsg_page = page;
1492                                 sk->sk_sndmsg_off = 0;
1493
1494                                 skb_fill_page_desc(skb, i, page, 0, 0);
1495                                 frag = &skb_shinfo(skb)->frags[i];
1496                         } else {
1497                                 err = -EMSGSIZE;
1498                                 goto error;
1499                         }
1500                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1501                                 err = -EFAULT;
1502                                 goto error;
1503                         }
1504                         sk->sk_sndmsg_off += copy;
1505                         frag->size += copy;
1506                         skb->len += copy;
1507                         skb->data_len += copy;
1508                         skb->truesize += copy;
1509                         atomic_add(copy, &sk->sk_wmem_alloc);
1510                 }
1511                 offset += copy;
1512                 length -= copy;
1513         }
1514         return 0;
1515 error:
1516         cork->length -= length;
1517         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1518         return err;
1519 }
1520
1521 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1522 {
1523         if (np->cork.opt) {
1524                 kfree(np->cork.opt->dst0opt);
1525                 kfree(np->cork.opt->dst1opt);
1526                 kfree(np->cork.opt->hopopt);
1527                 kfree(np->cork.opt->srcrt);
1528                 kfree(np->cork.opt);
1529                 np->cork.opt = NULL;
1530         }
1531
1532         if (inet->cork.base.dst) {
1533                 dst_release(inet->cork.base.dst);
1534                 inet->cork.base.dst = NULL;
1535                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1536         }
1537         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1538 }
1539
1540 int ip6_push_pending_frames(struct sock *sk)
1541 {
1542         struct sk_buff *skb, *tmp_skb;
1543         struct sk_buff **tail_skb;
1544         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1545         struct inet_sock *inet = inet_sk(sk);
1546         struct ipv6_pinfo *np = inet6_sk(sk);
1547         struct net *net = sock_net(sk);
1548         struct ipv6hdr *hdr;
1549         struct ipv6_txoptions *opt = np->cork.opt;
1550         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1551         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1552         unsigned char proto = fl6->flowi6_proto;
1553         int err = 0;
1554
1555         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1556                 goto out;
1557         tail_skb = &(skb_shinfo(skb)->frag_list);
1558
1559         /* move skb->data to ip header from ext header */
1560         if (skb->data < skb_network_header(skb))
1561                 __skb_pull(skb, skb_network_offset(skb));
1562         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1563                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1564                 *tail_skb = tmp_skb;
1565                 tail_skb = &(tmp_skb->next);
1566                 skb->len += tmp_skb->len;
1567                 skb->data_len += tmp_skb->len;
1568                 skb->truesize += tmp_skb->truesize;
1569                 tmp_skb->destructor = NULL;
1570                 tmp_skb->sk = NULL;
1571         }
1572
1573         /* Allow local fragmentation. */
1574         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1575                 skb->local_df = 1;
1576
1577         ipv6_addr_copy(final_dst, &fl6->daddr);
1578         __skb_pull(skb, skb_network_header_len(skb));
1579         if (opt && opt->opt_flen)
1580                 ipv6_push_frag_opts(skb, opt, &proto);
1581         if (opt && opt->opt_nflen)
1582                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1583
1584         skb_push(skb, sizeof(struct ipv6hdr));
1585         skb_reset_network_header(skb);
1586         hdr = ipv6_hdr(skb);
1587
1588         *(__be32*)hdr = fl6->flowlabel |
1589                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1590
1591         hdr->hop_limit = np->cork.hop_limit;
1592         hdr->nexthdr = proto;
1593         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1594         ipv6_addr_copy(&hdr->daddr, final_dst);
1595
1596         skb->priority = sk->sk_priority;
1597         skb->mark = sk->sk_mark;
1598
1599         skb_dst_set(skb, dst_clone(&rt->dst));
1600         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1601         if (proto == IPPROTO_ICMPV6) {
1602                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1603
1604                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1605                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1606         }
1607
1608         err = ip6_local_out(skb);
1609         if (err) {
1610                 if (err > 0)
1611                         err = net_xmit_errno(err);
1612                 if (err)
1613                         goto error;
1614         }
1615
1616 out:
1617         ip6_cork_release(inet, np);
1618         return err;
1619 error:
1620         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1621         goto out;
1622 }
1623
1624 void ip6_flush_pending_frames(struct sock *sk)
1625 {
1626         struct sk_buff *skb;
1627
1628         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1629                 if (skb_dst(skb))
1630                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1631                                       IPSTATS_MIB_OUTDISCARDS);
1632                 kfree_skb(skb);
1633         }
1634
1635         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1636 }