[INET]: Introduce inet_sk_rebuild_header
[pandora-kernel.git] / net / ipv4 / ip_output.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              The Internet Protocol (IP) output module.
7  *
8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Donald Becker, <becker@super.org>
13  *              Alan Cox, <Alan.Cox@linux.org>
14  *              Richard Underwood
15  *              Stefan Becker, <stefanb@yello.ping.de>
16  *              Jorge Cwik, <jorge@laser.satlink.net>
17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
19  *
20  *      See ip_input.c for original log
21  *
22  *      Fixes:
23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
25  *              Bradford Johnson:       Fix faulty handling of some frames when 
26  *                                      no route is found.
27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
28  *                                      (in case if packet not accepted by
29  *                                      output firewall rules)
30  *              Mike McLagan    :       Routing by source
31  *              Alexey Kuznetsov:       use new route cache
32  *              Andi Kleen:             Fix broken PMTU recovery and remove
33  *                                      some redundant tests.
34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path 
37  *                                      for decreased register pressure on x86 
38  *                                      and more readibility. 
39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
40  *                                      silently drop skb instead of failing with -EPERM.
41  *              Detlev Wengorz  :       Copy protocol for fragments.
42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
43  *                                      datagrams.
44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
45  */
46
47 #include <asm/uaccess.h>
48 #include <asm/system.h>
49 #include <linux/module.h>
50 #include <linux/types.h>
51 #include <linux/kernel.h>
52 #include <linux/sched.h>
53 #include <linux/mm.h>
54 #include <linux/string.h>
55 #include <linux/errno.h>
56 #include <linux/config.h>
57
58 #include <linux/socket.h>
59 #include <linux/sockios.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/etherdevice.h>
64 #include <linux/proc_fs.h>
65 #include <linux/stat.h>
66 #include <linux/init.h>
67
68 #include <net/snmp.h>
69 #include <net/ip.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
72 #include <linux/skbuff.h>
73 #include <net/sock.h>
74 #include <net/arp.h>
75 #include <net/icmp.h>
76 #include <net/checksum.h>
77 #include <net/inetpeer.h>
78 #include <net/checksum.h>
79 #include <linux/igmp.h>
80 #include <linux/netfilter_ipv4.h>
81 #include <linux/netfilter_bridge.h>
82 #include <linux/mroute.h>
83 #include <linux/netlink.h>
84 #include <linux/tcp.h>
85
86 int sysctl_ip_default_ttl = IPDEFTTL;
87
88 /* Generate a checksum for an outgoing IP datagram. */
89 __inline__ void ip_send_check(struct iphdr *iph)
90 {
91         iph->check = 0;
92         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
93 }
94
95 /* dev_loopback_xmit for use with netfilter. */
96 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
97 {
98         newskb->mac.raw = newskb->data;
99         __skb_pull(newskb, newskb->nh.raw - newskb->data);
100         newskb->pkt_type = PACKET_LOOPBACK;
101         newskb->ip_summed = CHECKSUM_UNNECESSARY;
102         BUG_TRAP(newskb->dst);
103         netif_rx(newskb);
104         return 0;
105 }
106
107 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
108 {
109         int ttl = inet->uc_ttl;
110
111         if (ttl < 0)
112                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
113         return ttl;
114 }
115
116 /* 
117  *              Add an ip header to a skbuff and send it out.
118  *
119  */
120 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
121                           u32 saddr, u32 daddr, struct ip_options *opt)
122 {
123         struct inet_sock *inet = inet_sk(sk);
124         struct rtable *rt = (struct rtable *)skb->dst;
125         struct iphdr *iph;
126
127         /* Build the IP header. */
128         if (opt)
129                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
130         else
131                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
132
133         iph->version  = 4;
134         iph->ihl      = 5;
135         iph->tos      = inet->tos;
136         if (ip_dont_fragment(sk, &rt->u.dst))
137                 iph->frag_off = htons(IP_DF);
138         else
139                 iph->frag_off = 0;
140         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
141         iph->daddr    = rt->rt_dst;
142         iph->saddr    = rt->rt_src;
143         iph->protocol = sk->sk_protocol;
144         iph->tot_len  = htons(skb->len);
145         ip_select_ident(iph, &rt->u.dst, sk);
146         skb->nh.iph   = iph;
147
148         if (opt && opt->optlen) {
149                 iph->ihl += opt->optlen>>2;
150                 ip_options_build(skb, opt, daddr, rt, 0);
151         }
152         ip_send_check(iph);
153
154         skb->priority = sk->sk_priority;
155
156         /* Send it out. */
157         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
158                        dst_output);
159 }
160
161 static inline int ip_finish_output2(struct sk_buff *skb)
162 {
163         struct dst_entry *dst = skb->dst;
164         struct hh_cache *hh = dst->hh;
165         struct net_device *dev = dst->dev;
166         int hh_len = LL_RESERVED_SPACE(dev);
167
168         /* Be paranoid, rather than too clever. */
169         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
170                 struct sk_buff *skb2;
171
172                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
173                 if (skb2 == NULL) {
174                         kfree_skb(skb);
175                         return -ENOMEM;
176                 }
177                 if (skb->sk)
178                         skb_set_owner_w(skb2, skb->sk);
179                 kfree_skb(skb);
180                 skb = skb2;
181         }
182
183         if (hh) {
184                 int hh_alen;
185
186                 read_lock_bh(&hh->hh_lock);
187                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
188                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
189                 read_unlock_bh(&hh->hh_lock);
190                 skb_push(skb, hh->hh_len);
191                 return hh->hh_output(skb);
192         } else if (dst->neighbour)
193                 return dst->neighbour->output(skb);
194
195         if (net_ratelimit())
196                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
197         kfree_skb(skb);
198         return -EINVAL;
199 }
200
201 static int ip_finish_output(struct sk_buff *skb)
202 {
203         struct net_device *dev = skb->dst->dev;
204
205         skb->dev = dev;
206         skb->protocol = htons(ETH_P_IP);
207
208         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
209                        ip_finish_output2);
210 }
211
212 int ip_mc_output(struct sk_buff *skb)
213 {
214         struct sock *sk = skb->sk;
215         struct rtable *rt = (struct rtable*)skb->dst;
216         struct net_device *dev = rt->u.dst.dev;
217
218         /*
219          *      If the indicated interface is up and running, send the packet.
220          */
221         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
222
223         skb->dev = dev;
224         skb->protocol = htons(ETH_P_IP);
225
226         /*
227          *      Multicasts are looped back for other local users
228          */
229
230         if (rt->rt_flags&RTCF_MULTICAST) {
231                 if ((!sk || inet_sk(sk)->mc_loop)
232 #ifdef CONFIG_IP_MROUTE
233                 /* Small optimization: do not loopback not local frames,
234                    which returned after forwarding; they will be  dropped
235                    by ip_mr_input in any case.
236                    Note, that local frames are looped back to be delivered
237                    to local recipients.
238
239                    This check is duplicated in ip_mr_input at the moment.
240                  */
241                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
242 #endif
243                 ) {
244                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
245                         if (newskb)
246                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
247                                         newskb->dev, 
248                                         ip_dev_loopback_xmit);
249                 }
250
251                 /* Multicasts with ttl 0 must not go beyond the host */
252
253                 if (skb->nh.iph->ttl == 0) {
254                         kfree_skb(skb);
255                         return 0;
256                 }
257         }
258
259         if (rt->rt_flags&RTCF_BROADCAST) {
260                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
261                 if (newskb)
262                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
263                                 newskb->dev, ip_dev_loopback_xmit);
264         }
265
266         if (skb->len > dst_mtu(&rt->u.dst))
267                 return ip_fragment(skb, ip_finish_output);
268         else
269                 return ip_finish_output(skb);
270 }
271
272 int ip_output(struct sk_buff *skb)
273 {
274         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
275
276         if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
277                 return ip_fragment(skb, ip_finish_output);
278         else
279                 return ip_finish_output(skb);
280 }
281
282 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
283 {
284         struct sock *sk = skb->sk;
285         struct inet_sock *inet = inet_sk(sk);
286         struct ip_options *opt = inet->opt;
287         struct rtable *rt;
288         struct iphdr *iph;
289
290         /* Skip all of this if the packet is already routed,
291          * f.e. by something like SCTP.
292          */
293         rt = (struct rtable *) skb->dst;
294         if (rt != NULL)
295                 goto packet_routed;
296
297         /* Make sure we can route this packet. */
298         rt = (struct rtable *)__sk_dst_check(sk, 0);
299         if (rt == NULL) {
300                 u32 daddr;
301
302                 /* Use correct destination address if we have options. */
303                 daddr = inet->daddr;
304                 if(opt && opt->srr)
305                         daddr = opt->faddr;
306
307                 {
308                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
309                                             .nl_u = { .ip4_u =
310                                                       { .daddr = daddr,
311                                                         .saddr = inet->saddr,
312                                                         .tos = RT_CONN_FLAGS(sk) } },
313                                             .proto = sk->sk_protocol,
314                                             .uli_u = { .ports =
315                                                        { .sport = inet->sport,
316                                                          .dport = inet->dport } } };
317
318                         /* If this fails, retransmit mechanism of transport layer will
319                          * keep trying until route appears or the connection times
320                          * itself out.
321                          */
322                         if (ip_route_output_flow(&rt, &fl, sk, 0))
323                                 goto no_route;
324                 }
325                 sk_setup_caps(sk, &rt->u.dst);
326         }
327         skb->dst = dst_clone(&rt->u.dst);
328
329 packet_routed:
330         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
331                 goto no_route;
332
333         /* OK, we know where to send it, allocate and build IP header. */
334         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
335         *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
336         iph->tot_len = htons(skb->len);
337         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
338                 iph->frag_off = htons(IP_DF);
339         else
340                 iph->frag_off = 0;
341         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
342         iph->protocol = sk->sk_protocol;
343         iph->saddr    = rt->rt_src;
344         iph->daddr    = rt->rt_dst;
345         skb->nh.iph   = iph;
346         /* Transport layer set skb->h.foo itself. */
347
348         if (opt && opt->optlen) {
349                 iph->ihl += opt->optlen >> 2;
350                 ip_options_build(skb, opt, inet->daddr, rt, 0);
351         }
352
353         ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
354
355         /* Add an IP checksum. */
356         ip_send_check(iph);
357
358         skb->priority = sk->sk_priority;
359
360         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
361                        dst_output);
362
363 no_route:
364         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
365         kfree_skb(skb);
366         return -EHOSTUNREACH;
367 }
368
369
370 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
371 {
372         to->pkt_type = from->pkt_type;
373         to->priority = from->priority;
374         to->protocol = from->protocol;
375         dst_release(to->dst);
376         to->dst = dst_clone(from->dst);
377         to->dev = from->dev;
378
379         /* Copy the flags to each fragment. */
380         IPCB(to)->flags = IPCB(from)->flags;
381
382 #ifdef CONFIG_NET_SCHED
383         to->tc_index = from->tc_index;
384 #endif
385 #ifdef CONFIG_NETFILTER
386         to->nfmark = from->nfmark;
387         /* Connection association is same as pre-frag packet */
388         nf_conntrack_put(to->nfct);
389         to->nfct = from->nfct;
390         nf_conntrack_get(to->nfct);
391         to->nfctinfo = from->nfctinfo;
392 #ifdef CONFIG_BRIDGE_NETFILTER
393         nf_bridge_put(to->nf_bridge);
394         to->nf_bridge = from->nf_bridge;
395         nf_bridge_get(to->nf_bridge);
396 #endif
397 #endif
398 }
399
400 /*
401  *      This IP datagram is too large to be sent in one piece.  Break it up into
402  *      smaller pieces (each of size equal to IP header plus
403  *      a block of the data of the original IP data part) that will yet fit in a
404  *      single device frame, and queue such a frame for sending.
405  */
406
407 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
408 {
409         struct iphdr *iph;
410         int raw = 0;
411         int ptr;
412         struct net_device *dev;
413         struct sk_buff *skb2;
414         unsigned int mtu, hlen, left, len, ll_rs;
415         int offset;
416         int not_last_frag;
417         struct rtable *rt = (struct rtable*)skb->dst;
418         int err = 0;
419
420         dev = rt->u.dst.dev;
421
422         /*
423          *      Point into the IP datagram header.
424          */
425
426         iph = skb->nh.iph;
427
428         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
429                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
430                           htonl(dst_mtu(&rt->u.dst)));
431                 kfree_skb(skb);
432                 return -EMSGSIZE;
433         }
434
435         /*
436          *      Setup starting values.
437          */
438
439         hlen = iph->ihl * 4;
440         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
441
442         /* When frag_list is given, use it. First, check its validity:
443          * some transformers could create wrong frag_list or break existing
444          * one, it is not prohibited. In this case fall back to copying.
445          *
446          * LATER: this step can be merged to real generation of fragments,
447          * we can switch to copy when see the first bad fragment.
448          */
449         if (skb_shinfo(skb)->frag_list) {
450                 struct sk_buff *frag;
451                 int first_len = skb_pagelen(skb);
452
453                 if (first_len - hlen > mtu ||
454                     ((first_len - hlen) & 7) ||
455                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
456                     skb_cloned(skb))
457                         goto slow_path;
458
459                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
460                         /* Correct geometry. */
461                         if (frag->len > mtu ||
462                             ((frag->len & 7) && frag->next) ||
463                             skb_headroom(frag) < hlen)
464                             goto slow_path;
465
466                         /* Partially cloned skb? */
467                         if (skb_shared(frag))
468                                 goto slow_path;
469
470                         BUG_ON(frag->sk);
471                         if (skb->sk) {
472                                 sock_hold(skb->sk);
473                                 frag->sk = skb->sk;
474                                 frag->destructor = sock_wfree;
475                                 skb->truesize -= frag->truesize;
476                         }
477                 }
478
479                 /* Everything is OK. Generate! */
480
481                 err = 0;
482                 offset = 0;
483                 frag = skb_shinfo(skb)->frag_list;
484                 skb_shinfo(skb)->frag_list = NULL;
485                 skb->data_len = first_len - skb_headlen(skb);
486                 skb->len = first_len;
487                 iph->tot_len = htons(first_len);
488                 iph->frag_off = htons(IP_MF);
489                 ip_send_check(iph);
490
491                 for (;;) {
492                         /* Prepare header of the next frame,
493                          * before previous one went down. */
494                         if (frag) {
495                                 frag->ip_summed = CHECKSUM_NONE;
496                                 frag->h.raw = frag->data;
497                                 frag->nh.raw = __skb_push(frag, hlen);
498                                 memcpy(frag->nh.raw, iph, hlen);
499                                 iph = frag->nh.iph;
500                                 iph->tot_len = htons(frag->len);
501                                 ip_copy_metadata(frag, skb);
502                                 if (offset == 0)
503                                         ip_options_fragment(frag);
504                                 offset += skb->len - hlen;
505                                 iph->frag_off = htons(offset>>3);
506                                 if (frag->next != NULL)
507                                         iph->frag_off |= htons(IP_MF);
508                                 /* Ready, complete checksum */
509                                 ip_send_check(iph);
510                         }
511
512                         err = output(skb);
513
514                         if (err || !frag)
515                                 break;
516
517                         skb = frag;
518                         frag = skb->next;
519                         skb->next = NULL;
520                 }
521
522                 if (err == 0) {
523                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
524                         return 0;
525                 }
526
527                 while (frag) {
528                         skb = frag->next;
529                         kfree_skb(frag);
530                         frag = skb;
531                 }
532                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
533                 return err;
534         }
535
536 slow_path:
537         left = skb->len - hlen;         /* Space per frame */
538         ptr = raw + hlen;               /* Where to start from */
539
540 #ifdef CONFIG_BRIDGE_NETFILTER
541         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
542          * we need to make room for the encapsulating header */
543         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
544         mtu -= nf_bridge_pad(skb);
545 #else
546         ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
547 #endif
548         /*
549          *      Fragment the datagram.
550          */
551
552         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
553         not_last_frag = iph->frag_off & htons(IP_MF);
554
555         /*
556          *      Keep copying data until we run out.
557          */
558
559         while(left > 0) {
560                 len = left;
561                 /* IF: it doesn't fit, use 'mtu' - the data space left */
562                 if (len > mtu)
563                         len = mtu;
564                 /* IF: we are not sending upto and including the packet end
565                    then align the next start on an eight byte boundary */
566                 if (len < left) {
567                         len &= ~7;
568                 }
569                 /*
570                  *      Allocate buffer.
571                  */
572
573                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
574                         NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
575                         err = -ENOMEM;
576                         goto fail;
577                 }
578
579                 /*
580                  *      Set up data on packet
581                  */
582
583                 ip_copy_metadata(skb2, skb);
584                 skb_reserve(skb2, ll_rs);
585                 skb_put(skb2, len + hlen);
586                 skb2->nh.raw = skb2->data;
587                 skb2->h.raw = skb2->data + hlen;
588
589                 /*
590                  *      Charge the memory for the fragment to any owner
591                  *      it might possess
592                  */
593
594                 if (skb->sk)
595                         skb_set_owner_w(skb2, skb->sk);
596
597                 /*
598                  *      Copy the packet header into the new buffer.
599                  */
600
601                 memcpy(skb2->nh.raw, skb->data, hlen);
602
603                 /*
604                  *      Copy a block of the IP datagram.
605                  */
606                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
607                         BUG();
608                 left -= len;
609
610                 /*
611                  *      Fill in the new header fields.
612                  */
613                 iph = skb2->nh.iph;
614                 iph->frag_off = htons((offset >> 3));
615
616                 /* ANK: dirty, but effective trick. Upgrade options only if
617                  * the segment to be fragmented was THE FIRST (otherwise,
618                  * options are already fixed) and make it ONCE
619                  * on the initial skb, so that all the following fragments
620                  * will inherit fixed options.
621                  */
622                 if (offset == 0)
623                         ip_options_fragment(skb);
624
625                 /*
626                  *      Added AC : If we are fragmenting a fragment that's not the
627                  *                 last fragment then keep MF on each bit
628                  */
629                 if (left > 0 || not_last_frag)
630                         iph->frag_off |= htons(IP_MF);
631                 ptr += len;
632                 offset += len;
633
634                 /*
635                  *      Put this fragment into the sending queue.
636                  */
637
638                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
639
640                 iph->tot_len = htons(len + hlen);
641
642                 ip_send_check(iph);
643
644                 err = output(skb2);
645                 if (err)
646                         goto fail;
647         }
648         kfree_skb(skb);
649         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
650         return err;
651
652 fail:
653         kfree_skb(skb); 
654         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
655         return err;
656 }
657
658 int
659 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
660 {
661         struct iovec *iov = from;
662
663         if (skb->ip_summed == CHECKSUM_HW) {
664                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
665                         return -EFAULT;
666         } else {
667                 unsigned int csum = 0;
668                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
669                         return -EFAULT;
670                 skb->csum = csum_block_add(skb->csum, csum, odd);
671         }
672         return 0;
673 }
674
675 static inline unsigned int
676 csum_page(struct page *page, int offset, int copy)
677 {
678         char *kaddr;
679         unsigned int csum;
680         kaddr = kmap(page);
681         csum = csum_partial(kaddr + offset, copy, 0);
682         kunmap(page);
683         return csum;
684 }
685
686 /*
687  *      ip_append_data() and ip_append_page() can make one large IP datagram
688  *      from many pieces of data. Each pieces will be holded on the socket
689  *      until ip_push_pending_frames() is called. Each piece can be a page
690  *      or non-page data.
691  *      
692  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
693  *      this interface potentially.
694  *
695  *      LATER: length must be adjusted by pad at tail, when it is required.
696  */
697 int ip_append_data(struct sock *sk,
698                    int getfrag(void *from, char *to, int offset, int len,
699                                int odd, struct sk_buff *skb),
700                    void *from, int length, int transhdrlen,
701                    struct ipcm_cookie *ipc, struct rtable *rt,
702                    unsigned int flags)
703 {
704         struct inet_sock *inet = inet_sk(sk);
705         struct sk_buff *skb;
706
707         struct ip_options *opt = NULL;
708         int hh_len;
709         int exthdrlen;
710         int mtu;
711         int copy;
712         int err;
713         int offset = 0;
714         unsigned int maxfraglen, fragheaderlen;
715         int csummode = CHECKSUM_NONE;
716
717         if (flags&MSG_PROBE)
718                 return 0;
719
720         if (skb_queue_empty(&sk->sk_write_queue)) {
721                 /*
722                  * setup for corking.
723                  */
724                 opt = ipc->opt;
725                 if (opt) {
726                         if (inet->cork.opt == NULL) {
727                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
728                                 if (unlikely(inet->cork.opt == NULL))
729                                         return -ENOBUFS;
730                         }
731                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
732                         inet->cork.flags |= IPCORK_OPT;
733                         inet->cork.addr = ipc->addr;
734                 }
735                 dst_hold(&rt->u.dst);
736                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
737                 inet->cork.rt = rt;
738                 inet->cork.length = 0;
739                 sk->sk_sndmsg_page = NULL;
740                 sk->sk_sndmsg_off = 0;
741                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
742                         length += exthdrlen;
743                         transhdrlen += exthdrlen;
744                 }
745         } else {
746                 rt = inet->cork.rt;
747                 if (inet->cork.flags & IPCORK_OPT)
748                         opt = inet->cork.opt;
749
750                 transhdrlen = 0;
751                 exthdrlen = 0;
752                 mtu = inet->cork.fragsize;
753         }
754         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
755
756         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
757         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
758
759         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
760                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
761                 return -EMSGSIZE;
762         }
763
764         /*
765          * transhdrlen > 0 means that this is the first fragment and we wish
766          * it won't be fragmented in the future.
767          */
768         if (transhdrlen &&
769             length + fragheaderlen <= mtu &&
770             rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
771             !exthdrlen)
772                 csummode = CHECKSUM_HW;
773
774         inet->cork.length += length;
775
776         /* So, what's going on in the loop below?
777          *
778          * We use calculated fragment length to generate chained skb,
779          * each of segments is IP fragment ready for sending to network after
780          * adding appropriate IP header.
781          */
782
783         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
784                 goto alloc_new_skb;
785
786         while (length > 0) {
787                 /* Check if the remaining data fits into current packet. */
788                 copy = mtu - skb->len;
789                 if (copy < length)
790                         copy = maxfraglen - skb->len;
791                 if (copy <= 0) {
792                         char *data;
793                         unsigned int datalen;
794                         unsigned int fraglen;
795                         unsigned int fraggap;
796                         unsigned int alloclen;
797                         struct sk_buff *skb_prev;
798 alloc_new_skb:
799                         skb_prev = skb;
800                         if (skb_prev)
801                                 fraggap = skb_prev->len - maxfraglen;
802                         else
803                                 fraggap = 0;
804
805                         /*
806                          * If remaining data exceeds the mtu,
807                          * we know we need more fragment(s).
808                          */
809                         datalen = length + fraggap;
810                         if (datalen > mtu - fragheaderlen)
811                                 datalen = maxfraglen - fragheaderlen;
812                         fraglen = datalen + fragheaderlen;
813
814                         if ((flags & MSG_MORE) && 
815                             !(rt->u.dst.dev->features&NETIF_F_SG))
816                                 alloclen = mtu;
817                         else
818                                 alloclen = datalen + fragheaderlen;
819
820                         /* The last fragment gets additional space at tail.
821                          * Note, with MSG_MORE we overallocate on fragments,
822                          * because we have no idea what fragment will be
823                          * the last.
824                          */
825                         if (datalen == length)
826                                 alloclen += rt->u.dst.trailer_len;
827
828                         if (transhdrlen) {
829                                 skb = sock_alloc_send_skb(sk, 
830                                                 alloclen + hh_len + 15,
831                                                 (flags & MSG_DONTWAIT), &err);
832                         } else {
833                                 skb = NULL;
834                                 if (atomic_read(&sk->sk_wmem_alloc) <=
835                                     2 * sk->sk_sndbuf)
836                                         skb = sock_wmalloc(sk, 
837                                                            alloclen + hh_len + 15, 1,
838                                                            sk->sk_allocation);
839                                 if (unlikely(skb == NULL))
840                                         err = -ENOBUFS;
841                         }
842                         if (skb == NULL)
843                                 goto error;
844
845                         /*
846                          *      Fill in the control structures
847                          */
848                         skb->ip_summed = csummode;
849                         skb->csum = 0;
850                         skb_reserve(skb, hh_len);
851
852                         /*
853                          *      Find where to start putting bytes.
854                          */
855                         data = skb_put(skb, fraglen);
856                         skb->nh.raw = data + exthdrlen;
857                         data += fragheaderlen;
858                         skb->h.raw = data + exthdrlen;
859
860                         if (fraggap) {
861                                 skb->csum = skb_copy_and_csum_bits(
862                                         skb_prev, maxfraglen,
863                                         data + transhdrlen, fraggap, 0);
864                                 skb_prev->csum = csum_sub(skb_prev->csum,
865                                                           skb->csum);
866                                 data += fraggap;
867                                 skb_trim(skb_prev, maxfraglen);
868                         }
869
870                         copy = datalen - transhdrlen - fraggap;
871                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
872                                 err = -EFAULT;
873                                 kfree_skb(skb);
874                                 goto error;
875                         }
876
877                         offset += copy;
878                         length -= datalen - fraggap;
879                         transhdrlen = 0;
880                         exthdrlen = 0;
881                         csummode = CHECKSUM_NONE;
882
883                         /*
884                          * Put the packet on the pending queue.
885                          */
886                         __skb_queue_tail(&sk->sk_write_queue, skb);
887                         continue;
888                 }
889
890                 if (copy > length)
891                         copy = length;
892
893                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
894                         unsigned int off;
895
896                         off = skb->len;
897                         if (getfrag(from, skb_put(skb, copy), 
898                                         offset, copy, off, skb) < 0) {
899                                 __skb_trim(skb, off);
900                                 err = -EFAULT;
901                                 goto error;
902                         }
903                 } else {
904                         int i = skb_shinfo(skb)->nr_frags;
905                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
906                         struct page *page = sk->sk_sndmsg_page;
907                         int off = sk->sk_sndmsg_off;
908                         unsigned int left;
909
910                         if (page && (left = PAGE_SIZE - off) > 0) {
911                                 if (copy >= left)
912                                         copy = left;
913                                 if (page != frag->page) {
914                                         if (i == MAX_SKB_FRAGS) {
915                                                 err = -EMSGSIZE;
916                                                 goto error;
917                                         }
918                                         get_page(page);
919                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
920                                         frag = &skb_shinfo(skb)->frags[i];
921                                 }
922                         } else if (i < MAX_SKB_FRAGS) {
923                                 if (copy > PAGE_SIZE)
924                                         copy = PAGE_SIZE;
925                                 page = alloc_pages(sk->sk_allocation, 0);
926                                 if (page == NULL)  {
927                                         err = -ENOMEM;
928                                         goto error;
929                                 }
930                                 sk->sk_sndmsg_page = page;
931                                 sk->sk_sndmsg_off = 0;
932
933                                 skb_fill_page_desc(skb, i, page, 0, 0);
934                                 frag = &skb_shinfo(skb)->frags[i];
935                                 skb->truesize += PAGE_SIZE;
936                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
937                         } else {
938                                 err = -EMSGSIZE;
939                                 goto error;
940                         }
941                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
942                                 err = -EFAULT;
943                                 goto error;
944                         }
945                         sk->sk_sndmsg_off += copy;
946                         frag->size += copy;
947                         skb->len += copy;
948                         skb->data_len += copy;
949                 }
950                 offset += copy;
951                 length -= copy;
952         }
953
954         return 0;
955
956 error:
957         inet->cork.length -= length;
958         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
959         return err; 
960 }
961
962 ssize_t ip_append_page(struct sock *sk, struct page *page,
963                        int offset, size_t size, int flags)
964 {
965         struct inet_sock *inet = inet_sk(sk);
966         struct sk_buff *skb;
967         struct rtable *rt;
968         struct ip_options *opt = NULL;
969         int hh_len;
970         int mtu;
971         int len;
972         int err;
973         unsigned int maxfraglen, fragheaderlen, fraggap;
974
975         if (inet->hdrincl)
976                 return -EPERM;
977
978         if (flags&MSG_PROBE)
979                 return 0;
980
981         if (skb_queue_empty(&sk->sk_write_queue))
982                 return -EINVAL;
983
984         rt = inet->cork.rt;
985         if (inet->cork.flags & IPCORK_OPT)
986                 opt = inet->cork.opt;
987
988         if (!(rt->u.dst.dev->features&NETIF_F_SG))
989                 return -EOPNOTSUPP;
990
991         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
992         mtu = inet->cork.fragsize;
993
994         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
995         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
996
997         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
998                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
999                 return -EMSGSIZE;
1000         }
1001
1002         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1003                 return -EINVAL;
1004
1005         inet->cork.length += size;
1006
1007         while (size > 0) {
1008                 int i;
1009
1010                 /* Check if the remaining data fits into current packet. */
1011                 len = mtu - skb->len;
1012                 if (len < size)
1013                         len = maxfraglen - skb->len;
1014                 if (len <= 0) {
1015                         struct sk_buff *skb_prev;
1016                         char *data;
1017                         struct iphdr *iph;
1018                         int alloclen;
1019
1020                         skb_prev = skb;
1021                         if (skb_prev)
1022                                 fraggap = skb_prev->len - maxfraglen;
1023                         else
1024                                 fraggap = 0;
1025
1026                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1027                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1028                         if (unlikely(!skb)) {
1029                                 err = -ENOBUFS;
1030                                 goto error;
1031                         }
1032
1033                         /*
1034                          *      Fill in the control structures
1035                          */
1036                         skb->ip_summed = CHECKSUM_NONE;
1037                         skb->csum = 0;
1038                         skb_reserve(skb, hh_len);
1039
1040                         /*
1041                          *      Find where to start putting bytes.
1042                          */
1043                         data = skb_put(skb, fragheaderlen + fraggap);
1044                         skb->nh.iph = iph = (struct iphdr *)data;
1045                         data += fragheaderlen;
1046                         skb->h.raw = data;
1047
1048                         if (fraggap) {
1049                                 skb->csum = skb_copy_and_csum_bits(
1050                                         skb_prev, maxfraglen,
1051                                         data, fraggap, 0);
1052                                 skb_prev->csum = csum_sub(skb_prev->csum,
1053                                                           skb->csum);
1054                                 skb_trim(skb_prev, maxfraglen);
1055                         }
1056
1057                         /*
1058                          * Put the packet on the pending queue.
1059                          */
1060                         __skb_queue_tail(&sk->sk_write_queue, skb);
1061                         continue;
1062                 }
1063
1064                 i = skb_shinfo(skb)->nr_frags;
1065                 if (len > size)
1066                         len = size;
1067                 if (skb_can_coalesce(skb, i, page, offset)) {
1068                         skb_shinfo(skb)->frags[i-1].size += len;
1069                 } else if (i < MAX_SKB_FRAGS) {
1070                         get_page(page);
1071                         skb_fill_page_desc(skb, i, page, offset, len);
1072                 } else {
1073                         err = -EMSGSIZE;
1074                         goto error;
1075                 }
1076
1077                 if (skb->ip_summed == CHECKSUM_NONE) {
1078                         unsigned int csum;
1079                         csum = csum_page(page, offset, len);
1080                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1081                 }
1082
1083                 skb->len += len;
1084                 skb->data_len += len;
1085                 offset += len;
1086                 size -= len;
1087         }
1088         return 0;
1089
1090 error:
1091         inet->cork.length -= size;
1092         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1093         return err;
1094 }
1095
1096 /*
1097  *      Combined all pending IP fragments on the socket as one IP datagram
1098  *      and push them out.
1099  */
1100 int ip_push_pending_frames(struct sock *sk)
1101 {
1102         struct sk_buff *skb, *tmp_skb;
1103         struct sk_buff **tail_skb;
1104         struct inet_sock *inet = inet_sk(sk);
1105         struct ip_options *opt = NULL;
1106         struct rtable *rt = inet->cork.rt;
1107         struct iphdr *iph;
1108         int df = 0;
1109         __u8 ttl;
1110         int err = 0;
1111
1112         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1113                 goto out;
1114         tail_skb = &(skb_shinfo(skb)->frag_list);
1115
1116         /* move skb->data to ip header from ext header */
1117         if (skb->data < skb->nh.raw)
1118                 __skb_pull(skb, skb->nh.raw - skb->data);
1119         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1120                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1121                 *tail_skb = tmp_skb;
1122                 tail_skb = &(tmp_skb->next);
1123                 skb->len += tmp_skb->len;
1124                 skb->data_len += tmp_skb->len;
1125                 skb->truesize += tmp_skb->truesize;
1126                 __sock_put(tmp_skb->sk);
1127                 tmp_skb->destructor = NULL;
1128                 tmp_skb->sk = NULL;
1129         }
1130
1131         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1132          * to fragment the frame generated here. No matter, what transforms
1133          * how transforms change size of the packet, it will come out.
1134          */
1135         if (inet->pmtudisc != IP_PMTUDISC_DO)
1136                 skb->local_df = 1;
1137
1138         /* DF bit is set when we want to see DF on outgoing frames.
1139          * If local_df is set too, we still allow to fragment this frame
1140          * locally. */
1141         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1142             (skb->len <= dst_mtu(&rt->u.dst) &&
1143              ip_dont_fragment(sk, &rt->u.dst)))
1144                 df = htons(IP_DF);
1145
1146         if (inet->cork.flags & IPCORK_OPT)
1147                 opt = inet->cork.opt;
1148
1149         if (rt->rt_type == RTN_MULTICAST)
1150                 ttl = inet->mc_ttl;
1151         else
1152                 ttl = ip_select_ttl(inet, &rt->u.dst);
1153
1154         iph = (struct iphdr *)skb->data;
1155         iph->version = 4;
1156         iph->ihl = 5;
1157         if (opt) {
1158                 iph->ihl += opt->optlen>>2;
1159                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1160         }
1161         iph->tos = inet->tos;
1162         iph->tot_len = htons(skb->len);
1163         iph->frag_off = df;
1164         if (!df) {
1165                 __ip_select_ident(iph, &rt->u.dst, 0);
1166         } else {
1167                 iph->id = htons(inet->id++);
1168         }
1169         iph->ttl = ttl;
1170         iph->protocol = sk->sk_protocol;
1171         iph->saddr = rt->rt_src;
1172         iph->daddr = rt->rt_dst;
1173         ip_send_check(iph);
1174
1175         skb->priority = sk->sk_priority;
1176         skb->dst = dst_clone(&rt->u.dst);
1177
1178         /* Netfilter gets whole the not fragmented skb. */
1179         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, 
1180                       skb->dst->dev, dst_output);
1181         if (err) {
1182                 if (err > 0)
1183                         err = inet->recverr ? net_xmit_errno(err) : 0;
1184                 if (err)
1185                         goto error;
1186         }
1187
1188 out:
1189         inet->cork.flags &= ~IPCORK_OPT;
1190         if (inet->cork.opt) {
1191                 kfree(inet->cork.opt);
1192                 inet->cork.opt = NULL;
1193         }
1194         if (inet->cork.rt) {
1195                 ip_rt_put(inet->cork.rt);
1196                 inet->cork.rt = NULL;
1197         }
1198         return err;
1199
1200 error:
1201         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1202         goto out;
1203 }
1204
1205 /*
1206  *      Throw away all pending data on the socket.
1207  */
1208 void ip_flush_pending_frames(struct sock *sk)
1209 {
1210         struct inet_sock *inet = inet_sk(sk);
1211         struct sk_buff *skb;
1212
1213         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1214                 kfree_skb(skb);
1215
1216         inet->cork.flags &= ~IPCORK_OPT;
1217         if (inet->cork.opt) {
1218                 kfree(inet->cork.opt);
1219                 inet->cork.opt = NULL;
1220         }
1221         if (inet->cork.rt) {
1222                 ip_rt_put(inet->cork.rt);
1223                 inet->cork.rt = NULL;
1224         }
1225 }
1226
1227
1228 /*
1229  *      Fetch data from kernel space and fill in checksum if needed.
1230  */
1231 static int ip_reply_glue_bits(void *dptr, char *to, int offset, 
1232                               int len, int odd, struct sk_buff *skb)
1233 {
1234         unsigned int csum;
1235
1236         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1237         skb->csum = csum_block_add(skb->csum, csum, odd);
1238         return 0;  
1239 }
1240
1241 /* 
1242  *      Generic function to send a packet as reply to another packet.
1243  *      Used to send TCP resets so far. ICMP should use this function too.
1244  *
1245  *      Should run single threaded per socket because it uses the sock 
1246  *      structure to pass arguments.
1247  *
1248  *      LATER: switch from ip_build_xmit to ip_append_*
1249  */
1250 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1251                    unsigned int len)
1252 {
1253         struct inet_sock *inet = inet_sk(sk);
1254         struct {
1255                 struct ip_options       opt;
1256                 char                    data[40];
1257         } replyopts;
1258         struct ipcm_cookie ipc;
1259         u32 daddr;
1260         struct rtable *rt = (struct rtable*)skb->dst;
1261
1262         if (ip_options_echo(&replyopts.opt, skb))
1263                 return;
1264
1265         daddr = ipc.addr = rt->rt_src;
1266         ipc.opt = NULL;
1267
1268         if (replyopts.opt.optlen) {
1269                 ipc.opt = &replyopts.opt;
1270
1271                 if (ipc.opt->srr)
1272                         daddr = replyopts.opt.faddr;
1273         }
1274
1275         {
1276                 struct flowi fl = { .nl_u = { .ip4_u =
1277                                               { .daddr = daddr,
1278                                                 .saddr = rt->rt_spec_dst,
1279                                                 .tos = RT_TOS(skb->nh.iph->tos) } },
1280                                     /* Not quite clean, but right. */
1281                                     .uli_u = { .ports =
1282                                                { .sport = skb->h.th->dest,
1283                                                  .dport = skb->h.th->source } },
1284                                     .proto = sk->sk_protocol };
1285                 if (ip_route_output_key(&rt, &fl))
1286                         return;
1287         }
1288
1289         /* And let IP do all the hard work.
1290
1291            This chunk is not reenterable, hence spinlock.
1292            Note that it uses the fact, that this function is called
1293            with locally disabled BH and that sk cannot be already spinlocked.
1294          */
1295         bh_lock_sock(sk);
1296         inet->tos = skb->nh.iph->tos;
1297         sk->sk_priority = skb->priority;
1298         sk->sk_protocol = skb->nh.iph->protocol;
1299         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1300                        &ipc, rt, MSG_DONTWAIT);
1301         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1302                 if (arg->csumoffset >= 0)
1303                         *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1304                 skb->ip_summed = CHECKSUM_NONE;
1305                 ip_push_pending_frames(sk);
1306         }
1307
1308         bh_unlock_sock(sk);
1309
1310         ip_rt_put(rt);
1311 }
1312
1313 void __init ip_init(void)
1314 {
1315         ip_rt_init();
1316         inet_initpeers();
1317
1318 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1319         igmp_mc_proc_init();
1320 #endif
1321 }
1322
1323 EXPORT_SYMBOL(ip_fragment);
1324 EXPORT_SYMBOL(ip_generic_getfrag);
1325 EXPORT_SYMBOL(ip_queue_xmit);
1326 EXPORT_SYMBOL(ip_send_check);