[NETFILTER]: Revert nf_reset change
[linux-2.6.git] / net / ipv4 / ip_output.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              The Internet Protocol (IP) output module.
7  *
8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Donald Becker, <becker@super.org>
13  *              Alan Cox, <Alan.Cox@linux.org>
14  *              Richard Underwood
15  *              Stefan Becker, <stefanb@yello.ping.de>
16  *              Jorge Cwik, <jorge@laser.satlink.net>
17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
19  *
20  *      See ip_input.c for original log
21  *
22  *      Fixes:
23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
25  *              Bradford Johnson:       Fix faulty handling of some frames when 
26  *                                      no route is found.
27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
28  *                                      (in case if packet not accepted by
29  *                                      output firewall rules)
30  *              Mike McLagan    :       Routing by source
31  *              Alexey Kuznetsov:       use new route cache
32  *              Andi Kleen:             Fix broken PMTU recovery and remove
33  *                                      some redundant tests.
34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path 
37  *                                      for decreased register pressure on x86 
38  *                                      and more readibility. 
39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
40  *                                      silently drop skb instead of failing with -EPERM.
41  *              Detlev Wengorz  :       Copy protocol for fragments.
42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
43  *                                      datagrams.
44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
45  */
46
47 #include <asm/uaccess.h>
48 #include <asm/system.h>
49 #include <linux/module.h>
50 #include <linux/types.h>
51 #include <linux/kernel.h>
52 #include <linux/sched.h>
53 #include <linux/mm.h>
54 #include <linux/string.h>
55 #include <linux/errno.h>
56 #include <linux/config.h>
57
58 #include <linux/socket.h>
59 #include <linux/sockios.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/etherdevice.h>
64 #include <linux/proc_fs.h>
65 #include <linux/stat.h>
66 #include <linux/init.h>
67
68 #include <net/snmp.h>
69 #include <net/ip.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
72 #include <net/tcp.h>
73 #include <net/udp.h>
74 #include <linux/skbuff.h>
75 #include <net/sock.h>
76 #include <net/arp.h>
77 #include <net/icmp.h>
78 #include <net/raw.h>
79 #include <net/checksum.h>
80 #include <net/inetpeer.h>
81 #include <net/checksum.h>
82 #include <linux/igmp.h>
83 #include <linux/netfilter_ipv4.h>
84 #include <linux/netfilter_bridge.h>
85 #include <linux/mroute.h>
86 #include <linux/netlink.h>
87
88 /*
89  *      Shall we try to damage output packets if routing dev changes?
90  */
91
92 int sysctl_ip_dynaddr;
93 int sysctl_ip_default_ttl = IPDEFTTL;
94
95 /* Generate a checksum for an outgoing IP datagram. */
96 __inline__ void ip_send_check(struct iphdr *iph)
97 {
98         iph->check = 0;
99         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
100 }
101
102 /* dev_loopback_xmit for use with netfilter. */
103 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
104 {
105         newskb->mac.raw = newskb->data;
106         __skb_pull(newskb, newskb->nh.raw - newskb->data);
107         newskb->pkt_type = PACKET_LOOPBACK;
108         newskb->ip_summed = CHECKSUM_UNNECESSARY;
109         BUG_TRAP(newskb->dst);
110         netif_rx(newskb);
111         return 0;
112 }
113
114 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
115 {
116         int ttl = inet->uc_ttl;
117
118         if (ttl < 0)
119                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
120         return ttl;
121 }
122
123 /* 
124  *              Add an ip header to a skbuff and send it out.
125  *
126  */
127 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
128                           u32 saddr, u32 daddr, struct ip_options *opt)
129 {
130         struct inet_sock *inet = inet_sk(sk);
131         struct rtable *rt = (struct rtable *)skb->dst;
132         struct iphdr *iph;
133
134         /* Build the IP header. */
135         if (opt)
136                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
137         else
138                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
139
140         iph->version  = 4;
141         iph->ihl      = 5;
142         iph->tos      = inet->tos;
143         if (ip_dont_fragment(sk, &rt->u.dst))
144                 iph->frag_off = htons(IP_DF);
145         else
146                 iph->frag_off = 0;
147         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
148         iph->daddr    = rt->rt_dst;
149         iph->saddr    = rt->rt_src;
150         iph->protocol = sk->sk_protocol;
151         iph->tot_len  = htons(skb->len);
152         ip_select_ident(iph, &rt->u.dst, sk);
153         skb->nh.iph   = iph;
154
155         if (opt && opt->optlen) {
156                 iph->ihl += opt->optlen>>2;
157                 ip_options_build(skb, opt, daddr, rt, 0);
158         }
159         ip_send_check(iph);
160
161         skb->priority = sk->sk_priority;
162
163         /* Send it out. */
164         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
165                        dst_output);
166 }
167
168 static inline int ip_finish_output2(struct sk_buff *skb)
169 {
170         struct dst_entry *dst = skb->dst;
171         struct hh_cache *hh = dst->hh;
172         struct net_device *dev = dst->dev;
173         int hh_len = LL_RESERVED_SPACE(dev);
174
175         /* Be paranoid, rather than too clever. */
176         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
177                 struct sk_buff *skb2;
178
179                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
180                 if (skb2 == NULL) {
181                         kfree_skb(skb);
182                         return -ENOMEM;
183                 }
184                 if (skb->sk)
185                         skb_set_owner_w(skb2, skb->sk);
186                 kfree_skb(skb);
187                 skb = skb2;
188         }
189
190         if (hh) {
191                 int hh_alen;
192
193                 read_lock_bh(&hh->hh_lock);
194                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
195                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
196                 read_unlock_bh(&hh->hh_lock);
197                 skb_push(skb, hh->hh_len);
198                 return hh->hh_output(skb);
199         } else if (dst->neighbour)
200                 return dst->neighbour->output(skb);
201
202         if (net_ratelimit())
203                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
204         kfree_skb(skb);
205         return -EINVAL;
206 }
207
208 int ip_finish_output(struct sk_buff *skb)
209 {
210         struct net_device *dev = skb->dst->dev;
211
212         skb->dev = dev;
213         skb->protocol = htons(ETH_P_IP);
214
215         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
216                        ip_finish_output2);
217 }
218
219 int ip_mc_output(struct sk_buff *skb)
220 {
221         struct sock *sk = skb->sk;
222         struct rtable *rt = (struct rtable*)skb->dst;
223         struct net_device *dev = rt->u.dst.dev;
224
225         /*
226          *      If the indicated interface is up and running, send the packet.
227          */
228         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
229
230         skb->dev = dev;
231         skb->protocol = htons(ETH_P_IP);
232
233         /*
234          *      Multicasts are looped back for other local users
235          */
236
237         if (rt->rt_flags&RTCF_MULTICAST) {
238                 if ((!sk || inet_sk(sk)->mc_loop)
239 #ifdef CONFIG_IP_MROUTE
240                 /* Small optimization: do not loopback not local frames,
241                    which returned after forwarding; they will be  dropped
242                    by ip_mr_input in any case.
243                    Note, that local frames are looped back to be delivered
244                    to local recipients.
245
246                    This check is duplicated in ip_mr_input at the moment.
247                  */
248                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
249 #endif
250                 ) {
251                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
252                         if (newskb)
253                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
254                                         newskb->dev, 
255                                         ip_dev_loopback_xmit);
256                 }
257
258                 /* Multicasts with ttl 0 must not go beyond the host */
259
260                 if (skb->nh.iph->ttl == 0) {
261                         kfree_skb(skb);
262                         return 0;
263                 }
264         }
265
266         if (rt->rt_flags&RTCF_BROADCAST) {
267                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
268                 if (newskb)
269                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
270                                 newskb->dev, ip_dev_loopback_xmit);
271         }
272
273         if (skb->len > dst_mtu(&rt->u.dst))
274                 return ip_fragment(skb, ip_finish_output);
275         else
276                 return ip_finish_output(skb);
277 }
278
279 int ip_output(struct sk_buff *skb)
280 {
281         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
282
283         if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
284                 return ip_fragment(skb, ip_finish_output);
285         else
286                 return ip_finish_output(skb);
287 }
288
289 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
290 {
291         struct sock *sk = skb->sk;
292         struct inet_sock *inet = inet_sk(sk);
293         struct ip_options *opt = inet->opt;
294         struct rtable *rt;
295         struct iphdr *iph;
296
297         /* Skip all of this if the packet is already routed,
298          * f.e. by something like SCTP.
299          */
300         rt = (struct rtable *) skb->dst;
301         if (rt != NULL)
302                 goto packet_routed;
303
304         /* Make sure we can route this packet. */
305         rt = (struct rtable *)__sk_dst_check(sk, 0);
306         if (rt == NULL) {
307                 u32 daddr;
308
309                 /* Use correct destination address if we have options. */
310                 daddr = inet->daddr;
311                 if(opt && opt->srr)
312                         daddr = opt->faddr;
313
314                 {
315                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
316                                             .nl_u = { .ip4_u =
317                                                       { .daddr = daddr,
318                                                         .saddr = inet->saddr,
319                                                         .tos = RT_CONN_FLAGS(sk) } },
320                                             .proto = sk->sk_protocol,
321                                             .uli_u = { .ports =
322                                                        { .sport = inet->sport,
323                                                          .dport = inet->dport } } };
324
325                         /* If this fails, retransmit mechanism of transport layer will
326                          * keep trying until route appears or the connection times
327                          * itself out.
328                          */
329                         if (ip_route_output_flow(&rt, &fl, sk, 0))
330                                 goto no_route;
331                 }
332                 __sk_dst_set(sk, &rt->u.dst);
333                 tcp_v4_setup_caps(sk, &rt->u.dst);
334         }
335         skb->dst = dst_clone(&rt->u.dst);
336
337 packet_routed:
338         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
339                 goto no_route;
340
341         /* OK, we know where to send it, allocate and build IP header. */
342         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
343         *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
344         iph->tot_len = htons(skb->len);
345         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
346                 iph->frag_off = htons(IP_DF);
347         else
348                 iph->frag_off = 0;
349         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
350         iph->protocol = sk->sk_protocol;
351         iph->saddr    = rt->rt_src;
352         iph->daddr    = rt->rt_dst;
353         skb->nh.iph   = iph;
354         /* Transport layer set skb->h.foo itself. */
355
356         if (opt && opt->optlen) {
357                 iph->ihl += opt->optlen >> 2;
358                 ip_options_build(skb, opt, inet->daddr, rt, 0);
359         }
360
361         ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
362
363         /* Add an IP checksum. */
364         ip_send_check(iph);
365
366         skb->priority = sk->sk_priority;
367
368         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
369                        dst_output);
370
371 no_route:
372         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
373         kfree_skb(skb);
374         return -EHOSTUNREACH;
375 }
376
377
378 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
379 {
380         to->pkt_type = from->pkt_type;
381         to->priority = from->priority;
382         to->protocol = from->protocol;
383         dst_release(to->dst);
384         to->dst = dst_clone(from->dst);
385         to->dev = from->dev;
386
387         /* Copy the flags to each fragment. */
388         IPCB(to)->flags = IPCB(from)->flags;
389
390 #ifdef CONFIG_NET_SCHED
391         to->tc_index = from->tc_index;
392 #endif
393 #ifdef CONFIG_NETFILTER
394         to->nfmark = from->nfmark;
395         to->nfcache = from->nfcache;
396         /* Connection association is same as pre-frag packet */
397         nf_conntrack_put(to->nfct);
398         to->nfct = from->nfct;
399         nf_conntrack_get(to->nfct);
400         to->nfctinfo = from->nfctinfo;
401 #ifdef CONFIG_BRIDGE_NETFILTER
402         nf_bridge_put(to->nf_bridge);
403         to->nf_bridge = from->nf_bridge;
404         nf_bridge_get(to->nf_bridge);
405 #endif
406 #endif
407 }
408
409 /*
410  *      This IP datagram is too large to be sent in one piece.  Break it up into
411  *      smaller pieces (each of size equal to IP header plus
412  *      a block of the data of the original IP data part) that will yet fit in a
413  *      single device frame, and queue such a frame for sending.
414  */
415
416 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
417 {
418         struct iphdr *iph;
419         int raw = 0;
420         int ptr;
421         struct net_device *dev;
422         struct sk_buff *skb2;
423         unsigned int mtu, hlen, left, len, ll_rs;
424         int offset;
425         int not_last_frag;
426         struct rtable *rt = (struct rtable*)skb->dst;
427         int err = 0;
428
429         dev = rt->u.dst.dev;
430
431         /*
432          *      Point into the IP datagram header.
433          */
434
435         iph = skb->nh.iph;
436
437         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
438                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
439                           htonl(dst_mtu(&rt->u.dst)));
440                 kfree_skb(skb);
441                 return -EMSGSIZE;
442         }
443
444         /*
445          *      Setup starting values.
446          */
447
448         hlen = iph->ihl * 4;
449         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
450
451         /* When frag_list is given, use it. First, check its validity:
452          * some transformers could create wrong frag_list or break existing
453          * one, it is not prohibited. In this case fall back to copying.
454          *
455          * LATER: this step can be merged to real generation of fragments,
456          * we can switch to copy when see the first bad fragment.
457          */
458         if (skb_shinfo(skb)->frag_list) {
459                 struct sk_buff *frag;
460                 int first_len = skb_pagelen(skb);
461
462                 if (first_len - hlen > mtu ||
463                     ((first_len - hlen) & 7) ||
464                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
465                     skb_cloned(skb))
466                         goto slow_path;
467
468                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
469                         /* Correct geometry. */
470                         if (frag->len > mtu ||
471                             ((frag->len & 7) && frag->next) ||
472                             skb_headroom(frag) < hlen)
473                             goto slow_path;
474
475                         /* Partially cloned skb? */
476                         if (skb_shared(frag))
477                                 goto slow_path;
478
479                         BUG_ON(frag->sk);
480                         if (skb->sk) {
481                                 sock_hold(skb->sk);
482                                 frag->sk = skb->sk;
483                                 frag->destructor = sock_wfree;
484                                 skb->truesize -= frag->truesize;
485                         }
486                 }
487
488                 /* Everything is OK. Generate! */
489
490                 err = 0;
491                 offset = 0;
492                 frag = skb_shinfo(skb)->frag_list;
493                 skb_shinfo(skb)->frag_list = NULL;
494                 skb->data_len = first_len - skb_headlen(skb);
495                 skb->len = first_len;
496                 iph->tot_len = htons(first_len);
497                 iph->frag_off = htons(IP_MF);
498                 ip_send_check(iph);
499
500                 for (;;) {
501                         /* Prepare header of the next frame,
502                          * before previous one went down. */
503                         if (frag) {
504                                 frag->ip_summed = CHECKSUM_NONE;
505                                 frag->h.raw = frag->data;
506                                 frag->nh.raw = __skb_push(frag, hlen);
507                                 memcpy(frag->nh.raw, iph, hlen);
508                                 iph = frag->nh.iph;
509                                 iph->tot_len = htons(frag->len);
510                                 ip_copy_metadata(frag, skb);
511                                 if (offset == 0)
512                                         ip_options_fragment(frag);
513                                 offset += skb->len - hlen;
514                                 iph->frag_off = htons(offset>>3);
515                                 if (frag->next != NULL)
516                                         iph->frag_off |= htons(IP_MF);
517                                 /* Ready, complete checksum */
518                                 ip_send_check(iph);
519                         }
520
521                         err = output(skb);
522
523                         if (err || !frag)
524                                 break;
525
526                         skb = frag;
527                         frag = skb->next;
528                         skb->next = NULL;
529                 }
530
531                 if (err == 0) {
532                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
533                         return 0;
534                 }
535
536                 while (frag) {
537                         skb = frag->next;
538                         kfree_skb(frag);
539                         frag = skb;
540                 }
541                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
542                 return err;
543         }
544
545 slow_path:
546         left = skb->len - hlen;         /* Space per frame */
547         ptr = raw + hlen;               /* Where to start from */
548
549 #ifdef CONFIG_BRIDGE_NETFILTER
550         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
551          * we need to make room for the encapsulating header */
552         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
553         mtu -= nf_bridge_pad(skb);
554 #else
555         ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
556 #endif
557         /*
558          *      Fragment the datagram.
559          */
560
561         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
562         not_last_frag = iph->frag_off & htons(IP_MF);
563
564         /*
565          *      Keep copying data until we run out.
566          */
567
568         while(left > 0) {
569                 len = left;
570                 /* IF: it doesn't fit, use 'mtu' - the data space left */
571                 if (len > mtu)
572                         len = mtu;
573                 /* IF: we are not sending upto and including the packet end
574                    then align the next start on an eight byte boundary */
575                 if (len < left) {
576                         len &= ~7;
577                 }
578                 /*
579                  *      Allocate buffer.
580                  */
581
582                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
583                         NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
584                         err = -ENOMEM;
585                         goto fail;
586                 }
587
588                 /*
589                  *      Set up data on packet
590                  */
591
592                 ip_copy_metadata(skb2, skb);
593                 skb_reserve(skb2, ll_rs);
594                 skb_put(skb2, len + hlen);
595                 skb2->nh.raw = skb2->data;
596                 skb2->h.raw = skb2->data + hlen;
597
598                 /*
599                  *      Charge the memory for the fragment to any owner
600                  *      it might possess
601                  */
602
603                 if (skb->sk)
604                         skb_set_owner_w(skb2, skb->sk);
605
606                 /*
607                  *      Copy the packet header into the new buffer.
608                  */
609
610                 memcpy(skb2->nh.raw, skb->data, hlen);
611
612                 /*
613                  *      Copy a block of the IP datagram.
614                  */
615                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
616                         BUG();
617                 left -= len;
618
619                 /*
620                  *      Fill in the new header fields.
621                  */
622                 iph = skb2->nh.iph;
623                 iph->frag_off = htons((offset >> 3));
624
625                 /* ANK: dirty, but effective trick. Upgrade options only if
626                  * the segment to be fragmented was THE FIRST (otherwise,
627                  * options are already fixed) and make it ONCE
628                  * on the initial skb, so that all the following fragments
629                  * will inherit fixed options.
630                  */
631                 if (offset == 0)
632                         ip_options_fragment(skb);
633
634                 /*
635                  *      Added AC : If we are fragmenting a fragment that's not the
636                  *                 last fragment then keep MF on each bit
637                  */
638                 if (left > 0 || not_last_frag)
639                         iph->frag_off |= htons(IP_MF);
640                 ptr += len;
641                 offset += len;
642
643                 /*
644                  *      Put this fragment into the sending queue.
645                  */
646
647                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
648
649                 iph->tot_len = htons(len + hlen);
650
651                 ip_send_check(iph);
652
653                 err = output(skb2);
654                 if (err)
655                         goto fail;
656         }
657         kfree_skb(skb);
658         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
659         return err;
660
661 fail:
662         kfree_skb(skb); 
663         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
664         return err;
665 }
666
667 int
668 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
669 {
670         struct iovec *iov = from;
671
672         if (skb->ip_summed == CHECKSUM_HW) {
673                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
674                         return -EFAULT;
675         } else {
676                 unsigned int csum = 0;
677                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
678                         return -EFAULT;
679                 skb->csum = csum_block_add(skb->csum, csum, odd);
680         }
681         return 0;
682 }
683
684 static inline unsigned int
685 csum_page(struct page *page, int offset, int copy)
686 {
687         char *kaddr;
688         unsigned int csum;
689         kaddr = kmap(page);
690         csum = csum_partial(kaddr + offset, copy, 0);
691         kunmap(page);
692         return csum;
693 }
694
695 /*
696  *      ip_append_data() and ip_append_page() can make one large IP datagram
697  *      from many pieces of data. Each pieces will be holded on the socket
698  *      until ip_push_pending_frames() is called. Each piece can be a page
699  *      or non-page data.
700  *      
701  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
702  *      this interface potentially.
703  *
704  *      LATER: length must be adjusted by pad at tail, when it is required.
705  */
706 int ip_append_data(struct sock *sk,
707                    int getfrag(void *from, char *to, int offset, int len,
708                                int odd, struct sk_buff *skb),
709                    void *from, int length, int transhdrlen,
710                    struct ipcm_cookie *ipc, struct rtable *rt,
711                    unsigned int flags)
712 {
713         struct inet_sock *inet = inet_sk(sk);
714         struct sk_buff *skb;
715
716         struct ip_options *opt = NULL;
717         int hh_len;
718         int exthdrlen;
719         int mtu;
720         int copy;
721         int err;
722         int offset = 0;
723         unsigned int maxfraglen, fragheaderlen;
724         int csummode = CHECKSUM_NONE;
725
726         if (flags&MSG_PROBE)
727                 return 0;
728
729         if (skb_queue_empty(&sk->sk_write_queue)) {
730                 /*
731                  * setup for corking.
732                  */
733                 opt = ipc->opt;
734                 if (opt) {
735                         if (inet->cork.opt == NULL) {
736                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
737                                 if (unlikely(inet->cork.opt == NULL))
738                                         return -ENOBUFS;
739                         }
740                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
741                         inet->cork.flags |= IPCORK_OPT;
742                         inet->cork.addr = ipc->addr;
743                 }
744                 dst_hold(&rt->u.dst);
745                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
746                 inet->cork.rt = rt;
747                 inet->cork.length = 0;
748                 sk->sk_sndmsg_page = NULL;
749                 sk->sk_sndmsg_off = 0;
750                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
751                         length += exthdrlen;
752                         transhdrlen += exthdrlen;
753                 }
754         } else {
755                 rt = inet->cork.rt;
756                 if (inet->cork.flags & IPCORK_OPT)
757                         opt = inet->cork.opt;
758
759                 transhdrlen = 0;
760                 exthdrlen = 0;
761                 mtu = inet->cork.fragsize;
762         }
763         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
764
765         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
766         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
767
768         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
769                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
770                 return -EMSGSIZE;
771         }
772
773         /*
774          * transhdrlen > 0 means that this is the first fragment and we wish
775          * it won't be fragmented in the future.
776          */
777         if (transhdrlen &&
778             length + fragheaderlen <= mtu &&
779             rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
780             !exthdrlen)
781                 csummode = CHECKSUM_HW;
782
783         inet->cork.length += length;
784
785         /* So, what's going on in the loop below?
786          *
787          * We use calculated fragment length to generate chained skb,
788          * each of segments is IP fragment ready for sending to network after
789          * adding appropriate IP header.
790          */
791
792         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
793                 goto alloc_new_skb;
794
795         while (length > 0) {
796                 /* Check if the remaining data fits into current packet. */
797                 copy = mtu - skb->len;
798                 if (copy < length)
799                         copy = maxfraglen - skb->len;
800                 if (copy <= 0) {
801                         char *data;
802                         unsigned int datalen;
803                         unsigned int fraglen;
804                         unsigned int fraggap;
805                         unsigned int alloclen;
806                         struct sk_buff *skb_prev;
807 alloc_new_skb:
808                         skb_prev = skb;
809                         if (skb_prev)
810                                 fraggap = skb_prev->len - maxfraglen;
811                         else
812                                 fraggap = 0;
813
814                         /*
815                          * If remaining data exceeds the mtu,
816                          * we know we need more fragment(s).
817                          */
818                         datalen = length + fraggap;
819                         if (datalen > mtu - fragheaderlen)
820                                 datalen = maxfraglen - fragheaderlen;
821                         fraglen = datalen + fragheaderlen;
822
823                         if ((flags & MSG_MORE) && 
824                             !(rt->u.dst.dev->features&NETIF_F_SG))
825                                 alloclen = mtu;
826                         else
827                                 alloclen = datalen + fragheaderlen;
828
829                         /* The last fragment gets additional space at tail.
830                          * Note, with MSG_MORE we overallocate on fragments,
831                          * because we have no idea what fragment will be
832                          * the last.
833                          */
834                         if (datalen == length)
835                                 alloclen += rt->u.dst.trailer_len;
836
837                         if (transhdrlen) {
838                                 skb = sock_alloc_send_skb(sk, 
839                                                 alloclen + hh_len + 15,
840                                                 (flags & MSG_DONTWAIT), &err);
841                         } else {
842                                 skb = NULL;
843                                 if (atomic_read(&sk->sk_wmem_alloc) <=
844                                     2 * sk->sk_sndbuf)
845                                         skb = sock_wmalloc(sk, 
846                                                            alloclen + hh_len + 15, 1,
847                                                            sk->sk_allocation);
848                                 if (unlikely(skb == NULL))
849                                         err = -ENOBUFS;
850                         }
851                         if (skb == NULL)
852                                 goto error;
853
854                         /*
855                          *      Fill in the control structures
856                          */
857                         skb->ip_summed = csummode;
858                         skb->csum = 0;
859                         skb_reserve(skb, hh_len);
860
861                         /*
862                          *      Find where to start putting bytes.
863                          */
864                         data = skb_put(skb, fraglen);
865                         skb->nh.raw = data + exthdrlen;
866                         data += fragheaderlen;
867                         skb->h.raw = data + exthdrlen;
868
869                         if (fraggap) {
870                                 skb->csum = skb_copy_and_csum_bits(
871                                         skb_prev, maxfraglen,
872                                         data + transhdrlen, fraggap, 0);
873                                 skb_prev->csum = csum_sub(skb_prev->csum,
874                                                           skb->csum);
875                                 data += fraggap;
876                                 skb_trim(skb_prev, maxfraglen);
877                         }
878
879                         copy = datalen - transhdrlen - fraggap;
880                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
881                                 err = -EFAULT;
882                                 kfree_skb(skb);
883                                 goto error;
884                         }
885
886                         offset += copy;
887                         length -= datalen - fraggap;
888                         transhdrlen = 0;
889                         exthdrlen = 0;
890                         csummode = CHECKSUM_NONE;
891
892                         /*
893                          * Put the packet on the pending queue.
894                          */
895                         __skb_queue_tail(&sk->sk_write_queue, skb);
896                         continue;
897                 }
898
899                 if (copy > length)
900                         copy = length;
901
902                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
903                         unsigned int off;
904
905                         off = skb->len;
906                         if (getfrag(from, skb_put(skb, copy), 
907                                         offset, copy, off, skb) < 0) {
908                                 __skb_trim(skb, off);
909                                 err = -EFAULT;
910                                 goto error;
911                         }
912                 } else {
913                         int i = skb_shinfo(skb)->nr_frags;
914                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
915                         struct page *page = sk->sk_sndmsg_page;
916                         int off = sk->sk_sndmsg_off;
917                         unsigned int left;
918
919                         if (page && (left = PAGE_SIZE - off) > 0) {
920                                 if (copy >= left)
921                                         copy = left;
922                                 if (page != frag->page) {
923                                         if (i == MAX_SKB_FRAGS) {
924                                                 err = -EMSGSIZE;
925                                                 goto error;
926                                         }
927                                         get_page(page);
928                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
929                                         frag = &skb_shinfo(skb)->frags[i];
930                                 }
931                         } else if (i < MAX_SKB_FRAGS) {
932                                 if (copy > PAGE_SIZE)
933                                         copy = PAGE_SIZE;
934                                 page = alloc_pages(sk->sk_allocation, 0);
935                                 if (page == NULL)  {
936                                         err = -ENOMEM;
937                                         goto error;
938                                 }
939                                 sk->sk_sndmsg_page = page;
940                                 sk->sk_sndmsg_off = 0;
941
942                                 skb_fill_page_desc(skb, i, page, 0, 0);
943                                 frag = &skb_shinfo(skb)->frags[i];
944                                 skb->truesize += PAGE_SIZE;
945                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
946                         } else {
947                                 err = -EMSGSIZE;
948                                 goto error;
949                         }
950                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
951                                 err = -EFAULT;
952                                 goto error;
953                         }
954                         sk->sk_sndmsg_off += copy;
955                         frag->size += copy;
956                         skb->len += copy;
957                         skb->data_len += copy;
958                 }
959                 offset += copy;
960                 length -= copy;
961         }
962
963         return 0;
964
965 error:
966         inet->cork.length -= length;
967         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
968         return err; 
969 }
970
971 ssize_t ip_append_page(struct sock *sk, struct page *page,
972                        int offset, size_t size, int flags)
973 {
974         struct inet_sock *inet = inet_sk(sk);
975         struct sk_buff *skb;
976         struct rtable *rt;
977         struct ip_options *opt = NULL;
978         int hh_len;
979         int mtu;
980         int len;
981         int err;
982         unsigned int maxfraglen, fragheaderlen, fraggap;
983
984         if (inet->hdrincl)
985                 return -EPERM;
986
987         if (flags&MSG_PROBE)
988                 return 0;
989
990         if (skb_queue_empty(&sk->sk_write_queue))
991                 return -EINVAL;
992
993         rt = inet->cork.rt;
994         if (inet->cork.flags & IPCORK_OPT)
995                 opt = inet->cork.opt;
996
997         if (!(rt->u.dst.dev->features&NETIF_F_SG))
998                 return -EOPNOTSUPP;
999
1000         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1001         mtu = inet->cork.fragsize;
1002
1003         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1004         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1005
1006         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1007                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1008                 return -EMSGSIZE;
1009         }
1010
1011         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1012                 return -EINVAL;
1013
1014         inet->cork.length += size;
1015
1016         while (size > 0) {
1017                 int i;
1018
1019                 /* Check if the remaining data fits into current packet. */
1020                 len = mtu - skb->len;
1021                 if (len < size)
1022                         len = maxfraglen - skb->len;
1023                 if (len <= 0) {
1024                         struct sk_buff *skb_prev;
1025                         char *data;
1026                         struct iphdr *iph;
1027                         int alloclen;
1028
1029                         skb_prev = skb;
1030                         if (skb_prev)
1031                                 fraggap = skb_prev->len - maxfraglen;
1032                         else
1033                                 fraggap = 0;
1034
1035                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1036                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1037                         if (unlikely(!skb)) {
1038                                 err = -ENOBUFS;
1039                                 goto error;
1040                         }
1041
1042                         /*
1043                          *      Fill in the control structures
1044                          */
1045                         skb->ip_summed = CHECKSUM_NONE;
1046                         skb->csum = 0;
1047                         skb_reserve(skb, hh_len);
1048
1049                         /*
1050                          *      Find where to start putting bytes.
1051                          */
1052                         data = skb_put(skb, fragheaderlen + fraggap);
1053                         skb->nh.iph = iph = (struct iphdr *)data;
1054                         data += fragheaderlen;
1055                         skb->h.raw = data;
1056
1057                         if (fraggap) {
1058                                 skb->csum = skb_copy_and_csum_bits(
1059                                         skb_prev, maxfraglen,
1060                                         data, fraggap, 0);
1061                                 skb_prev->csum = csum_sub(skb_prev->csum,
1062                                                           skb->csum);
1063                                 skb_trim(skb_prev, maxfraglen);
1064                         }
1065
1066                         /*
1067                          * Put the packet on the pending queue.
1068                          */
1069                         __skb_queue_tail(&sk->sk_write_queue, skb);
1070                         continue;
1071                 }
1072
1073                 i = skb_shinfo(skb)->nr_frags;
1074                 if (len > size)
1075                         len = size;
1076                 if (skb_can_coalesce(skb, i, page, offset)) {
1077                         skb_shinfo(skb)->frags[i-1].size += len;
1078                 } else if (i < MAX_SKB_FRAGS) {
1079                         get_page(page);
1080                         skb_fill_page_desc(skb, i, page, offset, len);
1081                 } else {
1082                         err = -EMSGSIZE;
1083                         goto error;
1084                 }
1085
1086                 if (skb->ip_summed == CHECKSUM_NONE) {
1087                         unsigned int csum;
1088                         csum = csum_page(page, offset, len);
1089                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1090                 }
1091
1092                 skb->len += len;
1093                 skb->data_len += len;
1094                 offset += len;
1095                 size -= len;
1096         }
1097         return 0;
1098
1099 error:
1100         inet->cork.length -= size;
1101         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1102         return err;
1103 }
1104
1105 /*
1106  *      Combined all pending IP fragments on the socket as one IP datagram
1107  *      and push them out.
1108  */
1109 int ip_push_pending_frames(struct sock *sk)
1110 {
1111         struct sk_buff *skb, *tmp_skb;
1112         struct sk_buff **tail_skb;
1113         struct inet_sock *inet = inet_sk(sk);
1114         struct ip_options *opt = NULL;
1115         struct rtable *rt = inet->cork.rt;
1116         struct iphdr *iph;
1117         int df = 0;
1118         __u8 ttl;
1119         int err = 0;
1120
1121         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1122                 goto out;
1123         tail_skb = &(skb_shinfo(skb)->frag_list);
1124
1125         /* move skb->data to ip header from ext header */
1126         if (skb->data < skb->nh.raw)
1127                 __skb_pull(skb, skb->nh.raw - skb->data);
1128         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1129                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1130                 *tail_skb = tmp_skb;
1131                 tail_skb = &(tmp_skb->next);
1132                 skb->len += tmp_skb->len;
1133                 skb->data_len += tmp_skb->len;
1134                 skb->truesize += tmp_skb->truesize;
1135                 __sock_put(tmp_skb->sk);
1136                 tmp_skb->destructor = NULL;
1137                 tmp_skb->sk = NULL;
1138         }
1139
1140         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1141          * to fragment the frame generated here. No matter, what transforms
1142          * how transforms change size of the packet, it will come out.
1143          */
1144         if (inet->pmtudisc != IP_PMTUDISC_DO)
1145                 skb->local_df = 1;
1146
1147         /* DF bit is set when we want to see DF on outgoing frames.
1148          * If local_df is set too, we still allow to fragment this frame
1149          * locally. */
1150         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1151             (skb->len <= dst_mtu(&rt->u.dst) &&
1152              ip_dont_fragment(sk, &rt->u.dst)))
1153                 df = htons(IP_DF);
1154
1155         if (inet->cork.flags & IPCORK_OPT)
1156                 opt = inet->cork.opt;
1157
1158         if (rt->rt_type == RTN_MULTICAST)
1159                 ttl = inet->mc_ttl;
1160         else
1161                 ttl = ip_select_ttl(inet, &rt->u.dst);
1162
1163         iph = (struct iphdr *)skb->data;
1164         iph->version = 4;
1165         iph->ihl = 5;
1166         if (opt) {
1167                 iph->ihl += opt->optlen>>2;
1168                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1169         }
1170         iph->tos = inet->tos;
1171         iph->tot_len = htons(skb->len);
1172         iph->frag_off = df;
1173         if (!df) {
1174                 __ip_select_ident(iph, &rt->u.dst, 0);
1175         } else {
1176                 iph->id = htons(inet->id++);
1177         }
1178         iph->ttl = ttl;
1179         iph->protocol = sk->sk_protocol;
1180         iph->saddr = rt->rt_src;
1181         iph->daddr = rt->rt_dst;
1182         ip_send_check(iph);
1183
1184         skb->priority = sk->sk_priority;
1185         skb->dst = dst_clone(&rt->u.dst);
1186
1187         /* Netfilter gets whole the not fragmented skb. */
1188         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, 
1189                       skb->dst->dev, dst_output);
1190         if (err) {
1191                 if (err > 0)
1192                         err = inet->recverr ? net_xmit_errno(err) : 0;
1193                 if (err)
1194                         goto error;
1195         }
1196
1197 out:
1198         inet->cork.flags &= ~IPCORK_OPT;
1199         if (inet->cork.opt) {
1200                 kfree(inet->cork.opt);
1201                 inet->cork.opt = NULL;
1202         }
1203         if (inet->cork.rt) {
1204                 ip_rt_put(inet->cork.rt);
1205                 inet->cork.rt = NULL;
1206         }
1207         return err;
1208
1209 error:
1210         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1211         goto out;
1212 }
1213
1214 /*
1215  *      Throw away all pending data on the socket.
1216  */
1217 void ip_flush_pending_frames(struct sock *sk)
1218 {
1219         struct inet_sock *inet = inet_sk(sk);
1220         struct sk_buff *skb;
1221
1222         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1223                 kfree_skb(skb);
1224
1225         inet->cork.flags &= ~IPCORK_OPT;
1226         if (inet->cork.opt) {
1227                 kfree(inet->cork.opt);
1228                 inet->cork.opt = NULL;
1229         }
1230         if (inet->cork.rt) {
1231                 ip_rt_put(inet->cork.rt);
1232                 inet->cork.rt = NULL;
1233         }
1234 }
1235
1236
1237 /*
1238  *      Fetch data from kernel space and fill in checksum if needed.
1239  */
1240 static int ip_reply_glue_bits(void *dptr, char *to, int offset, 
1241                               int len, int odd, struct sk_buff *skb)
1242 {
1243         unsigned int csum;
1244
1245         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1246         skb->csum = csum_block_add(skb->csum, csum, odd);
1247         return 0;  
1248 }
1249
1250 /* 
1251  *      Generic function to send a packet as reply to another packet.
1252  *      Used to send TCP resets so far. ICMP should use this function too.
1253  *
1254  *      Should run single threaded per socket because it uses the sock 
1255  *      structure to pass arguments.
1256  *
1257  *      LATER: switch from ip_build_xmit to ip_append_*
1258  */
1259 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1260                    unsigned int len)
1261 {
1262         struct inet_sock *inet = inet_sk(sk);
1263         struct {
1264                 struct ip_options       opt;
1265                 char                    data[40];
1266         } replyopts;
1267         struct ipcm_cookie ipc;
1268         u32 daddr;
1269         struct rtable *rt = (struct rtable*)skb->dst;
1270
1271         if (ip_options_echo(&replyopts.opt, skb))
1272                 return;
1273
1274         daddr = ipc.addr = rt->rt_src;
1275         ipc.opt = NULL;
1276
1277         if (replyopts.opt.optlen) {
1278                 ipc.opt = &replyopts.opt;
1279
1280                 if (ipc.opt->srr)
1281                         daddr = replyopts.opt.faddr;
1282         }
1283
1284         {
1285                 struct flowi fl = { .nl_u = { .ip4_u =
1286                                               { .daddr = daddr,
1287                                                 .saddr = rt->rt_spec_dst,
1288                                                 .tos = RT_TOS(skb->nh.iph->tos) } },
1289                                     /* Not quite clean, but right. */
1290                                     .uli_u = { .ports =
1291                                                { .sport = skb->h.th->dest,
1292                                                  .dport = skb->h.th->source } },
1293                                     .proto = sk->sk_protocol };
1294                 if (ip_route_output_key(&rt, &fl))
1295                         return;
1296         }
1297
1298         /* And let IP do all the hard work.
1299
1300            This chunk is not reenterable, hence spinlock.
1301            Note that it uses the fact, that this function is called
1302            with locally disabled BH and that sk cannot be already spinlocked.
1303          */
1304         bh_lock_sock(sk);
1305         inet->tos = skb->nh.iph->tos;
1306         sk->sk_priority = skb->priority;
1307         sk->sk_protocol = skb->nh.iph->protocol;
1308         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1309                        &ipc, rt, MSG_DONTWAIT);
1310         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1311                 if (arg->csumoffset >= 0)
1312                         *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1313                 skb->ip_summed = CHECKSUM_NONE;
1314                 ip_push_pending_frames(sk);
1315         }
1316
1317         bh_unlock_sock(sk);
1318
1319         ip_rt_put(rt);
1320 }
1321
1322 void __init ip_init(void)
1323 {
1324         ip_rt_init();
1325         inet_initpeers();
1326
1327 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1328         igmp_mc_proc_init();
1329 #endif
1330 }
1331
1332 EXPORT_SYMBOL(ip_finish_output);
1333 EXPORT_SYMBOL(ip_fragment);
1334 EXPORT_SYMBOL(ip_generic_getfrag);
1335 EXPORT_SYMBOL(ip_queue_xmit);
1336 EXPORT_SYMBOL(ip_send_check);
1337
1338 #ifdef CONFIG_SYSCTL
1339 EXPORT_SYMBOL(sysctl_ip_default_ttl);
1340 #endif