[SOCK]: Introduce sk_setup_caps
[linux-2.6.git] / net / ipv4 / ip_output.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              The Internet Protocol (IP) output module.
7  *
8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Donald Becker, <becker@super.org>
13  *              Alan Cox, <Alan.Cox@linux.org>
14  *              Richard Underwood
15  *              Stefan Becker, <stefanb@yello.ping.de>
16  *              Jorge Cwik, <jorge@laser.satlink.net>
17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
19  *
20  *      See ip_input.c for original log
21  *
22  *      Fixes:
23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
25  *              Bradford Johnson:       Fix faulty handling of some frames when 
26  *                                      no route is found.
27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
28  *                                      (in case if packet not accepted by
29  *                                      output firewall rules)
30  *              Mike McLagan    :       Routing by source
31  *              Alexey Kuznetsov:       use new route cache
32  *              Andi Kleen:             Fix broken PMTU recovery and remove
33  *                                      some redundant tests.
34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path 
37  *                                      for decreased register pressure on x86 
38  *                                      and more readibility. 
39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
40  *                                      silently drop skb instead of failing with -EPERM.
41  *              Detlev Wengorz  :       Copy protocol for fragments.
42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
43  *                                      datagrams.
44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
45  */
46
47 #include <asm/uaccess.h>
48 #include <asm/system.h>
49 #include <linux/module.h>
50 #include <linux/types.h>
51 #include <linux/kernel.h>
52 #include <linux/sched.h>
53 #include <linux/mm.h>
54 #include <linux/string.h>
55 #include <linux/errno.h>
56 #include <linux/config.h>
57
58 #include <linux/socket.h>
59 #include <linux/sockios.h>
60 #include <linux/in.h>
61 #include <linux/inet.h>
62 #include <linux/netdevice.h>
63 #include <linux/etherdevice.h>
64 #include <linux/proc_fs.h>
65 #include <linux/stat.h>
66 #include <linux/init.h>
67
68 #include <net/snmp.h>
69 #include <net/ip.h>
70 #include <net/protocol.h>
71 #include <net/route.h>
72 #include <linux/skbuff.h>
73 #include <net/sock.h>
74 #include <net/arp.h>
75 #include <net/icmp.h>
76 #include <net/checksum.h>
77 #include <net/inetpeer.h>
78 #include <net/checksum.h>
79 #include <linux/igmp.h>
80 #include <linux/netfilter_ipv4.h>
81 #include <linux/netfilter_bridge.h>
82 #include <linux/mroute.h>
83 #include <linux/netlink.h>
84 #include <linux/tcp.h>
85
86 /*
87  *      Shall we try to damage output packets if routing dev changes?
88  */
89
90 int sysctl_ip_dynaddr;
91 int sysctl_ip_default_ttl = IPDEFTTL;
92
93 /* Generate a checksum for an outgoing IP datagram. */
94 __inline__ void ip_send_check(struct iphdr *iph)
95 {
96         iph->check = 0;
97         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
98 }
99
100 /* dev_loopback_xmit for use with netfilter. */
101 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
102 {
103         newskb->mac.raw = newskb->data;
104         __skb_pull(newskb, newskb->nh.raw - newskb->data);
105         newskb->pkt_type = PACKET_LOOPBACK;
106         newskb->ip_summed = CHECKSUM_UNNECESSARY;
107         BUG_TRAP(newskb->dst);
108         netif_rx(newskb);
109         return 0;
110 }
111
112 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
113 {
114         int ttl = inet->uc_ttl;
115
116         if (ttl < 0)
117                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
118         return ttl;
119 }
120
121 /* 
122  *              Add an ip header to a skbuff and send it out.
123  *
124  */
125 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
126                           u32 saddr, u32 daddr, struct ip_options *opt)
127 {
128         struct inet_sock *inet = inet_sk(sk);
129         struct rtable *rt = (struct rtable *)skb->dst;
130         struct iphdr *iph;
131
132         /* Build the IP header. */
133         if (opt)
134                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
135         else
136                 iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
137
138         iph->version  = 4;
139         iph->ihl      = 5;
140         iph->tos      = inet->tos;
141         if (ip_dont_fragment(sk, &rt->u.dst))
142                 iph->frag_off = htons(IP_DF);
143         else
144                 iph->frag_off = 0;
145         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
146         iph->daddr    = rt->rt_dst;
147         iph->saddr    = rt->rt_src;
148         iph->protocol = sk->sk_protocol;
149         iph->tot_len  = htons(skb->len);
150         ip_select_ident(iph, &rt->u.dst, sk);
151         skb->nh.iph   = iph;
152
153         if (opt && opt->optlen) {
154                 iph->ihl += opt->optlen>>2;
155                 ip_options_build(skb, opt, daddr, rt, 0);
156         }
157         ip_send_check(iph);
158
159         skb->priority = sk->sk_priority;
160
161         /* Send it out. */
162         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
163                        dst_output);
164 }
165
166 static inline int ip_finish_output2(struct sk_buff *skb)
167 {
168         struct dst_entry *dst = skb->dst;
169         struct hh_cache *hh = dst->hh;
170         struct net_device *dev = dst->dev;
171         int hh_len = LL_RESERVED_SPACE(dev);
172
173         /* Be paranoid, rather than too clever. */
174         if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
175                 struct sk_buff *skb2;
176
177                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
178                 if (skb2 == NULL) {
179                         kfree_skb(skb);
180                         return -ENOMEM;
181                 }
182                 if (skb->sk)
183                         skb_set_owner_w(skb2, skb->sk);
184                 kfree_skb(skb);
185                 skb = skb2;
186         }
187
188         if (hh) {
189                 int hh_alen;
190
191                 read_lock_bh(&hh->hh_lock);
192                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
193                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
194                 read_unlock_bh(&hh->hh_lock);
195                 skb_push(skb, hh->hh_len);
196                 return hh->hh_output(skb);
197         } else if (dst->neighbour)
198                 return dst->neighbour->output(skb);
199
200         if (net_ratelimit())
201                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
202         kfree_skb(skb);
203         return -EINVAL;
204 }
205
206 static int ip_finish_output(struct sk_buff *skb)
207 {
208         struct net_device *dev = skb->dst->dev;
209
210         skb->dev = dev;
211         skb->protocol = htons(ETH_P_IP);
212
213         return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
214                        ip_finish_output2);
215 }
216
217 int ip_mc_output(struct sk_buff *skb)
218 {
219         struct sock *sk = skb->sk;
220         struct rtable *rt = (struct rtable*)skb->dst;
221         struct net_device *dev = rt->u.dst.dev;
222
223         /*
224          *      If the indicated interface is up and running, send the packet.
225          */
226         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
227
228         skb->dev = dev;
229         skb->protocol = htons(ETH_P_IP);
230
231         /*
232          *      Multicasts are looped back for other local users
233          */
234
235         if (rt->rt_flags&RTCF_MULTICAST) {
236                 if ((!sk || inet_sk(sk)->mc_loop)
237 #ifdef CONFIG_IP_MROUTE
238                 /* Small optimization: do not loopback not local frames,
239                    which returned after forwarding; they will be  dropped
240                    by ip_mr_input in any case.
241                    Note, that local frames are looped back to be delivered
242                    to local recipients.
243
244                    This check is duplicated in ip_mr_input at the moment.
245                  */
246                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
247 #endif
248                 ) {
249                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
250                         if (newskb)
251                                 NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
252                                         newskb->dev, 
253                                         ip_dev_loopback_xmit);
254                 }
255
256                 /* Multicasts with ttl 0 must not go beyond the host */
257
258                 if (skb->nh.iph->ttl == 0) {
259                         kfree_skb(skb);
260                         return 0;
261                 }
262         }
263
264         if (rt->rt_flags&RTCF_BROADCAST) {
265                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
266                 if (newskb)
267                         NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
268                                 newskb->dev, ip_dev_loopback_xmit);
269         }
270
271         if (skb->len > dst_mtu(&rt->u.dst))
272                 return ip_fragment(skb, ip_finish_output);
273         else
274                 return ip_finish_output(skb);
275 }
276
277 int ip_output(struct sk_buff *skb)
278 {
279         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
280
281         if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
282                 return ip_fragment(skb, ip_finish_output);
283         else
284                 return ip_finish_output(skb);
285 }
286
287 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
288 {
289         struct sock *sk = skb->sk;
290         struct inet_sock *inet = inet_sk(sk);
291         struct ip_options *opt = inet->opt;
292         struct rtable *rt;
293         struct iphdr *iph;
294
295         /* Skip all of this if the packet is already routed,
296          * f.e. by something like SCTP.
297          */
298         rt = (struct rtable *) skb->dst;
299         if (rt != NULL)
300                 goto packet_routed;
301
302         /* Make sure we can route this packet. */
303         rt = (struct rtable *)__sk_dst_check(sk, 0);
304         if (rt == NULL) {
305                 u32 daddr;
306
307                 /* Use correct destination address if we have options. */
308                 daddr = inet->daddr;
309                 if(opt && opt->srr)
310                         daddr = opt->faddr;
311
312                 {
313                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
314                                             .nl_u = { .ip4_u =
315                                                       { .daddr = daddr,
316                                                         .saddr = inet->saddr,
317                                                         .tos = RT_CONN_FLAGS(sk) } },
318                                             .proto = sk->sk_protocol,
319                                             .uli_u = { .ports =
320                                                        { .sport = inet->sport,
321                                                          .dport = inet->dport } } };
322
323                         /* If this fails, retransmit mechanism of transport layer will
324                          * keep trying until route appears or the connection times
325                          * itself out.
326                          */
327                         if (ip_route_output_flow(&rt, &fl, sk, 0))
328                                 goto no_route;
329                 }
330                 sk_setup_caps(sk, &rt->u.dst);
331         }
332         skb->dst = dst_clone(&rt->u.dst);
333
334 packet_routed:
335         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
336                 goto no_route;
337
338         /* OK, we know where to send it, allocate and build IP header. */
339         iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
340         *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
341         iph->tot_len = htons(skb->len);
342         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
343                 iph->frag_off = htons(IP_DF);
344         else
345                 iph->frag_off = 0;
346         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
347         iph->protocol = sk->sk_protocol;
348         iph->saddr    = rt->rt_src;
349         iph->daddr    = rt->rt_dst;
350         skb->nh.iph   = iph;
351         /* Transport layer set skb->h.foo itself. */
352
353         if (opt && opt->optlen) {
354                 iph->ihl += opt->optlen >> 2;
355                 ip_options_build(skb, opt, inet->daddr, rt, 0);
356         }
357
358         ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
359
360         /* Add an IP checksum. */
361         ip_send_check(iph);
362
363         skb->priority = sk->sk_priority;
364
365         return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
366                        dst_output);
367
368 no_route:
369         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
370         kfree_skb(skb);
371         return -EHOSTUNREACH;
372 }
373
374
375 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
376 {
377         to->pkt_type = from->pkt_type;
378         to->priority = from->priority;
379         to->protocol = from->protocol;
380         dst_release(to->dst);
381         to->dst = dst_clone(from->dst);
382         to->dev = from->dev;
383
384         /* Copy the flags to each fragment. */
385         IPCB(to)->flags = IPCB(from)->flags;
386
387 #ifdef CONFIG_NET_SCHED
388         to->tc_index = from->tc_index;
389 #endif
390 #ifdef CONFIG_NETFILTER
391         to->nfmark = from->nfmark;
392         /* Connection association is same as pre-frag packet */
393         nf_conntrack_put(to->nfct);
394         to->nfct = from->nfct;
395         nf_conntrack_get(to->nfct);
396         to->nfctinfo = from->nfctinfo;
397 #ifdef CONFIG_BRIDGE_NETFILTER
398         nf_bridge_put(to->nf_bridge);
399         to->nf_bridge = from->nf_bridge;
400         nf_bridge_get(to->nf_bridge);
401 #endif
402 #endif
403 }
404
405 /*
406  *      This IP datagram is too large to be sent in one piece.  Break it up into
407  *      smaller pieces (each of size equal to IP header plus
408  *      a block of the data of the original IP data part) that will yet fit in a
409  *      single device frame, and queue such a frame for sending.
410  */
411
412 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
413 {
414         struct iphdr *iph;
415         int raw = 0;
416         int ptr;
417         struct net_device *dev;
418         struct sk_buff *skb2;
419         unsigned int mtu, hlen, left, len, ll_rs;
420         int offset;
421         int not_last_frag;
422         struct rtable *rt = (struct rtable*)skb->dst;
423         int err = 0;
424
425         dev = rt->u.dst.dev;
426
427         /*
428          *      Point into the IP datagram header.
429          */
430
431         iph = skb->nh.iph;
432
433         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
434                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
435                           htonl(dst_mtu(&rt->u.dst)));
436                 kfree_skb(skb);
437                 return -EMSGSIZE;
438         }
439
440         /*
441          *      Setup starting values.
442          */
443
444         hlen = iph->ihl * 4;
445         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
446
447         /* When frag_list is given, use it. First, check its validity:
448          * some transformers could create wrong frag_list or break existing
449          * one, it is not prohibited. In this case fall back to copying.
450          *
451          * LATER: this step can be merged to real generation of fragments,
452          * we can switch to copy when see the first bad fragment.
453          */
454         if (skb_shinfo(skb)->frag_list) {
455                 struct sk_buff *frag;
456                 int first_len = skb_pagelen(skb);
457
458                 if (first_len - hlen > mtu ||
459                     ((first_len - hlen) & 7) ||
460                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
461                     skb_cloned(skb))
462                         goto slow_path;
463
464                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
465                         /* Correct geometry. */
466                         if (frag->len > mtu ||
467                             ((frag->len & 7) && frag->next) ||
468                             skb_headroom(frag) < hlen)
469                             goto slow_path;
470
471                         /* Partially cloned skb? */
472                         if (skb_shared(frag))
473                                 goto slow_path;
474
475                         BUG_ON(frag->sk);
476                         if (skb->sk) {
477                                 sock_hold(skb->sk);
478                                 frag->sk = skb->sk;
479                                 frag->destructor = sock_wfree;
480                                 skb->truesize -= frag->truesize;
481                         }
482                 }
483
484                 /* Everything is OK. Generate! */
485
486                 err = 0;
487                 offset = 0;
488                 frag = skb_shinfo(skb)->frag_list;
489                 skb_shinfo(skb)->frag_list = NULL;
490                 skb->data_len = first_len - skb_headlen(skb);
491                 skb->len = first_len;
492                 iph->tot_len = htons(first_len);
493                 iph->frag_off = htons(IP_MF);
494                 ip_send_check(iph);
495
496                 for (;;) {
497                         /* Prepare header of the next frame,
498                          * before previous one went down. */
499                         if (frag) {
500                                 frag->ip_summed = CHECKSUM_NONE;
501                                 frag->h.raw = frag->data;
502                                 frag->nh.raw = __skb_push(frag, hlen);
503                                 memcpy(frag->nh.raw, iph, hlen);
504                                 iph = frag->nh.iph;
505                                 iph->tot_len = htons(frag->len);
506                                 ip_copy_metadata(frag, skb);
507                                 if (offset == 0)
508                                         ip_options_fragment(frag);
509                                 offset += skb->len - hlen;
510                                 iph->frag_off = htons(offset>>3);
511                                 if (frag->next != NULL)
512                                         iph->frag_off |= htons(IP_MF);
513                                 /* Ready, complete checksum */
514                                 ip_send_check(iph);
515                         }
516
517                         err = output(skb);
518
519                         if (err || !frag)
520                                 break;
521
522                         skb = frag;
523                         frag = skb->next;
524                         skb->next = NULL;
525                 }
526
527                 if (err == 0) {
528                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
529                         return 0;
530                 }
531
532                 while (frag) {
533                         skb = frag->next;
534                         kfree_skb(frag);
535                         frag = skb;
536                 }
537                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
538                 return err;
539         }
540
541 slow_path:
542         left = skb->len - hlen;         /* Space per frame */
543         ptr = raw + hlen;               /* Where to start from */
544
545 #ifdef CONFIG_BRIDGE_NETFILTER
546         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
547          * we need to make room for the encapsulating header */
548         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
549         mtu -= nf_bridge_pad(skb);
550 #else
551         ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
552 #endif
553         /*
554          *      Fragment the datagram.
555          */
556
557         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
558         not_last_frag = iph->frag_off & htons(IP_MF);
559
560         /*
561          *      Keep copying data until we run out.
562          */
563
564         while(left > 0) {
565                 len = left;
566                 /* IF: it doesn't fit, use 'mtu' - the data space left */
567                 if (len > mtu)
568                         len = mtu;
569                 /* IF: we are not sending upto and including the packet end
570                    then align the next start on an eight byte boundary */
571                 if (len < left) {
572                         len &= ~7;
573                 }
574                 /*
575                  *      Allocate buffer.
576                  */
577
578                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
579                         NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
580                         err = -ENOMEM;
581                         goto fail;
582                 }
583
584                 /*
585                  *      Set up data on packet
586                  */
587
588                 ip_copy_metadata(skb2, skb);
589                 skb_reserve(skb2, ll_rs);
590                 skb_put(skb2, len + hlen);
591                 skb2->nh.raw = skb2->data;
592                 skb2->h.raw = skb2->data + hlen;
593
594                 /*
595                  *      Charge the memory for the fragment to any owner
596                  *      it might possess
597                  */
598
599                 if (skb->sk)
600                         skb_set_owner_w(skb2, skb->sk);
601
602                 /*
603                  *      Copy the packet header into the new buffer.
604                  */
605
606                 memcpy(skb2->nh.raw, skb->data, hlen);
607
608                 /*
609                  *      Copy a block of the IP datagram.
610                  */
611                 if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
612                         BUG();
613                 left -= len;
614
615                 /*
616                  *      Fill in the new header fields.
617                  */
618                 iph = skb2->nh.iph;
619                 iph->frag_off = htons((offset >> 3));
620
621                 /* ANK: dirty, but effective trick. Upgrade options only if
622                  * the segment to be fragmented was THE FIRST (otherwise,
623                  * options are already fixed) and make it ONCE
624                  * on the initial skb, so that all the following fragments
625                  * will inherit fixed options.
626                  */
627                 if (offset == 0)
628                         ip_options_fragment(skb);
629
630                 /*
631                  *      Added AC : If we are fragmenting a fragment that's not the
632                  *                 last fragment then keep MF on each bit
633                  */
634                 if (left > 0 || not_last_frag)
635                         iph->frag_off |= htons(IP_MF);
636                 ptr += len;
637                 offset += len;
638
639                 /*
640                  *      Put this fragment into the sending queue.
641                  */
642
643                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
644
645                 iph->tot_len = htons(len + hlen);
646
647                 ip_send_check(iph);
648
649                 err = output(skb2);
650                 if (err)
651                         goto fail;
652         }
653         kfree_skb(skb);
654         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
655         return err;
656
657 fail:
658         kfree_skb(skb); 
659         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
660         return err;
661 }
662
663 int
664 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
665 {
666         struct iovec *iov = from;
667
668         if (skb->ip_summed == CHECKSUM_HW) {
669                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
670                         return -EFAULT;
671         } else {
672                 unsigned int csum = 0;
673                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
674                         return -EFAULT;
675                 skb->csum = csum_block_add(skb->csum, csum, odd);
676         }
677         return 0;
678 }
679
680 static inline unsigned int
681 csum_page(struct page *page, int offset, int copy)
682 {
683         char *kaddr;
684         unsigned int csum;
685         kaddr = kmap(page);
686         csum = csum_partial(kaddr + offset, copy, 0);
687         kunmap(page);
688         return csum;
689 }
690
691 /*
692  *      ip_append_data() and ip_append_page() can make one large IP datagram
693  *      from many pieces of data. Each pieces will be holded on the socket
694  *      until ip_push_pending_frames() is called. Each piece can be a page
695  *      or non-page data.
696  *      
697  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
698  *      this interface potentially.
699  *
700  *      LATER: length must be adjusted by pad at tail, when it is required.
701  */
702 int ip_append_data(struct sock *sk,
703                    int getfrag(void *from, char *to, int offset, int len,
704                                int odd, struct sk_buff *skb),
705                    void *from, int length, int transhdrlen,
706                    struct ipcm_cookie *ipc, struct rtable *rt,
707                    unsigned int flags)
708 {
709         struct inet_sock *inet = inet_sk(sk);
710         struct sk_buff *skb;
711
712         struct ip_options *opt = NULL;
713         int hh_len;
714         int exthdrlen;
715         int mtu;
716         int copy;
717         int err;
718         int offset = 0;
719         unsigned int maxfraglen, fragheaderlen;
720         int csummode = CHECKSUM_NONE;
721
722         if (flags&MSG_PROBE)
723                 return 0;
724
725         if (skb_queue_empty(&sk->sk_write_queue)) {
726                 /*
727                  * setup for corking.
728                  */
729                 opt = ipc->opt;
730                 if (opt) {
731                         if (inet->cork.opt == NULL) {
732                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
733                                 if (unlikely(inet->cork.opt == NULL))
734                                         return -ENOBUFS;
735                         }
736                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
737                         inet->cork.flags |= IPCORK_OPT;
738                         inet->cork.addr = ipc->addr;
739                 }
740                 dst_hold(&rt->u.dst);
741                 inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
742                 inet->cork.rt = rt;
743                 inet->cork.length = 0;
744                 sk->sk_sndmsg_page = NULL;
745                 sk->sk_sndmsg_off = 0;
746                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
747                         length += exthdrlen;
748                         transhdrlen += exthdrlen;
749                 }
750         } else {
751                 rt = inet->cork.rt;
752                 if (inet->cork.flags & IPCORK_OPT)
753                         opt = inet->cork.opt;
754
755                 transhdrlen = 0;
756                 exthdrlen = 0;
757                 mtu = inet->cork.fragsize;
758         }
759         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
760
761         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
762         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
763
764         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
765                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
766                 return -EMSGSIZE;
767         }
768
769         /*
770          * transhdrlen > 0 means that this is the first fragment and we wish
771          * it won't be fragmented in the future.
772          */
773         if (transhdrlen &&
774             length + fragheaderlen <= mtu &&
775             rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
776             !exthdrlen)
777                 csummode = CHECKSUM_HW;
778
779         inet->cork.length += length;
780
781         /* So, what's going on in the loop below?
782          *
783          * We use calculated fragment length to generate chained skb,
784          * each of segments is IP fragment ready for sending to network after
785          * adding appropriate IP header.
786          */
787
788         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
789                 goto alloc_new_skb;
790
791         while (length > 0) {
792                 /* Check if the remaining data fits into current packet. */
793                 copy = mtu - skb->len;
794                 if (copy < length)
795                         copy = maxfraglen - skb->len;
796                 if (copy <= 0) {
797                         char *data;
798                         unsigned int datalen;
799                         unsigned int fraglen;
800                         unsigned int fraggap;
801                         unsigned int alloclen;
802                         struct sk_buff *skb_prev;
803 alloc_new_skb:
804                         skb_prev = skb;
805                         if (skb_prev)
806                                 fraggap = skb_prev->len - maxfraglen;
807                         else
808                                 fraggap = 0;
809
810                         /*
811                          * If remaining data exceeds the mtu,
812                          * we know we need more fragment(s).
813                          */
814                         datalen = length + fraggap;
815                         if (datalen > mtu - fragheaderlen)
816                                 datalen = maxfraglen - fragheaderlen;
817                         fraglen = datalen + fragheaderlen;
818
819                         if ((flags & MSG_MORE) && 
820                             !(rt->u.dst.dev->features&NETIF_F_SG))
821                                 alloclen = mtu;
822                         else
823                                 alloclen = datalen + fragheaderlen;
824
825                         /* The last fragment gets additional space at tail.
826                          * Note, with MSG_MORE we overallocate on fragments,
827                          * because we have no idea what fragment will be
828                          * the last.
829                          */
830                         if (datalen == length)
831                                 alloclen += rt->u.dst.trailer_len;
832
833                         if (transhdrlen) {
834                                 skb = sock_alloc_send_skb(sk, 
835                                                 alloclen + hh_len + 15,
836                                                 (flags & MSG_DONTWAIT), &err);
837                         } else {
838                                 skb = NULL;
839                                 if (atomic_read(&sk->sk_wmem_alloc) <=
840                                     2 * sk->sk_sndbuf)
841                                         skb = sock_wmalloc(sk, 
842                                                            alloclen + hh_len + 15, 1,
843                                                            sk->sk_allocation);
844                                 if (unlikely(skb == NULL))
845                                         err = -ENOBUFS;
846                         }
847                         if (skb == NULL)
848                                 goto error;
849
850                         /*
851                          *      Fill in the control structures
852                          */
853                         skb->ip_summed = csummode;
854                         skb->csum = 0;
855                         skb_reserve(skb, hh_len);
856
857                         /*
858                          *      Find where to start putting bytes.
859                          */
860                         data = skb_put(skb, fraglen);
861                         skb->nh.raw = data + exthdrlen;
862                         data += fragheaderlen;
863                         skb->h.raw = data + exthdrlen;
864
865                         if (fraggap) {
866                                 skb->csum = skb_copy_and_csum_bits(
867                                         skb_prev, maxfraglen,
868                                         data + transhdrlen, fraggap, 0);
869                                 skb_prev->csum = csum_sub(skb_prev->csum,
870                                                           skb->csum);
871                                 data += fraggap;
872                                 skb_trim(skb_prev, maxfraglen);
873                         }
874
875                         copy = datalen - transhdrlen - fraggap;
876                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
877                                 err = -EFAULT;
878                                 kfree_skb(skb);
879                                 goto error;
880                         }
881
882                         offset += copy;
883                         length -= datalen - fraggap;
884                         transhdrlen = 0;
885                         exthdrlen = 0;
886                         csummode = CHECKSUM_NONE;
887
888                         /*
889                          * Put the packet on the pending queue.
890                          */
891                         __skb_queue_tail(&sk->sk_write_queue, skb);
892                         continue;
893                 }
894
895                 if (copy > length)
896                         copy = length;
897
898                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
899                         unsigned int off;
900
901                         off = skb->len;
902                         if (getfrag(from, skb_put(skb, copy), 
903                                         offset, copy, off, skb) < 0) {
904                                 __skb_trim(skb, off);
905                                 err = -EFAULT;
906                                 goto error;
907                         }
908                 } else {
909                         int i = skb_shinfo(skb)->nr_frags;
910                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
911                         struct page *page = sk->sk_sndmsg_page;
912                         int off = sk->sk_sndmsg_off;
913                         unsigned int left;
914
915                         if (page && (left = PAGE_SIZE - off) > 0) {
916                                 if (copy >= left)
917                                         copy = left;
918                                 if (page != frag->page) {
919                                         if (i == MAX_SKB_FRAGS) {
920                                                 err = -EMSGSIZE;
921                                                 goto error;
922                                         }
923                                         get_page(page);
924                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
925                                         frag = &skb_shinfo(skb)->frags[i];
926                                 }
927                         } else if (i < MAX_SKB_FRAGS) {
928                                 if (copy > PAGE_SIZE)
929                                         copy = PAGE_SIZE;
930                                 page = alloc_pages(sk->sk_allocation, 0);
931                                 if (page == NULL)  {
932                                         err = -ENOMEM;
933                                         goto error;
934                                 }
935                                 sk->sk_sndmsg_page = page;
936                                 sk->sk_sndmsg_off = 0;
937
938                                 skb_fill_page_desc(skb, i, page, 0, 0);
939                                 frag = &skb_shinfo(skb)->frags[i];
940                                 skb->truesize += PAGE_SIZE;
941                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
942                         } else {
943                                 err = -EMSGSIZE;
944                                 goto error;
945                         }
946                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
947                                 err = -EFAULT;
948                                 goto error;
949                         }
950                         sk->sk_sndmsg_off += copy;
951                         frag->size += copy;
952                         skb->len += copy;
953                         skb->data_len += copy;
954                 }
955                 offset += copy;
956                 length -= copy;
957         }
958
959         return 0;
960
961 error:
962         inet->cork.length -= length;
963         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
964         return err; 
965 }
966
967 ssize_t ip_append_page(struct sock *sk, struct page *page,
968                        int offset, size_t size, int flags)
969 {
970         struct inet_sock *inet = inet_sk(sk);
971         struct sk_buff *skb;
972         struct rtable *rt;
973         struct ip_options *opt = NULL;
974         int hh_len;
975         int mtu;
976         int len;
977         int err;
978         unsigned int maxfraglen, fragheaderlen, fraggap;
979
980         if (inet->hdrincl)
981                 return -EPERM;
982
983         if (flags&MSG_PROBE)
984                 return 0;
985
986         if (skb_queue_empty(&sk->sk_write_queue))
987                 return -EINVAL;
988
989         rt = inet->cork.rt;
990         if (inet->cork.flags & IPCORK_OPT)
991                 opt = inet->cork.opt;
992
993         if (!(rt->u.dst.dev->features&NETIF_F_SG))
994                 return -EOPNOTSUPP;
995
996         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
997         mtu = inet->cork.fragsize;
998
999         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1000         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1001
1002         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1003                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1004                 return -EMSGSIZE;
1005         }
1006
1007         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1008                 return -EINVAL;
1009
1010         inet->cork.length += size;
1011
1012         while (size > 0) {
1013                 int i;
1014
1015                 /* Check if the remaining data fits into current packet. */
1016                 len = mtu - skb->len;
1017                 if (len < size)
1018                         len = maxfraglen - skb->len;
1019                 if (len <= 0) {
1020                         struct sk_buff *skb_prev;
1021                         char *data;
1022                         struct iphdr *iph;
1023                         int alloclen;
1024
1025                         skb_prev = skb;
1026                         if (skb_prev)
1027                                 fraggap = skb_prev->len - maxfraglen;
1028                         else
1029                                 fraggap = 0;
1030
1031                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1032                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1033                         if (unlikely(!skb)) {
1034                                 err = -ENOBUFS;
1035                                 goto error;
1036                         }
1037
1038                         /*
1039                          *      Fill in the control structures
1040                          */
1041                         skb->ip_summed = CHECKSUM_NONE;
1042                         skb->csum = 0;
1043                         skb_reserve(skb, hh_len);
1044
1045                         /*
1046                          *      Find where to start putting bytes.
1047                          */
1048                         data = skb_put(skb, fragheaderlen + fraggap);
1049                         skb->nh.iph = iph = (struct iphdr *)data;
1050                         data += fragheaderlen;
1051                         skb->h.raw = data;
1052
1053                         if (fraggap) {
1054                                 skb->csum = skb_copy_and_csum_bits(
1055                                         skb_prev, maxfraglen,
1056                                         data, fraggap, 0);
1057                                 skb_prev->csum = csum_sub(skb_prev->csum,
1058                                                           skb->csum);
1059                                 skb_trim(skb_prev, maxfraglen);
1060                         }
1061
1062                         /*
1063                          * Put the packet on the pending queue.
1064                          */
1065                         __skb_queue_tail(&sk->sk_write_queue, skb);
1066                         continue;
1067                 }
1068
1069                 i = skb_shinfo(skb)->nr_frags;
1070                 if (len > size)
1071                         len = size;
1072                 if (skb_can_coalesce(skb, i, page, offset)) {
1073                         skb_shinfo(skb)->frags[i-1].size += len;
1074                 } else if (i < MAX_SKB_FRAGS) {
1075                         get_page(page);
1076                         skb_fill_page_desc(skb, i, page, offset, len);
1077                 } else {
1078                         err = -EMSGSIZE;
1079                         goto error;
1080                 }
1081
1082                 if (skb->ip_summed == CHECKSUM_NONE) {
1083                         unsigned int csum;
1084                         csum = csum_page(page, offset, len);
1085                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1086                 }
1087
1088                 skb->len += len;
1089                 skb->data_len += len;
1090                 offset += len;
1091                 size -= len;
1092         }
1093         return 0;
1094
1095 error:
1096         inet->cork.length -= size;
1097         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1098         return err;
1099 }
1100
1101 /*
1102  *      Combined all pending IP fragments on the socket as one IP datagram
1103  *      and push them out.
1104  */
1105 int ip_push_pending_frames(struct sock *sk)
1106 {
1107         struct sk_buff *skb, *tmp_skb;
1108         struct sk_buff **tail_skb;
1109         struct inet_sock *inet = inet_sk(sk);
1110         struct ip_options *opt = NULL;
1111         struct rtable *rt = inet->cork.rt;
1112         struct iphdr *iph;
1113         int df = 0;
1114         __u8 ttl;
1115         int err = 0;
1116
1117         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1118                 goto out;
1119         tail_skb = &(skb_shinfo(skb)->frag_list);
1120
1121         /* move skb->data to ip header from ext header */
1122         if (skb->data < skb->nh.raw)
1123                 __skb_pull(skb, skb->nh.raw - skb->data);
1124         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1125                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1126                 *tail_skb = tmp_skb;
1127                 tail_skb = &(tmp_skb->next);
1128                 skb->len += tmp_skb->len;
1129                 skb->data_len += tmp_skb->len;
1130                 skb->truesize += tmp_skb->truesize;
1131                 __sock_put(tmp_skb->sk);
1132                 tmp_skb->destructor = NULL;
1133                 tmp_skb->sk = NULL;
1134         }
1135
1136         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1137          * to fragment the frame generated here. No matter, what transforms
1138          * how transforms change size of the packet, it will come out.
1139          */
1140         if (inet->pmtudisc != IP_PMTUDISC_DO)
1141                 skb->local_df = 1;
1142
1143         /* DF bit is set when we want to see DF on outgoing frames.
1144          * If local_df is set too, we still allow to fragment this frame
1145          * locally. */
1146         if (inet->pmtudisc == IP_PMTUDISC_DO ||
1147             (skb->len <= dst_mtu(&rt->u.dst) &&
1148              ip_dont_fragment(sk, &rt->u.dst)))
1149                 df = htons(IP_DF);
1150
1151         if (inet->cork.flags & IPCORK_OPT)
1152                 opt = inet->cork.opt;
1153
1154         if (rt->rt_type == RTN_MULTICAST)
1155                 ttl = inet->mc_ttl;
1156         else
1157                 ttl = ip_select_ttl(inet, &rt->u.dst);
1158
1159         iph = (struct iphdr *)skb->data;
1160         iph->version = 4;
1161         iph->ihl = 5;
1162         if (opt) {
1163                 iph->ihl += opt->optlen>>2;
1164                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1165         }
1166         iph->tos = inet->tos;
1167         iph->tot_len = htons(skb->len);
1168         iph->frag_off = df;
1169         if (!df) {
1170                 __ip_select_ident(iph, &rt->u.dst, 0);
1171         } else {
1172                 iph->id = htons(inet->id++);
1173         }
1174         iph->ttl = ttl;
1175         iph->protocol = sk->sk_protocol;
1176         iph->saddr = rt->rt_src;
1177         iph->daddr = rt->rt_dst;
1178         ip_send_check(iph);
1179
1180         skb->priority = sk->sk_priority;
1181         skb->dst = dst_clone(&rt->u.dst);
1182
1183         /* Netfilter gets whole the not fragmented skb. */
1184         err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, 
1185                       skb->dst->dev, dst_output);
1186         if (err) {
1187                 if (err > 0)
1188                         err = inet->recverr ? net_xmit_errno(err) : 0;
1189                 if (err)
1190                         goto error;
1191         }
1192
1193 out:
1194         inet->cork.flags &= ~IPCORK_OPT;
1195         if (inet->cork.opt) {
1196                 kfree(inet->cork.opt);
1197                 inet->cork.opt = NULL;
1198         }
1199         if (inet->cork.rt) {
1200                 ip_rt_put(inet->cork.rt);
1201                 inet->cork.rt = NULL;
1202         }
1203         return err;
1204
1205 error:
1206         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1207         goto out;
1208 }
1209
1210 /*
1211  *      Throw away all pending data on the socket.
1212  */
1213 void ip_flush_pending_frames(struct sock *sk)
1214 {
1215         struct inet_sock *inet = inet_sk(sk);
1216         struct sk_buff *skb;
1217
1218         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1219                 kfree_skb(skb);
1220
1221         inet->cork.flags &= ~IPCORK_OPT;
1222         if (inet->cork.opt) {
1223                 kfree(inet->cork.opt);
1224                 inet->cork.opt = NULL;
1225         }
1226         if (inet->cork.rt) {
1227                 ip_rt_put(inet->cork.rt);
1228                 inet->cork.rt = NULL;
1229         }
1230 }
1231
1232
1233 /*
1234  *      Fetch data from kernel space and fill in checksum if needed.
1235  */
1236 static int ip_reply_glue_bits(void *dptr, char *to, int offset, 
1237                               int len, int odd, struct sk_buff *skb)
1238 {
1239         unsigned int csum;
1240
1241         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1242         skb->csum = csum_block_add(skb->csum, csum, odd);
1243         return 0;  
1244 }
1245
1246 /* 
1247  *      Generic function to send a packet as reply to another packet.
1248  *      Used to send TCP resets so far. ICMP should use this function too.
1249  *
1250  *      Should run single threaded per socket because it uses the sock 
1251  *      structure to pass arguments.
1252  *
1253  *      LATER: switch from ip_build_xmit to ip_append_*
1254  */
1255 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1256                    unsigned int len)
1257 {
1258         struct inet_sock *inet = inet_sk(sk);
1259         struct {
1260                 struct ip_options       opt;
1261                 char                    data[40];
1262         } replyopts;
1263         struct ipcm_cookie ipc;
1264         u32 daddr;
1265         struct rtable *rt = (struct rtable*)skb->dst;
1266
1267         if (ip_options_echo(&replyopts.opt, skb))
1268                 return;
1269
1270         daddr = ipc.addr = rt->rt_src;
1271         ipc.opt = NULL;
1272
1273         if (replyopts.opt.optlen) {
1274                 ipc.opt = &replyopts.opt;
1275
1276                 if (ipc.opt->srr)
1277                         daddr = replyopts.opt.faddr;
1278         }
1279
1280         {
1281                 struct flowi fl = { .nl_u = { .ip4_u =
1282                                               { .daddr = daddr,
1283                                                 .saddr = rt->rt_spec_dst,
1284                                                 .tos = RT_TOS(skb->nh.iph->tos) } },
1285                                     /* Not quite clean, but right. */
1286                                     .uli_u = { .ports =
1287                                                { .sport = skb->h.th->dest,
1288                                                  .dport = skb->h.th->source } },
1289                                     .proto = sk->sk_protocol };
1290                 if (ip_route_output_key(&rt, &fl))
1291                         return;
1292         }
1293
1294         /* And let IP do all the hard work.
1295
1296            This chunk is not reenterable, hence spinlock.
1297            Note that it uses the fact, that this function is called
1298            with locally disabled BH and that sk cannot be already spinlocked.
1299          */
1300         bh_lock_sock(sk);
1301         inet->tos = skb->nh.iph->tos;
1302         sk->sk_priority = skb->priority;
1303         sk->sk_protocol = skb->nh.iph->protocol;
1304         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1305                        &ipc, rt, MSG_DONTWAIT);
1306         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1307                 if (arg->csumoffset >= 0)
1308                         *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
1309                 skb->ip_summed = CHECKSUM_NONE;
1310                 ip_push_pending_frames(sk);
1311         }
1312
1313         bh_unlock_sock(sk);
1314
1315         ip_rt_put(rt);
1316 }
1317
1318 void __init ip_init(void)
1319 {
1320         ip_rt_init();
1321         inet_initpeers();
1322
1323 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1324         igmp_mc_proc_init();
1325 #endif
1326 }
1327
1328 EXPORT_SYMBOL(ip_fragment);
1329 EXPORT_SYMBOL(ip_generic_getfrag);
1330 EXPORT_SYMBOL(ip_queue_xmit);
1331 EXPORT_SYMBOL(ip_send_check);