ipv4: Create and use route lookup helpers.
[linux-3.10.git] / net / ipv4 / ip_output.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              The Internet Protocol (IP) output module.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Donald Becker, <becker@super.org>
11  *              Alan Cox, <Alan.Cox@linux.org>
12  *              Richard Underwood
13  *              Stefan Becker, <stefanb@yello.ping.de>
14  *              Jorge Cwik, <jorge@laser.satlink.net>
15  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16  *              Hirokazu Takahashi, <taka@valinux.co.jp>
17  *
18  *      See ip_input.c for original log
19  *
20  *      Fixes:
21  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
22  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
23  *              Bradford Johnson:       Fix faulty handling of some frames when
24  *                                      no route is found.
25  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
26  *                                      (in case if packet not accepted by
27  *                                      output firewall rules)
28  *              Mike McLagan    :       Routing by source
29  *              Alexey Kuznetsov:       use new route cache
30  *              Andi Kleen:             Fix broken PMTU recovery and remove
31  *                                      some redundant tests.
32  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
33  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
34  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
35  *                                      for decreased register pressure on x86
36  *                                      and more readibility.
37  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
38  *                                      silently drop skb instead of failing with -EPERM.
39  *              Detlev Wengorz  :       Copy protocol for fragments.
40  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
41  *                                      datagrams.
42  *              Hirokazu Takahashi:     sendfile() on UDP works now.
43  */
44
45 #include <asm/uaccess.h>
46 #include <asm/system.h>
47 #include <linux/module.h>
48 #include <linux/types.h>
49 #include <linux/kernel.h>
50 #include <linux/mm.h>
51 #include <linux/string.h>
52 #include <linux/errno.h>
53 #include <linux/highmem.h>
54 #include <linux/slab.h>
55
56 #include <linux/socket.h>
57 #include <linux/sockios.h>
58 #include <linux/in.h>
59 #include <linux/inet.h>
60 #include <linux/netdevice.h>
61 #include <linux/etherdevice.h>
62 #include <linux/proc_fs.h>
63 #include <linux/stat.h>
64 #include <linux/init.h>
65
66 #include <net/snmp.h>
67 #include <net/ip.h>
68 #include <net/protocol.h>
69 #include <net/route.h>
70 #include <net/xfrm.h>
71 #include <linux/skbuff.h>
72 #include <net/sock.h>
73 #include <net/arp.h>
74 #include <net/icmp.h>
75 #include <net/checksum.h>
76 #include <net/inetpeer.h>
77 #include <linux/igmp.h>
78 #include <linux/netfilter_ipv4.h>
79 #include <linux/netfilter_bridge.h>
80 #include <linux/mroute.h>
81 #include <linux/netlink.h>
82 #include <linux/tcp.h>
83
84 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
85 EXPORT_SYMBOL(sysctl_ip_default_ttl);
86
87 /* Generate a checksum for an outgoing IP datagram. */
88 __inline__ void ip_send_check(struct iphdr *iph)
89 {
90         iph->check = 0;
91         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
92 }
93 EXPORT_SYMBOL(ip_send_check);
94
95 int __ip_local_out(struct sk_buff *skb)
96 {
97         struct iphdr *iph = ip_hdr(skb);
98
99         iph->tot_len = htons(skb->len);
100         ip_send_check(iph);
101         return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
102                        skb_dst(skb)->dev, dst_output);
103 }
104
105 int ip_local_out(struct sk_buff *skb)
106 {
107         int err;
108
109         err = __ip_local_out(skb);
110         if (likely(err == 1))
111                 err = dst_output(skb);
112
113         return err;
114 }
115 EXPORT_SYMBOL_GPL(ip_local_out);
116
117 /* dev_loopback_xmit for use with netfilter. */
118 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
119 {
120         skb_reset_mac_header(newskb);
121         __skb_pull(newskb, skb_network_offset(newskb));
122         newskb->pkt_type = PACKET_LOOPBACK;
123         newskb->ip_summed = CHECKSUM_UNNECESSARY;
124         WARN_ON(!skb_dst(newskb));
125         netif_rx_ni(newskb);
126         return 0;
127 }
128
129 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
130 {
131         int ttl = inet->uc_ttl;
132
133         if (ttl < 0)
134                 ttl = ip4_dst_hoplimit(dst);
135         return ttl;
136 }
137
138 /*
139  *              Add an ip header to a skbuff and send it out.
140  *
141  */
142 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
143                           __be32 saddr, __be32 daddr, struct ip_options *opt)
144 {
145         struct inet_sock *inet = inet_sk(sk);
146         struct rtable *rt = skb_rtable(skb);
147         struct iphdr *iph;
148
149         /* Build the IP header. */
150         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
151         skb_reset_network_header(skb);
152         iph = ip_hdr(skb);
153         iph->version  = 4;
154         iph->ihl      = 5;
155         iph->tos      = inet->tos;
156         if (ip_dont_fragment(sk, &rt->dst))
157                 iph->frag_off = htons(IP_DF);
158         else
159                 iph->frag_off = 0;
160         iph->ttl      = ip_select_ttl(inet, &rt->dst);
161         iph->daddr    = rt->rt_dst;
162         iph->saddr    = rt->rt_src;
163         iph->protocol = sk->sk_protocol;
164         ip_select_ident(iph, &rt->dst, sk);
165
166         if (opt && opt->optlen) {
167                 iph->ihl += opt->optlen>>2;
168                 ip_options_build(skb, opt, daddr, rt, 0);
169         }
170
171         skb->priority = sk->sk_priority;
172         skb->mark = sk->sk_mark;
173
174         /* Send it out. */
175         return ip_local_out(skb);
176 }
177 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
178
179 static inline int ip_finish_output2(struct sk_buff *skb)
180 {
181         struct dst_entry *dst = skb_dst(skb);
182         struct rtable *rt = (struct rtable *)dst;
183         struct net_device *dev = dst->dev;
184         unsigned int hh_len = LL_RESERVED_SPACE(dev);
185
186         if (rt->rt_type == RTN_MULTICAST) {
187                 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
188         } else if (rt->rt_type == RTN_BROADCAST)
189                 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
190
191         /* Be paranoid, rather than too clever. */
192         if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
193                 struct sk_buff *skb2;
194
195                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
196                 if (skb2 == NULL) {
197                         kfree_skb(skb);
198                         return -ENOMEM;
199                 }
200                 if (skb->sk)
201                         skb_set_owner_w(skb2, skb->sk);
202                 kfree_skb(skb);
203                 skb = skb2;
204         }
205
206         if (dst->hh)
207                 return neigh_hh_output(dst->hh, skb);
208         else if (dst->neighbour)
209                 return dst->neighbour->output(skb);
210
211         if (net_ratelimit())
212                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
213         kfree_skb(skb);
214         return -EINVAL;
215 }
216
217 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
218 {
219         struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
220
221         return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
222                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
223 }
224
225 static int ip_finish_output(struct sk_buff *skb)
226 {
227 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
228         /* Policy lookup after SNAT yielded a new policy */
229         if (skb_dst(skb)->xfrm != NULL) {
230                 IPCB(skb)->flags |= IPSKB_REROUTED;
231                 return dst_output(skb);
232         }
233 #endif
234         if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
235                 return ip_fragment(skb, ip_finish_output2);
236         else
237                 return ip_finish_output2(skb);
238 }
239
240 int ip_mc_output(struct sk_buff *skb)
241 {
242         struct sock *sk = skb->sk;
243         struct rtable *rt = skb_rtable(skb);
244         struct net_device *dev = rt->dst.dev;
245
246         /*
247          *      If the indicated interface is up and running, send the packet.
248          */
249         IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
250
251         skb->dev = dev;
252         skb->protocol = htons(ETH_P_IP);
253
254         /*
255          *      Multicasts are looped back for other local users
256          */
257
258         if (rt->rt_flags&RTCF_MULTICAST) {
259                 if (sk_mc_loop(sk)
260 #ifdef CONFIG_IP_MROUTE
261                 /* Small optimization: do not loopback not local frames,
262                    which returned after forwarding; they will be  dropped
263                    by ip_mr_input in any case.
264                    Note, that local frames are looped back to be delivered
265                    to local recipients.
266
267                    This check is duplicated in ip_mr_input at the moment.
268                  */
269                     &&
270                     ((rt->rt_flags & RTCF_LOCAL) ||
271                      !(IPCB(skb)->flags & IPSKB_FORWARDED))
272 #endif
273                    ) {
274                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
275                         if (newskb)
276                                 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
277                                         newskb, NULL, newskb->dev,
278                                         ip_dev_loopback_xmit);
279                 }
280
281                 /* Multicasts with ttl 0 must not go beyond the host */
282
283                 if (ip_hdr(skb)->ttl == 0) {
284                         kfree_skb(skb);
285                         return 0;
286                 }
287         }
288
289         if (rt->rt_flags&RTCF_BROADCAST) {
290                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
291                 if (newskb)
292                         NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
293                                 NULL, newskb->dev, ip_dev_loopback_xmit);
294         }
295
296         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
297                             skb->dev, ip_finish_output,
298                             !(IPCB(skb)->flags & IPSKB_REROUTED));
299 }
300
301 int ip_output(struct sk_buff *skb)
302 {
303         struct net_device *dev = skb_dst(skb)->dev;
304
305         IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
306
307         skb->dev = dev;
308         skb->protocol = htons(ETH_P_IP);
309
310         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
311                             ip_finish_output,
312                             !(IPCB(skb)->flags & IPSKB_REROUTED));
313 }
314
315 int ip_queue_xmit(struct sk_buff *skb)
316 {
317         struct sock *sk = skb->sk;
318         struct inet_sock *inet = inet_sk(sk);
319         struct ip_options *opt = inet->opt;
320         struct rtable *rt;
321         struct iphdr *iph;
322         int res;
323
324         /* Skip all of this if the packet is already routed,
325          * f.e. by something like SCTP.
326          */
327         rcu_read_lock();
328         rt = skb_rtable(skb);
329         if (rt != NULL)
330                 goto packet_routed;
331
332         /* Make sure we can route this packet. */
333         rt = (struct rtable *)__sk_dst_check(sk, 0);
334         if (rt == NULL) {
335                 __be32 daddr;
336
337                 /* Use correct destination address if we have options. */
338                 daddr = inet->inet_daddr;
339                 if(opt && opt->srr)
340                         daddr = opt->faddr;
341
342                 /* If this fails, retransmit mechanism of transport layer will
343                  * keep trying until route appears or the connection times
344                  * itself out.
345                  */
346                 rt = ip_route_output_ports(sock_net(sk), sk,
347                                            daddr, inet->inet_saddr,
348                                            inet->inet_dport,
349                                            inet->inet_sport,
350                                            sk->sk_protocol,
351                                            RT_CONN_FLAGS(sk),
352                                            sk->sk_bound_dev_if);
353                 if (IS_ERR(rt))
354                         goto no_route;
355                 sk_setup_caps(sk, &rt->dst);
356         }
357         skb_dst_set_noref(skb, &rt->dst);
358
359 packet_routed:
360         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
361                 goto no_route;
362
363         /* OK, we know where to send it, allocate and build IP header. */
364         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
365         skb_reset_network_header(skb);
366         iph = ip_hdr(skb);
367         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
368         if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
369                 iph->frag_off = htons(IP_DF);
370         else
371                 iph->frag_off = 0;
372         iph->ttl      = ip_select_ttl(inet, &rt->dst);
373         iph->protocol = sk->sk_protocol;
374         iph->saddr    = rt->rt_src;
375         iph->daddr    = rt->rt_dst;
376         /* Transport layer set skb->h.foo itself. */
377
378         if (opt && opt->optlen) {
379                 iph->ihl += opt->optlen >> 2;
380                 ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
381         }
382
383         ip_select_ident_more(iph, &rt->dst, sk,
384                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
385
386         skb->priority = sk->sk_priority;
387         skb->mark = sk->sk_mark;
388
389         res = ip_local_out(skb);
390         rcu_read_unlock();
391         return res;
392
393 no_route:
394         rcu_read_unlock();
395         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
396         kfree_skb(skb);
397         return -EHOSTUNREACH;
398 }
399 EXPORT_SYMBOL(ip_queue_xmit);
400
401
402 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
403 {
404         to->pkt_type = from->pkt_type;
405         to->priority = from->priority;
406         to->protocol = from->protocol;
407         skb_dst_drop(to);
408         skb_dst_copy(to, from);
409         to->dev = from->dev;
410         to->mark = from->mark;
411
412         /* Copy the flags to each fragment. */
413         IPCB(to)->flags = IPCB(from)->flags;
414
415 #ifdef CONFIG_NET_SCHED
416         to->tc_index = from->tc_index;
417 #endif
418         nf_copy(to, from);
419 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
420     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
421         to->nf_trace = from->nf_trace;
422 #endif
423 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
424         to->ipvs_property = from->ipvs_property;
425 #endif
426         skb_copy_secmark(to, from);
427 }
428
429 /*
430  *      This IP datagram is too large to be sent in one piece.  Break it up into
431  *      smaller pieces (each of size equal to IP header plus
432  *      a block of the data of the original IP data part) that will yet fit in a
433  *      single device frame, and queue such a frame for sending.
434  */
435
436 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
437 {
438         struct iphdr *iph;
439         int ptr;
440         struct net_device *dev;
441         struct sk_buff *skb2;
442         unsigned int mtu, hlen, left, len, ll_rs;
443         int offset;
444         __be16 not_last_frag;
445         struct rtable *rt = skb_rtable(skb);
446         int err = 0;
447
448         dev = rt->dst.dev;
449
450         /*
451          *      Point into the IP datagram header.
452          */
453
454         iph = ip_hdr(skb);
455
456         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
457                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
458                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
459                           htonl(ip_skb_dst_mtu(skb)));
460                 kfree_skb(skb);
461                 return -EMSGSIZE;
462         }
463
464         /*
465          *      Setup starting values.
466          */
467
468         hlen = iph->ihl * 4;
469         mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
470 #ifdef CONFIG_BRIDGE_NETFILTER
471         if (skb->nf_bridge)
472                 mtu -= nf_bridge_mtu_reduction(skb);
473 #endif
474         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
475
476         /* When frag_list is given, use it. First, check its validity:
477          * some transformers could create wrong frag_list or break existing
478          * one, it is not prohibited. In this case fall back to copying.
479          *
480          * LATER: this step can be merged to real generation of fragments,
481          * we can switch to copy when see the first bad fragment.
482          */
483         if (skb_has_frag_list(skb)) {
484                 struct sk_buff *frag, *frag2;
485                 int first_len = skb_pagelen(skb);
486
487                 if (first_len - hlen > mtu ||
488                     ((first_len - hlen) & 7) ||
489                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
490                     skb_cloned(skb))
491                         goto slow_path;
492
493                 skb_walk_frags(skb, frag) {
494                         /* Correct geometry. */
495                         if (frag->len > mtu ||
496                             ((frag->len & 7) && frag->next) ||
497                             skb_headroom(frag) < hlen)
498                                 goto slow_path_clean;
499
500                         /* Partially cloned skb? */
501                         if (skb_shared(frag))
502                                 goto slow_path_clean;
503
504                         BUG_ON(frag->sk);
505                         if (skb->sk) {
506                                 frag->sk = skb->sk;
507                                 frag->destructor = sock_wfree;
508                         }
509                         skb->truesize -= frag->truesize;
510                 }
511
512                 /* Everything is OK. Generate! */
513
514                 err = 0;
515                 offset = 0;
516                 frag = skb_shinfo(skb)->frag_list;
517                 skb_frag_list_init(skb);
518                 skb->data_len = first_len - skb_headlen(skb);
519                 skb->len = first_len;
520                 iph->tot_len = htons(first_len);
521                 iph->frag_off = htons(IP_MF);
522                 ip_send_check(iph);
523
524                 for (;;) {
525                         /* Prepare header of the next frame,
526                          * before previous one went down. */
527                         if (frag) {
528                                 frag->ip_summed = CHECKSUM_NONE;
529                                 skb_reset_transport_header(frag);
530                                 __skb_push(frag, hlen);
531                                 skb_reset_network_header(frag);
532                                 memcpy(skb_network_header(frag), iph, hlen);
533                                 iph = ip_hdr(frag);
534                                 iph->tot_len = htons(frag->len);
535                                 ip_copy_metadata(frag, skb);
536                                 if (offset == 0)
537                                         ip_options_fragment(frag);
538                                 offset += skb->len - hlen;
539                                 iph->frag_off = htons(offset>>3);
540                                 if (frag->next != NULL)
541                                         iph->frag_off |= htons(IP_MF);
542                                 /* Ready, complete checksum */
543                                 ip_send_check(iph);
544                         }
545
546                         err = output(skb);
547
548                         if (!err)
549                                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
550                         if (err || !frag)
551                                 break;
552
553                         skb = frag;
554                         frag = skb->next;
555                         skb->next = NULL;
556                 }
557
558                 if (err == 0) {
559                         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
560                         return 0;
561                 }
562
563                 while (frag) {
564                         skb = frag->next;
565                         kfree_skb(frag);
566                         frag = skb;
567                 }
568                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
569                 return err;
570
571 slow_path_clean:
572                 skb_walk_frags(skb, frag2) {
573                         if (frag2 == frag)
574                                 break;
575                         frag2->sk = NULL;
576                         frag2->destructor = NULL;
577                         skb->truesize += frag2->truesize;
578                 }
579         }
580
581 slow_path:
582         left = skb->len - hlen;         /* Space per frame */
583         ptr = hlen;             /* Where to start from */
584
585         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
586          * we need to make room for the encapsulating header
587          */
588         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, nf_bridge_pad(skb));
589
590         /*
591          *      Fragment the datagram.
592          */
593
594         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
595         not_last_frag = iph->frag_off & htons(IP_MF);
596
597         /*
598          *      Keep copying data until we run out.
599          */
600
601         while (left > 0) {
602                 len = left;
603                 /* IF: it doesn't fit, use 'mtu' - the data space left */
604                 if (len > mtu)
605                         len = mtu;
606                 /* IF: we are not sending upto and including the packet end
607                    then align the next start on an eight byte boundary */
608                 if (len < left) {
609                         len &= ~7;
610                 }
611                 /*
612                  *      Allocate buffer.
613                  */
614
615                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
616                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
617                         err = -ENOMEM;
618                         goto fail;
619                 }
620
621                 /*
622                  *      Set up data on packet
623                  */
624
625                 ip_copy_metadata(skb2, skb);
626                 skb_reserve(skb2, ll_rs);
627                 skb_put(skb2, len + hlen);
628                 skb_reset_network_header(skb2);
629                 skb2->transport_header = skb2->network_header + hlen;
630
631                 /*
632                  *      Charge the memory for the fragment to any owner
633                  *      it might possess
634                  */
635
636                 if (skb->sk)
637                         skb_set_owner_w(skb2, skb->sk);
638
639                 /*
640                  *      Copy the packet header into the new buffer.
641                  */
642
643                 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
644
645                 /*
646                  *      Copy a block of the IP datagram.
647                  */
648                 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
649                         BUG();
650                 left -= len;
651
652                 /*
653                  *      Fill in the new header fields.
654                  */
655                 iph = ip_hdr(skb2);
656                 iph->frag_off = htons((offset >> 3));
657
658                 /* ANK: dirty, but effective trick. Upgrade options only if
659                  * the segment to be fragmented was THE FIRST (otherwise,
660                  * options are already fixed) and make it ONCE
661                  * on the initial skb, so that all the following fragments
662                  * will inherit fixed options.
663                  */
664                 if (offset == 0)
665                         ip_options_fragment(skb);
666
667                 /*
668                  *      Added AC : If we are fragmenting a fragment that's not the
669                  *                 last fragment then keep MF on each bit
670                  */
671                 if (left > 0 || not_last_frag)
672                         iph->frag_off |= htons(IP_MF);
673                 ptr += len;
674                 offset += len;
675
676                 /*
677                  *      Put this fragment into the sending queue.
678                  */
679                 iph->tot_len = htons(len + hlen);
680
681                 ip_send_check(iph);
682
683                 err = output(skb2);
684                 if (err)
685                         goto fail;
686
687                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
688         }
689         kfree_skb(skb);
690         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
691         return err;
692
693 fail:
694         kfree_skb(skb);
695         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
696         return err;
697 }
698 EXPORT_SYMBOL(ip_fragment);
699
700 int
701 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
702 {
703         struct iovec *iov = from;
704
705         if (skb->ip_summed == CHECKSUM_PARTIAL) {
706                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
707                         return -EFAULT;
708         } else {
709                 __wsum csum = 0;
710                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
711                         return -EFAULT;
712                 skb->csum = csum_block_add(skb->csum, csum, odd);
713         }
714         return 0;
715 }
716 EXPORT_SYMBOL(ip_generic_getfrag);
717
718 static inline __wsum
719 csum_page(struct page *page, int offset, int copy)
720 {
721         char *kaddr;
722         __wsum csum;
723         kaddr = kmap(page);
724         csum = csum_partial(kaddr + offset, copy, 0);
725         kunmap(page);
726         return csum;
727 }
728
729 static inline int ip_ufo_append_data(struct sock *sk,
730                         struct sk_buff_head *queue,
731                         int getfrag(void *from, char *to, int offset, int len,
732                                int odd, struct sk_buff *skb),
733                         void *from, int length, int hh_len, int fragheaderlen,
734                         int transhdrlen, int mtu, unsigned int flags)
735 {
736         struct sk_buff *skb;
737         int err;
738
739         /* There is support for UDP fragmentation offload by network
740          * device, so create one single skb packet containing complete
741          * udp datagram
742          */
743         if ((skb = skb_peek_tail(queue)) == NULL) {
744                 skb = sock_alloc_send_skb(sk,
745                         hh_len + fragheaderlen + transhdrlen + 20,
746                         (flags & MSG_DONTWAIT), &err);
747
748                 if (skb == NULL)
749                         return err;
750
751                 /* reserve space for Hardware header */
752                 skb_reserve(skb, hh_len);
753
754                 /* create space for UDP/IP header */
755                 skb_put(skb, fragheaderlen + transhdrlen);
756
757                 /* initialize network header pointer */
758                 skb_reset_network_header(skb);
759
760                 /* initialize protocol header pointer */
761                 skb->transport_header = skb->network_header + fragheaderlen;
762
763                 skb->ip_summed = CHECKSUM_PARTIAL;
764                 skb->csum = 0;
765
766                 /* specify the length of each IP datagram fragment */
767                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
768                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
769                 __skb_queue_tail(queue, skb);
770         }
771
772         return skb_append_datato_frags(sk, skb, getfrag, from,
773                                        (length - transhdrlen));
774 }
775
776 static int __ip_append_data(struct sock *sk, struct sk_buff_head *queue,
777                             struct inet_cork *cork,
778                             int getfrag(void *from, char *to, int offset,
779                                         int len, int odd, struct sk_buff *skb),
780                             void *from, int length, int transhdrlen,
781                             unsigned int flags)
782 {
783         struct inet_sock *inet = inet_sk(sk);
784         struct sk_buff *skb;
785
786         struct ip_options *opt = cork->opt;
787         int hh_len;
788         int exthdrlen;
789         int mtu;
790         int copy;
791         int err;
792         int offset = 0;
793         unsigned int maxfraglen, fragheaderlen;
794         int csummode = CHECKSUM_NONE;
795         struct rtable *rt = (struct rtable *)cork->dst;
796
797         exthdrlen = transhdrlen ? rt->dst.header_len : 0;
798         length += exthdrlen;
799         transhdrlen += exthdrlen;
800         mtu = cork->fragsize;
801
802         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
803
804         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
805         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
806
807         if (cork->length + length > 0xFFFF - fragheaderlen) {
808                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
809                                mtu-exthdrlen);
810                 return -EMSGSIZE;
811         }
812
813         /*
814          * transhdrlen > 0 means that this is the first fragment and we wish
815          * it won't be fragmented in the future.
816          */
817         if (transhdrlen &&
818             length + fragheaderlen <= mtu &&
819             rt->dst.dev->features & NETIF_F_V4_CSUM &&
820             !exthdrlen)
821                 csummode = CHECKSUM_PARTIAL;
822
823         skb = skb_peek_tail(queue);
824
825         cork->length += length;
826         if (((length > mtu) || (skb && skb_is_gso(skb))) &&
827             (sk->sk_protocol == IPPROTO_UDP) &&
828             (rt->dst.dev->features & NETIF_F_UFO)) {
829                 err = ip_ufo_append_data(sk, queue, getfrag, from, length,
830                                          hh_len, fragheaderlen, transhdrlen,
831                                          mtu, flags);
832                 if (err)
833                         goto error;
834                 return 0;
835         }
836
837         /* So, what's going on in the loop below?
838          *
839          * We use calculated fragment length to generate chained skb,
840          * each of segments is IP fragment ready for sending to network after
841          * adding appropriate IP header.
842          */
843
844         if (!skb)
845                 goto alloc_new_skb;
846
847         while (length > 0) {
848                 /* Check if the remaining data fits into current packet. */
849                 copy = mtu - skb->len;
850                 if (copy < length)
851                         copy = maxfraglen - skb->len;
852                 if (copy <= 0) {
853                         char *data;
854                         unsigned int datalen;
855                         unsigned int fraglen;
856                         unsigned int fraggap;
857                         unsigned int alloclen;
858                         struct sk_buff *skb_prev;
859 alloc_new_skb:
860                         skb_prev = skb;
861                         if (skb_prev)
862                                 fraggap = skb_prev->len - maxfraglen;
863                         else
864                                 fraggap = 0;
865
866                         /*
867                          * If remaining data exceeds the mtu,
868                          * we know we need more fragment(s).
869                          */
870                         datalen = length + fraggap;
871                         if (datalen > mtu - fragheaderlen)
872                                 datalen = maxfraglen - fragheaderlen;
873                         fraglen = datalen + fragheaderlen;
874
875                         if ((flags & MSG_MORE) &&
876                             !(rt->dst.dev->features&NETIF_F_SG))
877                                 alloclen = mtu;
878                         else
879                                 alloclen = fraglen;
880
881                         /* The last fragment gets additional space at tail.
882                          * Note, with MSG_MORE we overallocate on fragments,
883                          * because we have no idea what fragment will be
884                          * the last.
885                          */
886                         if (datalen == length + fraggap) {
887                                 alloclen += rt->dst.trailer_len;
888                                 /* make sure mtu is not reached */
889                                 if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
890                                         datalen -= ALIGN(rt->dst.trailer_len, 8);
891                         }
892                         if (transhdrlen) {
893                                 skb = sock_alloc_send_skb(sk,
894                                                 alloclen + hh_len + 15,
895                                                 (flags & MSG_DONTWAIT), &err);
896                         } else {
897                                 skb = NULL;
898                                 if (atomic_read(&sk->sk_wmem_alloc) <=
899                                     2 * sk->sk_sndbuf)
900                                         skb = sock_wmalloc(sk,
901                                                            alloclen + hh_len + 15, 1,
902                                                            sk->sk_allocation);
903                                 if (unlikely(skb == NULL))
904                                         err = -ENOBUFS;
905                                 else
906                                         /* only the initial fragment is
907                                            time stamped */
908                                         cork->tx_flags = 0;
909                         }
910                         if (skb == NULL)
911                                 goto error;
912
913                         /*
914                          *      Fill in the control structures
915                          */
916                         skb->ip_summed = csummode;
917                         skb->csum = 0;
918                         skb_reserve(skb, hh_len);
919                         skb_shinfo(skb)->tx_flags = cork->tx_flags;
920
921                         /*
922                          *      Find where to start putting bytes.
923                          */
924                         data = skb_put(skb, fraglen);
925                         skb_set_network_header(skb, exthdrlen);
926                         skb->transport_header = (skb->network_header +
927                                                  fragheaderlen);
928                         data += fragheaderlen;
929
930                         if (fraggap) {
931                                 skb->csum = skb_copy_and_csum_bits(
932                                         skb_prev, maxfraglen,
933                                         data + transhdrlen, fraggap, 0);
934                                 skb_prev->csum = csum_sub(skb_prev->csum,
935                                                           skb->csum);
936                                 data += fraggap;
937                                 pskb_trim_unique(skb_prev, maxfraglen);
938                         }
939
940                         copy = datalen - transhdrlen - fraggap;
941                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
942                                 err = -EFAULT;
943                                 kfree_skb(skb);
944                                 goto error;
945                         }
946
947                         offset += copy;
948                         length -= datalen - fraggap;
949                         transhdrlen = 0;
950                         exthdrlen = 0;
951                         csummode = CHECKSUM_NONE;
952
953                         /*
954                          * Put the packet on the pending queue.
955                          */
956                         __skb_queue_tail(queue, skb);
957                         continue;
958                 }
959
960                 if (copy > length)
961                         copy = length;
962
963                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
964                         unsigned int off;
965
966                         off = skb->len;
967                         if (getfrag(from, skb_put(skb, copy),
968                                         offset, copy, off, skb) < 0) {
969                                 __skb_trim(skb, off);
970                                 err = -EFAULT;
971                                 goto error;
972                         }
973                 } else {
974                         int i = skb_shinfo(skb)->nr_frags;
975                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
976                         struct page *page = cork->page;
977                         int off = cork->off;
978                         unsigned int left;
979
980                         if (page && (left = PAGE_SIZE - off) > 0) {
981                                 if (copy >= left)
982                                         copy = left;
983                                 if (page != frag->page) {
984                                         if (i == MAX_SKB_FRAGS) {
985                                                 err = -EMSGSIZE;
986                                                 goto error;
987                                         }
988                                         get_page(page);
989                                         skb_fill_page_desc(skb, i, page, off, 0);
990                                         frag = &skb_shinfo(skb)->frags[i];
991                                 }
992                         } else if (i < MAX_SKB_FRAGS) {
993                                 if (copy > PAGE_SIZE)
994                                         copy = PAGE_SIZE;
995                                 page = alloc_pages(sk->sk_allocation, 0);
996                                 if (page == NULL)  {
997                                         err = -ENOMEM;
998                                         goto error;
999                                 }
1000                                 cork->page = page;
1001                                 cork->off = 0;
1002
1003                                 skb_fill_page_desc(skb, i, page, 0, 0);
1004                                 frag = &skb_shinfo(skb)->frags[i];
1005                         } else {
1006                                 err = -EMSGSIZE;
1007                                 goto error;
1008                         }
1009                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1010                                 err = -EFAULT;
1011                                 goto error;
1012                         }
1013                         cork->off += copy;
1014                         frag->size += copy;
1015                         skb->len += copy;
1016                         skb->data_len += copy;
1017                         skb->truesize += copy;
1018                         atomic_add(copy, &sk->sk_wmem_alloc);
1019                 }
1020                 offset += copy;
1021                 length -= copy;
1022         }
1023
1024         return 0;
1025
1026 error:
1027         cork->length -= length;
1028         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1029         return err;
1030 }
1031
1032 static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
1033                          struct ipcm_cookie *ipc, struct rtable **rtp)
1034 {
1035         struct inet_sock *inet = inet_sk(sk);
1036         struct ip_options *opt;
1037         struct rtable *rt;
1038
1039         /*
1040          * setup for corking.
1041          */
1042         opt = ipc->opt;
1043         if (opt) {
1044                 if (cork->opt == NULL) {
1045                         cork->opt = kmalloc(sizeof(struct ip_options) + 40,
1046                                             sk->sk_allocation);
1047                         if (unlikely(cork->opt == NULL))
1048                                 return -ENOBUFS;
1049                 }
1050                 memcpy(cork->opt, opt, sizeof(struct ip_options) + opt->optlen);
1051                 cork->flags |= IPCORK_OPT;
1052                 cork->addr = ipc->addr;
1053         }
1054         rt = *rtp;
1055         if (unlikely(!rt))
1056                 return -EFAULT;
1057         /*
1058          * We steal reference to this route, caller should not release it
1059          */
1060         *rtp = NULL;
1061         cork->fragsize = inet->pmtudisc == IP_PMTUDISC_PROBE ?
1062                          rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1063         cork->dst = &rt->dst;
1064         cork->length = 0;
1065         cork->tx_flags = ipc->tx_flags;
1066         cork->page = NULL;
1067         cork->off = 0;
1068
1069         return 0;
1070 }
1071
1072 /*
1073  *      ip_append_data() and ip_append_page() can make one large IP datagram
1074  *      from many pieces of data. Each pieces will be holded on the socket
1075  *      until ip_push_pending_frames() is called. Each piece can be a page
1076  *      or non-page data.
1077  *
1078  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
1079  *      this interface potentially.
1080  *
1081  *      LATER: length must be adjusted by pad at tail, when it is required.
1082  */
1083 int ip_append_data(struct sock *sk,
1084                    int getfrag(void *from, char *to, int offset, int len,
1085                                int odd, struct sk_buff *skb),
1086                    void *from, int length, int transhdrlen,
1087                    struct ipcm_cookie *ipc, struct rtable **rtp,
1088                    unsigned int flags)
1089 {
1090         struct inet_sock *inet = inet_sk(sk);
1091         int err;
1092
1093         if (flags&MSG_PROBE)
1094                 return 0;
1095
1096         if (skb_queue_empty(&sk->sk_write_queue)) {
1097                 err = ip_setup_cork(sk, &inet->cork, ipc, rtp);
1098                 if (err)
1099                         return err;
1100         } else {
1101                 transhdrlen = 0;
1102         }
1103
1104         return __ip_append_data(sk, &sk->sk_write_queue, &inet->cork, getfrag,
1105                                 from, length, transhdrlen, flags);
1106 }
1107
1108 ssize_t ip_append_page(struct sock *sk, struct page *page,
1109                        int offset, size_t size, int flags)
1110 {
1111         struct inet_sock *inet = inet_sk(sk);
1112         struct sk_buff *skb;
1113         struct rtable *rt;
1114         struct ip_options *opt = NULL;
1115         int hh_len;
1116         int mtu;
1117         int len;
1118         int err;
1119         unsigned int maxfraglen, fragheaderlen, fraggap;
1120
1121         if (inet->hdrincl)
1122                 return -EPERM;
1123
1124         if (flags&MSG_PROBE)
1125                 return 0;
1126
1127         if (skb_queue_empty(&sk->sk_write_queue))
1128                 return -EINVAL;
1129
1130         rt = (struct rtable *)inet->cork.dst;
1131         if (inet->cork.flags & IPCORK_OPT)
1132                 opt = inet->cork.opt;
1133
1134         if (!(rt->dst.dev->features&NETIF_F_SG))
1135                 return -EOPNOTSUPP;
1136
1137         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1138         mtu = inet->cork.fragsize;
1139
1140         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1141         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1142
1143         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1144                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1145                 return -EMSGSIZE;
1146         }
1147
1148         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1149                 return -EINVAL;
1150
1151         inet->cork.length += size;
1152         if ((size + skb->len > mtu) &&
1153             (sk->sk_protocol == IPPROTO_UDP) &&
1154             (rt->dst.dev->features & NETIF_F_UFO)) {
1155                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1156                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1157         }
1158
1159
1160         while (size > 0) {
1161                 int i;
1162
1163                 if (skb_is_gso(skb))
1164                         len = size;
1165                 else {
1166
1167                         /* Check if the remaining data fits into current packet. */
1168                         len = mtu - skb->len;
1169                         if (len < size)
1170                                 len = maxfraglen - skb->len;
1171                 }
1172                 if (len <= 0) {
1173                         struct sk_buff *skb_prev;
1174                         int alloclen;
1175
1176                         skb_prev = skb;
1177                         fraggap = skb_prev->len - maxfraglen;
1178
1179                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1180                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1181                         if (unlikely(!skb)) {
1182                                 err = -ENOBUFS;
1183                                 goto error;
1184                         }
1185
1186                         /*
1187                          *      Fill in the control structures
1188                          */
1189                         skb->ip_summed = CHECKSUM_NONE;
1190                         skb->csum = 0;
1191                         skb_reserve(skb, hh_len);
1192
1193                         /*
1194                          *      Find where to start putting bytes.
1195                          */
1196                         skb_put(skb, fragheaderlen + fraggap);
1197                         skb_reset_network_header(skb);
1198                         skb->transport_header = (skb->network_header +
1199                                                  fragheaderlen);
1200                         if (fraggap) {
1201                                 skb->csum = skb_copy_and_csum_bits(skb_prev,
1202                                                                    maxfraglen,
1203                                                     skb_transport_header(skb),
1204                                                                    fraggap, 0);
1205                                 skb_prev->csum = csum_sub(skb_prev->csum,
1206                                                           skb->csum);
1207                                 pskb_trim_unique(skb_prev, maxfraglen);
1208                         }
1209
1210                         /*
1211                          * Put the packet on the pending queue.
1212                          */
1213                         __skb_queue_tail(&sk->sk_write_queue, skb);
1214                         continue;
1215                 }
1216
1217                 i = skb_shinfo(skb)->nr_frags;
1218                 if (len > size)
1219                         len = size;
1220                 if (skb_can_coalesce(skb, i, page, offset)) {
1221                         skb_shinfo(skb)->frags[i-1].size += len;
1222                 } else if (i < MAX_SKB_FRAGS) {
1223                         get_page(page);
1224                         skb_fill_page_desc(skb, i, page, offset, len);
1225                 } else {
1226                         err = -EMSGSIZE;
1227                         goto error;
1228                 }
1229
1230                 if (skb->ip_summed == CHECKSUM_NONE) {
1231                         __wsum csum;
1232                         csum = csum_page(page, offset, len);
1233                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1234                 }
1235
1236                 skb->len += len;
1237                 skb->data_len += len;
1238                 skb->truesize += len;
1239                 atomic_add(len, &sk->sk_wmem_alloc);
1240                 offset += len;
1241                 size -= len;
1242         }
1243         return 0;
1244
1245 error:
1246         inet->cork.length -= size;
1247         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1248         return err;
1249 }
1250
1251 static void ip_cork_release(struct inet_cork *cork)
1252 {
1253         cork->flags &= ~IPCORK_OPT;
1254         kfree(cork->opt);
1255         cork->opt = NULL;
1256         dst_release(cork->dst);
1257         cork->dst = NULL;
1258 }
1259
1260 /*
1261  *      Combined all pending IP fragments on the socket as one IP datagram
1262  *      and push them out.
1263  */
1264 struct sk_buff *__ip_make_skb(struct sock *sk,
1265                               struct sk_buff_head *queue,
1266                               struct inet_cork *cork)
1267 {
1268         struct sk_buff *skb, *tmp_skb;
1269         struct sk_buff **tail_skb;
1270         struct inet_sock *inet = inet_sk(sk);
1271         struct net *net = sock_net(sk);
1272         struct ip_options *opt = NULL;
1273         struct rtable *rt = (struct rtable *)cork->dst;
1274         struct iphdr *iph;
1275         __be16 df = 0;
1276         __u8 ttl;
1277
1278         if ((skb = __skb_dequeue(queue)) == NULL)
1279                 goto out;
1280         tail_skb = &(skb_shinfo(skb)->frag_list);
1281
1282         /* move skb->data to ip header from ext header */
1283         if (skb->data < skb_network_header(skb))
1284                 __skb_pull(skb, skb_network_offset(skb));
1285         while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1286                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1287                 *tail_skb = tmp_skb;
1288                 tail_skb = &(tmp_skb->next);
1289                 skb->len += tmp_skb->len;
1290                 skb->data_len += tmp_skb->len;
1291                 skb->truesize += tmp_skb->truesize;
1292                 tmp_skb->destructor = NULL;
1293                 tmp_skb->sk = NULL;
1294         }
1295
1296         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1297          * to fragment the frame generated here. No matter, what transforms
1298          * how transforms change size of the packet, it will come out.
1299          */
1300         if (inet->pmtudisc < IP_PMTUDISC_DO)
1301                 skb->local_df = 1;
1302
1303         /* DF bit is set when we want to see DF on outgoing frames.
1304          * If local_df is set too, we still allow to fragment this frame
1305          * locally. */
1306         if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1307             (skb->len <= dst_mtu(&rt->dst) &&
1308              ip_dont_fragment(sk, &rt->dst)))
1309                 df = htons(IP_DF);
1310
1311         if (cork->flags & IPCORK_OPT)
1312                 opt = cork->opt;
1313
1314         if (rt->rt_type == RTN_MULTICAST)
1315                 ttl = inet->mc_ttl;
1316         else
1317                 ttl = ip_select_ttl(inet, &rt->dst);
1318
1319         iph = (struct iphdr *)skb->data;
1320         iph->version = 4;
1321         iph->ihl = 5;
1322         if (opt) {
1323                 iph->ihl += opt->optlen>>2;
1324                 ip_options_build(skb, opt, cork->addr, rt, 0);
1325         }
1326         iph->tos = inet->tos;
1327         iph->frag_off = df;
1328         ip_select_ident(iph, &rt->dst, sk);
1329         iph->ttl = ttl;
1330         iph->protocol = sk->sk_protocol;
1331         iph->saddr = rt->rt_src;
1332         iph->daddr = rt->rt_dst;
1333
1334         skb->priority = sk->sk_priority;
1335         skb->mark = sk->sk_mark;
1336         /*
1337          * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1338          * on dst refcount
1339          */
1340         cork->dst = NULL;
1341         skb_dst_set(skb, &rt->dst);
1342
1343         if (iph->protocol == IPPROTO_ICMP)
1344                 icmp_out_count(net, ((struct icmphdr *)
1345                         skb_transport_header(skb))->type);
1346
1347         ip_cork_release(cork);
1348 out:
1349         return skb;
1350 }
1351
1352 int ip_send_skb(struct sk_buff *skb)
1353 {
1354         struct net *net = sock_net(skb->sk);
1355         int err;
1356
1357         err = ip_local_out(skb);
1358         if (err) {
1359                 if (err > 0)
1360                         err = net_xmit_errno(err);
1361                 if (err)
1362                         IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1363         }
1364
1365         return err;
1366 }
1367
1368 int ip_push_pending_frames(struct sock *sk)
1369 {
1370         struct sk_buff *skb;
1371
1372         skb = ip_finish_skb(sk);
1373         if (!skb)
1374                 return 0;
1375
1376         /* Netfilter gets whole the not fragmented skb. */
1377         return ip_send_skb(skb);
1378 }
1379
1380 /*
1381  *      Throw away all pending data on the socket.
1382  */
1383 static void __ip_flush_pending_frames(struct sock *sk,
1384                                       struct sk_buff_head *queue,
1385                                       struct inet_cork *cork)
1386 {
1387         struct sk_buff *skb;
1388
1389         while ((skb = __skb_dequeue_tail(queue)) != NULL)
1390                 kfree_skb(skb);
1391
1392         ip_cork_release(cork);
1393 }
1394
1395 void ip_flush_pending_frames(struct sock *sk)
1396 {
1397         __ip_flush_pending_frames(sk, &sk->sk_write_queue, &inet_sk(sk)->cork);
1398 }
1399
1400 struct sk_buff *ip_make_skb(struct sock *sk,
1401                             int getfrag(void *from, char *to, int offset,
1402                                         int len, int odd, struct sk_buff *skb),
1403                             void *from, int length, int transhdrlen,
1404                             struct ipcm_cookie *ipc, struct rtable **rtp,
1405                             unsigned int flags)
1406 {
1407         struct inet_cork cork = {};
1408         struct sk_buff_head queue;
1409         int err;
1410
1411         if (flags & MSG_PROBE)
1412                 return NULL;
1413
1414         __skb_queue_head_init(&queue);
1415
1416         err = ip_setup_cork(sk, &cork, ipc, rtp);
1417         if (err)
1418                 return ERR_PTR(err);
1419
1420         err = __ip_append_data(sk, &queue, &cork, getfrag,
1421                                from, length, transhdrlen, flags);
1422         if (err) {
1423                 __ip_flush_pending_frames(sk, &queue, &cork);
1424                 return ERR_PTR(err);
1425         }
1426
1427         return __ip_make_skb(sk, &queue, &cork);
1428 }
1429
1430 /*
1431  *      Fetch data from kernel space and fill in checksum if needed.
1432  */
1433 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1434                               int len, int odd, struct sk_buff *skb)
1435 {
1436         __wsum csum;
1437
1438         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1439         skb->csum = csum_block_add(skb->csum, csum, odd);
1440         return 0;
1441 }
1442
1443 /*
1444  *      Generic function to send a packet as reply to another packet.
1445  *      Used to send TCP resets so far. ICMP should use this function too.
1446  *
1447  *      Should run single threaded per socket because it uses the sock
1448  *      structure to pass arguments.
1449  */
1450 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1451                    unsigned int len)
1452 {
1453         struct inet_sock *inet = inet_sk(sk);
1454         struct {
1455                 struct ip_options       opt;
1456                 char                    data[40];
1457         } replyopts;
1458         struct ipcm_cookie ipc;
1459         __be32 daddr;
1460         struct rtable *rt = skb_rtable(skb);
1461
1462         if (ip_options_echo(&replyopts.opt, skb))
1463                 return;
1464
1465         daddr = ipc.addr = rt->rt_src;
1466         ipc.opt = NULL;
1467         ipc.tx_flags = 0;
1468
1469         if (replyopts.opt.optlen) {
1470                 ipc.opt = &replyopts.opt;
1471
1472                 if (ipc.opt->srr)
1473                         daddr = replyopts.opt.faddr;
1474         }
1475
1476         {
1477                 struct flowi fl = { .oif = arg->bound_dev_if,
1478                                     .fl4_dst = daddr,
1479                                     .fl4_src = rt->rt_spec_dst,
1480                                     .fl4_tos = RT_TOS(ip_hdr(skb)->tos),
1481                                     .fl_ip_sport = tcp_hdr(skb)->dest,
1482                                     .fl_ip_dport = tcp_hdr(skb)->source,
1483                                     .proto = sk->sk_protocol,
1484                                     .flags = ip_reply_arg_flowi_flags(arg) };
1485                 security_skb_classify_flow(skb, &fl);
1486                 rt = ip_route_output_key(sock_net(sk), &fl);
1487                 if (IS_ERR(rt))
1488                         return;
1489         }
1490
1491         /* And let IP do all the hard work.
1492
1493            This chunk is not reenterable, hence spinlock.
1494            Note that it uses the fact, that this function is called
1495            with locally disabled BH and that sk cannot be already spinlocked.
1496          */
1497         bh_lock_sock(sk);
1498         inet->tos = ip_hdr(skb)->tos;
1499         sk->sk_priority = skb->priority;
1500         sk->sk_protocol = ip_hdr(skb)->protocol;
1501         sk->sk_bound_dev_if = arg->bound_dev_if;
1502         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1503                        &ipc, &rt, MSG_DONTWAIT);
1504         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1505                 if (arg->csumoffset >= 0)
1506                         *((__sum16 *)skb_transport_header(skb) +
1507                           arg->csumoffset) = csum_fold(csum_add(skb->csum,
1508                                                                 arg->csum));
1509                 skb->ip_summed = CHECKSUM_NONE;
1510                 ip_push_pending_frames(sk);
1511         }
1512
1513         bh_unlock_sock(sk);
1514
1515         ip_rt_put(rt);
1516 }
1517
1518 void __init ip_init(void)
1519 {
1520         ip_rt_init();
1521         inet_initpeers();
1522
1523 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1524         igmp_mc_proc_init();
1525 #endif
1526 }