ipv6: allow to send packet after receiving ICMPv6 Too Big message with MTU field...
[linux-2.6.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
71                        dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 static int ip6_output_finish(struct sk_buff *skb)
87 {
88         struct dst_entry *dst = skb_dst(skb);
89
90         if (dst->hh)
91                 return neigh_hh_output(dst->hh, skb);
92         else if (dst->neighbour)
93                 return dst->neighbour->output(skb);
94
95         IP6_INC_STATS_BH(dev_net(dst->dev),
96                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
97         kfree_skb(skb);
98         return -EINVAL;
99
100 }
101
102 /* dev_loopback_xmit for use with netfilter. */
103 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
104 {
105         skb_reset_mac_header(newskb);
106         __skb_pull(newskb, skb_network_offset(newskb));
107         newskb->pkt_type = PACKET_LOOPBACK;
108         newskb->ip_summed = CHECKSUM_UNNECESSARY;
109         WARN_ON(!skb_dst(newskb));
110
111         netif_rx_ni(newskb);
112         return 0;
113 }
114
115
116 static int ip6_output2(struct sk_buff *skb)
117 {
118         struct dst_entry *dst = skb_dst(skb);
119         struct net_device *dev = dst->dev;
120
121         skb->protocol = htons(ETH_P_IPV6);
122         skb->dev = dev;
123
124         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
125                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
126
127                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
128                     ((mroute6_socket(dev_net(dev)) &&
129                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
130                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
131                                          &ipv6_hdr(skb)->saddr))) {
132                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
133
134                         /* Do not check for IFF_ALLMULTI; multicast routing
135                            is not supported in any case.
136                          */
137                         if (newskb)
138                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
139                                         NULL, newskb->dev,
140                                         ip6_dev_loopback_xmit);
141
142                         if (ipv6_hdr(skb)->hop_limit == 0) {
143                                 IP6_INC_STATS(dev_net(dev), idev,
144                                               IPSTATS_MIB_OUTDISCARDS);
145                                 kfree_skb(skb);
146                                 return 0;
147                         }
148                 }
149
150                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
151                                 skb->len);
152         }
153
154         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
155                        ip6_output_finish);
156 }
157
158 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
159 {
160         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
161
162         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
163                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
164 }
165
166 int ip6_output(struct sk_buff *skb)
167 {
168         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
169         if (unlikely(idev->cnf.disable_ipv6)) {
170                 IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
171                               IPSTATS_MIB_OUTDISCARDS);
172                 kfree_skb(skb);
173                 return 0;
174         }
175
176         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
177                                 dst_allfrag(skb_dst(skb)))
178                 return ip6_fragment(skb, ip6_output2);
179         else
180                 return ip6_output2(skb);
181 }
182
183 /*
184  *      xmit an sk_buff (used by TCP)
185  */
186
187 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
188              struct ipv6_txoptions *opt, int ipfragok)
189 {
190         struct net *net = sock_net(sk);
191         struct ipv6_pinfo *np = inet6_sk(sk);
192         struct in6_addr *first_hop = &fl->fl6_dst;
193         struct dst_entry *dst = skb_dst(skb);
194         struct ipv6hdr *hdr;
195         u8  proto = fl->proto;
196         int seg_len = skb->len;
197         int hlimit = -1;
198         int tclass = 0;
199         u32 mtu;
200
201         if (opt) {
202                 unsigned int head_room;
203
204                 /* First: exthdrs may take lots of space (~8K for now)
205                    MAX_HEADER is not enough.
206                  */
207                 head_room = opt->opt_nflen + opt->opt_flen;
208                 seg_len += head_room;
209                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
210
211                 if (skb_headroom(skb) < head_room) {
212                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
213                         if (skb2 == NULL) {
214                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
215                                               IPSTATS_MIB_OUTDISCARDS);
216                                 kfree_skb(skb);
217                                 return -ENOBUFS;
218                         }
219                         kfree_skb(skb);
220                         skb = skb2;
221                         if (sk)
222                                 skb_set_owner_w(skb, sk);
223                 }
224                 if (opt->opt_flen)
225                         ipv6_push_frag_opts(skb, opt, &proto);
226                 if (opt->opt_nflen)
227                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
228         }
229
230         skb_push(skb, sizeof(struct ipv6hdr));
231         skb_reset_network_header(skb);
232         hdr = ipv6_hdr(skb);
233
234         /* Allow local fragmentation. */
235         if (ipfragok)
236                 skb->local_df = 1;
237
238         /*
239          *      Fill in the IPv6 header
240          */
241         if (np) {
242                 tclass = np->tclass;
243                 hlimit = np->hop_limit;
244         }
245         if (hlimit < 0)
246                 hlimit = ip6_dst_hoplimit(dst);
247
248         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
249
250         hdr->payload_len = htons(seg_len);
251         hdr->nexthdr = proto;
252         hdr->hop_limit = hlimit;
253
254         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
255         ipv6_addr_copy(&hdr->daddr, first_hop);
256
257         skb->priority = sk->sk_priority;
258         skb->mark = sk->sk_mark;
259
260         mtu = dst_mtu(dst);
261         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
262                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
263                               IPSTATS_MIB_OUT, skb->len);
264                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
265                                 dst_output);
266         }
267
268         if (net_ratelimit())
269                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
270         skb->dev = dst->dev;
271         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
272         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
273         kfree_skb(skb);
274         return -EMSGSIZE;
275 }
276
277 EXPORT_SYMBOL(ip6_xmit);
278
279 /*
280  *      To avoid extra problems ND packets are send through this
281  *      routine. It's code duplication but I really want to avoid
282  *      extra checks since ipv6_build_header is used by TCP (which
283  *      is for us performance critical)
284  */
285
286 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
287                const struct in6_addr *saddr, const struct in6_addr *daddr,
288                int proto, int len)
289 {
290         struct ipv6_pinfo *np = inet6_sk(sk);
291         struct ipv6hdr *hdr;
292         int totlen;
293
294         skb->protocol = htons(ETH_P_IPV6);
295         skb->dev = dev;
296
297         totlen = len + sizeof(struct ipv6hdr);
298
299         skb_reset_network_header(skb);
300         skb_put(skb, sizeof(struct ipv6hdr));
301         hdr = ipv6_hdr(skb);
302
303         *(__be32*)hdr = htonl(0x60000000);
304
305         hdr->payload_len = htons(len);
306         hdr->nexthdr = proto;
307         hdr->hop_limit = np->hop_limit;
308
309         ipv6_addr_copy(&hdr->saddr, saddr);
310         ipv6_addr_copy(&hdr->daddr, daddr);
311
312         return 0;
313 }
314
315 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
316 {
317         struct ip6_ra_chain *ra;
318         struct sock *last = NULL;
319
320         read_lock(&ip6_ra_lock);
321         for (ra = ip6_ra_chain; ra; ra = ra->next) {
322                 struct sock *sk = ra->sk;
323                 if (sk && ra->sel == sel &&
324                     (!sk->sk_bound_dev_if ||
325                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
326                         if (last) {
327                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
328                                 if (skb2)
329                                         rawv6_rcv(last, skb2);
330                         }
331                         last = sk;
332                 }
333         }
334
335         if (last) {
336                 rawv6_rcv(last, skb);
337                 read_unlock(&ip6_ra_lock);
338                 return 1;
339         }
340         read_unlock(&ip6_ra_lock);
341         return 0;
342 }
343
344 static int ip6_forward_proxy_check(struct sk_buff *skb)
345 {
346         struct ipv6hdr *hdr = ipv6_hdr(skb);
347         u8 nexthdr = hdr->nexthdr;
348         int offset;
349
350         if (ipv6_ext_hdr(nexthdr)) {
351                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
352                 if (offset < 0)
353                         return 0;
354         } else
355                 offset = sizeof(struct ipv6hdr);
356
357         if (nexthdr == IPPROTO_ICMPV6) {
358                 struct icmp6hdr *icmp6;
359
360                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
361                                          offset + 1 - skb->data)))
362                         return 0;
363
364                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
365
366                 switch (icmp6->icmp6_type) {
367                 case NDISC_ROUTER_SOLICITATION:
368                 case NDISC_ROUTER_ADVERTISEMENT:
369                 case NDISC_NEIGHBOUR_SOLICITATION:
370                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
371                 case NDISC_REDIRECT:
372                         /* For reaction involving unicast neighbor discovery
373                          * message destined to the proxied address, pass it to
374                          * input function.
375                          */
376                         return 1;
377                 default:
378                         break;
379                 }
380         }
381
382         /*
383          * The proxying router can't forward traffic sent to a link-local
384          * address, so signal the sender and discard the packet. This
385          * behavior is clarified by the MIPv6 specification.
386          */
387         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
388                 dst_link_failure(skb);
389                 return -1;
390         }
391
392         return 0;
393 }
394
395 static inline int ip6_forward_finish(struct sk_buff *skb)
396 {
397         return dst_output(skb);
398 }
399
400 int ip6_forward(struct sk_buff *skb)
401 {
402         struct dst_entry *dst = skb_dst(skb);
403         struct ipv6hdr *hdr = ipv6_hdr(skb);
404         struct inet6_skb_parm *opt = IP6CB(skb);
405         struct net *net = dev_net(dst->dev);
406         u32 mtu;
407
408         if (net->ipv6.devconf_all->forwarding == 0)
409                 goto error;
410
411         if (skb_warn_if_lro(skb))
412                 goto drop;
413
414         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
415                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
416                 goto drop;
417         }
418
419         skb_forward_csum(skb);
420
421         /*
422          *      We DO NOT make any processing on
423          *      RA packets, pushing them to user level AS IS
424          *      without ane WARRANTY that application will be able
425          *      to interpret them. The reason is that we
426          *      cannot make anything clever here.
427          *
428          *      We are not end-node, so that if packet contains
429          *      AH/ESP, we cannot make anything.
430          *      Defragmentation also would be mistake, RA packets
431          *      cannot be fragmented, because there is no warranty
432          *      that different fragments will go along one path. --ANK
433          */
434         if (opt->ra) {
435                 u8 *ptr = skb_network_header(skb) + opt->ra;
436                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
437                         return 0;
438         }
439
440         /*
441          *      check and decrement ttl
442          */
443         if (hdr->hop_limit <= 1) {
444                 /* Force OUTPUT device used as source address */
445                 skb->dev = dst->dev;
446                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
447                 IP6_INC_STATS_BH(net,
448                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
449
450                 kfree_skb(skb);
451                 return -ETIMEDOUT;
452         }
453
454         /* XXX: idev->cnf.proxy_ndp? */
455         if (net->ipv6.devconf_all->proxy_ndp &&
456             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
457                 int proxied = ip6_forward_proxy_check(skb);
458                 if (proxied > 0)
459                         return ip6_input(skb);
460                 else if (proxied < 0) {
461                         IP6_INC_STATS(net, ip6_dst_idev(dst),
462                                       IPSTATS_MIB_INDISCARDS);
463                         goto drop;
464                 }
465         }
466
467         if (!xfrm6_route_forward(skb)) {
468                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
469                 goto drop;
470         }
471         dst = skb_dst(skb);
472
473         /* IPv6 specs say nothing about it, but it is clear that we cannot
474            send redirects to source routed frames.
475            We don't send redirects to frames decapsulated from IPsec.
476          */
477         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
478             !skb_sec_path(skb)) {
479                 struct in6_addr *target = NULL;
480                 struct rt6_info *rt;
481                 struct neighbour *n = dst->neighbour;
482
483                 /*
484                  *      incoming and outgoing devices are the same
485                  *      send a redirect.
486                  */
487
488                 rt = (struct rt6_info *) dst;
489                 if ((rt->rt6i_flags & RTF_GATEWAY))
490                         target = (struct in6_addr*)&n->primary_key;
491                 else
492                         target = &hdr->daddr;
493
494                 /* Limit redirects both by destination (here)
495                    and by source (inside ndisc_send_redirect)
496                  */
497                 if (xrlim_allow(dst, 1*HZ))
498                         ndisc_send_redirect(skb, n, target);
499         } else {
500                 int addrtype = ipv6_addr_type(&hdr->saddr);
501
502                 /* This check is security critical. */
503                 if (addrtype == IPV6_ADDR_ANY ||
504                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
505                         goto error;
506                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
507                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
508                                     ICMPV6_NOT_NEIGHBOUR, 0);
509                         goto error;
510                 }
511         }
512
513         mtu = dst_mtu(dst);
514         if (mtu < IPV6_MIN_MTU)
515                 mtu = IPV6_MIN_MTU;
516
517         if (skb->len > mtu) {
518                 /* Again, force OUTPUT device used as source address */
519                 skb->dev = dst->dev;
520                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
521                 IP6_INC_STATS_BH(net,
522                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
523                 IP6_INC_STATS_BH(net,
524                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
525                 kfree_skb(skb);
526                 return -EMSGSIZE;
527         }
528
529         if (skb_cow(skb, dst->dev->hard_header_len)) {
530                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
531                 goto drop;
532         }
533
534         hdr = ipv6_hdr(skb);
535
536         /* Mangling hops number delayed to point after skb COW */
537
538         hdr->hop_limit--;
539
540         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
541         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
542                        ip6_forward_finish);
543
544 error:
545         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
546 drop:
547         kfree_skb(skb);
548         return -EINVAL;
549 }
550
551 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
552 {
553         to->pkt_type = from->pkt_type;
554         to->priority = from->priority;
555         to->protocol = from->protocol;
556         skb_dst_drop(to);
557         skb_dst_set(to, dst_clone(skb_dst(from)));
558         to->dev = from->dev;
559         to->mark = from->mark;
560
561 #ifdef CONFIG_NET_SCHED
562         to->tc_index = from->tc_index;
563 #endif
564         nf_copy(to, from);
565 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
566     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
567         to->nf_trace = from->nf_trace;
568 #endif
569         skb_copy_secmark(to, from);
570 }
571
572 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
573 {
574         u16 offset = sizeof(struct ipv6hdr);
575         struct ipv6_opt_hdr *exthdr =
576                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
577         unsigned int packet_len = skb->tail - skb->network_header;
578         int found_rhdr = 0;
579         *nexthdr = &ipv6_hdr(skb)->nexthdr;
580
581         while (offset + 1 <= packet_len) {
582
583                 switch (**nexthdr) {
584
585                 case NEXTHDR_HOP:
586                         break;
587                 case NEXTHDR_ROUTING:
588                         found_rhdr = 1;
589                         break;
590                 case NEXTHDR_DEST:
591 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
592                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
593                                 break;
594 #endif
595                         if (found_rhdr)
596                                 return offset;
597                         break;
598                 default :
599                         return offset;
600                 }
601
602                 offset += ipv6_optlen(exthdr);
603                 *nexthdr = &exthdr->nexthdr;
604                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
605                                                  offset);
606         }
607
608         return offset;
609 }
610
611 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
612 {
613         struct sk_buff *frag;
614         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
615         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
616         struct ipv6hdr *tmp_hdr;
617         struct frag_hdr *fh;
618         unsigned int mtu, hlen, left, len;
619         __be32 frag_id = 0;
620         int ptr, offset = 0, err=0;
621         u8 *prevhdr, nexthdr = 0;
622         struct net *net = dev_net(skb_dst(skb)->dev);
623
624         hlen = ip6_find_1stfragopt(skb, &prevhdr);
625         nexthdr = *prevhdr;
626
627         mtu = ip6_skb_dst_mtu(skb);
628
629         /* We must not fragment if the socket is set to force MTU discovery
630          * or if the skb it not generated by a local socket.
631          */
632         if (!skb->local_df && skb->len > mtu) {
633                 skb->dev = skb_dst(skb)->dev;
634                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
635                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
636                               IPSTATS_MIB_FRAGFAILS);
637                 kfree_skb(skb);
638                 return -EMSGSIZE;
639         }
640
641         if (np && np->frag_size < mtu) {
642                 if (np->frag_size)
643                         mtu = np->frag_size;
644         }
645         mtu -= hlen + sizeof(struct frag_hdr);
646
647         if (skb_has_frags(skb)) {
648                 int first_len = skb_pagelen(skb);
649                 int truesizes = 0;
650
651                 if (first_len - hlen > mtu ||
652                     ((first_len - hlen) & 7) ||
653                     skb_cloned(skb))
654                         goto slow_path;
655
656                 skb_walk_frags(skb, frag) {
657                         /* Correct geometry. */
658                         if (frag->len > mtu ||
659                             ((frag->len & 7) && frag->next) ||
660                             skb_headroom(frag) < hlen)
661                             goto slow_path;
662
663                         /* Partially cloned skb? */
664                         if (skb_shared(frag))
665                                 goto slow_path;
666
667                         BUG_ON(frag->sk);
668                         if (skb->sk) {
669                                 frag->sk = skb->sk;
670                                 frag->destructor = sock_wfree;
671                                 truesizes += frag->truesize;
672                         }
673                 }
674
675                 err = 0;
676                 offset = 0;
677                 frag = skb_shinfo(skb)->frag_list;
678                 skb_frag_list_init(skb);
679                 /* BUILD HEADER */
680
681                 *prevhdr = NEXTHDR_FRAGMENT;
682                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
683                 if (!tmp_hdr) {
684                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
685                                       IPSTATS_MIB_FRAGFAILS);
686                         return -ENOMEM;
687                 }
688
689                 __skb_pull(skb, hlen);
690                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
691                 __skb_push(skb, hlen);
692                 skb_reset_network_header(skb);
693                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
694
695                 ipv6_select_ident(fh);
696                 fh->nexthdr = nexthdr;
697                 fh->reserved = 0;
698                 fh->frag_off = htons(IP6_MF);
699                 frag_id = fh->identification;
700
701                 first_len = skb_pagelen(skb);
702                 skb->data_len = first_len - skb_headlen(skb);
703                 skb->truesize -= truesizes;
704                 skb->len = first_len;
705                 ipv6_hdr(skb)->payload_len = htons(first_len -
706                                                    sizeof(struct ipv6hdr));
707
708                 dst_hold(&rt->u.dst);
709
710                 for (;;) {
711                         /* Prepare header of the next frame,
712                          * before previous one went down. */
713                         if (frag) {
714                                 frag->ip_summed = CHECKSUM_NONE;
715                                 skb_reset_transport_header(frag);
716                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
717                                 __skb_push(frag, hlen);
718                                 skb_reset_network_header(frag);
719                                 memcpy(skb_network_header(frag), tmp_hdr,
720                                        hlen);
721                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
722                                 fh->nexthdr = nexthdr;
723                                 fh->reserved = 0;
724                                 fh->frag_off = htons(offset);
725                                 if (frag->next != NULL)
726                                         fh->frag_off |= htons(IP6_MF);
727                                 fh->identification = frag_id;
728                                 ipv6_hdr(frag)->payload_len =
729                                                 htons(frag->len -
730                                                       sizeof(struct ipv6hdr));
731                                 ip6_copy_metadata(frag, skb);
732                         }
733
734                         err = output(skb);
735                         if(!err)
736                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
737                                               IPSTATS_MIB_FRAGCREATES);
738
739                         if (err || !frag)
740                                 break;
741
742                         skb = frag;
743                         frag = skb->next;
744                         skb->next = NULL;
745                 }
746
747                 kfree(tmp_hdr);
748
749                 if (err == 0) {
750                         IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
751                                       IPSTATS_MIB_FRAGOKS);
752                         dst_release(&rt->u.dst);
753                         return 0;
754                 }
755
756                 while (frag) {
757                         skb = frag->next;
758                         kfree_skb(frag);
759                         frag = skb;
760                 }
761
762                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
763                               IPSTATS_MIB_FRAGFAILS);
764                 dst_release(&rt->u.dst);
765                 return err;
766         }
767
768 slow_path:
769         left = skb->len - hlen;         /* Space per frame */
770         ptr = hlen;                     /* Where to start from */
771
772         /*
773          *      Fragment the datagram.
774          */
775
776         *prevhdr = NEXTHDR_FRAGMENT;
777
778         /*
779          *      Keep copying data until we run out.
780          */
781         while(left > 0) {
782                 len = left;
783                 /* IF: it doesn't fit, use 'mtu' - the data space left */
784                 if (len > mtu)
785                         len = mtu;
786                 /* IF: we are not sending upto and including the packet end
787                    then align the next start on an eight byte boundary */
788                 if (len < left) {
789                         len &= ~7;
790                 }
791                 /*
792                  *      Allocate buffer.
793                  */
794
795                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
796                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
797                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
798                                       IPSTATS_MIB_FRAGFAILS);
799                         err = -ENOMEM;
800                         goto fail;
801                 }
802
803                 /*
804                  *      Set up data on packet
805                  */
806
807                 ip6_copy_metadata(frag, skb);
808                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
809                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
810                 skb_reset_network_header(frag);
811                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
812                 frag->transport_header = (frag->network_header + hlen +
813                                           sizeof(struct frag_hdr));
814
815                 /*
816                  *      Charge the memory for the fragment to any owner
817                  *      it might possess
818                  */
819                 if (skb->sk)
820                         skb_set_owner_w(frag, skb->sk);
821
822                 /*
823                  *      Copy the packet header into the new buffer.
824                  */
825                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
826
827                 /*
828                  *      Build fragment header.
829                  */
830                 fh->nexthdr = nexthdr;
831                 fh->reserved = 0;
832                 if (!frag_id) {
833                         ipv6_select_ident(fh);
834                         frag_id = fh->identification;
835                 } else
836                         fh->identification = frag_id;
837
838                 /*
839                  *      Copy a block of the IP datagram.
840                  */
841                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
842                         BUG();
843                 left -= len;
844
845                 fh->frag_off = htons(offset);
846                 if (left > 0)
847                         fh->frag_off |= htons(IP6_MF);
848                 ipv6_hdr(frag)->payload_len = htons(frag->len -
849                                                     sizeof(struct ipv6hdr));
850
851                 ptr += len;
852                 offset += len;
853
854                 /*
855                  *      Put this fragment into the sending queue.
856                  */
857                 err = output(frag);
858                 if (err)
859                         goto fail;
860
861                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
862                               IPSTATS_MIB_FRAGCREATES);
863         }
864         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
865                       IPSTATS_MIB_FRAGOKS);
866         kfree_skb(skb);
867         return err;
868
869 fail:
870         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
871                       IPSTATS_MIB_FRAGFAILS);
872         kfree_skb(skb);
873         return err;
874 }
875
876 static inline int ip6_rt_check(struct rt6key *rt_key,
877                                struct in6_addr *fl_addr,
878                                struct in6_addr *addr_cache)
879 {
880         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
881                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
882 }
883
884 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
885                                           struct dst_entry *dst,
886                                           struct flowi *fl)
887 {
888         struct ipv6_pinfo *np = inet6_sk(sk);
889         struct rt6_info *rt = (struct rt6_info *)dst;
890
891         if (!dst)
892                 goto out;
893
894         /* Yes, checking route validity in not connected
895          * case is not very simple. Take into account,
896          * that we do not support routing by source, TOS,
897          * and MSG_DONTROUTE            --ANK (980726)
898          *
899          * 1. ip6_rt_check(): If route was host route,
900          *    check that cached destination is current.
901          *    If it is network route, we still may
902          *    check its validity using saved pointer
903          *    to the last used address: daddr_cache.
904          *    We do not want to save whole address now,
905          *    (because main consumer of this service
906          *    is tcp, which has not this problem),
907          *    so that the last trick works only on connected
908          *    sockets.
909          * 2. oif also should be the same.
910          */
911         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
912 #ifdef CONFIG_IPV6_SUBTREES
913             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
914 #endif
915             (fl->oif && fl->oif != dst->dev->ifindex)) {
916                 dst_release(dst);
917                 dst = NULL;
918         }
919
920 out:
921         return dst;
922 }
923
924 static int ip6_dst_lookup_tail(struct sock *sk,
925                                struct dst_entry **dst, struct flowi *fl)
926 {
927         int err;
928         struct net *net = sock_net(sk);
929
930         if (*dst == NULL)
931                 *dst = ip6_route_output(net, sk, fl);
932
933         if ((err = (*dst)->error))
934                 goto out_err_release;
935
936         if (ipv6_addr_any(&fl->fl6_src)) {
937                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
938                                          &fl->fl6_dst,
939                                          sk ? inet6_sk(sk)->srcprefs : 0,
940                                          &fl->fl6_src);
941                 if (err)
942                         goto out_err_release;
943         }
944
945 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
946         /*
947          * Here if the dst entry we've looked up
948          * has a neighbour entry that is in the INCOMPLETE
949          * state and the src address from the flow is
950          * marked as OPTIMISTIC, we release the found
951          * dst entry and replace it instead with the
952          * dst entry of the nexthop router
953          */
954         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
955                 struct inet6_ifaddr *ifp;
956                 struct flowi fl_gw;
957                 int redirect;
958
959                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
960                                       (*dst)->dev, 1);
961
962                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
963                 if (ifp)
964                         in6_ifa_put(ifp);
965
966                 if (redirect) {
967                         /*
968                          * We need to get the dst entry for the
969                          * default router instead
970                          */
971                         dst_release(*dst);
972                         memcpy(&fl_gw, fl, sizeof(struct flowi));
973                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
974                         *dst = ip6_route_output(net, sk, &fl_gw);
975                         if ((err = (*dst)->error))
976                                 goto out_err_release;
977                 }
978         }
979 #endif
980
981         return 0;
982
983 out_err_release:
984         if (err == -ENETUNREACH)
985                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
986         dst_release(*dst);
987         *dst = NULL;
988         return err;
989 }
990
991 /**
992  *      ip6_dst_lookup - perform route lookup on flow
993  *      @sk: socket which provides route info
994  *      @dst: pointer to dst_entry * for result
995  *      @fl: flow to lookup
996  *
997  *      This function performs a route lookup on the given flow.
998  *
999  *      It returns zero on success, or a standard errno code on error.
1000  */
1001 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1002 {
1003         *dst = NULL;
1004         return ip6_dst_lookup_tail(sk, dst, fl);
1005 }
1006 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1007
1008 /**
1009  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1010  *      @sk: socket which provides the dst cache and route info
1011  *      @dst: pointer to dst_entry * for result
1012  *      @fl: flow to lookup
1013  *
1014  *      This function performs a route lookup on the given flow with the
1015  *      possibility of using the cached route in the socket if it is valid.
1016  *      It will take the socket dst lock when operating on the dst cache.
1017  *      As a result, this function can only be used in process context.
1018  *
1019  *      It returns zero on success, or a standard errno code on error.
1020  */
1021 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1022 {
1023         *dst = NULL;
1024         if (sk) {
1025                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1026                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1027         }
1028
1029         return ip6_dst_lookup_tail(sk, dst, fl);
1030 }
1031 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1032
1033 static inline int ip6_ufo_append_data(struct sock *sk,
1034                         int getfrag(void *from, char *to, int offset, int len,
1035                         int odd, struct sk_buff *skb),
1036                         void *from, int length, int hh_len, int fragheaderlen,
1037                         int transhdrlen, int mtu,unsigned int flags)
1038
1039 {
1040         struct sk_buff *skb;
1041         int err;
1042
1043         /* There is support for UDP large send offload by network
1044          * device, so create one single skb packet containing complete
1045          * udp datagram
1046          */
1047         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1048                 skb = sock_alloc_send_skb(sk,
1049                         hh_len + fragheaderlen + transhdrlen + 20,
1050                         (flags & MSG_DONTWAIT), &err);
1051                 if (skb == NULL)
1052                         return -ENOMEM;
1053
1054                 /* reserve space for Hardware header */
1055                 skb_reserve(skb, hh_len);
1056
1057                 /* create space for UDP/IP header */
1058                 skb_put(skb,fragheaderlen + transhdrlen);
1059
1060                 /* initialize network header pointer */
1061                 skb_reset_network_header(skb);
1062
1063                 /* initialize protocol header pointer */
1064                 skb->transport_header = skb->network_header + fragheaderlen;
1065
1066                 skb->ip_summed = CHECKSUM_PARTIAL;
1067                 skb->csum = 0;
1068                 sk->sk_sndmsg_off = 0;
1069         }
1070
1071         err = skb_append_datato_frags(sk,skb, getfrag, from,
1072                                       (length - transhdrlen));
1073         if (!err) {
1074                 struct frag_hdr fhdr;
1075
1076                 /* Specify the length of each IPv6 datagram fragment.
1077                  * It has to be a multiple of 8.
1078                  */
1079                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1080                                              sizeof(struct frag_hdr)) & ~7;
1081                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1082                 ipv6_select_ident(&fhdr);
1083                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1084                 __skb_queue_tail(&sk->sk_write_queue, skb);
1085
1086                 return 0;
1087         }
1088         /* There is not enough support do UPD LSO,
1089          * so follow normal path
1090          */
1091         kfree_skb(skb);
1092
1093         return err;
1094 }
1095
1096 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1097                                                gfp_t gfp)
1098 {
1099         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1100 }
1101
1102 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1103                                                 gfp_t gfp)
1104 {
1105         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1106 }
1107
1108 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1109         int offset, int len, int odd, struct sk_buff *skb),
1110         void *from, int length, int transhdrlen,
1111         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1112         struct rt6_info *rt, unsigned int flags)
1113 {
1114         struct inet_sock *inet = inet_sk(sk);
1115         struct ipv6_pinfo *np = inet6_sk(sk);
1116         struct sk_buff *skb;
1117         unsigned int maxfraglen, fragheaderlen;
1118         int exthdrlen;
1119         int hh_len;
1120         int mtu;
1121         int copy;
1122         int err;
1123         int offset = 0;
1124         int csummode = CHECKSUM_NONE;
1125
1126         if (flags&MSG_PROBE)
1127                 return 0;
1128         if (skb_queue_empty(&sk->sk_write_queue)) {
1129                 /*
1130                  * setup for corking
1131                  */
1132                 if (opt) {
1133                         if (WARN_ON(np->cork.opt))
1134                                 return -EINVAL;
1135
1136                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1137                         if (unlikely(np->cork.opt == NULL))
1138                                 return -ENOBUFS;
1139
1140                         np->cork.opt->tot_len = opt->tot_len;
1141                         np->cork.opt->opt_flen = opt->opt_flen;
1142                         np->cork.opt->opt_nflen = opt->opt_nflen;
1143
1144                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1145                                                             sk->sk_allocation);
1146                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1147                                 return -ENOBUFS;
1148
1149                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1150                                                             sk->sk_allocation);
1151                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1152                                 return -ENOBUFS;
1153
1154                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1155                                                            sk->sk_allocation);
1156                         if (opt->hopopt && !np->cork.opt->hopopt)
1157                                 return -ENOBUFS;
1158
1159                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1160                                                             sk->sk_allocation);
1161                         if (opt->srcrt && !np->cork.opt->srcrt)
1162                                 return -ENOBUFS;
1163
1164                         /* need source address above miyazawa*/
1165                 }
1166                 dst_hold(&rt->u.dst);
1167                 inet->cork.dst = &rt->u.dst;
1168                 inet->cork.fl = *fl;
1169                 np->cork.hop_limit = hlimit;
1170                 np->cork.tclass = tclass;
1171                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1172                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1173                 if (np->frag_size < mtu) {
1174                         if (np->frag_size)
1175                                 mtu = np->frag_size;
1176                 }
1177                 inet->cork.fragsize = mtu;
1178                 if (dst_allfrag(rt->u.dst.path))
1179                         inet->cork.flags |= IPCORK_ALLFRAG;
1180                 inet->cork.length = 0;
1181                 sk->sk_sndmsg_page = NULL;
1182                 sk->sk_sndmsg_off = 0;
1183                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1184                             rt->rt6i_nfheader_len;
1185                 length += exthdrlen;
1186                 transhdrlen += exthdrlen;
1187         } else {
1188                 rt = (struct rt6_info *)inet->cork.dst;
1189                 fl = &inet->cork.fl;
1190                 opt = np->cork.opt;
1191                 transhdrlen = 0;
1192                 exthdrlen = 0;
1193                 mtu = inet->cork.fragsize;
1194         }
1195
1196         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1197
1198         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1199                         (opt ? opt->opt_nflen : 0);
1200         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1201
1202         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1203                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1204                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1205                         return -EMSGSIZE;
1206                 }
1207         }
1208
1209         /*
1210          * Let's try using as much space as possible.
1211          * Use MTU if total length of the message fits into the MTU.
1212          * Otherwise, we need to reserve fragment header and
1213          * fragment alignment (= 8-15 octects, in total).
1214          *
1215          * Note that we may need to "move" the data from the tail of
1216          * of the buffer to the new fragment when we split
1217          * the message.
1218          *
1219          * FIXME: It may be fragmented into multiple chunks
1220          *        at once if non-fragmentable extension headers
1221          *        are too large.
1222          * --yoshfuji
1223          */
1224
1225         inet->cork.length += length;
1226         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1227             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1228
1229                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1230                                           fragheaderlen, transhdrlen, mtu,
1231                                           flags);
1232                 if (err)
1233                         goto error;
1234                 return 0;
1235         }
1236
1237         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1238                 goto alloc_new_skb;
1239
1240         while (length > 0) {
1241                 /* Check if the remaining data fits into current packet. */
1242                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1243                 if (copy < length)
1244                         copy = maxfraglen - skb->len;
1245
1246                 if (copy <= 0) {
1247                         char *data;
1248                         unsigned int datalen;
1249                         unsigned int fraglen;
1250                         unsigned int fraggap;
1251                         unsigned int alloclen;
1252                         struct sk_buff *skb_prev;
1253 alloc_new_skb:
1254                         skb_prev = skb;
1255
1256                         /* There's no room in the current skb */
1257                         if (skb_prev)
1258                                 fraggap = skb_prev->len - maxfraglen;
1259                         else
1260                                 fraggap = 0;
1261
1262                         /*
1263                          * If remaining data exceeds the mtu,
1264                          * we know we need more fragment(s).
1265                          */
1266                         datalen = length + fraggap;
1267                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1268                                 datalen = maxfraglen - fragheaderlen;
1269
1270                         fraglen = datalen + fragheaderlen;
1271                         if ((flags & MSG_MORE) &&
1272                             !(rt->u.dst.dev->features&NETIF_F_SG))
1273                                 alloclen = mtu;
1274                         else
1275                                 alloclen = datalen + fragheaderlen;
1276
1277                         /*
1278                          * The last fragment gets additional space at tail.
1279                          * Note: we overallocate on fragments with MSG_MODE
1280                          * because we have no idea if we're the last one.
1281                          */
1282                         if (datalen == length + fraggap)
1283                                 alloclen += rt->u.dst.trailer_len;
1284
1285                         /*
1286                          * We just reserve space for fragment header.
1287                          * Note: this may be overallocation if the message
1288                          * (without MSG_MORE) fits into the MTU.
1289                          */
1290                         alloclen += sizeof(struct frag_hdr);
1291
1292                         if (transhdrlen) {
1293                                 skb = sock_alloc_send_skb(sk,
1294                                                 alloclen + hh_len,
1295                                                 (flags & MSG_DONTWAIT), &err);
1296                         } else {
1297                                 skb = NULL;
1298                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1299                                     2 * sk->sk_sndbuf)
1300                                         skb = sock_wmalloc(sk,
1301                                                            alloclen + hh_len, 1,
1302                                                            sk->sk_allocation);
1303                                 if (unlikely(skb == NULL))
1304                                         err = -ENOBUFS;
1305                         }
1306                         if (skb == NULL)
1307                                 goto error;
1308                         /*
1309                          *      Fill in the control structures
1310                          */
1311                         skb->ip_summed = csummode;
1312                         skb->csum = 0;
1313                         /* reserve for fragmentation */
1314                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1315
1316                         /*
1317                          *      Find where to start putting bytes
1318                          */
1319                         data = skb_put(skb, fraglen);
1320                         skb_set_network_header(skb, exthdrlen);
1321                         data += fragheaderlen;
1322                         skb->transport_header = (skb->network_header +
1323                                                  fragheaderlen);
1324                         if (fraggap) {
1325                                 skb->csum = skb_copy_and_csum_bits(
1326                                         skb_prev, maxfraglen,
1327                                         data + transhdrlen, fraggap, 0);
1328                                 skb_prev->csum = csum_sub(skb_prev->csum,
1329                                                           skb->csum);
1330                                 data += fraggap;
1331                                 pskb_trim_unique(skb_prev, maxfraglen);
1332                         }
1333                         copy = datalen - transhdrlen - fraggap;
1334                         if (copy < 0) {
1335                                 err = -EINVAL;
1336                                 kfree_skb(skb);
1337                                 goto error;
1338                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1339                                 err = -EFAULT;
1340                                 kfree_skb(skb);
1341                                 goto error;
1342                         }
1343
1344                         offset += copy;
1345                         length -= datalen - fraggap;
1346                         transhdrlen = 0;
1347                         exthdrlen = 0;
1348                         csummode = CHECKSUM_NONE;
1349
1350                         /*
1351                          * Put the packet on the pending queue
1352                          */
1353                         __skb_queue_tail(&sk->sk_write_queue, skb);
1354                         continue;
1355                 }
1356
1357                 if (copy > length)
1358                         copy = length;
1359
1360                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1361                         unsigned int off;
1362
1363                         off = skb->len;
1364                         if (getfrag(from, skb_put(skb, copy),
1365                                                 offset, copy, off, skb) < 0) {
1366                                 __skb_trim(skb, off);
1367                                 err = -EFAULT;
1368                                 goto error;
1369                         }
1370                 } else {
1371                         int i = skb_shinfo(skb)->nr_frags;
1372                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1373                         struct page *page = sk->sk_sndmsg_page;
1374                         int off = sk->sk_sndmsg_off;
1375                         unsigned int left;
1376
1377                         if (page && (left = PAGE_SIZE - off) > 0) {
1378                                 if (copy >= left)
1379                                         copy = left;
1380                                 if (page != frag->page) {
1381                                         if (i == MAX_SKB_FRAGS) {
1382                                                 err = -EMSGSIZE;
1383                                                 goto error;
1384                                         }
1385                                         get_page(page);
1386                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1387                                         frag = &skb_shinfo(skb)->frags[i];
1388                                 }
1389                         } else if(i < MAX_SKB_FRAGS) {
1390                                 if (copy > PAGE_SIZE)
1391                                         copy = PAGE_SIZE;
1392                                 page = alloc_pages(sk->sk_allocation, 0);
1393                                 if (page == NULL) {
1394                                         err = -ENOMEM;
1395                                         goto error;
1396                                 }
1397                                 sk->sk_sndmsg_page = page;
1398                                 sk->sk_sndmsg_off = 0;
1399
1400                                 skb_fill_page_desc(skb, i, page, 0, 0);
1401                                 frag = &skb_shinfo(skb)->frags[i];
1402                         } else {
1403                                 err = -EMSGSIZE;
1404                                 goto error;
1405                         }
1406                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1407                                 err = -EFAULT;
1408                                 goto error;
1409                         }
1410                         sk->sk_sndmsg_off += copy;
1411                         frag->size += copy;
1412                         skb->len += copy;
1413                         skb->data_len += copy;
1414                         skb->truesize += copy;
1415                         atomic_add(copy, &sk->sk_wmem_alloc);
1416                 }
1417                 offset += copy;
1418                 length -= copy;
1419         }
1420         return 0;
1421 error:
1422         inet->cork.length -= length;
1423         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1424         return err;
1425 }
1426
1427 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1428 {
1429         if (np->cork.opt) {
1430                 kfree(np->cork.opt->dst0opt);
1431                 kfree(np->cork.opt->dst1opt);
1432                 kfree(np->cork.opt->hopopt);
1433                 kfree(np->cork.opt->srcrt);
1434                 kfree(np->cork.opt);
1435                 np->cork.opt = NULL;
1436         }
1437
1438         if (inet->cork.dst) {
1439                 dst_release(inet->cork.dst);
1440                 inet->cork.dst = NULL;
1441                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1442         }
1443         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1444 }
1445
1446 int ip6_push_pending_frames(struct sock *sk)
1447 {
1448         struct sk_buff *skb, *tmp_skb;
1449         struct sk_buff **tail_skb;
1450         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1451         struct inet_sock *inet = inet_sk(sk);
1452         struct ipv6_pinfo *np = inet6_sk(sk);
1453         struct net *net = sock_net(sk);
1454         struct ipv6hdr *hdr;
1455         struct ipv6_txoptions *opt = np->cork.opt;
1456         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1457         struct flowi *fl = &inet->cork.fl;
1458         unsigned char proto = fl->proto;
1459         int err = 0;
1460
1461         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1462                 goto out;
1463         tail_skb = &(skb_shinfo(skb)->frag_list);
1464
1465         /* move skb->data to ip header from ext header */
1466         if (skb->data < skb_network_header(skb))
1467                 __skb_pull(skb, skb_network_offset(skb));
1468         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1469                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1470                 *tail_skb = tmp_skb;
1471                 tail_skb = &(tmp_skb->next);
1472                 skb->len += tmp_skb->len;
1473                 skb->data_len += tmp_skb->len;
1474                 skb->truesize += tmp_skb->truesize;
1475                 tmp_skb->destructor = NULL;
1476                 tmp_skb->sk = NULL;
1477         }
1478
1479         /* Allow local fragmentation. */
1480         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1481                 skb->local_df = 1;
1482
1483         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1484         __skb_pull(skb, skb_network_header_len(skb));
1485         if (opt && opt->opt_flen)
1486                 ipv6_push_frag_opts(skb, opt, &proto);
1487         if (opt && opt->opt_nflen)
1488                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1489
1490         skb_push(skb, sizeof(struct ipv6hdr));
1491         skb_reset_network_header(skb);
1492         hdr = ipv6_hdr(skb);
1493
1494         *(__be32*)hdr = fl->fl6_flowlabel |
1495                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1496
1497         hdr->hop_limit = np->cork.hop_limit;
1498         hdr->nexthdr = proto;
1499         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1500         ipv6_addr_copy(&hdr->daddr, final_dst);
1501
1502         skb->priority = sk->sk_priority;
1503         skb->mark = sk->sk_mark;
1504
1505         skb_dst_set(skb, dst_clone(&rt->u.dst));
1506         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1507         if (proto == IPPROTO_ICMPV6) {
1508                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1509
1510                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1511                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1512         }
1513
1514         err = ip6_local_out(skb);
1515         if (err) {
1516                 if (err > 0)
1517                         err = net_xmit_errno(err);
1518                 if (err)
1519                         goto error;
1520         }
1521
1522 out:
1523         ip6_cork_release(inet, np);
1524         return err;
1525 error:
1526         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1527         goto out;
1528 }
1529
1530 void ip6_flush_pending_frames(struct sock *sk)
1531 {
1532         struct sk_buff *skb;
1533
1534         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1535                 if (skb_dst(skb))
1536                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1537                                       IPSTATS_MIB_OUTDISCARDS);
1538                 kfree_skb(skb);
1539         }
1540
1541         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1542 }