ipv6: Fragment locally generated tunnel-mode IPSec6 packets as needed.
[linux-2.6.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89         skb_reset_mac_header(newskb);
90         __skb_pull(newskb, skb_network_offset(newskb));
91         newskb->pkt_type = PACKET_LOOPBACK;
92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
93         WARN_ON(!skb_dst(newskb));
94
95         netif_rx_ni(newskb);
96         return 0;
97 }
98
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101         struct dst_entry *dst = skb_dst(skb);
102         struct net_device *dev = dst->dev;
103
104         skb->protocol = htons(ETH_P_IPV6);
105         skb->dev = dev;
106
107         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
108                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
109
110                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
111                     ((mroute6_socket(dev_net(dev), skb) &&
112                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
113                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
114                                          &ipv6_hdr(skb)->saddr))) {
115                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
116
117                         /* Do not check for IFF_ALLMULTI; multicast routing
118                            is not supported in any case.
119                          */
120                         if (newskb)
121                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
122                                         newskb, NULL, newskb->dev,
123                                         ip6_dev_loopback_xmit);
124
125                         if (ipv6_hdr(skb)->hop_limit == 0) {
126                                 IP6_INC_STATS(dev_net(dev), idev,
127                                               IPSTATS_MIB_OUTDISCARDS);
128                                 kfree_skb(skb);
129                                 return 0;
130                         }
131                 }
132
133                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
134                                 skb->len);
135         }
136
137         if (dst->hh)
138                 return neigh_hh_output(dst->hh, skb);
139         else if (dst->neighbour)
140                 return dst->neighbour->output(skb);
141
142         IP6_INC_STATS_BH(dev_net(dst->dev),
143                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
144         kfree_skb(skb);
145         return -EINVAL;
146 }
147
148 static int ip6_finish_output(struct sk_buff *skb)
149 {
150         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
151             dst_allfrag(skb_dst(skb)))
152                 return ip6_fragment(skb, ip6_finish_output2);
153         else
154                 return ip6_finish_output2(skb);
155 }
156
157 int ip6_output(struct sk_buff *skb)
158 {
159         struct net_device *dev = skb_dst(skb)->dev;
160         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161         if (unlikely(idev->cnf.disable_ipv6)) {
162                 IP6_INC_STATS(dev_net(dev), idev,
163                               IPSTATS_MIB_OUTDISCARDS);
164                 kfree_skb(skb);
165                 return 0;
166         }
167
168         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
169                             ip6_finish_output,
170                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
171 }
172
173 /*
174  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
175  */
176
177 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
178              struct ipv6_txoptions *opt)
179 {
180         struct net *net = sock_net(sk);
181         struct ipv6_pinfo *np = inet6_sk(sk);
182         struct in6_addr *first_hop = &fl->fl6_dst;
183         struct dst_entry *dst = skb_dst(skb);
184         struct ipv6hdr *hdr;
185         u8  proto = fl->proto;
186         int seg_len = skb->len;
187         int hlimit = -1;
188         int tclass = 0;
189         u32 mtu;
190
191         if (opt) {
192                 unsigned int head_room;
193
194                 /* First: exthdrs may take lots of space (~8K for now)
195                    MAX_HEADER is not enough.
196                  */
197                 head_room = opt->opt_nflen + opt->opt_flen;
198                 seg_len += head_room;
199                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
200
201                 if (skb_headroom(skb) < head_room) {
202                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
203                         if (skb2 == NULL) {
204                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
205                                               IPSTATS_MIB_OUTDISCARDS);
206                                 kfree_skb(skb);
207                                 return -ENOBUFS;
208                         }
209                         kfree_skb(skb);
210                         skb = skb2;
211                         skb_set_owner_w(skb, sk);
212                 }
213                 if (opt->opt_flen)
214                         ipv6_push_frag_opts(skb, opt, &proto);
215                 if (opt->opt_nflen)
216                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
217         }
218
219         skb_push(skb, sizeof(struct ipv6hdr));
220         skb_reset_network_header(skb);
221         hdr = ipv6_hdr(skb);
222
223         /*
224          *      Fill in the IPv6 header
225          */
226         if (np) {
227                 tclass = np->tclass;
228                 hlimit = np->hop_limit;
229         }
230         if (hlimit < 0)
231                 hlimit = ip6_dst_hoplimit(dst);
232
233         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
234
235         hdr->payload_len = htons(seg_len);
236         hdr->nexthdr = proto;
237         hdr->hop_limit = hlimit;
238
239         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
240         ipv6_addr_copy(&hdr->daddr, first_hop);
241
242         skb->priority = sk->sk_priority;
243         skb->mark = sk->sk_mark;
244
245         mtu = dst_mtu(dst);
246         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
247                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
248                               IPSTATS_MIB_OUT, skb->len);
249                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
250                                dst->dev, dst_output);
251         }
252
253         if (net_ratelimit())
254                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
255         skb->dev = dst->dev;
256         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
257         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
258         kfree_skb(skb);
259         return -EMSGSIZE;
260 }
261
262 EXPORT_SYMBOL(ip6_xmit);
263
264 /*
265  *      To avoid extra problems ND packets are send through this
266  *      routine. It's code duplication but I really want to avoid
267  *      extra checks since ipv6_build_header is used by TCP (which
268  *      is for us performance critical)
269  */
270
271 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
272                const struct in6_addr *saddr, const struct in6_addr *daddr,
273                int proto, int len)
274 {
275         struct ipv6_pinfo *np = inet6_sk(sk);
276         struct ipv6hdr *hdr;
277         int totlen;
278
279         skb->protocol = htons(ETH_P_IPV6);
280         skb->dev = dev;
281
282         totlen = len + sizeof(struct ipv6hdr);
283
284         skb_reset_network_header(skb);
285         skb_put(skb, sizeof(struct ipv6hdr));
286         hdr = ipv6_hdr(skb);
287
288         *(__be32*)hdr = htonl(0x60000000);
289
290         hdr->payload_len = htons(len);
291         hdr->nexthdr = proto;
292         hdr->hop_limit = np->hop_limit;
293
294         ipv6_addr_copy(&hdr->saddr, saddr);
295         ipv6_addr_copy(&hdr->daddr, daddr);
296
297         return 0;
298 }
299
300 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
301 {
302         struct ip6_ra_chain *ra;
303         struct sock *last = NULL;
304
305         read_lock(&ip6_ra_lock);
306         for (ra = ip6_ra_chain; ra; ra = ra->next) {
307                 struct sock *sk = ra->sk;
308                 if (sk && ra->sel == sel &&
309                     (!sk->sk_bound_dev_if ||
310                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
311                         if (last) {
312                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
313                                 if (skb2)
314                                         rawv6_rcv(last, skb2);
315                         }
316                         last = sk;
317                 }
318         }
319
320         if (last) {
321                 rawv6_rcv(last, skb);
322                 read_unlock(&ip6_ra_lock);
323                 return 1;
324         }
325         read_unlock(&ip6_ra_lock);
326         return 0;
327 }
328
329 static int ip6_forward_proxy_check(struct sk_buff *skb)
330 {
331         struct ipv6hdr *hdr = ipv6_hdr(skb);
332         u8 nexthdr = hdr->nexthdr;
333         int offset;
334
335         if (ipv6_ext_hdr(nexthdr)) {
336                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
337                 if (offset < 0)
338                         return 0;
339         } else
340                 offset = sizeof(struct ipv6hdr);
341
342         if (nexthdr == IPPROTO_ICMPV6) {
343                 struct icmp6hdr *icmp6;
344
345                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
346                                          offset + 1 - skb->data)))
347                         return 0;
348
349                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
350
351                 switch (icmp6->icmp6_type) {
352                 case NDISC_ROUTER_SOLICITATION:
353                 case NDISC_ROUTER_ADVERTISEMENT:
354                 case NDISC_NEIGHBOUR_SOLICITATION:
355                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
356                 case NDISC_REDIRECT:
357                         /* For reaction involving unicast neighbor discovery
358                          * message destined to the proxied address, pass it to
359                          * input function.
360                          */
361                         return 1;
362                 default:
363                         break;
364                 }
365         }
366
367         /*
368          * The proxying router can't forward traffic sent to a link-local
369          * address, so signal the sender and discard the packet. This
370          * behavior is clarified by the MIPv6 specification.
371          */
372         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
373                 dst_link_failure(skb);
374                 return -1;
375         }
376
377         return 0;
378 }
379
380 static inline int ip6_forward_finish(struct sk_buff *skb)
381 {
382         return dst_output(skb);
383 }
384
385 int ip6_forward(struct sk_buff *skb)
386 {
387         struct dst_entry *dst = skb_dst(skb);
388         struct ipv6hdr *hdr = ipv6_hdr(skb);
389         struct inet6_skb_parm *opt = IP6CB(skb);
390         struct net *net = dev_net(dst->dev);
391         u32 mtu;
392
393         if (net->ipv6.devconf_all->forwarding == 0)
394                 goto error;
395
396         if (skb_warn_if_lro(skb))
397                 goto drop;
398
399         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
400                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
401                 goto drop;
402         }
403
404         skb_forward_csum(skb);
405
406         /*
407          *      We DO NOT make any processing on
408          *      RA packets, pushing them to user level AS IS
409          *      without ane WARRANTY that application will be able
410          *      to interpret them. The reason is that we
411          *      cannot make anything clever here.
412          *
413          *      We are not end-node, so that if packet contains
414          *      AH/ESP, we cannot make anything.
415          *      Defragmentation also would be mistake, RA packets
416          *      cannot be fragmented, because there is no warranty
417          *      that different fragments will go along one path. --ANK
418          */
419         if (opt->ra) {
420                 u8 *ptr = skb_network_header(skb) + opt->ra;
421                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
422                         return 0;
423         }
424
425         /*
426          *      check and decrement ttl
427          */
428         if (hdr->hop_limit <= 1) {
429                 /* Force OUTPUT device used as source address */
430                 skb->dev = dst->dev;
431                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
432                 IP6_INC_STATS_BH(net,
433                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
434
435                 kfree_skb(skb);
436                 return -ETIMEDOUT;
437         }
438
439         /* XXX: idev->cnf.proxy_ndp? */
440         if (net->ipv6.devconf_all->proxy_ndp &&
441             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
442                 int proxied = ip6_forward_proxy_check(skb);
443                 if (proxied > 0)
444                         return ip6_input(skb);
445                 else if (proxied < 0) {
446                         IP6_INC_STATS(net, ip6_dst_idev(dst),
447                                       IPSTATS_MIB_INDISCARDS);
448                         goto drop;
449                 }
450         }
451
452         if (!xfrm6_route_forward(skb)) {
453                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
454                 goto drop;
455         }
456         dst = skb_dst(skb);
457
458         /* IPv6 specs say nothing about it, but it is clear that we cannot
459            send redirects to source routed frames.
460            We don't send redirects to frames decapsulated from IPsec.
461          */
462         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
463             !skb_sec_path(skb)) {
464                 struct in6_addr *target = NULL;
465                 struct rt6_info *rt;
466                 struct neighbour *n = dst->neighbour;
467
468                 /*
469                  *      incoming and outgoing devices are the same
470                  *      send a redirect.
471                  */
472
473                 rt = (struct rt6_info *) dst;
474                 if ((rt->rt6i_flags & RTF_GATEWAY))
475                         target = (struct in6_addr*)&n->primary_key;
476                 else
477                         target = &hdr->daddr;
478
479                 /* Limit redirects both by destination (here)
480                    and by source (inside ndisc_send_redirect)
481                  */
482                 if (xrlim_allow(dst, 1*HZ))
483                         ndisc_send_redirect(skb, n, target);
484         } else {
485                 int addrtype = ipv6_addr_type(&hdr->saddr);
486
487                 /* This check is security critical. */
488                 if (addrtype == IPV6_ADDR_ANY ||
489                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
490                         goto error;
491                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
492                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
493                                     ICMPV6_NOT_NEIGHBOUR, 0);
494                         goto error;
495                 }
496         }
497
498         mtu = dst_mtu(dst);
499         if (mtu < IPV6_MIN_MTU)
500                 mtu = IPV6_MIN_MTU;
501
502         if (skb->len > mtu && !skb_is_gso(skb)) {
503                 /* Again, force OUTPUT device used as source address */
504                 skb->dev = dst->dev;
505                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
506                 IP6_INC_STATS_BH(net,
507                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
508                 IP6_INC_STATS_BH(net,
509                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
510                 kfree_skb(skb);
511                 return -EMSGSIZE;
512         }
513
514         if (skb_cow(skb, dst->dev->hard_header_len)) {
515                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
516                 goto drop;
517         }
518
519         hdr = ipv6_hdr(skb);
520
521         /* Mangling hops number delayed to point after skb COW */
522
523         hdr->hop_limit--;
524
525         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
526         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
527                        ip6_forward_finish);
528
529 error:
530         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
531 drop:
532         kfree_skb(skb);
533         return -EINVAL;
534 }
535
536 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
537 {
538         to->pkt_type = from->pkt_type;
539         to->priority = from->priority;
540         to->protocol = from->protocol;
541         skb_dst_drop(to);
542         skb_dst_set(to, dst_clone(skb_dst(from)));
543         to->dev = from->dev;
544         to->mark = from->mark;
545
546 #ifdef CONFIG_NET_SCHED
547         to->tc_index = from->tc_index;
548 #endif
549         nf_copy(to, from);
550 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
551     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
552         to->nf_trace = from->nf_trace;
553 #endif
554         skb_copy_secmark(to, from);
555 }
556
557 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
558 {
559         u16 offset = sizeof(struct ipv6hdr);
560         struct ipv6_opt_hdr *exthdr =
561                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
562         unsigned int packet_len = skb->tail - skb->network_header;
563         int found_rhdr = 0;
564         *nexthdr = &ipv6_hdr(skb)->nexthdr;
565
566         while (offset + 1 <= packet_len) {
567
568                 switch (**nexthdr) {
569
570                 case NEXTHDR_HOP:
571                         break;
572                 case NEXTHDR_ROUTING:
573                         found_rhdr = 1;
574                         break;
575                 case NEXTHDR_DEST:
576 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
577                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
578                                 break;
579 #endif
580                         if (found_rhdr)
581                                 return offset;
582                         break;
583                 default :
584                         return offset;
585                 }
586
587                 offset += ipv6_optlen(exthdr);
588                 *nexthdr = &exthdr->nexthdr;
589                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
590                                                  offset);
591         }
592
593         return offset;
594 }
595
596 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
597 {
598         struct sk_buff *frag;
599         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
600         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
601         struct ipv6hdr *tmp_hdr;
602         struct frag_hdr *fh;
603         unsigned int mtu, hlen, left, len;
604         __be32 frag_id = 0;
605         int ptr, offset = 0, err=0;
606         u8 *prevhdr, nexthdr = 0;
607         struct net *net = dev_net(skb_dst(skb)->dev);
608
609         hlen = ip6_find_1stfragopt(skb, &prevhdr);
610         nexthdr = *prevhdr;
611
612         mtu = ip6_skb_dst_mtu(skb);
613
614         /* We must not fragment if the socket is set to force MTU discovery
615          * or if the skb it not generated by a local socket.
616          */
617         if (!skb->local_df && skb->len > mtu) {
618                 skb->dev = skb_dst(skb)->dev;
619                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
620                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
621                               IPSTATS_MIB_FRAGFAILS);
622                 kfree_skb(skb);
623                 return -EMSGSIZE;
624         }
625
626         if (np && np->frag_size < mtu) {
627                 if (np->frag_size)
628                         mtu = np->frag_size;
629         }
630         mtu -= hlen + sizeof(struct frag_hdr);
631
632         if (skb_has_frag_list(skb)) {
633                 int first_len = skb_pagelen(skb);
634                 struct sk_buff *frag2;
635
636                 if (first_len - hlen > mtu ||
637                     ((first_len - hlen) & 7) ||
638                     skb_cloned(skb))
639                         goto slow_path;
640
641                 skb_walk_frags(skb, frag) {
642                         /* Correct geometry. */
643                         if (frag->len > mtu ||
644                             ((frag->len & 7) && frag->next) ||
645                             skb_headroom(frag) < hlen)
646                                 goto slow_path_clean;
647
648                         /* Partially cloned skb? */
649                         if (skb_shared(frag))
650                                 goto slow_path_clean;
651
652                         BUG_ON(frag->sk);
653                         if (skb->sk) {
654                                 frag->sk = skb->sk;
655                                 frag->destructor = sock_wfree;
656                         }
657                         skb->truesize -= frag->truesize;
658                 }
659
660                 err = 0;
661                 offset = 0;
662                 frag = skb_shinfo(skb)->frag_list;
663                 skb_frag_list_init(skb);
664                 /* BUILD HEADER */
665
666                 *prevhdr = NEXTHDR_FRAGMENT;
667                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
668                 if (!tmp_hdr) {
669                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
670                                       IPSTATS_MIB_FRAGFAILS);
671                         return -ENOMEM;
672                 }
673
674                 __skb_pull(skb, hlen);
675                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
676                 __skb_push(skb, hlen);
677                 skb_reset_network_header(skb);
678                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
679
680                 ipv6_select_ident(fh);
681                 fh->nexthdr = nexthdr;
682                 fh->reserved = 0;
683                 fh->frag_off = htons(IP6_MF);
684                 frag_id = fh->identification;
685
686                 first_len = skb_pagelen(skb);
687                 skb->data_len = first_len - skb_headlen(skb);
688                 skb->len = first_len;
689                 ipv6_hdr(skb)->payload_len = htons(first_len -
690                                                    sizeof(struct ipv6hdr));
691
692                 dst_hold(&rt->dst);
693
694                 for (;;) {
695                         /* Prepare header of the next frame,
696                          * before previous one went down. */
697                         if (frag) {
698                                 frag->ip_summed = CHECKSUM_NONE;
699                                 skb_reset_transport_header(frag);
700                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
701                                 __skb_push(frag, hlen);
702                                 skb_reset_network_header(frag);
703                                 memcpy(skb_network_header(frag), tmp_hdr,
704                                        hlen);
705                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
706                                 fh->nexthdr = nexthdr;
707                                 fh->reserved = 0;
708                                 fh->frag_off = htons(offset);
709                                 if (frag->next != NULL)
710                                         fh->frag_off |= htons(IP6_MF);
711                                 fh->identification = frag_id;
712                                 ipv6_hdr(frag)->payload_len =
713                                                 htons(frag->len -
714                                                       sizeof(struct ipv6hdr));
715                                 ip6_copy_metadata(frag, skb);
716                         }
717
718                         err = output(skb);
719                         if(!err)
720                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
721                                               IPSTATS_MIB_FRAGCREATES);
722
723                         if (err || !frag)
724                                 break;
725
726                         skb = frag;
727                         frag = skb->next;
728                         skb->next = NULL;
729                 }
730
731                 kfree(tmp_hdr);
732
733                 if (err == 0) {
734                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
735                                       IPSTATS_MIB_FRAGOKS);
736                         dst_release(&rt->dst);
737                         return 0;
738                 }
739
740                 while (frag) {
741                         skb = frag->next;
742                         kfree_skb(frag);
743                         frag = skb;
744                 }
745
746                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
747                               IPSTATS_MIB_FRAGFAILS);
748                 dst_release(&rt->dst);
749                 return err;
750
751 slow_path_clean:
752                 skb_walk_frags(skb, frag2) {
753                         if (frag2 == frag)
754                                 break;
755                         frag2->sk = NULL;
756                         frag2->destructor = NULL;
757                         skb->truesize += frag2->truesize;
758                 }
759         }
760
761 slow_path:
762         left = skb->len - hlen;         /* Space per frame */
763         ptr = hlen;                     /* Where to start from */
764
765         /*
766          *      Fragment the datagram.
767          */
768
769         *prevhdr = NEXTHDR_FRAGMENT;
770
771         /*
772          *      Keep copying data until we run out.
773          */
774         while(left > 0) {
775                 len = left;
776                 /* IF: it doesn't fit, use 'mtu' - the data space left */
777                 if (len > mtu)
778                         len = mtu;
779                 /* IF: we are not sending upto and including the packet end
780                    then align the next start on an eight byte boundary */
781                 if (len < left) {
782                         len &= ~7;
783                 }
784                 /*
785                  *      Allocate buffer.
786                  */
787
788                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
789                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
790                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
791                                       IPSTATS_MIB_FRAGFAILS);
792                         err = -ENOMEM;
793                         goto fail;
794                 }
795
796                 /*
797                  *      Set up data on packet
798                  */
799
800                 ip6_copy_metadata(frag, skb);
801                 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
802                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
803                 skb_reset_network_header(frag);
804                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
805                 frag->transport_header = (frag->network_header + hlen +
806                                           sizeof(struct frag_hdr));
807
808                 /*
809                  *      Charge the memory for the fragment to any owner
810                  *      it might possess
811                  */
812                 if (skb->sk)
813                         skb_set_owner_w(frag, skb->sk);
814
815                 /*
816                  *      Copy the packet header into the new buffer.
817                  */
818                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
819
820                 /*
821                  *      Build fragment header.
822                  */
823                 fh->nexthdr = nexthdr;
824                 fh->reserved = 0;
825                 if (!frag_id) {
826                         ipv6_select_ident(fh);
827                         frag_id = fh->identification;
828                 } else
829                         fh->identification = frag_id;
830
831                 /*
832                  *      Copy a block of the IP datagram.
833                  */
834                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
835                         BUG();
836                 left -= len;
837
838                 fh->frag_off = htons(offset);
839                 if (left > 0)
840                         fh->frag_off |= htons(IP6_MF);
841                 ipv6_hdr(frag)->payload_len = htons(frag->len -
842                                                     sizeof(struct ipv6hdr));
843
844                 ptr += len;
845                 offset += len;
846
847                 /*
848                  *      Put this fragment into the sending queue.
849                  */
850                 err = output(frag);
851                 if (err)
852                         goto fail;
853
854                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
855                               IPSTATS_MIB_FRAGCREATES);
856         }
857         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
858                       IPSTATS_MIB_FRAGOKS);
859         kfree_skb(skb);
860         return err;
861
862 fail:
863         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
864                       IPSTATS_MIB_FRAGFAILS);
865         kfree_skb(skb);
866         return err;
867 }
868
869 static inline int ip6_rt_check(struct rt6key *rt_key,
870                                struct in6_addr *fl_addr,
871                                struct in6_addr *addr_cache)
872 {
873         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
874                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
875 }
876
877 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
878                                           struct dst_entry *dst,
879                                           struct flowi *fl)
880 {
881         struct ipv6_pinfo *np = inet6_sk(sk);
882         struct rt6_info *rt = (struct rt6_info *)dst;
883
884         if (!dst)
885                 goto out;
886
887         /* Yes, checking route validity in not connected
888          * case is not very simple. Take into account,
889          * that we do not support routing by source, TOS,
890          * and MSG_DONTROUTE            --ANK (980726)
891          *
892          * 1. ip6_rt_check(): If route was host route,
893          *    check that cached destination is current.
894          *    If it is network route, we still may
895          *    check its validity using saved pointer
896          *    to the last used address: daddr_cache.
897          *    We do not want to save whole address now,
898          *    (because main consumer of this service
899          *    is tcp, which has not this problem),
900          *    so that the last trick works only on connected
901          *    sockets.
902          * 2. oif also should be the same.
903          */
904         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
905 #ifdef CONFIG_IPV6_SUBTREES
906             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
907 #endif
908             (fl->oif && fl->oif != dst->dev->ifindex)) {
909                 dst_release(dst);
910                 dst = NULL;
911         }
912
913 out:
914         return dst;
915 }
916
917 static int ip6_dst_lookup_tail(struct sock *sk,
918                                struct dst_entry **dst, struct flowi *fl)
919 {
920         int err;
921         struct net *net = sock_net(sk);
922
923         if (*dst == NULL)
924                 *dst = ip6_route_output(net, sk, fl);
925
926         if ((err = (*dst)->error))
927                 goto out_err_release;
928
929         if (ipv6_addr_any(&fl->fl6_src)) {
930                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
931                                          &fl->fl6_dst,
932                                          sk ? inet6_sk(sk)->srcprefs : 0,
933                                          &fl->fl6_src);
934                 if (err)
935                         goto out_err_release;
936         }
937
938 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
939         /*
940          * Here if the dst entry we've looked up
941          * has a neighbour entry that is in the INCOMPLETE
942          * state and the src address from the flow is
943          * marked as OPTIMISTIC, we release the found
944          * dst entry and replace it instead with the
945          * dst entry of the nexthop router
946          */
947         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
948                 struct inet6_ifaddr *ifp;
949                 struct flowi fl_gw;
950                 int redirect;
951
952                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
953                                       (*dst)->dev, 1);
954
955                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
956                 if (ifp)
957                         in6_ifa_put(ifp);
958
959                 if (redirect) {
960                         /*
961                          * We need to get the dst entry for the
962                          * default router instead
963                          */
964                         dst_release(*dst);
965                         memcpy(&fl_gw, fl, sizeof(struct flowi));
966                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
967                         *dst = ip6_route_output(net, sk, &fl_gw);
968                         if ((err = (*dst)->error))
969                                 goto out_err_release;
970                 }
971         }
972 #endif
973
974         return 0;
975
976 out_err_release:
977         if (err == -ENETUNREACH)
978                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
979         dst_release(*dst);
980         *dst = NULL;
981         return err;
982 }
983
984 /**
985  *      ip6_dst_lookup - perform route lookup on flow
986  *      @sk: socket which provides route info
987  *      @dst: pointer to dst_entry * for result
988  *      @fl: flow to lookup
989  *
990  *      This function performs a route lookup on the given flow.
991  *
992  *      It returns zero on success, or a standard errno code on error.
993  */
994 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
995 {
996         *dst = NULL;
997         return ip6_dst_lookup_tail(sk, dst, fl);
998 }
999 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1000
1001 /**
1002  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1003  *      @sk: socket which provides the dst cache and route info
1004  *      @dst: pointer to dst_entry * for result
1005  *      @fl: flow to lookup
1006  *
1007  *      This function performs a route lookup on the given flow with the
1008  *      possibility of using the cached route in the socket if it is valid.
1009  *      It will take the socket dst lock when operating on the dst cache.
1010  *      As a result, this function can only be used in process context.
1011  *
1012  *      It returns zero on success, or a standard errno code on error.
1013  */
1014 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1015 {
1016         *dst = NULL;
1017         if (sk) {
1018                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1019                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1020         }
1021
1022         return ip6_dst_lookup_tail(sk, dst, fl);
1023 }
1024 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1025
1026 static inline int ip6_ufo_append_data(struct sock *sk,
1027                         int getfrag(void *from, char *to, int offset, int len,
1028                         int odd, struct sk_buff *skb),
1029                         void *from, int length, int hh_len, int fragheaderlen,
1030                         int transhdrlen, int mtu,unsigned int flags)
1031
1032 {
1033         struct sk_buff *skb;
1034         int err;
1035
1036         /* There is support for UDP large send offload by network
1037          * device, so create one single skb packet containing complete
1038          * udp datagram
1039          */
1040         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1041                 skb = sock_alloc_send_skb(sk,
1042                         hh_len + fragheaderlen + transhdrlen + 20,
1043                         (flags & MSG_DONTWAIT), &err);
1044                 if (skb == NULL)
1045                         return -ENOMEM;
1046
1047                 /* reserve space for Hardware header */
1048                 skb_reserve(skb, hh_len);
1049
1050                 /* create space for UDP/IP header */
1051                 skb_put(skb,fragheaderlen + transhdrlen);
1052
1053                 /* initialize network header pointer */
1054                 skb_reset_network_header(skb);
1055
1056                 /* initialize protocol header pointer */
1057                 skb->transport_header = skb->network_header + fragheaderlen;
1058
1059                 skb->ip_summed = CHECKSUM_PARTIAL;
1060                 skb->csum = 0;
1061                 sk->sk_sndmsg_off = 0;
1062         }
1063
1064         err = skb_append_datato_frags(sk,skb, getfrag, from,
1065                                       (length - transhdrlen));
1066         if (!err) {
1067                 struct frag_hdr fhdr;
1068
1069                 /* Specify the length of each IPv6 datagram fragment.
1070                  * It has to be a multiple of 8.
1071                  */
1072                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1073                                              sizeof(struct frag_hdr)) & ~7;
1074                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1075                 ipv6_select_ident(&fhdr);
1076                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1077                 __skb_queue_tail(&sk->sk_write_queue, skb);
1078
1079                 return 0;
1080         }
1081         /* There is not enough support do UPD LSO,
1082          * so follow normal path
1083          */
1084         kfree_skb(skb);
1085
1086         return err;
1087 }
1088
1089 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1090                                                gfp_t gfp)
1091 {
1092         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1093 }
1094
1095 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1096                                                 gfp_t gfp)
1097 {
1098         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1099 }
1100
1101 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1102         int offset, int len, int odd, struct sk_buff *skb),
1103         void *from, int length, int transhdrlen,
1104         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1105         struct rt6_info *rt, unsigned int flags, int dontfrag)
1106 {
1107         struct inet_sock *inet = inet_sk(sk);
1108         struct ipv6_pinfo *np = inet6_sk(sk);
1109         struct sk_buff *skb;
1110         unsigned int maxfraglen, fragheaderlen;
1111         int exthdrlen;
1112         int hh_len;
1113         int mtu;
1114         int copy;
1115         int err;
1116         int offset = 0;
1117         int csummode = CHECKSUM_NONE;
1118
1119         if (flags&MSG_PROBE)
1120                 return 0;
1121         if (skb_queue_empty(&sk->sk_write_queue)) {
1122                 /*
1123                  * setup for corking
1124                  */
1125                 if (opt) {
1126                         if (WARN_ON(np->cork.opt))
1127                                 return -EINVAL;
1128
1129                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1130                         if (unlikely(np->cork.opt == NULL))
1131                                 return -ENOBUFS;
1132
1133                         np->cork.opt->tot_len = opt->tot_len;
1134                         np->cork.opt->opt_flen = opt->opt_flen;
1135                         np->cork.opt->opt_nflen = opt->opt_nflen;
1136
1137                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1138                                                             sk->sk_allocation);
1139                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1140                                 return -ENOBUFS;
1141
1142                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1143                                                             sk->sk_allocation);
1144                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1145                                 return -ENOBUFS;
1146
1147                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1148                                                            sk->sk_allocation);
1149                         if (opt->hopopt && !np->cork.opt->hopopt)
1150                                 return -ENOBUFS;
1151
1152                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1153                                                             sk->sk_allocation);
1154                         if (opt->srcrt && !np->cork.opt->srcrt)
1155                                 return -ENOBUFS;
1156
1157                         /* need source address above miyazawa*/
1158                 }
1159                 dst_hold(&rt->dst);
1160                 inet->cork.dst = &rt->dst;
1161                 inet->cork.fl = *fl;
1162                 np->cork.hop_limit = hlimit;
1163                 np->cork.tclass = tclass;
1164                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1165                       rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1166                 if (np->frag_size < mtu) {
1167                         if (np->frag_size)
1168                                 mtu = np->frag_size;
1169                 }
1170                 inet->cork.fragsize = mtu;
1171                 if (dst_allfrag(rt->dst.path))
1172                         inet->cork.flags |= IPCORK_ALLFRAG;
1173                 inet->cork.length = 0;
1174                 sk->sk_sndmsg_page = NULL;
1175                 sk->sk_sndmsg_off = 0;
1176                 exthdrlen = rt->dst.header_len + (opt ? opt->opt_flen : 0) -
1177                             rt->rt6i_nfheader_len;
1178                 length += exthdrlen;
1179                 transhdrlen += exthdrlen;
1180         } else {
1181                 rt = (struct rt6_info *)inet->cork.dst;
1182                 fl = &inet->cork.fl;
1183                 opt = np->cork.opt;
1184                 transhdrlen = 0;
1185                 exthdrlen = 0;
1186                 mtu = inet->cork.fragsize;
1187         }
1188
1189         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1190
1191         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1192                         (opt ? opt->opt_nflen : 0);
1193         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1194
1195         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1196                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1197                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1198                         return -EMSGSIZE;
1199                 }
1200         }
1201
1202         /*
1203          * Let's try using as much space as possible.
1204          * Use MTU if total length of the message fits into the MTU.
1205          * Otherwise, we need to reserve fragment header and
1206          * fragment alignment (= 8-15 octects, in total).
1207          *
1208          * Note that we may need to "move" the data from the tail of
1209          * of the buffer to the new fragment when we split
1210          * the message.
1211          *
1212          * FIXME: It may be fragmented into multiple chunks
1213          *        at once if non-fragmentable extension headers
1214          *        are too large.
1215          * --yoshfuji
1216          */
1217
1218         inet->cork.length += length;
1219         if (length > mtu) {
1220                 int proto = sk->sk_protocol;
1221                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1222                         ipv6_local_rxpmtu(sk, fl, mtu-exthdrlen);
1223                         return -EMSGSIZE;
1224                 }
1225
1226                 if (proto == IPPROTO_UDP &&
1227                     (rt->dst.dev->features & NETIF_F_UFO)) {
1228
1229                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1230                                                   hh_len, fragheaderlen,
1231                                                   transhdrlen, mtu, flags);
1232                         if (err)
1233                                 goto error;
1234                         return 0;
1235                 }
1236         }
1237
1238         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1239                 goto alloc_new_skb;
1240
1241         while (length > 0) {
1242                 /* Check if the remaining data fits into current packet. */
1243                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1244                 if (copy < length)
1245                         copy = maxfraglen - skb->len;
1246
1247                 if (copy <= 0) {
1248                         char *data;
1249                         unsigned int datalen;
1250                         unsigned int fraglen;
1251                         unsigned int fraggap;
1252                         unsigned int alloclen;
1253                         struct sk_buff *skb_prev;
1254 alloc_new_skb:
1255                         skb_prev = skb;
1256
1257                         /* There's no room in the current skb */
1258                         if (skb_prev)
1259                                 fraggap = skb_prev->len - maxfraglen;
1260                         else
1261                                 fraggap = 0;
1262
1263                         /*
1264                          * If remaining data exceeds the mtu,
1265                          * we know we need more fragment(s).
1266                          */
1267                         datalen = length + fraggap;
1268                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1269                                 datalen = maxfraglen - fragheaderlen;
1270
1271                         fraglen = datalen + fragheaderlen;
1272                         if ((flags & MSG_MORE) &&
1273                             !(rt->dst.dev->features&NETIF_F_SG))
1274                                 alloclen = mtu;
1275                         else
1276                                 alloclen = datalen + fragheaderlen;
1277
1278                         /*
1279                          * The last fragment gets additional space at tail.
1280                          * Note: we overallocate on fragments with MSG_MODE
1281                          * because we have no idea if we're the last one.
1282                          */
1283                         if (datalen == length + fraggap)
1284                                 alloclen += rt->dst.trailer_len;
1285
1286                         /*
1287                          * We just reserve space for fragment header.
1288                          * Note: this may be overallocation if the message
1289                          * (without MSG_MORE) fits into the MTU.
1290                          */
1291                         alloclen += sizeof(struct frag_hdr);
1292
1293                         if (transhdrlen) {
1294                                 skb = sock_alloc_send_skb(sk,
1295                                                 alloclen + hh_len,
1296                                                 (flags & MSG_DONTWAIT), &err);
1297                         } else {
1298                                 skb = NULL;
1299                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1300                                     2 * sk->sk_sndbuf)
1301                                         skb = sock_wmalloc(sk,
1302                                                            alloclen + hh_len, 1,
1303                                                            sk->sk_allocation);
1304                                 if (unlikely(skb == NULL))
1305                                         err = -ENOBUFS;
1306                         }
1307                         if (skb == NULL)
1308                                 goto error;
1309                         /*
1310                          *      Fill in the control structures
1311                          */
1312                         skb->ip_summed = csummode;
1313                         skb->csum = 0;
1314                         /* reserve for fragmentation */
1315                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1316
1317                         /*
1318                          *      Find where to start putting bytes
1319                          */
1320                         data = skb_put(skb, fraglen);
1321                         skb_set_network_header(skb, exthdrlen);
1322                         data += fragheaderlen;
1323                         skb->transport_header = (skb->network_header +
1324                                                  fragheaderlen);
1325                         if (fraggap) {
1326                                 skb->csum = skb_copy_and_csum_bits(
1327                                         skb_prev, maxfraglen,
1328                                         data + transhdrlen, fraggap, 0);
1329                                 skb_prev->csum = csum_sub(skb_prev->csum,
1330                                                           skb->csum);
1331                                 data += fraggap;
1332                                 pskb_trim_unique(skb_prev, maxfraglen);
1333                         }
1334                         copy = datalen - transhdrlen - fraggap;
1335                         if (copy < 0) {
1336                                 err = -EINVAL;
1337                                 kfree_skb(skb);
1338                                 goto error;
1339                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1340                                 err = -EFAULT;
1341                                 kfree_skb(skb);
1342                                 goto error;
1343                         }
1344
1345                         offset += copy;
1346                         length -= datalen - fraggap;
1347                         transhdrlen = 0;
1348                         exthdrlen = 0;
1349                         csummode = CHECKSUM_NONE;
1350
1351                         /*
1352                          * Put the packet on the pending queue
1353                          */
1354                         __skb_queue_tail(&sk->sk_write_queue, skb);
1355                         continue;
1356                 }
1357
1358                 if (copy > length)
1359                         copy = length;
1360
1361                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1362                         unsigned int off;
1363
1364                         off = skb->len;
1365                         if (getfrag(from, skb_put(skb, copy),
1366                                                 offset, copy, off, skb) < 0) {
1367                                 __skb_trim(skb, off);
1368                                 err = -EFAULT;
1369                                 goto error;
1370                         }
1371                 } else {
1372                         int i = skb_shinfo(skb)->nr_frags;
1373                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1374                         struct page *page = sk->sk_sndmsg_page;
1375                         int off = sk->sk_sndmsg_off;
1376                         unsigned int left;
1377
1378                         if (page && (left = PAGE_SIZE - off) > 0) {
1379                                 if (copy >= left)
1380                                         copy = left;
1381                                 if (page != frag->page) {
1382                                         if (i == MAX_SKB_FRAGS) {
1383                                                 err = -EMSGSIZE;
1384                                                 goto error;
1385                                         }
1386                                         get_page(page);
1387                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1388                                         frag = &skb_shinfo(skb)->frags[i];
1389                                 }
1390                         } else if(i < MAX_SKB_FRAGS) {
1391                                 if (copy > PAGE_SIZE)
1392                                         copy = PAGE_SIZE;
1393                                 page = alloc_pages(sk->sk_allocation, 0);
1394                                 if (page == NULL) {
1395                                         err = -ENOMEM;
1396                                         goto error;
1397                                 }
1398                                 sk->sk_sndmsg_page = page;
1399                                 sk->sk_sndmsg_off = 0;
1400
1401                                 skb_fill_page_desc(skb, i, page, 0, 0);
1402                                 frag = &skb_shinfo(skb)->frags[i];
1403                         } else {
1404                                 err = -EMSGSIZE;
1405                                 goto error;
1406                         }
1407                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1408                                 err = -EFAULT;
1409                                 goto error;
1410                         }
1411                         sk->sk_sndmsg_off += copy;
1412                         frag->size += copy;
1413                         skb->len += copy;
1414                         skb->data_len += copy;
1415                         skb->truesize += copy;
1416                         atomic_add(copy, &sk->sk_wmem_alloc);
1417                 }
1418                 offset += copy;
1419                 length -= copy;
1420         }
1421         return 0;
1422 error:
1423         inet->cork.length -= length;
1424         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1425         return err;
1426 }
1427
1428 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1429 {
1430         if (np->cork.opt) {
1431                 kfree(np->cork.opt->dst0opt);
1432                 kfree(np->cork.opt->dst1opt);
1433                 kfree(np->cork.opt->hopopt);
1434                 kfree(np->cork.opt->srcrt);
1435                 kfree(np->cork.opt);
1436                 np->cork.opt = NULL;
1437         }
1438
1439         if (inet->cork.dst) {
1440                 dst_release(inet->cork.dst);
1441                 inet->cork.dst = NULL;
1442                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1443         }
1444         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1445 }
1446
1447 int ip6_push_pending_frames(struct sock *sk)
1448 {
1449         struct sk_buff *skb, *tmp_skb;
1450         struct sk_buff **tail_skb;
1451         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1452         struct inet_sock *inet = inet_sk(sk);
1453         struct ipv6_pinfo *np = inet6_sk(sk);
1454         struct net *net = sock_net(sk);
1455         struct ipv6hdr *hdr;
1456         struct ipv6_txoptions *opt = np->cork.opt;
1457         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1458         struct flowi *fl = &inet->cork.fl;
1459         unsigned char proto = fl->proto;
1460         int err = 0;
1461
1462         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1463                 goto out;
1464         tail_skb = &(skb_shinfo(skb)->frag_list);
1465
1466         /* move skb->data to ip header from ext header */
1467         if (skb->data < skb_network_header(skb))
1468                 __skb_pull(skb, skb_network_offset(skb));
1469         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1470                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1471                 *tail_skb = tmp_skb;
1472                 tail_skb = &(tmp_skb->next);
1473                 skb->len += tmp_skb->len;
1474                 skb->data_len += tmp_skb->len;
1475                 skb->truesize += tmp_skb->truesize;
1476                 tmp_skb->destructor = NULL;
1477                 tmp_skb->sk = NULL;
1478         }
1479
1480         /* Allow local fragmentation. */
1481         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1482                 skb->local_df = 1;
1483
1484         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1485         __skb_pull(skb, skb_network_header_len(skb));
1486         if (opt && opt->opt_flen)
1487                 ipv6_push_frag_opts(skb, opt, &proto);
1488         if (opt && opt->opt_nflen)
1489                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1490
1491         skb_push(skb, sizeof(struct ipv6hdr));
1492         skb_reset_network_header(skb);
1493         hdr = ipv6_hdr(skb);
1494
1495         *(__be32*)hdr = fl->fl6_flowlabel |
1496                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1497
1498         hdr->hop_limit = np->cork.hop_limit;
1499         hdr->nexthdr = proto;
1500         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1501         ipv6_addr_copy(&hdr->daddr, final_dst);
1502
1503         skb->priority = sk->sk_priority;
1504         skb->mark = sk->sk_mark;
1505
1506         skb_dst_set(skb, dst_clone(&rt->dst));
1507         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1508         if (proto == IPPROTO_ICMPV6) {
1509                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1510
1511                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1512                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1513         }
1514
1515         err = ip6_local_out(skb);
1516         if (err) {
1517                 if (err > 0)
1518                         err = net_xmit_errno(err);
1519                 if (err)
1520                         goto error;
1521         }
1522
1523 out:
1524         ip6_cork_release(inet, np);
1525         return err;
1526 error:
1527         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1528         goto out;
1529 }
1530
1531 void ip6_flush_pending_frames(struct sock *sk)
1532 {
1533         struct sk_buff *skb;
1534
1535         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1536                 if (skb_dst(skb))
1537                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1538                                       IPSTATS_MIB_OUTDISCARDS);
1539                 kfree_skb(skb);
1540         }
1541
1542         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1543 }