ip: Report qdisc packet drops
[linux-2.6.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40
41 #include <linux/netfilter.h>
42 #include <linux/netfilter_ipv6.h>
43
44 #include <net/sock.h>
45 #include <net/snmp.h>
46
47 #include <net/ipv6.h>
48 #include <net/ndisc.h>
49 #include <net/protocol.h>
50 #include <net/ip6_route.h>
51 #include <net/addrconf.h>
52 #include <net/rawv6.h>
53 #include <net/icmp.h>
54 #include <net/xfrm.h>
55 #include <net/checksum.h>
56 #include <linux/mroute6.h>
57
58 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
59
60 int __ip6_local_out(struct sk_buff *skb)
61 {
62         int len;
63
64         len = skb->len - sizeof(struct ipv6hdr);
65         if (len > IPV6_MAXPLEN)
66                 len = 0;
67         ipv6_hdr(skb)->payload_len = htons(len);
68
69         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
70                        dst_output);
71 }
72
73 int ip6_local_out(struct sk_buff *skb)
74 {
75         int err;
76
77         err = __ip6_local_out(skb);
78         if (likely(err == 1))
79                 err = dst_output(skb);
80
81         return err;
82 }
83 EXPORT_SYMBOL_GPL(ip6_local_out);
84
85 static int ip6_output_finish(struct sk_buff *skb)
86 {
87         struct dst_entry *dst = skb_dst(skb);
88
89         if (dst->hh)
90                 return neigh_hh_output(dst->hh, skb);
91         else if (dst->neighbour)
92                 return dst->neighbour->output(skb);
93
94         IP6_INC_STATS_BH(dev_net(dst->dev),
95                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
96         kfree_skb(skb);
97         return -EINVAL;
98
99 }
100
101 /* dev_loopback_xmit for use with netfilter. */
102 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
103 {
104         skb_reset_mac_header(newskb);
105         __skb_pull(newskb, skb_network_offset(newskb));
106         newskb->pkt_type = PACKET_LOOPBACK;
107         newskb->ip_summed = CHECKSUM_UNNECESSARY;
108         WARN_ON(!skb_dst(newskb));
109
110         netif_rx(newskb);
111         return 0;
112 }
113
114
115 static int ip6_output2(struct sk_buff *skb)
116 {
117         struct dst_entry *dst = skb_dst(skb);
118         struct net_device *dev = dst->dev;
119
120         skb->protocol = htons(ETH_P_IPV6);
121         skb->dev = dev;
122
123         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
124                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
125                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
126
127                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
128                     ((mroute6_socket(dev_net(dev)) &&
129                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
130                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
131                                          &ipv6_hdr(skb)->saddr))) {
132                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
133
134                         /* Do not check for IFF_ALLMULTI; multicast routing
135                            is not supported in any case.
136                          */
137                         if (newskb)
138                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
139                                         NULL, newskb->dev,
140                                         ip6_dev_loopback_xmit);
141
142                         if (ipv6_hdr(skb)->hop_limit == 0) {
143                                 IP6_INC_STATS(dev_net(dev), idev,
144                                               IPSTATS_MIB_OUTDISCARDS);
145                                 kfree_skb(skb);
146                                 return 0;
147                         }
148                 }
149
150                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
151                                 skb->len);
152         }
153
154         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
155                        ip6_output_finish);
156 }
157
158 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
159 {
160         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
161
162         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
163                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
164 }
165
166 int ip6_output(struct sk_buff *skb)
167 {
168         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
169         if (unlikely(idev->cnf.disable_ipv6)) {
170                 IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
171                               IPSTATS_MIB_OUTDISCARDS);
172                 kfree_skb(skb);
173                 return 0;
174         }
175
176         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
177                                 dst_allfrag(skb_dst(skb)))
178                 return ip6_fragment(skb, ip6_output2);
179         else
180                 return ip6_output2(skb);
181 }
182
183 /*
184  *      xmit an sk_buff (used by TCP)
185  */
186
187 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
188              struct ipv6_txoptions *opt, int ipfragok)
189 {
190         struct net *net = sock_net(sk);
191         struct ipv6_pinfo *np = inet6_sk(sk);
192         struct in6_addr *first_hop = &fl->fl6_dst;
193         struct dst_entry *dst = skb_dst(skb);
194         struct ipv6hdr *hdr;
195         u8  proto = fl->proto;
196         int seg_len = skb->len;
197         int hlimit = -1;
198         int tclass = 0;
199         u32 mtu;
200
201         if (opt) {
202                 unsigned int head_room;
203
204                 /* First: exthdrs may take lots of space (~8K for now)
205                    MAX_HEADER is not enough.
206                  */
207                 head_room = opt->opt_nflen + opt->opt_flen;
208                 seg_len += head_room;
209                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
210
211                 if (skb_headroom(skb) < head_room) {
212                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
213                         if (skb2 == NULL) {
214                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
215                                               IPSTATS_MIB_OUTDISCARDS);
216                                 kfree_skb(skb);
217                                 return -ENOBUFS;
218                         }
219                         kfree_skb(skb);
220                         skb = skb2;
221                         if (sk)
222                                 skb_set_owner_w(skb, sk);
223                 }
224                 if (opt->opt_flen)
225                         ipv6_push_frag_opts(skb, opt, &proto);
226                 if (opt->opt_nflen)
227                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
228         }
229
230         skb_push(skb, sizeof(struct ipv6hdr));
231         skb_reset_network_header(skb);
232         hdr = ipv6_hdr(skb);
233
234         /* Allow local fragmentation. */
235         if (ipfragok)
236                 skb->local_df = 1;
237
238         /*
239          *      Fill in the IPv6 header
240          */
241         if (np) {
242                 tclass = np->tclass;
243                 hlimit = np->hop_limit;
244         }
245         if (hlimit < 0)
246                 hlimit = ip6_dst_hoplimit(dst);
247
248         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
249
250         hdr->payload_len = htons(seg_len);
251         hdr->nexthdr = proto;
252         hdr->hop_limit = hlimit;
253
254         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
255         ipv6_addr_copy(&hdr->daddr, first_hop);
256
257         skb->priority = sk->sk_priority;
258         skb->mark = sk->sk_mark;
259
260         mtu = dst_mtu(dst);
261         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
262                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
263                               IPSTATS_MIB_OUT, skb->len);
264                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
265                                 dst_output);
266         }
267
268         if (net_ratelimit())
269                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
270         skb->dev = dst->dev;
271         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
272         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
273         kfree_skb(skb);
274         return -EMSGSIZE;
275 }
276
277 EXPORT_SYMBOL(ip6_xmit);
278
279 /*
280  *      To avoid extra problems ND packets are send through this
281  *      routine. It's code duplication but I really want to avoid
282  *      extra checks since ipv6_build_header is used by TCP (which
283  *      is for us performance critical)
284  */
285
286 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
287                const struct in6_addr *saddr, const struct in6_addr *daddr,
288                int proto, int len)
289 {
290         struct ipv6_pinfo *np = inet6_sk(sk);
291         struct ipv6hdr *hdr;
292         int totlen;
293
294         skb->protocol = htons(ETH_P_IPV6);
295         skb->dev = dev;
296
297         totlen = len + sizeof(struct ipv6hdr);
298
299         skb_reset_network_header(skb);
300         skb_put(skb, sizeof(struct ipv6hdr));
301         hdr = ipv6_hdr(skb);
302
303         *(__be32*)hdr = htonl(0x60000000);
304
305         hdr->payload_len = htons(len);
306         hdr->nexthdr = proto;
307         hdr->hop_limit = np->hop_limit;
308
309         ipv6_addr_copy(&hdr->saddr, saddr);
310         ipv6_addr_copy(&hdr->daddr, daddr);
311
312         return 0;
313 }
314
315 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
316 {
317         struct ip6_ra_chain *ra;
318         struct sock *last = NULL;
319
320         read_lock(&ip6_ra_lock);
321         for (ra = ip6_ra_chain; ra; ra = ra->next) {
322                 struct sock *sk = ra->sk;
323                 if (sk && ra->sel == sel &&
324                     (!sk->sk_bound_dev_if ||
325                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
326                         if (last) {
327                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
328                                 if (skb2)
329                                         rawv6_rcv(last, skb2);
330                         }
331                         last = sk;
332                 }
333         }
334
335         if (last) {
336                 rawv6_rcv(last, skb);
337                 read_unlock(&ip6_ra_lock);
338                 return 1;
339         }
340         read_unlock(&ip6_ra_lock);
341         return 0;
342 }
343
344 static int ip6_forward_proxy_check(struct sk_buff *skb)
345 {
346         struct ipv6hdr *hdr = ipv6_hdr(skb);
347         u8 nexthdr = hdr->nexthdr;
348         int offset;
349
350         if (ipv6_ext_hdr(nexthdr)) {
351                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
352                 if (offset < 0)
353                         return 0;
354         } else
355                 offset = sizeof(struct ipv6hdr);
356
357         if (nexthdr == IPPROTO_ICMPV6) {
358                 struct icmp6hdr *icmp6;
359
360                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
361                                          offset + 1 - skb->data)))
362                         return 0;
363
364                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
365
366                 switch (icmp6->icmp6_type) {
367                 case NDISC_ROUTER_SOLICITATION:
368                 case NDISC_ROUTER_ADVERTISEMENT:
369                 case NDISC_NEIGHBOUR_SOLICITATION:
370                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
371                 case NDISC_REDIRECT:
372                         /* For reaction involving unicast neighbor discovery
373                          * message destined to the proxied address, pass it to
374                          * input function.
375                          */
376                         return 1;
377                 default:
378                         break;
379                 }
380         }
381
382         /*
383          * The proxying router can't forward traffic sent to a link-local
384          * address, so signal the sender and discard the packet. This
385          * behavior is clarified by the MIPv6 specification.
386          */
387         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
388                 dst_link_failure(skb);
389                 return -1;
390         }
391
392         return 0;
393 }
394
395 static inline int ip6_forward_finish(struct sk_buff *skb)
396 {
397         return dst_output(skb);
398 }
399
400 int ip6_forward(struct sk_buff *skb)
401 {
402         struct dst_entry *dst = skb_dst(skb);
403         struct ipv6hdr *hdr = ipv6_hdr(skb);
404         struct inet6_skb_parm *opt = IP6CB(skb);
405         struct net *net = dev_net(dst->dev);
406
407         if (net->ipv6.devconf_all->forwarding == 0)
408                 goto error;
409
410         if (skb_warn_if_lro(skb))
411                 goto drop;
412
413         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
414                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
415                 goto drop;
416         }
417
418         skb_forward_csum(skb);
419
420         /*
421          *      We DO NOT make any processing on
422          *      RA packets, pushing them to user level AS IS
423          *      without ane WARRANTY that application will be able
424          *      to interpret them. The reason is that we
425          *      cannot make anything clever here.
426          *
427          *      We are not end-node, so that if packet contains
428          *      AH/ESP, we cannot make anything.
429          *      Defragmentation also would be mistake, RA packets
430          *      cannot be fragmented, because there is no warranty
431          *      that different fragments will go along one path. --ANK
432          */
433         if (opt->ra) {
434                 u8 *ptr = skb_network_header(skb) + opt->ra;
435                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
436                         return 0;
437         }
438
439         /*
440          *      check and decrement ttl
441          */
442         if (hdr->hop_limit <= 1) {
443                 /* Force OUTPUT device used as source address */
444                 skb->dev = dst->dev;
445                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
446                             0, skb->dev);
447                 IP6_INC_STATS_BH(net,
448                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
449
450                 kfree_skb(skb);
451                 return -ETIMEDOUT;
452         }
453
454         /* XXX: idev->cnf.proxy_ndp? */
455         if (net->ipv6.devconf_all->proxy_ndp &&
456             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
457                 int proxied = ip6_forward_proxy_check(skb);
458                 if (proxied > 0)
459                         return ip6_input(skb);
460                 else if (proxied < 0) {
461                         IP6_INC_STATS(net, ip6_dst_idev(dst),
462                                       IPSTATS_MIB_INDISCARDS);
463                         goto drop;
464                 }
465         }
466
467         if (!xfrm6_route_forward(skb)) {
468                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
469                 goto drop;
470         }
471         dst = skb_dst(skb);
472
473         /* IPv6 specs say nothing about it, but it is clear that we cannot
474            send redirects to source routed frames.
475            We don't send redirects to frames decapsulated from IPsec.
476          */
477         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
478             !skb_sec_path(skb)) {
479                 struct in6_addr *target = NULL;
480                 struct rt6_info *rt;
481                 struct neighbour *n = dst->neighbour;
482
483                 /*
484                  *      incoming and outgoing devices are the same
485                  *      send a redirect.
486                  */
487
488                 rt = (struct rt6_info *) dst;
489                 if ((rt->rt6i_flags & RTF_GATEWAY))
490                         target = (struct in6_addr*)&n->primary_key;
491                 else
492                         target = &hdr->daddr;
493
494                 /* Limit redirects both by destination (here)
495                    and by source (inside ndisc_send_redirect)
496                  */
497                 if (xrlim_allow(dst, 1*HZ))
498                         ndisc_send_redirect(skb, n, target);
499         } else {
500                 int addrtype = ipv6_addr_type(&hdr->saddr);
501
502                 /* This check is security critical. */
503                 if (addrtype == IPV6_ADDR_ANY ||
504                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
505                         goto error;
506                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
507                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
508                                 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
509                         goto error;
510                 }
511         }
512
513         if (skb->len > dst_mtu(dst)) {
514                 /* Again, force OUTPUT device used as source address */
515                 skb->dev = dst->dev;
516                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
517                 IP6_INC_STATS_BH(net,
518                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
519                 IP6_INC_STATS_BH(net,
520                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
521                 kfree_skb(skb);
522                 return -EMSGSIZE;
523         }
524
525         if (skb_cow(skb, dst->dev->hard_header_len)) {
526                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
527                 goto drop;
528         }
529
530         hdr = ipv6_hdr(skb);
531
532         /* Mangling hops number delayed to point after skb COW */
533
534         hdr->hop_limit--;
535
536         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
537         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
538                        ip6_forward_finish);
539
540 error:
541         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
542 drop:
543         kfree_skb(skb);
544         return -EINVAL;
545 }
546
547 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
548 {
549         to->pkt_type = from->pkt_type;
550         to->priority = from->priority;
551         to->protocol = from->protocol;
552         skb_dst_drop(to);
553         skb_dst_set(to, dst_clone(skb_dst(from)));
554         to->dev = from->dev;
555         to->mark = from->mark;
556
557 #ifdef CONFIG_NET_SCHED
558         to->tc_index = from->tc_index;
559 #endif
560         nf_copy(to, from);
561 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
562     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
563         to->nf_trace = from->nf_trace;
564 #endif
565         skb_copy_secmark(to, from);
566 }
567
568 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
569 {
570         u16 offset = sizeof(struct ipv6hdr);
571         struct ipv6_opt_hdr *exthdr =
572                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
573         unsigned int packet_len = skb->tail - skb->network_header;
574         int found_rhdr = 0;
575         *nexthdr = &ipv6_hdr(skb)->nexthdr;
576
577         while (offset + 1 <= packet_len) {
578
579                 switch (**nexthdr) {
580
581                 case NEXTHDR_HOP:
582                         break;
583                 case NEXTHDR_ROUTING:
584                         found_rhdr = 1;
585                         break;
586                 case NEXTHDR_DEST:
587 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
588                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
589                                 break;
590 #endif
591                         if (found_rhdr)
592                                 return offset;
593                         break;
594                 default :
595                         return offset;
596                 }
597
598                 offset += ipv6_optlen(exthdr);
599                 *nexthdr = &exthdr->nexthdr;
600                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
601                                                  offset);
602         }
603
604         return offset;
605 }
606
607 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
608 {
609         struct sk_buff *frag;
610         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
611         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
612         struct ipv6hdr *tmp_hdr;
613         struct frag_hdr *fh;
614         unsigned int mtu, hlen, left, len;
615         __be32 frag_id = 0;
616         int ptr, offset = 0, err=0;
617         u8 *prevhdr, nexthdr = 0;
618         struct net *net = dev_net(skb_dst(skb)->dev);
619
620         hlen = ip6_find_1stfragopt(skb, &prevhdr);
621         nexthdr = *prevhdr;
622
623         mtu = ip6_skb_dst_mtu(skb);
624
625         /* We must not fragment if the socket is set to force MTU discovery
626          * or if the skb it not generated by a local socket.  (This last
627          * check should be redundant, but it's free.)
628          */
629         if (!skb->local_df) {
630                 skb->dev = skb_dst(skb)->dev;
631                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
632                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
633                               IPSTATS_MIB_FRAGFAILS);
634                 kfree_skb(skb);
635                 return -EMSGSIZE;
636         }
637
638         if (np && np->frag_size < mtu) {
639                 if (np->frag_size)
640                         mtu = np->frag_size;
641         }
642         mtu -= hlen + sizeof(struct frag_hdr);
643
644         if (skb_has_frags(skb)) {
645                 int first_len = skb_pagelen(skb);
646                 int truesizes = 0;
647
648                 if (first_len - hlen > mtu ||
649                     ((first_len - hlen) & 7) ||
650                     skb_cloned(skb))
651                         goto slow_path;
652
653                 skb_walk_frags(skb, frag) {
654                         /* Correct geometry. */
655                         if (frag->len > mtu ||
656                             ((frag->len & 7) && frag->next) ||
657                             skb_headroom(frag) < hlen)
658                             goto slow_path;
659
660                         /* Partially cloned skb? */
661                         if (skb_shared(frag))
662                                 goto slow_path;
663
664                         BUG_ON(frag->sk);
665                         if (skb->sk) {
666                                 frag->sk = skb->sk;
667                                 frag->destructor = sock_wfree;
668                                 truesizes += frag->truesize;
669                         }
670                 }
671
672                 err = 0;
673                 offset = 0;
674                 frag = skb_shinfo(skb)->frag_list;
675                 skb_frag_list_init(skb);
676                 /* BUILD HEADER */
677
678                 *prevhdr = NEXTHDR_FRAGMENT;
679                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
680                 if (!tmp_hdr) {
681                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
682                                       IPSTATS_MIB_FRAGFAILS);
683                         return -ENOMEM;
684                 }
685
686                 __skb_pull(skb, hlen);
687                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
688                 __skb_push(skb, hlen);
689                 skb_reset_network_header(skb);
690                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
691
692                 ipv6_select_ident(fh);
693                 fh->nexthdr = nexthdr;
694                 fh->reserved = 0;
695                 fh->frag_off = htons(IP6_MF);
696                 frag_id = fh->identification;
697
698                 first_len = skb_pagelen(skb);
699                 skb->data_len = first_len - skb_headlen(skb);
700                 skb->truesize -= truesizes;
701                 skb->len = first_len;
702                 ipv6_hdr(skb)->payload_len = htons(first_len -
703                                                    sizeof(struct ipv6hdr));
704
705                 dst_hold(&rt->u.dst);
706
707                 for (;;) {
708                         /* Prepare header of the next frame,
709                          * before previous one went down. */
710                         if (frag) {
711                                 frag->ip_summed = CHECKSUM_NONE;
712                                 skb_reset_transport_header(frag);
713                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
714                                 __skb_push(frag, hlen);
715                                 skb_reset_network_header(frag);
716                                 memcpy(skb_network_header(frag), tmp_hdr,
717                                        hlen);
718                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
719                                 fh->nexthdr = nexthdr;
720                                 fh->reserved = 0;
721                                 fh->frag_off = htons(offset);
722                                 if (frag->next != NULL)
723                                         fh->frag_off |= htons(IP6_MF);
724                                 fh->identification = frag_id;
725                                 ipv6_hdr(frag)->payload_len =
726                                                 htons(frag->len -
727                                                       sizeof(struct ipv6hdr));
728                                 ip6_copy_metadata(frag, skb);
729                         }
730
731                         err = output(skb);
732                         if(!err)
733                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
734                                               IPSTATS_MIB_FRAGCREATES);
735
736                         if (err || !frag)
737                                 break;
738
739                         skb = frag;
740                         frag = skb->next;
741                         skb->next = NULL;
742                 }
743
744                 kfree(tmp_hdr);
745
746                 if (err == 0) {
747                         IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
748                                       IPSTATS_MIB_FRAGOKS);
749                         dst_release(&rt->u.dst);
750                         return 0;
751                 }
752
753                 while (frag) {
754                         skb = frag->next;
755                         kfree_skb(frag);
756                         frag = skb;
757                 }
758
759                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
760                               IPSTATS_MIB_FRAGFAILS);
761                 dst_release(&rt->u.dst);
762                 return err;
763         }
764
765 slow_path:
766         left = skb->len - hlen;         /* Space per frame */
767         ptr = hlen;                     /* Where to start from */
768
769         /*
770          *      Fragment the datagram.
771          */
772
773         *prevhdr = NEXTHDR_FRAGMENT;
774
775         /*
776          *      Keep copying data until we run out.
777          */
778         while(left > 0) {
779                 len = left;
780                 /* IF: it doesn't fit, use 'mtu' - the data space left */
781                 if (len > mtu)
782                         len = mtu;
783                 /* IF: we are not sending upto and including the packet end
784                    then align the next start on an eight byte boundary */
785                 if (len < left) {
786                         len &= ~7;
787                 }
788                 /*
789                  *      Allocate buffer.
790                  */
791
792                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
793                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
794                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
795                                       IPSTATS_MIB_FRAGFAILS);
796                         err = -ENOMEM;
797                         goto fail;
798                 }
799
800                 /*
801                  *      Set up data on packet
802                  */
803
804                 ip6_copy_metadata(frag, skb);
805                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
806                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
807                 skb_reset_network_header(frag);
808                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
809                 frag->transport_header = (frag->network_header + hlen +
810                                           sizeof(struct frag_hdr));
811
812                 /*
813                  *      Charge the memory for the fragment to any owner
814                  *      it might possess
815                  */
816                 if (skb->sk)
817                         skb_set_owner_w(frag, skb->sk);
818
819                 /*
820                  *      Copy the packet header into the new buffer.
821                  */
822                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
823
824                 /*
825                  *      Build fragment header.
826                  */
827                 fh->nexthdr = nexthdr;
828                 fh->reserved = 0;
829                 if (!frag_id) {
830                         ipv6_select_ident(fh);
831                         frag_id = fh->identification;
832                 } else
833                         fh->identification = frag_id;
834
835                 /*
836                  *      Copy a block of the IP datagram.
837                  */
838                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
839                         BUG();
840                 left -= len;
841
842                 fh->frag_off = htons(offset);
843                 if (left > 0)
844                         fh->frag_off |= htons(IP6_MF);
845                 ipv6_hdr(frag)->payload_len = htons(frag->len -
846                                                     sizeof(struct ipv6hdr));
847
848                 ptr += len;
849                 offset += len;
850
851                 /*
852                  *      Put this fragment into the sending queue.
853                  */
854                 err = output(frag);
855                 if (err)
856                         goto fail;
857
858                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
859                               IPSTATS_MIB_FRAGCREATES);
860         }
861         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
862                       IPSTATS_MIB_FRAGOKS);
863         kfree_skb(skb);
864         return err;
865
866 fail:
867         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
868                       IPSTATS_MIB_FRAGFAILS);
869         kfree_skb(skb);
870         return err;
871 }
872
873 static inline int ip6_rt_check(struct rt6key *rt_key,
874                                struct in6_addr *fl_addr,
875                                struct in6_addr *addr_cache)
876 {
877         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
878                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
879 }
880
881 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
882                                           struct dst_entry *dst,
883                                           struct flowi *fl)
884 {
885         struct ipv6_pinfo *np = inet6_sk(sk);
886         struct rt6_info *rt = (struct rt6_info *)dst;
887
888         if (!dst)
889                 goto out;
890
891         /* Yes, checking route validity in not connected
892          * case is not very simple. Take into account,
893          * that we do not support routing by source, TOS,
894          * and MSG_DONTROUTE            --ANK (980726)
895          *
896          * 1. ip6_rt_check(): If route was host route,
897          *    check that cached destination is current.
898          *    If it is network route, we still may
899          *    check its validity using saved pointer
900          *    to the last used address: daddr_cache.
901          *    We do not want to save whole address now,
902          *    (because main consumer of this service
903          *    is tcp, which has not this problem),
904          *    so that the last trick works only on connected
905          *    sockets.
906          * 2. oif also should be the same.
907          */
908         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
909 #ifdef CONFIG_IPV6_SUBTREES
910             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
911 #endif
912             (fl->oif && fl->oif != dst->dev->ifindex)) {
913                 dst_release(dst);
914                 dst = NULL;
915         }
916
917 out:
918         return dst;
919 }
920
921 static int ip6_dst_lookup_tail(struct sock *sk,
922                                struct dst_entry **dst, struct flowi *fl)
923 {
924         int err;
925         struct net *net = sock_net(sk);
926
927         if (*dst == NULL)
928                 *dst = ip6_route_output(net, sk, fl);
929
930         if ((err = (*dst)->error))
931                 goto out_err_release;
932
933         if (ipv6_addr_any(&fl->fl6_src)) {
934                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
935                                          &fl->fl6_dst,
936                                          sk ? inet6_sk(sk)->srcprefs : 0,
937                                          &fl->fl6_src);
938                 if (err)
939                         goto out_err_release;
940         }
941
942 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
943         /*
944          * Here if the dst entry we've looked up
945          * has a neighbour entry that is in the INCOMPLETE
946          * state and the src address from the flow is
947          * marked as OPTIMISTIC, we release the found
948          * dst entry and replace it instead with the
949          * dst entry of the nexthop router
950          */
951         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
952                 struct inet6_ifaddr *ifp;
953                 struct flowi fl_gw;
954                 int redirect;
955
956                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
957                                       (*dst)->dev, 1);
958
959                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
960                 if (ifp)
961                         in6_ifa_put(ifp);
962
963                 if (redirect) {
964                         /*
965                          * We need to get the dst entry for the
966                          * default router instead
967                          */
968                         dst_release(*dst);
969                         memcpy(&fl_gw, fl, sizeof(struct flowi));
970                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
971                         *dst = ip6_route_output(net, sk, &fl_gw);
972                         if ((err = (*dst)->error))
973                                 goto out_err_release;
974                 }
975         }
976 #endif
977
978         return 0;
979
980 out_err_release:
981         if (err == -ENETUNREACH)
982                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
983         dst_release(*dst);
984         *dst = NULL;
985         return err;
986 }
987
988 /**
989  *      ip6_dst_lookup - perform route lookup on flow
990  *      @sk: socket which provides route info
991  *      @dst: pointer to dst_entry * for result
992  *      @fl: flow to lookup
993  *
994  *      This function performs a route lookup on the given flow.
995  *
996  *      It returns zero on success, or a standard errno code on error.
997  */
998 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
999 {
1000         *dst = NULL;
1001         return ip6_dst_lookup_tail(sk, dst, fl);
1002 }
1003 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1004
1005 /**
1006  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1007  *      @sk: socket which provides the dst cache and route info
1008  *      @dst: pointer to dst_entry * for result
1009  *      @fl: flow to lookup
1010  *
1011  *      This function performs a route lookup on the given flow with the
1012  *      possibility of using the cached route in the socket if it is valid.
1013  *      It will take the socket dst lock when operating on the dst cache.
1014  *      As a result, this function can only be used in process context.
1015  *
1016  *      It returns zero on success, or a standard errno code on error.
1017  */
1018 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1019 {
1020         *dst = NULL;
1021         if (sk) {
1022                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1023                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1024         }
1025
1026         return ip6_dst_lookup_tail(sk, dst, fl);
1027 }
1028 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1029
1030 static inline int ip6_ufo_append_data(struct sock *sk,
1031                         int getfrag(void *from, char *to, int offset, int len,
1032                         int odd, struct sk_buff *skb),
1033                         void *from, int length, int hh_len, int fragheaderlen,
1034                         int transhdrlen, int mtu,unsigned int flags)
1035
1036 {
1037         struct sk_buff *skb;
1038         int err;
1039
1040         /* There is support for UDP large send offload by network
1041          * device, so create one single skb packet containing complete
1042          * udp datagram
1043          */
1044         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1045                 skb = sock_alloc_send_skb(sk,
1046                         hh_len + fragheaderlen + transhdrlen + 20,
1047                         (flags & MSG_DONTWAIT), &err);
1048                 if (skb == NULL)
1049                         return -ENOMEM;
1050
1051                 /* reserve space for Hardware header */
1052                 skb_reserve(skb, hh_len);
1053
1054                 /* create space for UDP/IP header */
1055                 skb_put(skb,fragheaderlen + transhdrlen);
1056
1057                 /* initialize network header pointer */
1058                 skb_reset_network_header(skb);
1059
1060                 /* initialize protocol header pointer */
1061                 skb->transport_header = skb->network_header + fragheaderlen;
1062
1063                 skb->ip_summed = CHECKSUM_PARTIAL;
1064                 skb->csum = 0;
1065                 sk->sk_sndmsg_off = 0;
1066         }
1067
1068         err = skb_append_datato_frags(sk,skb, getfrag, from,
1069                                       (length - transhdrlen));
1070         if (!err) {
1071                 struct frag_hdr fhdr;
1072
1073                 /* Specify the length of each IPv6 datagram fragment.
1074                  * It has to be a multiple of 8.
1075                  */
1076                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1077                                              sizeof(struct frag_hdr)) & ~7;
1078                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1079                 ipv6_select_ident(&fhdr);
1080                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1081                 __skb_queue_tail(&sk->sk_write_queue, skb);
1082
1083                 return 0;
1084         }
1085         /* There is not enough support do UPD LSO,
1086          * so follow normal path
1087          */
1088         kfree_skb(skb);
1089
1090         return err;
1091 }
1092
1093 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1094                                                gfp_t gfp)
1095 {
1096         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1097 }
1098
1099 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1100                                                 gfp_t gfp)
1101 {
1102         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1103 }
1104
1105 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1106         int offset, int len, int odd, struct sk_buff *skb),
1107         void *from, int length, int transhdrlen,
1108         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1109         struct rt6_info *rt, unsigned int flags)
1110 {
1111         struct inet_sock *inet = inet_sk(sk);
1112         struct ipv6_pinfo *np = inet6_sk(sk);
1113         struct sk_buff *skb;
1114         unsigned int maxfraglen, fragheaderlen;
1115         int exthdrlen;
1116         int hh_len;
1117         int mtu;
1118         int copy;
1119         int err;
1120         int offset = 0;
1121         int csummode = CHECKSUM_NONE;
1122
1123         if (flags&MSG_PROBE)
1124                 return 0;
1125         if (skb_queue_empty(&sk->sk_write_queue)) {
1126                 /*
1127                  * setup for corking
1128                  */
1129                 if (opt) {
1130                         if (WARN_ON(np->cork.opt))
1131                                 return -EINVAL;
1132
1133                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1134                         if (unlikely(np->cork.opt == NULL))
1135                                 return -ENOBUFS;
1136
1137                         np->cork.opt->tot_len = opt->tot_len;
1138                         np->cork.opt->opt_flen = opt->opt_flen;
1139                         np->cork.opt->opt_nflen = opt->opt_nflen;
1140
1141                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1142                                                             sk->sk_allocation);
1143                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1144                                 return -ENOBUFS;
1145
1146                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1147                                                             sk->sk_allocation);
1148                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1149                                 return -ENOBUFS;
1150
1151                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1152                                                            sk->sk_allocation);
1153                         if (opt->hopopt && !np->cork.opt->hopopt)
1154                                 return -ENOBUFS;
1155
1156                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1157                                                             sk->sk_allocation);
1158                         if (opt->srcrt && !np->cork.opt->srcrt)
1159                                 return -ENOBUFS;
1160
1161                         /* need source address above miyazawa*/
1162                 }
1163                 dst_hold(&rt->u.dst);
1164                 inet->cork.dst = &rt->u.dst;
1165                 inet->cork.fl = *fl;
1166                 np->cork.hop_limit = hlimit;
1167                 np->cork.tclass = tclass;
1168                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1169                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1170                 if (np->frag_size < mtu) {
1171                         if (np->frag_size)
1172                                 mtu = np->frag_size;
1173                 }
1174                 inet->cork.fragsize = mtu;
1175                 if (dst_allfrag(rt->u.dst.path))
1176                         inet->cork.flags |= IPCORK_ALLFRAG;
1177                 inet->cork.length = 0;
1178                 sk->sk_sndmsg_page = NULL;
1179                 sk->sk_sndmsg_off = 0;
1180                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1181                             rt->rt6i_nfheader_len;
1182                 length += exthdrlen;
1183                 transhdrlen += exthdrlen;
1184         } else {
1185                 rt = (struct rt6_info *)inet->cork.dst;
1186                 fl = &inet->cork.fl;
1187                 opt = np->cork.opt;
1188                 transhdrlen = 0;
1189                 exthdrlen = 0;
1190                 mtu = inet->cork.fragsize;
1191         }
1192
1193         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1194
1195         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1196                         (opt ? opt->opt_nflen : 0);
1197         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1198
1199         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1200                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1201                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1202                         return -EMSGSIZE;
1203                 }
1204         }
1205
1206         /*
1207          * Let's try using as much space as possible.
1208          * Use MTU if total length of the message fits into the MTU.
1209          * Otherwise, we need to reserve fragment header and
1210          * fragment alignment (= 8-15 octects, in total).
1211          *
1212          * Note that we may need to "move" the data from the tail of
1213          * of the buffer to the new fragment when we split
1214          * the message.
1215          *
1216          * FIXME: It may be fragmented into multiple chunks
1217          *        at once if non-fragmentable extension headers
1218          *        are too large.
1219          * --yoshfuji
1220          */
1221
1222         inet->cork.length += length;
1223         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1224             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1225
1226                 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1227                                           fragheaderlen, transhdrlen, mtu,
1228                                           flags);
1229                 if (err)
1230                         goto error;
1231                 return 0;
1232         }
1233
1234         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1235                 goto alloc_new_skb;
1236
1237         while (length > 0) {
1238                 /* Check if the remaining data fits into current packet. */
1239                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1240                 if (copy < length)
1241                         copy = maxfraglen - skb->len;
1242
1243                 if (copy <= 0) {
1244                         char *data;
1245                         unsigned int datalen;
1246                         unsigned int fraglen;
1247                         unsigned int fraggap;
1248                         unsigned int alloclen;
1249                         struct sk_buff *skb_prev;
1250 alloc_new_skb:
1251                         skb_prev = skb;
1252
1253                         /* There's no room in the current skb */
1254                         if (skb_prev)
1255                                 fraggap = skb_prev->len - maxfraglen;
1256                         else
1257                                 fraggap = 0;
1258
1259                         /*
1260                          * If remaining data exceeds the mtu,
1261                          * we know we need more fragment(s).
1262                          */
1263                         datalen = length + fraggap;
1264                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1265                                 datalen = maxfraglen - fragheaderlen;
1266
1267                         fraglen = datalen + fragheaderlen;
1268                         if ((flags & MSG_MORE) &&
1269                             !(rt->u.dst.dev->features&NETIF_F_SG))
1270                                 alloclen = mtu;
1271                         else
1272                                 alloclen = datalen + fragheaderlen;
1273
1274                         /*
1275                          * The last fragment gets additional space at tail.
1276                          * Note: we overallocate on fragments with MSG_MODE
1277                          * because we have no idea if we're the last one.
1278                          */
1279                         if (datalen == length + fraggap)
1280                                 alloclen += rt->u.dst.trailer_len;
1281
1282                         /*
1283                          * We just reserve space for fragment header.
1284                          * Note: this may be overallocation if the message
1285                          * (without MSG_MORE) fits into the MTU.
1286                          */
1287                         alloclen += sizeof(struct frag_hdr);
1288
1289                         if (transhdrlen) {
1290                                 skb = sock_alloc_send_skb(sk,
1291                                                 alloclen + hh_len,
1292                                                 (flags & MSG_DONTWAIT), &err);
1293                         } else {
1294                                 skb = NULL;
1295                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1296                                     2 * sk->sk_sndbuf)
1297                                         skb = sock_wmalloc(sk,
1298                                                            alloclen + hh_len, 1,
1299                                                            sk->sk_allocation);
1300                                 if (unlikely(skb == NULL))
1301                                         err = -ENOBUFS;
1302                         }
1303                         if (skb == NULL)
1304                                 goto error;
1305                         /*
1306                          *      Fill in the control structures
1307                          */
1308                         skb->ip_summed = csummode;
1309                         skb->csum = 0;
1310                         /* reserve for fragmentation */
1311                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1312
1313                         /*
1314                          *      Find where to start putting bytes
1315                          */
1316                         data = skb_put(skb, fraglen);
1317                         skb_set_network_header(skb, exthdrlen);
1318                         data += fragheaderlen;
1319                         skb->transport_header = (skb->network_header +
1320                                                  fragheaderlen);
1321                         if (fraggap) {
1322                                 skb->csum = skb_copy_and_csum_bits(
1323                                         skb_prev, maxfraglen,
1324                                         data + transhdrlen, fraggap, 0);
1325                                 skb_prev->csum = csum_sub(skb_prev->csum,
1326                                                           skb->csum);
1327                                 data += fraggap;
1328                                 pskb_trim_unique(skb_prev, maxfraglen);
1329                         }
1330                         copy = datalen - transhdrlen - fraggap;
1331                         if (copy < 0) {
1332                                 err = -EINVAL;
1333                                 kfree_skb(skb);
1334                                 goto error;
1335                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1336                                 err = -EFAULT;
1337                                 kfree_skb(skb);
1338                                 goto error;
1339                         }
1340
1341                         offset += copy;
1342                         length -= datalen - fraggap;
1343                         transhdrlen = 0;
1344                         exthdrlen = 0;
1345                         csummode = CHECKSUM_NONE;
1346
1347                         /*
1348                          * Put the packet on the pending queue
1349                          */
1350                         __skb_queue_tail(&sk->sk_write_queue, skb);
1351                         continue;
1352                 }
1353
1354                 if (copy > length)
1355                         copy = length;
1356
1357                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1358                         unsigned int off;
1359
1360                         off = skb->len;
1361                         if (getfrag(from, skb_put(skb, copy),
1362                                                 offset, copy, off, skb) < 0) {
1363                                 __skb_trim(skb, off);
1364                                 err = -EFAULT;
1365                                 goto error;
1366                         }
1367                 } else {
1368                         int i = skb_shinfo(skb)->nr_frags;
1369                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1370                         struct page *page = sk->sk_sndmsg_page;
1371                         int off = sk->sk_sndmsg_off;
1372                         unsigned int left;
1373
1374                         if (page && (left = PAGE_SIZE - off) > 0) {
1375                                 if (copy >= left)
1376                                         copy = left;
1377                                 if (page != frag->page) {
1378                                         if (i == MAX_SKB_FRAGS) {
1379                                                 err = -EMSGSIZE;
1380                                                 goto error;
1381                                         }
1382                                         get_page(page);
1383                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1384                                         frag = &skb_shinfo(skb)->frags[i];
1385                                 }
1386                         } else if(i < MAX_SKB_FRAGS) {
1387                                 if (copy > PAGE_SIZE)
1388                                         copy = PAGE_SIZE;
1389                                 page = alloc_pages(sk->sk_allocation, 0);
1390                                 if (page == NULL) {
1391                                         err = -ENOMEM;
1392                                         goto error;
1393                                 }
1394                                 sk->sk_sndmsg_page = page;
1395                                 sk->sk_sndmsg_off = 0;
1396
1397                                 skb_fill_page_desc(skb, i, page, 0, 0);
1398                                 frag = &skb_shinfo(skb)->frags[i];
1399                         } else {
1400                                 err = -EMSGSIZE;
1401                                 goto error;
1402                         }
1403                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1404                                 err = -EFAULT;
1405                                 goto error;
1406                         }
1407                         sk->sk_sndmsg_off += copy;
1408                         frag->size += copy;
1409                         skb->len += copy;
1410                         skb->data_len += copy;
1411                         skb->truesize += copy;
1412                         atomic_add(copy, &sk->sk_wmem_alloc);
1413                 }
1414                 offset += copy;
1415                 length -= copy;
1416         }
1417         return 0;
1418 error:
1419         inet->cork.length -= length;
1420         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1421         return err;
1422 }
1423
1424 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1425 {
1426         if (np->cork.opt) {
1427                 kfree(np->cork.opt->dst0opt);
1428                 kfree(np->cork.opt->dst1opt);
1429                 kfree(np->cork.opt->hopopt);
1430                 kfree(np->cork.opt->srcrt);
1431                 kfree(np->cork.opt);
1432                 np->cork.opt = NULL;
1433         }
1434
1435         if (inet->cork.dst) {
1436                 dst_release(inet->cork.dst);
1437                 inet->cork.dst = NULL;
1438                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1439         }
1440         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1441 }
1442
1443 int ip6_push_pending_frames(struct sock *sk)
1444 {
1445         struct sk_buff *skb, *tmp_skb;
1446         struct sk_buff **tail_skb;
1447         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1448         struct inet_sock *inet = inet_sk(sk);
1449         struct ipv6_pinfo *np = inet6_sk(sk);
1450         struct net *net = sock_net(sk);
1451         struct ipv6hdr *hdr;
1452         struct ipv6_txoptions *opt = np->cork.opt;
1453         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1454         struct flowi *fl = &inet->cork.fl;
1455         unsigned char proto = fl->proto;
1456         int err = 0;
1457
1458         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1459                 goto out;
1460         tail_skb = &(skb_shinfo(skb)->frag_list);
1461
1462         /* move skb->data to ip header from ext header */
1463         if (skb->data < skb_network_header(skb))
1464                 __skb_pull(skb, skb_network_offset(skb));
1465         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1466                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1467                 *tail_skb = tmp_skb;
1468                 tail_skb = &(tmp_skb->next);
1469                 skb->len += tmp_skb->len;
1470                 skb->data_len += tmp_skb->len;
1471                 skb->truesize += tmp_skb->truesize;
1472                 tmp_skb->destructor = NULL;
1473                 tmp_skb->sk = NULL;
1474         }
1475
1476         /* Allow local fragmentation. */
1477         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1478                 skb->local_df = 1;
1479
1480         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1481         __skb_pull(skb, skb_network_header_len(skb));
1482         if (opt && opt->opt_flen)
1483                 ipv6_push_frag_opts(skb, opt, &proto);
1484         if (opt && opt->opt_nflen)
1485                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1486
1487         skb_push(skb, sizeof(struct ipv6hdr));
1488         skb_reset_network_header(skb);
1489         hdr = ipv6_hdr(skb);
1490
1491         *(__be32*)hdr = fl->fl6_flowlabel |
1492                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1493
1494         hdr->hop_limit = np->cork.hop_limit;
1495         hdr->nexthdr = proto;
1496         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1497         ipv6_addr_copy(&hdr->daddr, final_dst);
1498
1499         skb->priority = sk->sk_priority;
1500         skb->mark = sk->sk_mark;
1501
1502         skb_dst_set(skb, dst_clone(&rt->u.dst));
1503         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1504         if (proto == IPPROTO_ICMPV6) {
1505                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1506
1507                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1508                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1509         }
1510
1511         err = ip6_local_out(skb);
1512         if (err) {
1513                 if (err > 0)
1514                         err = net_xmit_errno(err);
1515                 if (err)
1516                         goto error;
1517         }
1518
1519 out:
1520         ip6_cork_release(inet, np);
1521         return err;
1522 error:
1523         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1524         goto out;
1525 }
1526
1527 void ip6_flush_pending_frames(struct sock *sk)
1528 {
1529         struct sk_buff *skb;
1530
1531         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1532                 if (skb_dst(skb))
1533                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1534                                       IPSTATS_MIB_OUTDISCARDS);
1535                 kfree_skb(skb);
1536         }
1537
1538         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1539 }