ipv6: tcp: fix TCLASS value in ACK messages sent from TIME_WAIT
[linux-2.6.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89         skb_reset_mac_header(newskb);
90         __skb_pull(newskb, skb_network_offset(newskb));
91         newskb->pkt_type = PACKET_LOOPBACK;
92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
93         WARN_ON(!skb_dst(newskb));
94
95         netif_rx_ni(newskb);
96         return 0;
97 }
98
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101         struct dst_entry *dst = skb_dst(skb);
102         struct net_device *dev = dst->dev;
103         struct neighbour *neigh;
104
105         skb->protocol = htons(ETH_P_IPV6);
106         skb->dev = dev;
107
108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110
111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112                     ((mroute6_socket(dev_net(dev), skb) &&
113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115                                          &ipv6_hdr(skb)->saddr))) {
116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117
118                         /* Do not check for IFF_ALLMULTI; multicast routing
119                            is not supported in any case.
120                          */
121                         if (newskb)
122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123                                         newskb, NULL, newskb->dev,
124                                         ip6_dev_loopback_xmit);
125
126                         if (ipv6_hdr(skb)->hop_limit == 0) {
127                                 IP6_INC_STATS(dev_net(dev), idev,
128                                               IPSTATS_MIB_OUTDISCARDS);
129                                 kfree_skb(skb);
130                                 return 0;
131                         }
132                 }
133
134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135                                 skb->len);
136         }
137
138         rcu_read_lock();
139         neigh = dst_get_neighbour(dst);
140         if (neigh) {
141                 int res = neigh_output(neigh, skb);
142
143                 rcu_read_unlock();
144                 return res;
145         }
146         rcu_read_unlock();
147         IP6_INC_STATS_BH(dev_net(dst->dev),
148                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149         kfree_skb(skb);
150         return -EINVAL;
151 }
152
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156             dst_allfrag(skb_dst(skb)))
157                 return ip6_fragment(skb, ip6_finish_output2);
158         else
159                 return ip6_finish_output2(skb);
160 }
161
162 int ip6_output(struct sk_buff *skb)
163 {
164         struct net_device *dev = skb_dst(skb)->dev;
165         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166         if (unlikely(idev->cnf.disable_ipv6)) {
167                 IP6_INC_STATS(dev_net(dev), idev,
168                               IPSTATS_MIB_OUTDISCARDS);
169                 kfree_skb(skb);
170                 return 0;
171         }
172
173         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174                             ip6_finish_output,
175                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177
178 /*
179  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183              struct ipv6_txoptions *opt, int tclass)
184 {
185         struct net *net = sock_net(sk);
186         struct ipv6_pinfo *np = inet6_sk(sk);
187         struct in6_addr *first_hop = &fl6->daddr;
188         struct dst_entry *dst = skb_dst(skb);
189         struct ipv6hdr *hdr;
190         u8  proto = fl6->flowi6_proto;
191         int seg_len = skb->len;
192         int hlimit = -1;
193         u32 mtu;
194
195         if (opt) {
196                 unsigned int head_room;
197
198                 /* First: exthdrs may take lots of space (~8K for now)
199                    MAX_HEADER is not enough.
200                  */
201                 head_room = opt->opt_nflen + opt->opt_flen;
202                 seg_len += head_room;
203                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204
205                 if (skb_headroom(skb) < head_room) {
206                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207                         if (skb2 == NULL) {
208                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209                                               IPSTATS_MIB_OUTDISCARDS);
210                                 kfree_skb(skb);
211                                 return -ENOBUFS;
212                         }
213                         kfree_skb(skb);
214                         skb = skb2;
215                         skb_set_owner_w(skb, sk);
216                 }
217                 if (opt->opt_flen)
218                         ipv6_push_frag_opts(skb, opt, &proto);
219                 if (opt->opt_nflen)
220                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221         }
222
223         skb_push(skb, sizeof(struct ipv6hdr));
224         skb_reset_network_header(skb);
225         hdr = ipv6_hdr(skb);
226
227         /*
228          *      Fill in the IPv6 header
229          */
230         if (np)
231                 hlimit = np->hop_limit;
232         if (hlimit < 0)
233                 hlimit = ip6_dst_hoplimit(dst);
234
235         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
236
237         hdr->payload_len = htons(seg_len);
238         hdr->nexthdr = proto;
239         hdr->hop_limit = hlimit;
240
241         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
242         ipv6_addr_copy(&hdr->daddr, first_hop);
243
244         skb->priority = sk->sk_priority;
245         skb->mark = sk->sk_mark;
246
247         mtu = dst_mtu(dst);
248         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250                               IPSTATS_MIB_OUT, skb->len);
251                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252                                dst->dev, dst_output);
253         }
254
255         if (net_ratelimit())
256                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
257         skb->dev = dst->dev;
258         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
259         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
260         kfree_skb(skb);
261         return -EMSGSIZE;
262 }
263
264 EXPORT_SYMBOL(ip6_xmit);
265
266 /*
267  *      To avoid extra problems ND packets are send through this
268  *      routine. It's code duplication but I really want to avoid
269  *      extra checks since ipv6_build_header is used by TCP (which
270  *      is for us performance critical)
271  */
272
273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
274                const struct in6_addr *saddr, const struct in6_addr *daddr,
275                int proto, int len)
276 {
277         struct ipv6_pinfo *np = inet6_sk(sk);
278         struct ipv6hdr *hdr;
279
280         skb->protocol = htons(ETH_P_IPV6);
281         skb->dev = dev;
282
283         skb_reset_network_header(skb);
284         skb_put(skb, sizeof(struct ipv6hdr));
285         hdr = ipv6_hdr(skb);
286
287         *(__be32*)hdr = htonl(0x60000000);
288
289         hdr->payload_len = htons(len);
290         hdr->nexthdr = proto;
291         hdr->hop_limit = np->hop_limit;
292
293         ipv6_addr_copy(&hdr->saddr, saddr);
294         ipv6_addr_copy(&hdr->daddr, daddr);
295
296         return 0;
297 }
298
299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
300 {
301         struct ip6_ra_chain *ra;
302         struct sock *last = NULL;
303
304         read_lock(&ip6_ra_lock);
305         for (ra = ip6_ra_chain; ra; ra = ra->next) {
306                 struct sock *sk = ra->sk;
307                 if (sk && ra->sel == sel &&
308                     (!sk->sk_bound_dev_if ||
309                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
310                         if (last) {
311                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
312                                 if (skb2)
313                                         rawv6_rcv(last, skb2);
314                         }
315                         last = sk;
316                 }
317         }
318
319         if (last) {
320                 rawv6_rcv(last, skb);
321                 read_unlock(&ip6_ra_lock);
322                 return 1;
323         }
324         read_unlock(&ip6_ra_lock);
325         return 0;
326 }
327
328 static int ip6_forward_proxy_check(struct sk_buff *skb)
329 {
330         struct ipv6hdr *hdr = ipv6_hdr(skb);
331         u8 nexthdr = hdr->nexthdr;
332         int offset;
333
334         if (ipv6_ext_hdr(nexthdr)) {
335                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
336                 if (offset < 0)
337                         return 0;
338         } else
339                 offset = sizeof(struct ipv6hdr);
340
341         if (nexthdr == IPPROTO_ICMPV6) {
342                 struct icmp6hdr *icmp6;
343
344                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345                                          offset + 1 - skb->data)))
346                         return 0;
347
348                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349
350                 switch (icmp6->icmp6_type) {
351                 case NDISC_ROUTER_SOLICITATION:
352                 case NDISC_ROUTER_ADVERTISEMENT:
353                 case NDISC_NEIGHBOUR_SOLICITATION:
354                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
355                 case NDISC_REDIRECT:
356                         /* For reaction involving unicast neighbor discovery
357                          * message destined to the proxied address, pass it to
358                          * input function.
359                          */
360                         return 1;
361                 default:
362                         break;
363                 }
364         }
365
366         /*
367          * The proxying router can't forward traffic sent to a link-local
368          * address, so signal the sender and discard the packet. This
369          * behavior is clarified by the MIPv6 specification.
370          */
371         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372                 dst_link_failure(skb);
373                 return -1;
374         }
375
376         return 0;
377 }
378
379 static inline int ip6_forward_finish(struct sk_buff *skb)
380 {
381         return dst_output(skb);
382 }
383
384 int ip6_forward(struct sk_buff *skb)
385 {
386         struct dst_entry *dst = skb_dst(skb);
387         struct ipv6hdr *hdr = ipv6_hdr(skb);
388         struct inet6_skb_parm *opt = IP6CB(skb);
389         struct net *net = dev_net(dst->dev);
390         struct neighbour *n;
391         u32 mtu;
392
393         if (net->ipv6.devconf_all->forwarding == 0)
394                 goto error;
395
396         if (skb_warn_if_lro(skb))
397                 goto drop;
398
399         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
400                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
401                 goto drop;
402         }
403
404         if (skb->pkt_type != PACKET_HOST)
405                 goto drop;
406
407         skb_forward_csum(skb);
408
409         /*
410          *      We DO NOT make any processing on
411          *      RA packets, pushing them to user level AS IS
412          *      without ane WARRANTY that application will be able
413          *      to interpret them. The reason is that we
414          *      cannot make anything clever here.
415          *
416          *      We are not end-node, so that if packet contains
417          *      AH/ESP, we cannot make anything.
418          *      Defragmentation also would be mistake, RA packets
419          *      cannot be fragmented, because there is no warranty
420          *      that different fragments will go along one path. --ANK
421          */
422         if (opt->ra) {
423                 u8 *ptr = skb_network_header(skb) + opt->ra;
424                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
425                         return 0;
426         }
427
428         /*
429          *      check and decrement ttl
430          */
431         if (hdr->hop_limit <= 1) {
432                 /* Force OUTPUT device used as source address */
433                 skb->dev = dst->dev;
434                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
435                 IP6_INC_STATS_BH(net,
436                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
437
438                 kfree_skb(skb);
439                 return -ETIMEDOUT;
440         }
441
442         /* XXX: idev->cnf.proxy_ndp? */
443         if (net->ipv6.devconf_all->proxy_ndp &&
444             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
445                 int proxied = ip6_forward_proxy_check(skb);
446                 if (proxied > 0)
447                         return ip6_input(skb);
448                 else if (proxied < 0) {
449                         IP6_INC_STATS(net, ip6_dst_idev(dst),
450                                       IPSTATS_MIB_INDISCARDS);
451                         goto drop;
452                 }
453         }
454
455         if (!xfrm6_route_forward(skb)) {
456                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
457                 goto drop;
458         }
459         dst = skb_dst(skb);
460
461         /* IPv6 specs say nothing about it, but it is clear that we cannot
462            send redirects to source routed frames.
463            We don't send redirects to frames decapsulated from IPsec.
464          */
465         n = dst_get_neighbour(dst);
466         if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
467                 struct in6_addr *target = NULL;
468                 struct rt6_info *rt;
469
470                 /*
471                  *      incoming and outgoing devices are the same
472                  *      send a redirect.
473                  */
474
475                 rt = (struct rt6_info *) dst;
476                 if ((rt->rt6i_flags & RTF_GATEWAY))
477                         target = (struct in6_addr*)&n->primary_key;
478                 else
479                         target = &hdr->daddr;
480
481                 if (!rt->rt6i_peer)
482                         rt6_bind_peer(rt, 1);
483
484                 /* Limit redirects both by destination (here)
485                    and by source (inside ndisc_send_redirect)
486                  */
487                 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
488                         ndisc_send_redirect(skb, n, target);
489         } else {
490                 int addrtype = ipv6_addr_type(&hdr->saddr);
491
492                 /* This check is security critical. */
493                 if (addrtype == IPV6_ADDR_ANY ||
494                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
495                         goto error;
496                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
497                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
498                                     ICMPV6_NOT_NEIGHBOUR, 0);
499                         goto error;
500                 }
501         }
502
503         mtu = dst_mtu(dst);
504         if (mtu < IPV6_MIN_MTU)
505                 mtu = IPV6_MIN_MTU;
506
507         if (skb->len > mtu && !skb_is_gso(skb)) {
508                 /* Again, force OUTPUT device used as source address */
509                 skb->dev = dst->dev;
510                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
511                 IP6_INC_STATS_BH(net,
512                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
513                 IP6_INC_STATS_BH(net,
514                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
515                 kfree_skb(skb);
516                 return -EMSGSIZE;
517         }
518
519         if (skb_cow(skb, dst->dev->hard_header_len)) {
520                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
521                 goto drop;
522         }
523
524         hdr = ipv6_hdr(skb);
525
526         /* Mangling hops number delayed to point after skb COW */
527
528         hdr->hop_limit--;
529
530         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
531         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
532                        ip6_forward_finish);
533
534 error:
535         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
536 drop:
537         kfree_skb(skb);
538         return -EINVAL;
539 }
540
541 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
542 {
543         to->pkt_type = from->pkt_type;
544         to->priority = from->priority;
545         to->protocol = from->protocol;
546         skb_dst_drop(to);
547         skb_dst_set(to, dst_clone(skb_dst(from)));
548         to->dev = from->dev;
549         to->mark = from->mark;
550
551 #ifdef CONFIG_NET_SCHED
552         to->tc_index = from->tc_index;
553 #endif
554         nf_copy(to, from);
555 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
556     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
557         to->nf_trace = from->nf_trace;
558 #endif
559         skb_copy_secmark(to, from);
560 }
561
562 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
563 {
564         u16 offset = sizeof(struct ipv6hdr);
565         struct ipv6_opt_hdr *exthdr =
566                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
567         unsigned int packet_len = skb->tail - skb->network_header;
568         int found_rhdr = 0;
569         *nexthdr = &ipv6_hdr(skb)->nexthdr;
570
571         while (offset + 1 <= packet_len) {
572
573                 switch (**nexthdr) {
574
575                 case NEXTHDR_HOP:
576                         break;
577                 case NEXTHDR_ROUTING:
578                         found_rhdr = 1;
579                         break;
580                 case NEXTHDR_DEST:
581 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
582                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
583                                 break;
584 #endif
585                         if (found_rhdr)
586                                 return offset;
587                         break;
588                 default :
589                         return offset;
590                 }
591
592                 offset += ipv6_optlen(exthdr);
593                 *nexthdr = &exthdr->nexthdr;
594                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
595                                                  offset);
596         }
597
598         return offset;
599 }
600
601 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
602 {
603         static atomic_t ipv6_fragmentation_id;
604         int old, new;
605
606         if (rt) {
607                 struct inet_peer *peer;
608
609                 if (!rt->rt6i_peer)
610                         rt6_bind_peer(rt, 1);
611                 peer = rt->rt6i_peer;
612                 if (peer) {
613                         fhdr->identification = htonl(inet_getid(peer, 0));
614                         return;
615                 }
616         }
617         do {
618                 old = atomic_read(&ipv6_fragmentation_id);
619                 new = old + 1;
620                 if (!new)
621                         new = 1;
622         } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
623         fhdr->identification = htonl(new);
624 }
625
626 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
627 {
628         struct sk_buff *frag;
629         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
630         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
631         struct ipv6hdr *tmp_hdr;
632         struct frag_hdr *fh;
633         unsigned int mtu, hlen, left, len;
634         __be32 frag_id = 0;
635         int ptr, offset = 0, err=0;
636         u8 *prevhdr, nexthdr = 0;
637         struct net *net = dev_net(skb_dst(skb)->dev);
638
639         hlen = ip6_find_1stfragopt(skb, &prevhdr);
640         nexthdr = *prevhdr;
641
642         mtu = ip6_skb_dst_mtu(skb);
643
644         /* We must not fragment if the socket is set to force MTU discovery
645          * or if the skb it not generated by a local socket.
646          */
647         if (!skb->local_df && skb->len > mtu) {
648                 skb->dev = skb_dst(skb)->dev;
649                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
650                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
651                               IPSTATS_MIB_FRAGFAILS);
652                 kfree_skb(skb);
653                 return -EMSGSIZE;
654         }
655
656         if (np && np->frag_size < mtu) {
657                 if (np->frag_size)
658                         mtu = np->frag_size;
659         }
660         mtu -= hlen + sizeof(struct frag_hdr);
661
662         if (skb_has_frag_list(skb)) {
663                 int first_len = skb_pagelen(skb);
664                 struct sk_buff *frag2;
665
666                 if (first_len - hlen > mtu ||
667                     ((first_len - hlen) & 7) ||
668                     skb_cloned(skb))
669                         goto slow_path;
670
671                 skb_walk_frags(skb, frag) {
672                         /* Correct geometry. */
673                         if (frag->len > mtu ||
674                             ((frag->len & 7) && frag->next) ||
675                             skb_headroom(frag) < hlen)
676                                 goto slow_path_clean;
677
678                         /* Partially cloned skb? */
679                         if (skb_shared(frag))
680                                 goto slow_path_clean;
681
682                         BUG_ON(frag->sk);
683                         if (skb->sk) {
684                                 frag->sk = skb->sk;
685                                 frag->destructor = sock_wfree;
686                         }
687                         skb->truesize -= frag->truesize;
688                 }
689
690                 err = 0;
691                 offset = 0;
692                 frag = skb_shinfo(skb)->frag_list;
693                 skb_frag_list_init(skb);
694                 /* BUILD HEADER */
695
696                 *prevhdr = NEXTHDR_FRAGMENT;
697                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
698                 if (!tmp_hdr) {
699                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
700                                       IPSTATS_MIB_FRAGFAILS);
701                         return -ENOMEM;
702                 }
703
704                 __skb_pull(skb, hlen);
705                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
706                 __skb_push(skb, hlen);
707                 skb_reset_network_header(skb);
708                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
709
710                 ipv6_select_ident(fh, rt);
711                 fh->nexthdr = nexthdr;
712                 fh->reserved = 0;
713                 fh->frag_off = htons(IP6_MF);
714                 frag_id = fh->identification;
715
716                 first_len = skb_pagelen(skb);
717                 skb->data_len = first_len - skb_headlen(skb);
718                 skb->len = first_len;
719                 ipv6_hdr(skb)->payload_len = htons(first_len -
720                                                    sizeof(struct ipv6hdr));
721
722                 dst_hold(&rt->dst);
723
724                 for (;;) {
725                         /* Prepare header of the next frame,
726                          * before previous one went down. */
727                         if (frag) {
728                                 frag->ip_summed = CHECKSUM_NONE;
729                                 skb_reset_transport_header(frag);
730                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
731                                 __skb_push(frag, hlen);
732                                 skb_reset_network_header(frag);
733                                 memcpy(skb_network_header(frag), tmp_hdr,
734                                        hlen);
735                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
736                                 fh->nexthdr = nexthdr;
737                                 fh->reserved = 0;
738                                 fh->frag_off = htons(offset);
739                                 if (frag->next != NULL)
740                                         fh->frag_off |= htons(IP6_MF);
741                                 fh->identification = frag_id;
742                                 ipv6_hdr(frag)->payload_len =
743                                                 htons(frag->len -
744                                                       sizeof(struct ipv6hdr));
745                                 ip6_copy_metadata(frag, skb);
746                         }
747
748                         err = output(skb);
749                         if(!err)
750                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
751                                               IPSTATS_MIB_FRAGCREATES);
752
753                         if (err || !frag)
754                                 break;
755
756                         skb = frag;
757                         frag = skb->next;
758                         skb->next = NULL;
759                 }
760
761                 kfree(tmp_hdr);
762
763                 if (err == 0) {
764                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
765                                       IPSTATS_MIB_FRAGOKS);
766                         dst_release(&rt->dst);
767                         return 0;
768                 }
769
770                 while (frag) {
771                         skb = frag->next;
772                         kfree_skb(frag);
773                         frag = skb;
774                 }
775
776                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
777                               IPSTATS_MIB_FRAGFAILS);
778                 dst_release(&rt->dst);
779                 return err;
780
781 slow_path_clean:
782                 skb_walk_frags(skb, frag2) {
783                         if (frag2 == frag)
784                                 break;
785                         frag2->sk = NULL;
786                         frag2->destructor = NULL;
787                         skb->truesize += frag2->truesize;
788                 }
789         }
790
791 slow_path:
792         left = skb->len - hlen;         /* Space per frame */
793         ptr = hlen;                     /* Where to start from */
794
795         /*
796          *      Fragment the datagram.
797          */
798
799         *prevhdr = NEXTHDR_FRAGMENT;
800
801         /*
802          *      Keep copying data until we run out.
803          */
804         while(left > 0) {
805                 len = left;
806                 /* IF: it doesn't fit, use 'mtu' - the data space left */
807                 if (len > mtu)
808                         len = mtu;
809                 /* IF: we are not sending up to and including the packet end
810                    then align the next start on an eight byte boundary */
811                 if (len < left) {
812                         len &= ~7;
813                 }
814                 /*
815                  *      Allocate buffer.
816                  */
817
818                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->dst.dev), GFP_ATOMIC)) == NULL) {
819                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
820                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
821                                       IPSTATS_MIB_FRAGFAILS);
822                         err = -ENOMEM;
823                         goto fail;
824                 }
825
826                 /*
827                  *      Set up data on packet
828                  */
829
830                 ip6_copy_metadata(frag, skb);
831                 skb_reserve(frag, LL_RESERVED_SPACE(rt->dst.dev));
832                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
833                 skb_reset_network_header(frag);
834                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
835                 frag->transport_header = (frag->network_header + hlen +
836                                           sizeof(struct frag_hdr));
837
838                 /*
839                  *      Charge the memory for the fragment to any owner
840                  *      it might possess
841                  */
842                 if (skb->sk)
843                         skb_set_owner_w(frag, skb->sk);
844
845                 /*
846                  *      Copy the packet header into the new buffer.
847                  */
848                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
849
850                 /*
851                  *      Build fragment header.
852                  */
853                 fh->nexthdr = nexthdr;
854                 fh->reserved = 0;
855                 if (!frag_id) {
856                         ipv6_select_ident(fh, rt);
857                         frag_id = fh->identification;
858                 } else
859                         fh->identification = frag_id;
860
861                 /*
862                  *      Copy a block of the IP datagram.
863                  */
864                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
865                         BUG();
866                 left -= len;
867
868                 fh->frag_off = htons(offset);
869                 if (left > 0)
870                         fh->frag_off |= htons(IP6_MF);
871                 ipv6_hdr(frag)->payload_len = htons(frag->len -
872                                                     sizeof(struct ipv6hdr));
873
874                 ptr += len;
875                 offset += len;
876
877                 /*
878                  *      Put this fragment into the sending queue.
879                  */
880                 err = output(frag);
881                 if (err)
882                         goto fail;
883
884                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
885                               IPSTATS_MIB_FRAGCREATES);
886         }
887         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
888                       IPSTATS_MIB_FRAGOKS);
889         kfree_skb(skb);
890         return err;
891
892 fail:
893         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
894                       IPSTATS_MIB_FRAGFAILS);
895         kfree_skb(skb);
896         return err;
897 }
898
899 static inline int ip6_rt_check(const struct rt6key *rt_key,
900                                const struct in6_addr *fl_addr,
901                                const struct in6_addr *addr_cache)
902 {
903         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
904                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
905 }
906
907 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
908                                           struct dst_entry *dst,
909                                           const struct flowi6 *fl6)
910 {
911         struct ipv6_pinfo *np = inet6_sk(sk);
912         struct rt6_info *rt = (struct rt6_info *)dst;
913
914         if (!dst)
915                 goto out;
916
917         /* Yes, checking route validity in not connected
918          * case is not very simple. Take into account,
919          * that we do not support routing by source, TOS,
920          * and MSG_DONTROUTE            --ANK (980726)
921          *
922          * 1. ip6_rt_check(): If route was host route,
923          *    check that cached destination is current.
924          *    If it is network route, we still may
925          *    check its validity using saved pointer
926          *    to the last used address: daddr_cache.
927          *    We do not want to save whole address now,
928          *    (because main consumer of this service
929          *    is tcp, which has not this problem),
930          *    so that the last trick works only on connected
931          *    sockets.
932          * 2. oif also should be the same.
933          */
934         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
935 #ifdef CONFIG_IPV6_SUBTREES
936             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
937 #endif
938             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
939                 dst_release(dst);
940                 dst = NULL;
941         }
942
943 out:
944         return dst;
945 }
946
947 static int ip6_dst_lookup_tail(struct sock *sk,
948                                struct dst_entry **dst, struct flowi6 *fl6)
949 {
950         struct net *net = sock_net(sk);
951 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
952         struct neighbour *n;
953 #endif
954         int err;
955
956         if (*dst == NULL)
957                 *dst = ip6_route_output(net, sk, fl6);
958
959         if ((err = (*dst)->error))
960                 goto out_err_release;
961
962         if (ipv6_addr_any(&fl6->saddr)) {
963                 struct rt6_info *rt = (struct rt6_info *) *dst;
964                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
965                                           sk ? inet6_sk(sk)->srcprefs : 0,
966                                           &fl6->saddr);
967                 if (err)
968                         goto out_err_release;
969         }
970
971 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
972         /*
973          * Here if the dst entry we've looked up
974          * has a neighbour entry that is in the INCOMPLETE
975          * state and the src address from the flow is
976          * marked as OPTIMISTIC, we release the found
977          * dst entry and replace it instead with the
978          * dst entry of the nexthop router
979          */
980         rcu_read_lock();
981         n = dst_get_neighbour(*dst);
982         if (n && !(n->nud_state & NUD_VALID)) {
983                 struct inet6_ifaddr *ifp;
984                 struct flowi6 fl_gw6;
985                 int redirect;
986
987                 rcu_read_unlock();
988                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
989                                       (*dst)->dev, 1);
990
991                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
992                 if (ifp)
993                         in6_ifa_put(ifp);
994
995                 if (redirect) {
996                         /*
997                          * We need to get the dst entry for the
998                          * default router instead
999                          */
1000                         dst_release(*dst);
1001                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1002                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1003                         *dst = ip6_route_output(net, sk, &fl_gw6);
1004                         if ((err = (*dst)->error))
1005                                 goto out_err_release;
1006                 }
1007         } else {
1008                 rcu_read_unlock();
1009         }
1010 #endif
1011
1012         return 0;
1013
1014 out_err_release:
1015         if (err == -ENETUNREACH)
1016                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1017         dst_release(*dst);
1018         *dst = NULL;
1019         return err;
1020 }
1021
1022 /**
1023  *      ip6_dst_lookup - perform route lookup on flow
1024  *      @sk: socket which provides route info
1025  *      @dst: pointer to dst_entry * for result
1026  *      @fl6: flow to lookup
1027  *
1028  *      This function performs a route lookup on the given flow.
1029  *
1030  *      It returns zero on success, or a standard errno code on error.
1031  */
1032 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1033 {
1034         *dst = NULL;
1035         return ip6_dst_lookup_tail(sk, dst, fl6);
1036 }
1037 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1038
1039 /**
1040  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1041  *      @sk: socket which provides route info
1042  *      @fl6: flow to lookup
1043  *      @final_dst: final destination address for ipsec lookup
1044  *      @can_sleep: we are in a sleepable context
1045  *
1046  *      This function performs a route lookup on the given flow.
1047  *
1048  *      It returns a valid dst pointer on success, or a pointer encoded
1049  *      error code.
1050  */
1051 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1052                                       const struct in6_addr *final_dst,
1053                                       bool can_sleep)
1054 {
1055         struct dst_entry *dst = NULL;
1056         int err;
1057
1058         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1059         if (err)
1060                 return ERR_PTR(err);
1061         if (final_dst)
1062                 ipv6_addr_copy(&fl6->daddr, final_dst);
1063         if (can_sleep)
1064                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1065
1066         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1067 }
1068 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1069
1070 /**
1071  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1072  *      @sk: socket which provides the dst cache and route info
1073  *      @fl6: flow to lookup
1074  *      @final_dst: final destination address for ipsec lookup
1075  *      @can_sleep: we are in a sleepable context
1076  *
1077  *      This function performs a route lookup on the given flow with the
1078  *      possibility of using the cached route in the socket if it is valid.
1079  *      It will take the socket dst lock when operating on the dst cache.
1080  *      As a result, this function can only be used in process context.
1081  *
1082  *      It returns a valid dst pointer on success, or a pointer encoded
1083  *      error code.
1084  */
1085 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1086                                          const struct in6_addr *final_dst,
1087                                          bool can_sleep)
1088 {
1089         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1090         int err;
1091
1092         dst = ip6_sk_dst_check(sk, dst, fl6);
1093
1094         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1095         if (err)
1096                 return ERR_PTR(err);
1097         if (final_dst)
1098                 ipv6_addr_copy(&fl6->daddr, final_dst);
1099         if (can_sleep)
1100                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1101
1102         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1103 }
1104 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1105
1106 static inline int ip6_ufo_append_data(struct sock *sk,
1107                         int getfrag(void *from, char *to, int offset, int len,
1108                         int odd, struct sk_buff *skb),
1109                         void *from, int length, int hh_len, int fragheaderlen,
1110                         int transhdrlen, int mtu,unsigned int flags,
1111                         struct rt6_info *rt)
1112
1113 {
1114         struct sk_buff *skb;
1115         int err;
1116
1117         /* There is support for UDP large send offload by network
1118          * device, so create one single skb packet containing complete
1119          * udp datagram
1120          */
1121         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1122                 skb = sock_alloc_send_skb(sk,
1123                         hh_len + fragheaderlen + transhdrlen + 20,
1124                         (flags & MSG_DONTWAIT), &err);
1125                 if (skb == NULL)
1126                         return -ENOMEM;
1127
1128                 /* reserve space for Hardware header */
1129                 skb_reserve(skb, hh_len);
1130
1131                 /* create space for UDP/IP header */
1132                 skb_put(skb,fragheaderlen + transhdrlen);
1133
1134                 /* initialize network header pointer */
1135                 skb_reset_network_header(skb);
1136
1137                 /* initialize protocol header pointer */
1138                 skb->transport_header = skb->network_header + fragheaderlen;
1139
1140                 skb->ip_summed = CHECKSUM_PARTIAL;
1141                 skb->csum = 0;
1142         }
1143
1144         err = skb_append_datato_frags(sk,skb, getfrag, from,
1145                                       (length - transhdrlen));
1146         if (!err) {
1147                 struct frag_hdr fhdr;
1148
1149                 /* Specify the length of each IPv6 datagram fragment.
1150                  * It has to be a multiple of 8.
1151                  */
1152                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1153                                              sizeof(struct frag_hdr)) & ~7;
1154                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1155                 ipv6_select_ident(&fhdr, rt);
1156                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1157                 __skb_queue_tail(&sk->sk_write_queue, skb);
1158
1159                 return 0;
1160         }
1161         /* There is not enough support do UPD LSO,
1162          * so follow normal path
1163          */
1164         kfree_skb(skb);
1165
1166         return err;
1167 }
1168
1169 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1170                                                gfp_t gfp)
1171 {
1172         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1173 }
1174
1175 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1176                                                 gfp_t gfp)
1177 {
1178         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1179 }
1180
1181 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1182         int offset, int len, int odd, struct sk_buff *skb),
1183         void *from, int length, int transhdrlen,
1184         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1185         struct rt6_info *rt, unsigned int flags, int dontfrag)
1186 {
1187         struct inet_sock *inet = inet_sk(sk);
1188         struct ipv6_pinfo *np = inet6_sk(sk);
1189         struct inet_cork *cork;
1190         struct sk_buff *skb;
1191         unsigned int maxfraglen, fragheaderlen;
1192         int exthdrlen;
1193         int dst_exthdrlen;
1194         int hh_len;
1195         int mtu;
1196         int copy;
1197         int err;
1198         int offset = 0;
1199         int csummode = CHECKSUM_NONE;
1200         __u8 tx_flags = 0;
1201
1202         if (flags&MSG_PROBE)
1203                 return 0;
1204         cork = &inet->cork.base;
1205         if (skb_queue_empty(&sk->sk_write_queue)) {
1206                 /*
1207                  * setup for corking
1208                  */
1209                 if (opt) {
1210                         if (WARN_ON(np->cork.opt))
1211                                 return -EINVAL;
1212
1213                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1214                         if (unlikely(np->cork.opt == NULL))
1215                                 return -ENOBUFS;
1216
1217                         np->cork.opt->tot_len = opt->tot_len;
1218                         np->cork.opt->opt_flen = opt->opt_flen;
1219                         np->cork.opt->opt_nflen = opt->opt_nflen;
1220
1221                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1222                                                             sk->sk_allocation);
1223                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1224                                 return -ENOBUFS;
1225
1226                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1227                                                             sk->sk_allocation);
1228                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1229                                 return -ENOBUFS;
1230
1231                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1232                                                            sk->sk_allocation);
1233                         if (opt->hopopt && !np->cork.opt->hopopt)
1234                                 return -ENOBUFS;
1235
1236                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1237                                                             sk->sk_allocation);
1238                         if (opt->srcrt && !np->cork.opt->srcrt)
1239                                 return -ENOBUFS;
1240
1241                         /* need source address above miyazawa*/
1242                 }
1243                 dst_hold(&rt->dst);
1244                 cork->dst = &rt->dst;
1245                 inet->cork.fl.u.ip6 = *fl6;
1246                 np->cork.hop_limit = hlimit;
1247                 np->cork.tclass = tclass;
1248                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1249                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1250                 if (np->frag_size < mtu) {
1251                         if (np->frag_size)
1252                                 mtu = np->frag_size;
1253                 }
1254                 cork->fragsize = mtu;
1255                 if (dst_allfrag(rt->dst.path))
1256                         cork->flags |= IPCORK_ALLFRAG;
1257                 cork->length = 0;
1258                 sk->sk_sndmsg_page = NULL;
1259                 sk->sk_sndmsg_off = 0;
1260                 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1261                 length += exthdrlen;
1262                 transhdrlen += exthdrlen;
1263                 dst_exthdrlen = rt->dst.header_len;
1264         } else {
1265                 rt = (struct rt6_info *)cork->dst;
1266                 fl6 = &inet->cork.fl.u.ip6;
1267                 opt = np->cork.opt;
1268                 transhdrlen = 0;
1269                 exthdrlen = 0;
1270                 dst_exthdrlen = 0;
1271                 mtu = cork->fragsize;
1272         }
1273
1274         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1275
1276         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1277                         (opt ? opt->opt_nflen : 0);
1278         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1279
1280         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1281                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1282                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1283                         return -EMSGSIZE;
1284                 }
1285         }
1286
1287         /* For UDP, check if TX timestamp is enabled */
1288         if (sk->sk_type == SOCK_DGRAM) {
1289                 err = sock_tx_timestamp(sk, &tx_flags);
1290                 if (err)
1291                         goto error;
1292         }
1293
1294         /*
1295          * Let's try using as much space as possible.
1296          * Use MTU if total length of the message fits into the MTU.
1297          * Otherwise, we need to reserve fragment header and
1298          * fragment alignment (= 8-15 octects, in total).
1299          *
1300          * Note that we may need to "move" the data from the tail of
1301          * of the buffer to the new fragment when we split
1302          * the message.
1303          *
1304          * FIXME: It may be fragmented into multiple chunks
1305          *        at once if non-fragmentable extension headers
1306          *        are too large.
1307          * --yoshfuji
1308          */
1309
1310         cork->length += length;
1311         if (length > mtu) {
1312                 int proto = sk->sk_protocol;
1313                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1314                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1315                         return -EMSGSIZE;
1316                 }
1317
1318                 if (proto == IPPROTO_UDP &&
1319                     (rt->dst.dev->features & NETIF_F_UFO)) {
1320
1321                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1322                                                   hh_len, fragheaderlen,
1323                                                   transhdrlen, mtu, flags, rt);
1324                         if (err)
1325                                 goto error;
1326                         return 0;
1327                 }
1328         }
1329
1330         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1331                 goto alloc_new_skb;
1332
1333         while (length > 0) {
1334                 /* Check if the remaining data fits into current packet. */
1335                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1336                 if (copy < length)
1337                         copy = maxfraglen - skb->len;
1338
1339                 if (copy <= 0) {
1340                         char *data;
1341                         unsigned int datalen;
1342                         unsigned int fraglen;
1343                         unsigned int fraggap;
1344                         unsigned int alloclen;
1345                         struct sk_buff *skb_prev;
1346 alloc_new_skb:
1347                         skb_prev = skb;
1348
1349                         /* There's no room in the current skb */
1350                         if (skb_prev)
1351                                 fraggap = skb_prev->len - maxfraglen;
1352                         else
1353                                 fraggap = 0;
1354
1355                         /*
1356                          * If remaining data exceeds the mtu,
1357                          * we know we need more fragment(s).
1358                          */
1359                         datalen = length + fraggap;
1360                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1361                                 datalen = maxfraglen - fragheaderlen;
1362
1363                         fraglen = datalen + fragheaderlen;
1364                         if ((flags & MSG_MORE) &&
1365                             !(rt->dst.dev->features&NETIF_F_SG))
1366                                 alloclen = mtu;
1367                         else
1368                                 alloclen = datalen + fragheaderlen;
1369
1370                         alloclen += dst_exthdrlen;
1371
1372                         /*
1373                          * The last fragment gets additional space at tail.
1374                          * Note: we overallocate on fragments with MSG_MODE
1375                          * because we have no idea if we're the last one.
1376                          */
1377                         if (datalen == length + fraggap)
1378                                 alloclen += rt->dst.trailer_len;
1379
1380                         /*
1381                          * We just reserve space for fragment header.
1382                          * Note: this may be overallocation if the message
1383                          * (without MSG_MORE) fits into the MTU.
1384                          */
1385                         alloclen += sizeof(struct frag_hdr);
1386
1387                         if (transhdrlen) {
1388                                 skb = sock_alloc_send_skb(sk,
1389                                                 alloclen + hh_len,
1390                                                 (flags & MSG_DONTWAIT), &err);
1391                         } else {
1392                                 skb = NULL;
1393                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1394                                     2 * sk->sk_sndbuf)
1395                                         skb = sock_wmalloc(sk,
1396                                                            alloclen + hh_len, 1,
1397                                                            sk->sk_allocation);
1398                                 if (unlikely(skb == NULL))
1399                                         err = -ENOBUFS;
1400                                 else {
1401                                         /* Only the initial fragment
1402                                          * is time stamped.
1403                                          */
1404                                         tx_flags = 0;
1405                                 }
1406                         }
1407                         if (skb == NULL)
1408                                 goto error;
1409                         /*
1410                          *      Fill in the control structures
1411                          */
1412                         skb->ip_summed = csummode;
1413                         skb->csum = 0;
1414                         /* reserve for fragmentation */
1415                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1416
1417                         if (sk->sk_type == SOCK_DGRAM)
1418                                 skb_shinfo(skb)->tx_flags = tx_flags;
1419
1420                         /*
1421                          *      Find where to start putting bytes
1422                          */
1423                         data = skb_put(skb, fraglen + dst_exthdrlen);
1424                         skb_set_network_header(skb, exthdrlen + dst_exthdrlen);
1425                         data += fragheaderlen + dst_exthdrlen;
1426                         skb->transport_header = (skb->network_header +
1427                                                  fragheaderlen);
1428                         if (fraggap) {
1429                                 skb->csum = skb_copy_and_csum_bits(
1430                                         skb_prev, maxfraglen,
1431                                         data + transhdrlen, fraggap, 0);
1432                                 skb_prev->csum = csum_sub(skb_prev->csum,
1433                                                           skb->csum);
1434                                 data += fraggap;
1435                                 pskb_trim_unique(skb_prev, maxfraglen);
1436                         }
1437                         copy = datalen - transhdrlen - fraggap;
1438
1439                         if (copy < 0) {
1440                                 err = -EINVAL;
1441                                 kfree_skb(skb);
1442                                 goto error;
1443                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1444                                 err = -EFAULT;
1445                                 kfree_skb(skb);
1446                                 goto error;
1447                         }
1448
1449                         offset += copy;
1450                         length -= datalen - fraggap;
1451                         transhdrlen = 0;
1452                         exthdrlen = 0;
1453                         dst_exthdrlen = 0;
1454                         csummode = CHECKSUM_NONE;
1455
1456                         /*
1457                          * Put the packet on the pending queue
1458                          */
1459                         __skb_queue_tail(&sk->sk_write_queue, skb);
1460                         continue;
1461                 }
1462
1463                 if (copy > length)
1464                         copy = length;
1465
1466                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1467                         unsigned int off;
1468
1469                         off = skb->len;
1470                         if (getfrag(from, skb_put(skb, copy),
1471                                                 offset, copy, off, skb) < 0) {
1472                                 __skb_trim(skb, off);
1473                                 err = -EFAULT;
1474                                 goto error;
1475                         }
1476                 } else {
1477                         int i = skb_shinfo(skb)->nr_frags;
1478                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1479                         struct page *page = sk->sk_sndmsg_page;
1480                         int off = sk->sk_sndmsg_off;
1481                         unsigned int left;
1482
1483                         if (page && (left = PAGE_SIZE - off) > 0) {
1484                                 if (copy >= left)
1485                                         copy = left;
1486                                 if (page != skb_frag_page(frag)) {
1487                                         if (i == MAX_SKB_FRAGS) {
1488                                                 err = -EMSGSIZE;
1489                                                 goto error;
1490                                         }
1491                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1492                                         skb_frag_ref(skb, i);
1493                                         frag = &skb_shinfo(skb)->frags[i];
1494                                 }
1495                         } else if(i < MAX_SKB_FRAGS) {
1496                                 if (copy > PAGE_SIZE)
1497                                         copy = PAGE_SIZE;
1498                                 page = alloc_pages(sk->sk_allocation, 0);
1499                                 if (page == NULL) {
1500                                         err = -ENOMEM;
1501                                         goto error;
1502                                 }
1503                                 sk->sk_sndmsg_page = page;
1504                                 sk->sk_sndmsg_off = 0;
1505
1506                                 skb_fill_page_desc(skb, i, page, 0, 0);
1507                                 frag = &skb_shinfo(skb)->frags[i];
1508                         } else {
1509                                 err = -EMSGSIZE;
1510                                 goto error;
1511                         }
1512                         if (getfrag(from,
1513                                     skb_frag_address(frag) + skb_frag_size(frag),
1514                                     offset, copy, skb->len, skb) < 0) {
1515                                 err = -EFAULT;
1516                                 goto error;
1517                         }
1518                         sk->sk_sndmsg_off += copy;
1519                         skb_frag_size_add(frag, copy);
1520                         skb->len += copy;
1521                         skb->data_len += copy;
1522                         skb->truesize += copy;
1523                         atomic_add(copy, &sk->sk_wmem_alloc);
1524                 }
1525                 offset += copy;
1526                 length -= copy;
1527         }
1528         return 0;
1529 error:
1530         cork->length -= length;
1531         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1532         return err;
1533 }
1534
1535 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1536 {
1537         if (np->cork.opt) {
1538                 kfree(np->cork.opt->dst0opt);
1539                 kfree(np->cork.opt->dst1opt);
1540                 kfree(np->cork.opt->hopopt);
1541                 kfree(np->cork.opt->srcrt);
1542                 kfree(np->cork.opt);
1543                 np->cork.opt = NULL;
1544         }
1545
1546         if (inet->cork.base.dst) {
1547                 dst_release(inet->cork.base.dst);
1548                 inet->cork.base.dst = NULL;
1549                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1550         }
1551         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1552 }
1553
1554 int ip6_push_pending_frames(struct sock *sk)
1555 {
1556         struct sk_buff *skb, *tmp_skb;
1557         struct sk_buff **tail_skb;
1558         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1559         struct inet_sock *inet = inet_sk(sk);
1560         struct ipv6_pinfo *np = inet6_sk(sk);
1561         struct net *net = sock_net(sk);
1562         struct ipv6hdr *hdr;
1563         struct ipv6_txoptions *opt = np->cork.opt;
1564         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1565         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1566         unsigned char proto = fl6->flowi6_proto;
1567         int err = 0;
1568
1569         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1570                 goto out;
1571         tail_skb = &(skb_shinfo(skb)->frag_list);
1572
1573         /* move skb->data to ip header from ext header */
1574         if (skb->data < skb_network_header(skb))
1575                 __skb_pull(skb, skb_network_offset(skb));
1576         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1577                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1578                 *tail_skb = tmp_skb;
1579                 tail_skb = &(tmp_skb->next);
1580                 skb->len += tmp_skb->len;
1581                 skb->data_len += tmp_skb->len;
1582                 skb->truesize += tmp_skb->truesize;
1583                 tmp_skb->destructor = NULL;
1584                 tmp_skb->sk = NULL;
1585         }
1586
1587         /* Allow local fragmentation. */
1588         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1589                 skb->local_df = 1;
1590
1591         ipv6_addr_copy(final_dst, &fl6->daddr);
1592         __skb_pull(skb, skb_network_header_len(skb));
1593         if (opt && opt->opt_flen)
1594                 ipv6_push_frag_opts(skb, opt, &proto);
1595         if (opt && opt->opt_nflen)
1596                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1597
1598         skb_push(skb, sizeof(struct ipv6hdr));
1599         skb_reset_network_header(skb);
1600         hdr = ipv6_hdr(skb);
1601
1602         *(__be32*)hdr = fl6->flowlabel |
1603                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1604
1605         hdr->hop_limit = np->cork.hop_limit;
1606         hdr->nexthdr = proto;
1607         ipv6_addr_copy(&hdr->saddr, &fl6->saddr);
1608         ipv6_addr_copy(&hdr->daddr, final_dst);
1609
1610         skb->priority = sk->sk_priority;
1611         skb->mark = sk->sk_mark;
1612
1613         skb_dst_set(skb, dst_clone(&rt->dst));
1614         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1615         if (proto == IPPROTO_ICMPV6) {
1616                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1617
1618                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1619                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1620         }
1621
1622         err = ip6_local_out(skb);
1623         if (err) {
1624                 if (err > 0)
1625                         err = net_xmit_errno(err);
1626                 if (err)
1627                         goto error;
1628         }
1629
1630 out:
1631         ip6_cork_release(inet, np);
1632         return err;
1633 error:
1634         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1635         goto out;
1636 }
1637
1638 void ip6_flush_pending_frames(struct sock *sk)
1639 {
1640         struct sk_buff *skb;
1641
1642         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1643                 if (skb_dst(skb))
1644                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1645                                       IPSTATS_MIB_OUTDISCARDS);
1646                 kfree_skb(skb);
1647         }
1648
1649         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1650 }