d97e07183ce9c12123ece97e954004b912c88b30
[linux-2.6.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89         skb_reset_mac_header(newskb);
90         __skb_pull(newskb, skb_network_offset(newskb));
91         newskb->pkt_type = PACKET_LOOPBACK;
92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
93         WARN_ON(!skb_dst(newskb));
94
95         netif_rx_ni(newskb);
96         return 0;
97 }
98
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101         struct dst_entry *dst = skb_dst(skb);
102         struct net_device *dev = dst->dev;
103         struct neighbour *neigh;
104
105         skb->protocol = htons(ETH_P_IPV6);
106         skb->dev = dev;
107
108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110
111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112                     ((mroute6_socket(dev_net(dev), skb) &&
113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115                                          &ipv6_hdr(skb)->saddr))) {
116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117
118                         /* Do not check for IFF_ALLMULTI; multicast routing
119                            is not supported in any case.
120                          */
121                         if (newskb)
122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123                                         newskb, NULL, newskb->dev,
124                                         ip6_dev_loopback_xmit);
125
126                         if (ipv6_hdr(skb)->hop_limit == 0) {
127                                 IP6_INC_STATS(dev_net(dev), idev,
128                                               IPSTATS_MIB_OUTDISCARDS);
129                                 kfree_skb(skb);
130                                 return 0;
131                         }
132                 }
133
134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135                                 skb->len);
136         }
137
138         rcu_read_lock();
139         neigh = dst_get_neighbour_noref(dst);
140         if (neigh) {
141                 int res = neigh_output(neigh, skb);
142
143                 rcu_read_unlock();
144                 return res;
145         }
146         rcu_read_unlock();
147         IP6_INC_STATS_BH(dev_net(dst->dev),
148                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149         kfree_skb(skb);
150         return -EINVAL;
151 }
152
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156             dst_allfrag(skb_dst(skb)))
157                 return ip6_fragment(skb, ip6_finish_output2);
158         else
159                 return ip6_finish_output2(skb);
160 }
161
162 int ip6_output(struct sk_buff *skb)
163 {
164         struct net_device *dev = skb_dst(skb)->dev;
165         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166         if (unlikely(idev->cnf.disable_ipv6)) {
167                 IP6_INC_STATS(dev_net(dev), idev,
168                               IPSTATS_MIB_OUTDISCARDS);
169                 kfree_skb(skb);
170                 return 0;
171         }
172
173         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174                             ip6_finish_output,
175                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177
178 /*
179  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183              struct ipv6_txoptions *opt, int tclass)
184 {
185         struct net *net = sock_net(sk);
186         struct ipv6_pinfo *np = inet6_sk(sk);
187         struct in6_addr *first_hop = &fl6->daddr;
188         struct dst_entry *dst = skb_dst(skb);
189         struct ipv6hdr *hdr;
190         u8  proto = fl6->flowi6_proto;
191         int seg_len = skb->len;
192         int hlimit = -1;
193         u32 mtu;
194
195         if (opt) {
196                 unsigned int head_room;
197
198                 /* First: exthdrs may take lots of space (~8K for now)
199                    MAX_HEADER is not enough.
200                  */
201                 head_room = opt->opt_nflen + opt->opt_flen;
202                 seg_len += head_room;
203                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204
205                 if (skb_headroom(skb) < head_room) {
206                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207                         if (skb2 == NULL) {
208                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209                                               IPSTATS_MIB_OUTDISCARDS);
210                                 kfree_skb(skb);
211                                 return -ENOBUFS;
212                         }
213                         kfree_skb(skb);
214                         skb = skb2;
215                         skb_set_owner_w(skb, sk);
216                 }
217                 if (opt->opt_flen)
218                         ipv6_push_frag_opts(skb, opt, &proto);
219                 if (opt->opt_nflen)
220                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221         }
222
223         skb_push(skb, sizeof(struct ipv6hdr));
224         skb_reset_network_header(skb);
225         hdr = ipv6_hdr(skb);
226
227         /*
228          *      Fill in the IPv6 header
229          */
230         if (np)
231                 hlimit = np->hop_limit;
232         if (hlimit < 0)
233                 hlimit = ip6_dst_hoplimit(dst);
234
235         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
236
237         hdr->payload_len = htons(seg_len);
238         hdr->nexthdr = proto;
239         hdr->hop_limit = hlimit;
240
241         hdr->saddr = fl6->saddr;
242         hdr->daddr = *first_hop;
243
244         skb->priority = sk->sk_priority;
245         skb->mark = sk->sk_mark;
246
247         mtu = dst_mtu(dst);
248         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250                               IPSTATS_MIB_OUT, skb->len);
251                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252                                dst->dev, dst_output);
253         }
254
255         if (net_ratelimit())
256                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
257         skb->dev = dst->dev;
258         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
259         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
260         kfree_skb(skb);
261         return -EMSGSIZE;
262 }
263
264 EXPORT_SYMBOL(ip6_xmit);
265
266 /*
267  *      To avoid extra problems ND packets are send through this
268  *      routine. It's code duplication but I really want to avoid
269  *      extra checks since ipv6_build_header is used by TCP (which
270  *      is for us performance critical)
271  */
272
273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
274                const struct in6_addr *saddr, const struct in6_addr *daddr,
275                int proto, int len)
276 {
277         struct ipv6_pinfo *np = inet6_sk(sk);
278         struct ipv6hdr *hdr;
279
280         skb->protocol = htons(ETH_P_IPV6);
281         skb->dev = dev;
282
283         skb_reset_network_header(skb);
284         skb_put(skb, sizeof(struct ipv6hdr));
285         hdr = ipv6_hdr(skb);
286
287         *(__be32*)hdr = htonl(0x60000000);
288
289         hdr->payload_len = htons(len);
290         hdr->nexthdr = proto;
291         hdr->hop_limit = np->hop_limit;
292
293         hdr->saddr = *saddr;
294         hdr->daddr = *daddr;
295
296         return 0;
297 }
298
299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
300 {
301         struct ip6_ra_chain *ra;
302         struct sock *last = NULL;
303
304         read_lock(&ip6_ra_lock);
305         for (ra = ip6_ra_chain; ra; ra = ra->next) {
306                 struct sock *sk = ra->sk;
307                 if (sk && ra->sel == sel &&
308                     (!sk->sk_bound_dev_if ||
309                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
310                         if (last) {
311                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
312                                 if (skb2)
313                                         rawv6_rcv(last, skb2);
314                         }
315                         last = sk;
316                 }
317         }
318
319         if (last) {
320                 rawv6_rcv(last, skb);
321                 read_unlock(&ip6_ra_lock);
322                 return 1;
323         }
324         read_unlock(&ip6_ra_lock);
325         return 0;
326 }
327
328 static int ip6_forward_proxy_check(struct sk_buff *skb)
329 {
330         struct ipv6hdr *hdr = ipv6_hdr(skb);
331         u8 nexthdr = hdr->nexthdr;
332         __be16 frag_off;
333         int offset;
334
335         if (ipv6_ext_hdr(nexthdr)) {
336                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
337                 if (offset < 0)
338                         return 0;
339         } else
340                 offset = sizeof(struct ipv6hdr);
341
342         if (nexthdr == IPPROTO_ICMPV6) {
343                 struct icmp6hdr *icmp6;
344
345                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
346                                          offset + 1 - skb->data)))
347                         return 0;
348
349                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
350
351                 switch (icmp6->icmp6_type) {
352                 case NDISC_ROUTER_SOLICITATION:
353                 case NDISC_ROUTER_ADVERTISEMENT:
354                 case NDISC_NEIGHBOUR_SOLICITATION:
355                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
356                 case NDISC_REDIRECT:
357                         /* For reaction involving unicast neighbor discovery
358                          * message destined to the proxied address, pass it to
359                          * input function.
360                          */
361                         return 1;
362                 default:
363                         break;
364                 }
365         }
366
367         /*
368          * The proxying router can't forward traffic sent to a link-local
369          * address, so signal the sender and discard the packet. This
370          * behavior is clarified by the MIPv6 specification.
371          */
372         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
373                 dst_link_failure(skb);
374                 return -1;
375         }
376
377         return 0;
378 }
379
380 static inline int ip6_forward_finish(struct sk_buff *skb)
381 {
382         return dst_output(skb);
383 }
384
385 int ip6_forward(struct sk_buff *skb)
386 {
387         struct dst_entry *dst = skb_dst(skb);
388         struct ipv6hdr *hdr = ipv6_hdr(skb);
389         struct inet6_skb_parm *opt = IP6CB(skb);
390         struct net *net = dev_net(dst->dev);
391         struct neighbour *n;
392         u32 mtu;
393
394         if (net->ipv6.devconf_all->forwarding == 0)
395                 goto error;
396
397         if (skb_warn_if_lro(skb))
398                 goto drop;
399
400         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
401                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
402                 goto drop;
403         }
404
405         if (skb->pkt_type != PACKET_HOST)
406                 goto drop;
407
408         skb_forward_csum(skb);
409
410         /*
411          *      We DO NOT make any processing on
412          *      RA packets, pushing them to user level AS IS
413          *      without ane WARRANTY that application will be able
414          *      to interpret them. The reason is that we
415          *      cannot make anything clever here.
416          *
417          *      We are not end-node, so that if packet contains
418          *      AH/ESP, we cannot make anything.
419          *      Defragmentation also would be mistake, RA packets
420          *      cannot be fragmented, because there is no warranty
421          *      that different fragments will go along one path. --ANK
422          */
423         if (opt->ra) {
424                 u8 *ptr = skb_network_header(skb) + opt->ra;
425                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
426                         return 0;
427         }
428
429         /*
430          *      check and decrement ttl
431          */
432         if (hdr->hop_limit <= 1) {
433                 /* Force OUTPUT device used as source address */
434                 skb->dev = dst->dev;
435                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
436                 IP6_INC_STATS_BH(net,
437                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
438
439                 kfree_skb(skb);
440                 return -ETIMEDOUT;
441         }
442
443         /* XXX: idev->cnf.proxy_ndp? */
444         if (net->ipv6.devconf_all->proxy_ndp &&
445             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
446                 int proxied = ip6_forward_proxy_check(skb);
447                 if (proxied > 0)
448                         return ip6_input(skb);
449                 else if (proxied < 0) {
450                         IP6_INC_STATS(net, ip6_dst_idev(dst),
451                                       IPSTATS_MIB_INDISCARDS);
452                         goto drop;
453                 }
454         }
455
456         if (!xfrm6_route_forward(skb)) {
457                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
458                 goto drop;
459         }
460         dst = skb_dst(skb);
461
462         /* IPv6 specs say nothing about it, but it is clear that we cannot
463            send redirects to source routed frames.
464            We don't send redirects to frames decapsulated from IPsec.
465          */
466         n = dst_get_neighbour_noref(dst);
467         if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) {
468                 struct in6_addr *target = NULL;
469                 struct rt6_info *rt;
470
471                 /*
472                  *      incoming and outgoing devices are the same
473                  *      send a redirect.
474                  */
475
476                 rt = (struct rt6_info *) dst;
477                 if ((rt->rt6i_flags & RTF_GATEWAY))
478                         target = (struct in6_addr*)&n->primary_key;
479                 else
480                         target = &hdr->daddr;
481
482                 if (!rt->rt6i_peer)
483                         rt6_bind_peer(rt, 1);
484
485                 /* Limit redirects both by destination (here)
486                    and by source (inside ndisc_send_redirect)
487                  */
488                 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
489                         ndisc_send_redirect(skb, n, target);
490         } else {
491                 int addrtype = ipv6_addr_type(&hdr->saddr);
492
493                 /* This check is security critical. */
494                 if (addrtype == IPV6_ADDR_ANY ||
495                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
496                         goto error;
497                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
498                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
499                                     ICMPV6_NOT_NEIGHBOUR, 0);
500                         goto error;
501                 }
502         }
503
504         mtu = dst_mtu(dst);
505         if (mtu < IPV6_MIN_MTU)
506                 mtu = IPV6_MIN_MTU;
507
508         if (skb->len > mtu && !skb_is_gso(skb)) {
509                 /* Again, force OUTPUT device used as source address */
510                 skb->dev = dst->dev;
511                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
512                 IP6_INC_STATS_BH(net,
513                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
514                 IP6_INC_STATS_BH(net,
515                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
516                 kfree_skb(skb);
517                 return -EMSGSIZE;
518         }
519
520         if (skb_cow(skb, dst->dev->hard_header_len)) {
521                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
522                 goto drop;
523         }
524
525         hdr = ipv6_hdr(skb);
526
527         /* Mangling hops number delayed to point after skb COW */
528
529         hdr->hop_limit--;
530
531         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
532         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
533                        ip6_forward_finish);
534
535 error:
536         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
537 drop:
538         kfree_skb(skb);
539         return -EINVAL;
540 }
541
542 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
543 {
544         to->pkt_type = from->pkt_type;
545         to->priority = from->priority;
546         to->protocol = from->protocol;
547         skb_dst_drop(to);
548         skb_dst_set(to, dst_clone(skb_dst(from)));
549         to->dev = from->dev;
550         to->mark = from->mark;
551
552 #ifdef CONFIG_NET_SCHED
553         to->tc_index = from->tc_index;
554 #endif
555         nf_copy(to, from);
556 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
557     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
558         to->nf_trace = from->nf_trace;
559 #endif
560         skb_copy_secmark(to, from);
561 }
562
563 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
564 {
565         u16 offset = sizeof(struct ipv6hdr);
566         struct ipv6_opt_hdr *exthdr =
567                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
568         unsigned int packet_len = skb->tail - skb->network_header;
569         int found_rhdr = 0;
570         *nexthdr = &ipv6_hdr(skb)->nexthdr;
571
572         while (offset + 1 <= packet_len) {
573
574                 switch (**nexthdr) {
575
576                 case NEXTHDR_HOP:
577                         break;
578                 case NEXTHDR_ROUTING:
579                         found_rhdr = 1;
580                         break;
581                 case NEXTHDR_DEST:
582 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
583                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
584                                 break;
585 #endif
586                         if (found_rhdr)
587                                 return offset;
588                         break;
589                 default :
590                         return offset;
591                 }
592
593                 offset += ipv6_optlen(exthdr);
594                 *nexthdr = &exthdr->nexthdr;
595                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
596                                                  offset);
597         }
598
599         return offset;
600 }
601
602 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
603 {
604         static atomic_t ipv6_fragmentation_id;
605         int old, new;
606
607         if (rt && !(rt->dst.flags & DST_NOPEER)) {
608                 struct inet_peer *peer;
609
610                 if (!rt->rt6i_peer)
611                         rt6_bind_peer(rt, 1);
612                 peer = rt->rt6i_peer;
613                 if (peer) {
614                         fhdr->identification = htonl(inet_getid(peer, 0));
615                         return;
616                 }
617         }
618         do {
619                 old = atomic_read(&ipv6_fragmentation_id);
620                 new = old + 1;
621                 if (!new)
622                         new = 1;
623         } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
624         fhdr->identification = htonl(new);
625 }
626
627 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
628 {
629         struct sk_buff *frag;
630         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
631         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
632         struct ipv6hdr *tmp_hdr;
633         struct frag_hdr *fh;
634         unsigned int mtu, hlen, left, len;
635         int hroom, troom;
636         __be32 frag_id = 0;
637         int ptr, offset = 0, err=0;
638         u8 *prevhdr, nexthdr = 0;
639         struct net *net = dev_net(skb_dst(skb)->dev);
640
641         hlen = ip6_find_1stfragopt(skb, &prevhdr);
642         nexthdr = *prevhdr;
643
644         mtu = ip6_skb_dst_mtu(skb);
645
646         /* We must not fragment if the socket is set to force MTU discovery
647          * or if the skb it not generated by a local socket.
648          */
649         if (!skb->local_df && skb->len > mtu) {
650                 skb->dev = skb_dst(skb)->dev;
651                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
652                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
653                               IPSTATS_MIB_FRAGFAILS);
654                 kfree_skb(skb);
655                 return -EMSGSIZE;
656         }
657
658         if (np && np->frag_size < mtu) {
659                 if (np->frag_size)
660                         mtu = np->frag_size;
661         }
662         mtu -= hlen + sizeof(struct frag_hdr);
663
664         if (skb_has_frag_list(skb)) {
665                 int first_len = skb_pagelen(skb);
666                 struct sk_buff *frag2;
667
668                 if (first_len - hlen > mtu ||
669                     ((first_len - hlen) & 7) ||
670                     skb_cloned(skb))
671                         goto slow_path;
672
673                 skb_walk_frags(skb, frag) {
674                         /* Correct geometry. */
675                         if (frag->len > mtu ||
676                             ((frag->len & 7) && frag->next) ||
677                             skb_headroom(frag) < hlen)
678                                 goto slow_path_clean;
679
680                         /* Partially cloned skb? */
681                         if (skb_shared(frag))
682                                 goto slow_path_clean;
683
684                         BUG_ON(frag->sk);
685                         if (skb->sk) {
686                                 frag->sk = skb->sk;
687                                 frag->destructor = sock_wfree;
688                         }
689                         skb->truesize -= frag->truesize;
690                 }
691
692                 err = 0;
693                 offset = 0;
694                 frag = skb_shinfo(skb)->frag_list;
695                 skb_frag_list_init(skb);
696                 /* BUILD HEADER */
697
698                 *prevhdr = NEXTHDR_FRAGMENT;
699                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
700                 if (!tmp_hdr) {
701                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
702                                       IPSTATS_MIB_FRAGFAILS);
703                         return -ENOMEM;
704                 }
705
706                 __skb_pull(skb, hlen);
707                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
708                 __skb_push(skb, hlen);
709                 skb_reset_network_header(skb);
710                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
711
712                 ipv6_select_ident(fh, rt);
713                 fh->nexthdr = nexthdr;
714                 fh->reserved = 0;
715                 fh->frag_off = htons(IP6_MF);
716                 frag_id = fh->identification;
717
718                 first_len = skb_pagelen(skb);
719                 skb->data_len = first_len - skb_headlen(skb);
720                 skb->len = first_len;
721                 ipv6_hdr(skb)->payload_len = htons(first_len -
722                                                    sizeof(struct ipv6hdr));
723
724                 dst_hold(&rt->dst);
725
726                 for (;;) {
727                         /* Prepare header of the next frame,
728                          * before previous one went down. */
729                         if (frag) {
730                                 frag->ip_summed = CHECKSUM_NONE;
731                                 skb_reset_transport_header(frag);
732                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
733                                 __skb_push(frag, hlen);
734                                 skb_reset_network_header(frag);
735                                 memcpy(skb_network_header(frag), tmp_hdr,
736                                        hlen);
737                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
738                                 fh->nexthdr = nexthdr;
739                                 fh->reserved = 0;
740                                 fh->frag_off = htons(offset);
741                                 if (frag->next != NULL)
742                                         fh->frag_off |= htons(IP6_MF);
743                                 fh->identification = frag_id;
744                                 ipv6_hdr(frag)->payload_len =
745                                                 htons(frag->len -
746                                                       sizeof(struct ipv6hdr));
747                                 ip6_copy_metadata(frag, skb);
748                         }
749
750                         err = output(skb);
751                         if(!err)
752                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
753                                               IPSTATS_MIB_FRAGCREATES);
754
755                         if (err || !frag)
756                                 break;
757
758                         skb = frag;
759                         frag = skb->next;
760                         skb->next = NULL;
761                 }
762
763                 kfree(tmp_hdr);
764
765                 if (err == 0) {
766                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
767                                       IPSTATS_MIB_FRAGOKS);
768                         dst_release(&rt->dst);
769                         return 0;
770                 }
771
772                 while (frag) {
773                         skb = frag->next;
774                         kfree_skb(frag);
775                         frag = skb;
776                 }
777
778                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
779                               IPSTATS_MIB_FRAGFAILS);
780                 dst_release(&rt->dst);
781                 return err;
782
783 slow_path_clean:
784                 skb_walk_frags(skb, frag2) {
785                         if (frag2 == frag)
786                                 break;
787                         frag2->sk = NULL;
788                         frag2->destructor = NULL;
789                         skb->truesize += frag2->truesize;
790                 }
791         }
792
793 slow_path:
794         left = skb->len - hlen;         /* Space per frame */
795         ptr = hlen;                     /* Where to start from */
796
797         /*
798          *      Fragment the datagram.
799          */
800
801         *prevhdr = NEXTHDR_FRAGMENT;
802         hroom = LL_RESERVED_SPACE(rt->dst.dev);
803         troom = rt->dst.dev->needed_tailroom;
804
805         /*
806          *      Keep copying data until we run out.
807          */
808         while(left > 0) {
809                 len = left;
810                 /* IF: it doesn't fit, use 'mtu' - the data space left */
811                 if (len > mtu)
812                         len = mtu;
813                 /* IF: we are not sending up to and including the packet end
814                    then align the next start on an eight byte boundary */
815                 if (len < left) {
816                         len &= ~7;
817                 }
818                 /*
819                  *      Allocate buffer.
820                  */
821
822                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
823                                       hroom + troom, GFP_ATOMIC)) == NULL) {
824                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
825                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
826                                       IPSTATS_MIB_FRAGFAILS);
827                         err = -ENOMEM;
828                         goto fail;
829                 }
830
831                 /*
832                  *      Set up data on packet
833                  */
834
835                 ip6_copy_metadata(frag, skb);
836                 skb_reserve(frag, hroom);
837                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
838                 skb_reset_network_header(frag);
839                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
840                 frag->transport_header = (frag->network_header + hlen +
841                                           sizeof(struct frag_hdr));
842
843                 /*
844                  *      Charge the memory for the fragment to any owner
845                  *      it might possess
846                  */
847                 if (skb->sk)
848                         skb_set_owner_w(frag, skb->sk);
849
850                 /*
851                  *      Copy the packet header into the new buffer.
852                  */
853                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
854
855                 /*
856                  *      Build fragment header.
857                  */
858                 fh->nexthdr = nexthdr;
859                 fh->reserved = 0;
860                 if (!frag_id) {
861                         ipv6_select_ident(fh, rt);
862                         frag_id = fh->identification;
863                 } else
864                         fh->identification = frag_id;
865
866                 /*
867                  *      Copy a block of the IP datagram.
868                  */
869                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
870                         BUG();
871                 left -= len;
872
873                 fh->frag_off = htons(offset);
874                 if (left > 0)
875                         fh->frag_off |= htons(IP6_MF);
876                 ipv6_hdr(frag)->payload_len = htons(frag->len -
877                                                     sizeof(struct ipv6hdr));
878
879                 ptr += len;
880                 offset += len;
881
882                 /*
883                  *      Put this fragment into the sending queue.
884                  */
885                 err = output(frag);
886                 if (err)
887                         goto fail;
888
889                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
890                               IPSTATS_MIB_FRAGCREATES);
891         }
892         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
893                       IPSTATS_MIB_FRAGOKS);
894         kfree_skb(skb);
895         return err;
896
897 fail:
898         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
899                       IPSTATS_MIB_FRAGFAILS);
900         kfree_skb(skb);
901         return err;
902 }
903
904 static inline int ip6_rt_check(const struct rt6key *rt_key,
905                                const struct in6_addr *fl_addr,
906                                const struct in6_addr *addr_cache)
907 {
908         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
909                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
910 }
911
912 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
913                                           struct dst_entry *dst,
914                                           const struct flowi6 *fl6)
915 {
916         struct ipv6_pinfo *np = inet6_sk(sk);
917         struct rt6_info *rt = (struct rt6_info *)dst;
918
919         if (!dst)
920                 goto out;
921
922         /* Yes, checking route validity in not connected
923          * case is not very simple. Take into account,
924          * that we do not support routing by source, TOS,
925          * and MSG_DONTROUTE            --ANK (980726)
926          *
927          * 1. ip6_rt_check(): If route was host route,
928          *    check that cached destination is current.
929          *    If it is network route, we still may
930          *    check its validity using saved pointer
931          *    to the last used address: daddr_cache.
932          *    We do not want to save whole address now,
933          *    (because main consumer of this service
934          *    is tcp, which has not this problem),
935          *    so that the last trick works only on connected
936          *    sockets.
937          * 2. oif also should be the same.
938          */
939         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
940 #ifdef CONFIG_IPV6_SUBTREES
941             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
942 #endif
943             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
944                 dst_release(dst);
945                 dst = NULL;
946         }
947
948 out:
949         return dst;
950 }
951
952 static int ip6_dst_lookup_tail(struct sock *sk,
953                                struct dst_entry **dst, struct flowi6 *fl6)
954 {
955         struct net *net = sock_net(sk);
956 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
957         struct neighbour *n;
958 #endif
959         int err;
960
961         if (*dst == NULL)
962                 *dst = ip6_route_output(net, sk, fl6);
963
964         if ((err = (*dst)->error))
965                 goto out_err_release;
966
967         if (ipv6_addr_any(&fl6->saddr)) {
968                 struct rt6_info *rt = (struct rt6_info *) *dst;
969                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
970                                           sk ? inet6_sk(sk)->srcprefs : 0,
971                                           &fl6->saddr);
972                 if (err)
973                         goto out_err_release;
974         }
975
976 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
977         /*
978          * Here if the dst entry we've looked up
979          * has a neighbour entry that is in the INCOMPLETE
980          * state and the src address from the flow is
981          * marked as OPTIMISTIC, we release the found
982          * dst entry and replace it instead with the
983          * dst entry of the nexthop router
984          */
985         rcu_read_lock();
986         n = dst_get_neighbour_noref(*dst);
987         if (n && !(n->nud_state & NUD_VALID)) {
988                 struct inet6_ifaddr *ifp;
989                 struct flowi6 fl_gw6;
990                 int redirect;
991
992                 rcu_read_unlock();
993                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
994                                       (*dst)->dev, 1);
995
996                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
997                 if (ifp)
998                         in6_ifa_put(ifp);
999
1000                 if (redirect) {
1001                         /*
1002                          * We need to get the dst entry for the
1003                          * default router instead
1004                          */
1005                         dst_release(*dst);
1006                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1007                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1008                         *dst = ip6_route_output(net, sk, &fl_gw6);
1009                         if ((err = (*dst)->error))
1010                                 goto out_err_release;
1011                 }
1012         } else {
1013                 rcu_read_unlock();
1014         }
1015 #endif
1016
1017         return 0;
1018
1019 out_err_release:
1020         if (err == -ENETUNREACH)
1021                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1022         dst_release(*dst);
1023         *dst = NULL;
1024         return err;
1025 }
1026
1027 /**
1028  *      ip6_dst_lookup - perform route lookup on flow
1029  *      @sk: socket which provides route info
1030  *      @dst: pointer to dst_entry * for result
1031  *      @fl6: flow to lookup
1032  *
1033  *      This function performs a route lookup on the given flow.
1034  *
1035  *      It returns zero on success, or a standard errno code on error.
1036  */
1037 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1038 {
1039         *dst = NULL;
1040         return ip6_dst_lookup_tail(sk, dst, fl6);
1041 }
1042 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1043
1044 /**
1045  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1046  *      @sk: socket which provides route info
1047  *      @fl6: flow to lookup
1048  *      @final_dst: final destination address for ipsec lookup
1049  *      @can_sleep: we are in a sleepable context
1050  *
1051  *      This function performs a route lookup on the given flow.
1052  *
1053  *      It returns a valid dst pointer on success, or a pointer encoded
1054  *      error code.
1055  */
1056 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1057                                       const struct in6_addr *final_dst,
1058                                       bool can_sleep)
1059 {
1060         struct dst_entry *dst = NULL;
1061         int err;
1062
1063         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1064         if (err)
1065                 return ERR_PTR(err);
1066         if (final_dst)
1067                 fl6->daddr = *final_dst;
1068         if (can_sleep)
1069                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1070
1071         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1072 }
1073 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1074
1075 /**
1076  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1077  *      @sk: socket which provides the dst cache and route info
1078  *      @fl6: flow to lookup
1079  *      @final_dst: final destination address for ipsec lookup
1080  *      @can_sleep: we are in a sleepable context
1081  *
1082  *      This function performs a route lookup on the given flow with the
1083  *      possibility of using the cached route in the socket if it is valid.
1084  *      It will take the socket dst lock when operating on the dst cache.
1085  *      As a result, this function can only be used in process context.
1086  *
1087  *      It returns a valid dst pointer on success, or a pointer encoded
1088  *      error code.
1089  */
1090 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1091                                          const struct in6_addr *final_dst,
1092                                          bool can_sleep)
1093 {
1094         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1095         int err;
1096
1097         dst = ip6_sk_dst_check(sk, dst, fl6);
1098
1099         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1100         if (err)
1101                 return ERR_PTR(err);
1102         if (final_dst)
1103                 fl6->daddr = *final_dst;
1104         if (can_sleep)
1105                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1106
1107         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1108 }
1109 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1110
1111 static inline int ip6_ufo_append_data(struct sock *sk,
1112                         int getfrag(void *from, char *to, int offset, int len,
1113                         int odd, struct sk_buff *skb),
1114                         void *from, int length, int hh_len, int fragheaderlen,
1115                         int transhdrlen, int mtu,unsigned int flags,
1116                         struct rt6_info *rt)
1117
1118 {
1119         struct sk_buff *skb;
1120         int err;
1121
1122         /* There is support for UDP large send offload by network
1123          * device, so create one single skb packet containing complete
1124          * udp datagram
1125          */
1126         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1127                 skb = sock_alloc_send_skb(sk,
1128                         hh_len + fragheaderlen + transhdrlen + 20,
1129                         (flags & MSG_DONTWAIT), &err);
1130                 if (skb == NULL)
1131                         return err;
1132
1133                 /* reserve space for Hardware header */
1134                 skb_reserve(skb, hh_len);
1135
1136                 /* create space for UDP/IP header */
1137                 skb_put(skb,fragheaderlen + transhdrlen);
1138
1139                 /* initialize network header pointer */
1140                 skb_reset_network_header(skb);
1141
1142                 /* initialize protocol header pointer */
1143                 skb->transport_header = skb->network_header + fragheaderlen;
1144
1145                 skb->ip_summed = CHECKSUM_PARTIAL;
1146                 skb->csum = 0;
1147         }
1148
1149         err = skb_append_datato_frags(sk,skb, getfrag, from,
1150                                       (length - transhdrlen));
1151         if (!err) {
1152                 struct frag_hdr fhdr;
1153
1154                 /* Specify the length of each IPv6 datagram fragment.
1155                  * It has to be a multiple of 8.
1156                  */
1157                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1158                                              sizeof(struct frag_hdr)) & ~7;
1159                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1160                 ipv6_select_ident(&fhdr, rt);
1161                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1162                 __skb_queue_tail(&sk->sk_write_queue, skb);
1163
1164                 return 0;
1165         }
1166         /* There is not enough support do UPD LSO,
1167          * so follow normal path
1168          */
1169         kfree_skb(skb);
1170
1171         return err;
1172 }
1173
1174 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1175                                                gfp_t gfp)
1176 {
1177         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1178 }
1179
1180 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1181                                                 gfp_t gfp)
1182 {
1183         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1184 }
1185
1186 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1187         int offset, int len, int odd, struct sk_buff *skb),
1188         void *from, int length, int transhdrlen,
1189         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1190         struct rt6_info *rt, unsigned int flags, int dontfrag)
1191 {
1192         struct inet_sock *inet = inet_sk(sk);
1193         struct ipv6_pinfo *np = inet6_sk(sk);
1194         struct inet_cork *cork;
1195         struct sk_buff *skb;
1196         unsigned int maxfraglen, fragheaderlen;
1197         int exthdrlen;
1198         int dst_exthdrlen;
1199         int hh_len;
1200         int mtu;
1201         int copy;
1202         int err;
1203         int offset = 0;
1204         int csummode = CHECKSUM_NONE;
1205         __u8 tx_flags = 0;
1206
1207         if (flags&MSG_PROBE)
1208                 return 0;
1209         cork = &inet->cork.base;
1210         if (skb_queue_empty(&sk->sk_write_queue)) {
1211                 /*
1212                  * setup for corking
1213                  */
1214                 if (opt) {
1215                         if (WARN_ON(np->cork.opt))
1216                                 return -EINVAL;
1217
1218                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1219                         if (unlikely(np->cork.opt == NULL))
1220                                 return -ENOBUFS;
1221
1222                         np->cork.opt->tot_len = opt->tot_len;
1223                         np->cork.opt->opt_flen = opt->opt_flen;
1224                         np->cork.opt->opt_nflen = opt->opt_nflen;
1225
1226                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1227                                                             sk->sk_allocation);
1228                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1229                                 return -ENOBUFS;
1230
1231                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1232                                                             sk->sk_allocation);
1233                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1234                                 return -ENOBUFS;
1235
1236                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1237                                                            sk->sk_allocation);
1238                         if (opt->hopopt && !np->cork.opt->hopopt)
1239                                 return -ENOBUFS;
1240
1241                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1242                                                             sk->sk_allocation);
1243                         if (opt->srcrt && !np->cork.opt->srcrt)
1244                                 return -ENOBUFS;
1245
1246                         /* need source address above miyazawa*/
1247                 }
1248                 dst_hold(&rt->dst);
1249                 cork->dst = &rt->dst;
1250                 inet->cork.fl.u.ip6 = *fl6;
1251                 np->cork.hop_limit = hlimit;
1252                 np->cork.tclass = tclass;
1253                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1254                       rt->dst.dev->mtu : dst_mtu(&rt->dst);
1255                 if (np->frag_size < mtu) {
1256                         if (np->frag_size)
1257                                 mtu = np->frag_size;
1258                 }
1259                 cork->fragsize = mtu;
1260                 if (dst_allfrag(rt->dst.path))
1261                         cork->flags |= IPCORK_ALLFRAG;
1262                 cork->length = 0;
1263                 sk->sk_sndmsg_page = NULL;
1264                 sk->sk_sndmsg_off = 0;
1265                 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1266                 length += exthdrlen;
1267                 transhdrlen += exthdrlen;
1268                 dst_exthdrlen = rt->dst.header_len;
1269         } else {
1270                 rt = (struct rt6_info *)cork->dst;
1271                 fl6 = &inet->cork.fl.u.ip6;
1272                 opt = np->cork.opt;
1273                 transhdrlen = 0;
1274                 exthdrlen = 0;
1275                 dst_exthdrlen = 0;
1276                 mtu = cork->fragsize;
1277         }
1278
1279         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1280
1281         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1282                         (opt ? opt->opt_nflen : 0);
1283         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1284
1285         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1286                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1287                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1288                         return -EMSGSIZE;
1289                 }
1290         }
1291
1292         /* For UDP, check if TX timestamp is enabled */
1293         if (sk->sk_type == SOCK_DGRAM) {
1294                 err = sock_tx_timestamp(sk, &tx_flags);
1295                 if (err)
1296                         goto error;
1297         }
1298
1299         /*
1300          * Let's try using as much space as possible.
1301          * Use MTU if total length of the message fits into the MTU.
1302          * Otherwise, we need to reserve fragment header and
1303          * fragment alignment (= 8-15 octects, in total).
1304          *
1305          * Note that we may need to "move" the data from the tail of
1306          * of the buffer to the new fragment when we split
1307          * the message.
1308          *
1309          * FIXME: It may be fragmented into multiple chunks
1310          *        at once if non-fragmentable extension headers
1311          *        are too large.
1312          * --yoshfuji
1313          */
1314
1315         cork->length += length;
1316         if (length > mtu) {
1317                 int proto = sk->sk_protocol;
1318                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1319                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1320                         return -EMSGSIZE;
1321                 }
1322
1323                 if (proto == IPPROTO_UDP &&
1324                     (rt->dst.dev->features & NETIF_F_UFO)) {
1325
1326                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1327                                                   hh_len, fragheaderlen,
1328                                                   transhdrlen, mtu, flags, rt);
1329                         if (err)
1330                                 goto error;
1331                         return 0;
1332                 }
1333         }
1334
1335         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1336                 goto alloc_new_skb;
1337
1338         while (length > 0) {
1339                 /* Check if the remaining data fits into current packet. */
1340                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1341                 if (copy < length)
1342                         copy = maxfraglen - skb->len;
1343
1344                 if (copy <= 0) {
1345                         char *data;
1346                         unsigned int datalen;
1347                         unsigned int fraglen;
1348                         unsigned int fraggap;
1349                         unsigned int alloclen;
1350                         struct sk_buff *skb_prev;
1351 alloc_new_skb:
1352                         skb_prev = skb;
1353
1354                         /* There's no room in the current skb */
1355                         if (skb_prev)
1356                                 fraggap = skb_prev->len - maxfraglen;
1357                         else
1358                                 fraggap = 0;
1359
1360                         /*
1361                          * If remaining data exceeds the mtu,
1362                          * we know we need more fragment(s).
1363                          */
1364                         datalen = length + fraggap;
1365                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1366                                 datalen = maxfraglen - fragheaderlen;
1367
1368                         fraglen = datalen + fragheaderlen;
1369                         if ((flags & MSG_MORE) &&
1370                             !(rt->dst.dev->features&NETIF_F_SG))
1371                                 alloclen = mtu;
1372                         else
1373                                 alloclen = datalen + fragheaderlen;
1374
1375                         alloclen += dst_exthdrlen;
1376
1377                         /*
1378                          * The last fragment gets additional space at tail.
1379                          * Note: we overallocate on fragments with MSG_MODE
1380                          * because we have no idea if we're the last one.
1381                          */
1382                         if (datalen == length + fraggap)
1383                                 alloclen += rt->dst.trailer_len;
1384
1385                         /*
1386                          * We just reserve space for fragment header.
1387                          * Note: this may be overallocation if the message
1388                          * (without MSG_MORE) fits into the MTU.
1389                          */
1390                         alloclen += sizeof(struct frag_hdr);
1391
1392                         if (transhdrlen) {
1393                                 skb = sock_alloc_send_skb(sk,
1394                                                 alloclen + hh_len,
1395                                                 (flags & MSG_DONTWAIT), &err);
1396                         } else {
1397                                 skb = NULL;
1398                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1399                                     2 * sk->sk_sndbuf)
1400                                         skb = sock_wmalloc(sk,
1401                                                            alloclen + hh_len, 1,
1402                                                            sk->sk_allocation);
1403                                 if (unlikely(skb == NULL))
1404                                         err = -ENOBUFS;
1405                                 else {
1406                                         /* Only the initial fragment
1407                                          * is time stamped.
1408                                          */
1409                                         tx_flags = 0;
1410                                 }
1411                         }
1412                         if (skb == NULL)
1413                                 goto error;
1414                         /*
1415                          *      Fill in the control structures
1416                          */
1417                         skb->ip_summed = csummode;
1418                         skb->csum = 0;
1419                         /* reserve for fragmentation */
1420                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1421
1422                         if (sk->sk_type == SOCK_DGRAM)
1423                                 skb_shinfo(skb)->tx_flags = tx_flags;
1424
1425                         /*
1426                          *      Find where to start putting bytes
1427                          */
1428                         data = skb_put(skb, fraglen + dst_exthdrlen);
1429                         skb_set_network_header(skb, exthdrlen + dst_exthdrlen);
1430                         data += fragheaderlen + dst_exthdrlen;
1431                         skb->transport_header = (skb->network_header +
1432                                                  fragheaderlen);
1433                         if (fraggap) {
1434                                 skb->csum = skb_copy_and_csum_bits(
1435                                         skb_prev, maxfraglen,
1436                                         data + transhdrlen, fraggap, 0);
1437                                 skb_prev->csum = csum_sub(skb_prev->csum,
1438                                                           skb->csum);
1439                                 data += fraggap;
1440                                 pskb_trim_unique(skb_prev, maxfraglen);
1441                         }
1442                         copy = datalen - transhdrlen - fraggap;
1443
1444                         if (copy < 0) {
1445                                 err = -EINVAL;
1446                                 kfree_skb(skb);
1447                                 goto error;
1448                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1449                                 err = -EFAULT;
1450                                 kfree_skb(skb);
1451                                 goto error;
1452                         }
1453
1454                         offset += copy;
1455                         length -= datalen - fraggap;
1456                         transhdrlen = 0;
1457                         exthdrlen = 0;
1458                         dst_exthdrlen = 0;
1459                         csummode = CHECKSUM_NONE;
1460
1461                         /*
1462                          * Put the packet on the pending queue
1463                          */
1464                         __skb_queue_tail(&sk->sk_write_queue, skb);
1465                         continue;
1466                 }
1467
1468                 if (copy > length)
1469                         copy = length;
1470
1471                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1472                         unsigned int off;
1473
1474                         off = skb->len;
1475                         if (getfrag(from, skb_put(skb, copy),
1476                                                 offset, copy, off, skb) < 0) {
1477                                 __skb_trim(skb, off);
1478                                 err = -EFAULT;
1479                                 goto error;
1480                         }
1481                 } else {
1482                         int i = skb_shinfo(skb)->nr_frags;
1483                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1484                         struct page *page = sk->sk_sndmsg_page;
1485                         int off = sk->sk_sndmsg_off;
1486                         unsigned int left;
1487
1488                         if (page && (left = PAGE_SIZE - off) > 0) {
1489                                 if (copy >= left)
1490                                         copy = left;
1491                                 if (page != skb_frag_page(frag)) {
1492                                         if (i == MAX_SKB_FRAGS) {
1493                                                 err = -EMSGSIZE;
1494                                                 goto error;
1495                                         }
1496                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1497                                         skb_frag_ref(skb, i);
1498                                         frag = &skb_shinfo(skb)->frags[i];
1499                                 }
1500                         } else if(i < MAX_SKB_FRAGS) {
1501                                 if (copy > PAGE_SIZE)
1502                                         copy = PAGE_SIZE;
1503                                 page = alloc_pages(sk->sk_allocation, 0);
1504                                 if (page == NULL) {
1505                                         err = -ENOMEM;
1506                                         goto error;
1507                                 }
1508                                 sk->sk_sndmsg_page = page;
1509                                 sk->sk_sndmsg_off = 0;
1510
1511                                 skb_fill_page_desc(skb, i, page, 0, 0);
1512                                 frag = &skb_shinfo(skb)->frags[i];
1513                         } else {
1514                                 err = -EMSGSIZE;
1515                                 goto error;
1516                         }
1517                         if (getfrag(from,
1518                                     skb_frag_address(frag) + skb_frag_size(frag),
1519                                     offset, copy, skb->len, skb) < 0) {
1520                                 err = -EFAULT;
1521                                 goto error;
1522                         }
1523                         sk->sk_sndmsg_off += copy;
1524                         skb_frag_size_add(frag, copy);
1525                         skb->len += copy;
1526                         skb->data_len += copy;
1527                         skb->truesize += copy;
1528                         atomic_add(copy, &sk->sk_wmem_alloc);
1529                 }
1530                 offset += copy;
1531                 length -= copy;
1532         }
1533         return 0;
1534 error:
1535         cork->length -= length;
1536         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1537         return err;
1538 }
1539
1540 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1541 {
1542         if (np->cork.opt) {
1543                 kfree(np->cork.opt->dst0opt);
1544                 kfree(np->cork.opt->dst1opt);
1545                 kfree(np->cork.opt->hopopt);
1546                 kfree(np->cork.opt->srcrt);
1547                 kfree(np->cork.opt);
1548                 np->cork.opt = NULL;
1549         }
1550
1551         if (inet->cork.base.dst) {
1552                 dst_release(inet->cork.base.dst);
1553                 inet->cork.base.dst = NULL;
1554                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1555         }
1556         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1557 }
1558
1559 int ip6_push_pending_frames(struct sock *sk)
1560 {
1561         struct sk_buff *skb, *tmp_skb;
1562         struct sk_buff **tail_skb;
1563         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1564         struct inet_sock *inet = inet_sk(sk);
1565         struct ipv6_pinfo *np = inet6_sk(sk);
1566         struct net *net = sock_net(sk);
1567         struct ipv6hdr *hdr;
1568         struct ipv6_txoptions *opt = np->cork.opt;
1569         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1570         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1571         unsigned char proto = fl6->flowi6_proto;
1572         int err = 0;
1573
1574         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1575                 goto out;
1576         tail_skb = &(skb_shinfo(skb)->frag_list);
1577
1578         /* move skb->data to ip header from ext header */
1579         if (skb->data < skb_network_header(skb))
1580                 __skb_pull(skb, skb_network_offset(skb));
1581         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1582                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1583                 *tail_skb = tmp_skb;
1584                 tail_skb = &(tmp_skb->next);
1585                 skb->len += tmp_skb->len;
1586                 skb->data_len += tmp_skb->len;
1587                 skb->truesize += tmp_skb->truesize;
1588                 tmp_skb->destructor = NULL;
1589                 tmp_skb->sk = NULL;
1590         }
1591
1592         /* Allow local fragmentation. */
1593         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1594                 skb->local_df = 1;
1595
1596         *final_dst = fl6->daddr;
1597         __skb_pull(skb, skb_network_header_len(skb));
1598         if (opt && opt->opt_flen)
1599                 ipv6_push_frag_opts(skb, opt, &proto);
1600         if (opt && opt->opt_nflen)
1601                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1602
1603         skb_push(skb, sizeof(struct ipv6hdr));
1604         skb_reset_network_header(skb);
1605         hdr = ipv6_hdr(skb);
1606
1607         *(__be32*)hdr = fl6->flowlabel |
1608                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1609
1610         hdr->hop_limit = np->cork.hop_limit;
1611         hdr->nexthdr = proto;
1612         hdr->saddr = fl6->saddr;
1613         hdr->daddr = *final_dst;
1614
1615         skb->priority = sk->sk_priority;
1616         skb->mark = sk->sk_mark;
1617
1618         skb_dst_set(skb, dst_clone(&rt->dst));
1619         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1620         if (proto == IPPROTO_ICMPV6) {
1621                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1622
1623                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1624                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1625         }
1626
1627         err = ip6_local_out(skb);
1628         if (err) {
1629                 if (err > 0)
1630                         err = net_xmit_errno(err);
1631                 if (err)
1632                         goto error;
1633         }
1634
1635 out:
1636         ip6_cork_release(inet, np);
1637         return err;
1638 error:
1639         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1640         goto out;
1641 }
1642
1643 void ip6_flush_pending_frames(struct sock *sk)
1644 {
1645         struct sk_buff *skb;
1646
1647         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1648                 if (skb_dst(skb))
1649                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1650                                       IPSTATS_MIB_OUTDISCARDS);
1651                 kfree_skb(skb);
1652         }
1653
1654         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1655 }