[IPV6]: Do not ignore IPV6_MTU socket option.
[linux-2.6.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation 
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9  *
10  *      Based on linux/net/ipv4/ip_output.c
11  *
12  *      This program is free software; you can redistribute it and/or
13  *      modify it under the terms of the GNU General Public License
14  *      as published by the Free Software Foundation; either version
15  *      2 of the License, or (at your option) any later version.
16  *
17  *      Changes:
18  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
19  *                              extension headers are implemented.
20  *                              route changes now work.
21  *                              ip6_forward does not confuse sniffers.
22  *                              etc.
23  *
24  *      H. von Brand    :       Added missing #include <linux/string.h>
25  *      Imran Patel     :       frag id should be in NBO
26  *      Kazunori MIYAZAWA @USAGI
27  *                      :       add ip6_append_data and related functions
28  *                              for datagram xmit
29  */
30
31 #include <linux/config.h>
32 #include <linux/errno.h>
33 #include <linux/types.h>
34 #include <linux/string.h>
35 #include <linux/socket.h>
36 #include <linux/net.h>
37 #include <linux/netdevice.h>
38 #include <linux/if_arp.h>
39 #include <linux/in6.h>
40 #include <linux/tcp.h>
41 #include <linux/route.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv6.h>
45
46 #include <net/sock.h>
47 #include <net/snmp.h>
48
49 #include <net/ipv6.h>
50 #include <net/ndisc.h>
51 #include <net/protocol.h>
52 #include <net/ip6_route.h>
53 #include <net/addrconf.h>
54 #include <net/rawv6.h>
55 #include <net/icmp.h>
56 #include <net/xfrm.h>
57 #include <net/checksum.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62 {
63         static u32 ipv6_fragmentation_id = 1;
64         static DEFINE_SPINLOCK(ip6_id_lock);
65
66         spin_lock_bh(&ip6_id_lock);
67         fhdr->identification = htonl(ipv6_fragmentation_id);
68         if (++ipv6_fragmentation_id == 0)
69                 ipv6_fragmentation_id = 1;
70         spin_unlock_bh(&ip6_id_lock);
71 }
72
73 static inline int ip6_output_finish(struct sk_buff *skb)
74 {
75
76         struct dst_entry *dst = skb->dst;
77         struct hh_cache *hh = dst->hh;
78
79         if (hh) {
80                 int hh_alen;
81
82                 read_lock_bh(&hh->hh_lock);
83                 hh_alen = HH_DATA_ALIGN(hh->hh_len);
84                 memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
85                 read_unlock_bh(&hh->hh_lock);
86                 skb_push(skb, hh->hh_len);
87                 return hh->hh_output(skb);
88         } else if (dst->neighbour)
89                 return dst->neighbour->output(skb);
90
91         IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
92         kfree_skb(skb);
93         return -EINVAL;
94
95 }
96
97 /* dev_loopback_xmit for use with netfilter. */
98 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
99 {
100         newskb->mac.raw = newskb->data;
101         __skb_pull(newskb, newskb->nh.raw - newskb->data);
102         newskb->pkt_type = PACKET_LOOPBACK;
103         newskb->ip_summed = CHECKSUM_UNNECESSARY;
104         BUG_TRAP(newskb->dst);
105
106         netif_rx(newskb);
107         return 0;
108 }
109
110
111 static int ip6_output2(struct sk_buff *skb)
112 {
113         struct dst_entry *dst = skb->dst;
114         struct net_device *dev = dst->dev;
115
116         skb->protocol = htons(ETH_P_IPV6);
117         skb->dev = dev;
118
119         if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) {
120                 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
121
122                 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
123                     ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr,
124                                 &skb->nh.ipv6h->saddr)) {
125                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
126
127                         /* Do not check for IFF_ALLMULTI; multicast routing
128                            is not supported in any case.
129                          */
130                         if (newskb)
131                                 NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL,
132                                         newskb->dev,
133                                         ip6_dev_loopback_xmit);
134
135                         if (skb->nh.ipv6h->hop_limit == 0) {
136                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
137                                 kfree_skb(skb);
138                                 return 0;
139                         }
140                 }
141
142                 IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
143         }
144
145         return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
146 }
147
148 int ip6_output(struct sk_buff *skb)
149 {
150         if ((skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->ufo_size) ||
151                                 dst_allfrag(skb->dst))
152                 return ip6_fragment(skb, ip6_output2);
153         else
154                 return ip6_output2(skb);
155 }
156
157 /*
158  *      xmit an sk_buff (used by TCP)
159  */
160
161 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
162              struct ipv6_txoptions *opt, int ipfragok)
163 {
164         struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL;
165         struct in6_addr *first_hop = &fl->fl6_dst;
166         struct dst_entry *dst = skb->dst;
167         struct ipv6hdr *hdr;
168         u8  proto = fl->proto;
169         int seg_len = skb->len;
170         int hlimit, tclass;
171         u32 mtu;
172
173         if (opt) {
174                 int head_room;
175
176                 /* First: exthdrs may take lots of space (~8K for now)
177                    MAX_HEADER is not enough.
178                  */
179                 head_room = opt->opt_nflen + opt->opt_flen;
180                 seg_len += head_room;
181                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
182
183                 if (skb_headroom(skb) < head_room) {
184                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
185                         kfree_skb(skb);
186                         skb = skb2;
187                         if (skb == NULL) {      
188                                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
189                                 return -ENOBUFS;
190                         }
191                         if (sk)
192                                 skb_set_owner_w(skb, sk);
193                 }
194                 if (opt->opt_flen)
195                         ipv6_push_frag_opts(skb, opt, &proto);
196                 if (opt->opt_nflen)
197                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
198         }
199
200         hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr));
201
202         /*
203          *      Fill in the IPv6 header
204          */
205
206         hlimit = -1;
207         if (np)
208                 hlimit = np->hop_limit;
209         if (hlimit < 0)
210                 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
211         if (hlimit < 0)
212                 hlimit = ipv6_get_hoplimit(dst->dev);
213
214         tclass = -1;
215         if (np)
216                 tclass = np->tclass;
217         if (tclass < 0)
218                 tclass = 0;
219
220         *(u32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
221
222         hdr->payload_len = htons(seg_len);
223         hdr->nexthdr = proto;
224         hdr->hop_limit = hlimit;
225
226         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
227         ipv6_addr_copy(&hdr->daddr, first_hop);
228
229         skb->priority = sk->sk_priority;
230
231         mtu = dst_mtu(dst);
232         if ((skb->len <= mtu) || ipfragok) {
233                 IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
234                 return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev,
235                                 dst_output);
236         }
237
238         if (net_ratelimit())
239                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
240         skb->dev = dst->dev;
241         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
242         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
243         kfree_skb(skb);
244         return -EMSGSIZE;
245 }
246
247 /*
248  *      To avoid extra problems ND packets are send through this
249  *      routine. It's code duplication but I really want to avoid
250  *      extra checks since ipv6_build_header is used by TCP (which
251  *      is for us performance critical)
252  */
253
254 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
255                struct in6_addr *saddr, struct in6_addr *daddr,
256                int proto, int len)
257 {
258         struct ipv6_pinfo *np = inet6_sk(sk);
259         struct ipv6hdr *hdr;
260         int totlen;
261
262         skb->protocol = htons(ETH_P_IPV6);
263         skb->dev = dev;
264
265         totlen = len + sizeof(struct ipv6hdr);
266
267         hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
268         skb->nh.ipv6h = hdr;
269
270         *(u32*)hdr = htonl(0x60000000);
271
272         hdr->payload_len = htons(len);
273         hdr->nexthdr = proto;
274         hdr->hop_limit = np->hop_limit;
275
276         ipv6_addr_copy(&hdr->saddr, saddr);
277         ipv6_addr_copy(&hdr->daddr, daddr);
278
279         return 0;
280 }
281
282 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
283 {
284         struct ip6_ra_chain *ra;
285         struct sock *last = NULL;
286
287         read_lock(&ip6_ra_lock);
288         for (ra = ip6_ra_chain; ra; ra = ra->next) {
289                 struct sock *sk = ra->sk;
290                 if (sk && ra->sel == sel &&
291                     (!sk->sk_bound_dev_if ||
292                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
293                         if (last) {
294                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
295                                 if (skb2)
296                                         rawv6_rcv(last, skb2);
297                         }
298                         last = sk;
299                 }
300         }
301
302         if (last) {
303                 rawv6_rcv(last, skb);
304                 read_unlock(&ip6_ra_lock);
305                 return 1;
306         }
307         read_unlock(&ip6_ra_lock);
308         return 0;
309 }
310
311 static inline int ip6_forward_finish(struct sk_buff *skb)
312 {
313         return dst_output(skb);
314 }
315
316 int ip6_forward(struct sk_buff *skb)
317 {
318         struct dst_entry *dst = skb->dst;
319         struct ipv6hdr *hdr = skb->nh.ipv6h;
320         struct inet6_skb_parm *opt = IP6CB(skb);
321         
322         if (ipv6_devconf.forwarding == 0)
323                 goto error;
324
325         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
326                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
327                 goto drop;
328         }
329
330         skb->ip_summed = CHECKSUM_NONE;
331
332         /*
333          *      We DO NOT make any processing on
334          *      RA packets, pushing them to user level AS IS
335          *      without ane WARRANTY that application will be able
336          *      to interpret them. The reason is that we
337          *      cannot make anything clever here.
338          *
339          *      We are not end-node, so that if packet contains
340          *      AH/ESP, we cannot make anything.
341          *      Defragmentation also would be mistake, RA packets
342          *      cannot be fragmented, because there is no warranty
343          *      that different fragments will go along one path. --ANK
344          */
345         if (opt->ra) {
346                 u8 *ptr = skb->nh.raw + opt->ra;
347                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
348                         return 0;
349         }
350
351         /*
352          *      check and decrement ttl
353          */
354         if (hdr->hop_limit <= 1) {
355                 /* Force OUTPUT device used as source address */
356                 skb->dev = dst->dev;
357                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
358                             0, skb->dev);
359
360                 kfree_skb(skb);
361                 return -ETIMEDOUT;
362         }
363
364         if (!xfrm6_route_forward(skb)) {
365                 IP6_INC_STATS(IPSTATS_MIB_INDISCARDS);
366                 goto drop;
367         }
368         dst = skb->dst;
369
370         /* IPv6 specs say nothing about it, but it is clear that we cannot
371            send redirects to source routed frames.
372          */
373         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) {
374                 struct in6_addr *target = NULL;
375                 struct rt6_info *rt;
376                 struct neighbour *n = dst->neighbour;
377
378                 /*
379                  *      incoming and outgoing devices are the same
380                  *      send a redirect.
381                  */
382
383                 rt = (struct rt6_info *) dst;
384                 if ((rt->rt6i_flags & RTF_GATEWAY))
385                         target = (struct in6_addr*)&n->primary_key;
386                 else
387                         target = &hdr->daddr;
388
389                 /* Limit redirects both by destination (here)
390                    and by source (inside ndisc_send_redirect)
391                  */
392                 if (xrlim_allow(dst, 1*HZ))
393                         ndisc_send_redirect(skb, n, target);
394         } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK
395                                                 |IPV6_ADDR_LINKLOCAL)) {
396                 /* This check is security critical. */
397                 goto error;
398         }
399
400         if (skb->len > dst_mtu(dst)) {
401                 /* Again, force OUTPUT device used as source address */
402                 skb->dev = dst->dev;
403                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
404                 IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS);
405                 IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
406                 kfree_skb(skb);
407                 return -EMSGSIZE;
408         }
409
410         if (skb_cow(skb, dst->dev->hard_header_len)) {
411                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
412                 goto drop;
413         }
414
415         hdr = skb->nh.ipv6h;
416
417         /* Mangling hops number delayed to point after skb COW */
418  
419         hdr->hop_limit--;
420
421         IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
422         return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish);
423
424 error:
425         IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
426 drop:
427         kfree_skb(skb);
428         return -EINVAL;
429 }
430
431 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
432 {
433         to->pkt_type = from->pkt_type;
434         to->priority = from->priority;
435         to->protocol = from->protocol;
436         dst_release(to->dst);
437         to->dst = dst_clone(from->dst);
438         to->dev = from->dev;
439
440 #ifdef CONFIG_NET_SCHED
441         to->tc_index = from->tc_index;
442 #endif
443 #ifdef CONFIG_NETFILTER
444         to->nfmark = from->nfmark;
445         /* Connection association is same as pre-frag packet */
446         nf_conntrack_put(to->nfct);
447         to->nfct = from->nfct;
448         nf_conntrack_get(to->nfct);
449         to->nfctinfo = from->nfctinfo;
450 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
451         nf_conntrack_put_reasm(to->nfct_reasm);
452         to->nfct_reasm = from->nfct_reasm;
453         nf_conntrack_get_reasm(to->nfct_reasm);
454 #endif
455 #ifdef CONFIG_BRIDGE_NETFILTER
456         nf_bridge_put(to->nf_bridge);
457         to->nf_bridge = from->nf_bridge;
458         nf_bridge_get(to->nf_bridge);
459 #endif
460 #endif
461 }
462
463 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
464 {
465         u16 offset = sizeof(struct ipv6hdr);
466         struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1);
467         unsigned int packet_len = skb->tail - skb->nh.raw;
468         int found_rhdr = 0;
469         *nexthdr = &skb->nh.ipv6h->nexthdr;
470
471         while (offset + 1 <= packet_len) {
472
473                 switch (**nexthdr) {
474
475                 case NEXTHDR_HOP:
476                 case NEXTHDR_ROUTING:
477                 case NEXTHDR_DEST:
478                         if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1;
479                         if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset;
480                         offset += ipv6_optlen(exthdr);
481                         *nexthdr = &exthdr->nexthdr;
482                         exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset);
483                         break;
484                 default :
485                         return offset;
486                 }
487         }
488
489         return offset;
490 }
491
492 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
493 {
494         struct net_device *dev;
495         struct sk_buff *frag;
496         struct rt6_info *rt = (struct rt6_info*)skb->dst;
497         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
498         struct ipv6hdr *tmp_hdr;
499         struct frag_hdr *fh;
500         unsigned int mtu, hlen, left, len;
501         u32 frag_id = 0;
502         int ptr, offset = 0, err=0;
503         u8 *prevhdr, nexthdr = 0;
504
505         dev = rt->u.dst.dev;
506         hlen = ip6_find_1stfragopt(skb, &prevhdr);
507         nexthdr = *prevhdr;
508
509         mtu = dst_mtu(&rt->u.dst);
510         if (np && np->frag_size < mtu) {
511                 if (np->frag_size)
512                         mtu = np->frag_size;
513         }
514         mtu -= hlen + sizeof(struct frag_hdr);
515
516         if (skb_shinfo(skb)->frag_list) {
517                 int first_len = skb_pagelen(skb);
518
519                 if (first_len - hlen > mtu ||
520                     ((first_len - hlen) & 7) ||
521                     skb_cloned(skb))
522                         goto slow_path;
523
524                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
525                         /* Correct geometry. */
526                         if (frag->len > mtu ||
527                             ((frag->len & 7) && frag->next) ||
528                             skb_headroom(frag) < hlen)
529                             goto slow_path;
530
531                         /* Partially cloned skb? */
532                         if (skb_shared(frag))
533                                 goto slow_path;
534
535                         BUG_ON(frag->sk);
536                         if (skb->sk) {
537                                 sock_hold(skb->sk);
538                                 frag->sk = skb->sk;
539                                 frag->destructor = sock_wfree;
540                                 skb->truesize -= frag->truesize;
541                         }
542                 }
543
544                 err = 0;
545                 offset = 0;
546                 frag = skb_shinfo(skb)->frag_list;
547                 skb_shinfo(skb)->frag_list = NULL;
548                 /* BUILD HEADER */
549
550                 tmp_hdr = kmalloc(hlen, GFP_ATOMIC);
551                 if (!tmp_hdr) {
552                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
553                         return -ENOMEM;
554                 }
555
556                 *prevhdr = NEXTHDR_FRAGMENT;
557                 memcpy(tmp_hdr, skb->nh.raw, hlen);
558                 __skb_pull(skb, hlen);
559                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
560                 skb->nh.raw = __skb_push(skb, hlen);
561                 memcpy(skb->nh.raw, tmp_hdr, hlen);
562
563                 ipv6_select_ident(skb, fh);
564                 fh->nexthdr = nexthdr;
565                 fh->reserved = 0;
566                 fh->frag_off = htons(IP6_MF);
567                 frag_id = fh->identification;
568
569                 first_len = skb_pagelen(skb);
570                 skb->data_len = first_len - skb_headlen(skb);
571                 skb->len = first_len;
572                 skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr));
573  
574
575                 for (;;) {
576                         /* Prepare header of the next frame,
577                          * before previous one went down. */
578                         if (frag) {
579                                 frag->ip_summed = CHECKSUM_NONE;
580                                 frag->h.raw = frag->data;
581                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
582                                 frag->nh.raw = __skb_push(frag, hlen);
583                                 memcpy(frag->nh.raw, tmp_hdr, hlen);
584                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
585                                 fh->nexthdr = nexthdr;
586                                 fh->reserved = 0;
587                                 fh->frag_off = htons(offset);
588                                 if (frag->next != NULL)
589                                         fh->frag_off |= htons(IP6_MF);
590                                 fh->identification = frag_id;
591                                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
592                                 ip6_copy_metadata(frag, skb);
593                         }
594                         
595                         err = output(skb);
596                         if (err || !frag)
597                                 break;
598
599                         skb = frag;
600                         frag = skb->next;
601                         skb->next = NULL;
602                 }
603
604                 kfree(tmp_hdr);
605
606                 if (err == 0) {
607                         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
608                         return 0;
609                 }
610
611                 while (frag) {
612                         skb = frag->next;
613                         kfree_skb(frag);
614                         frag = skb;
615                 }
616
617                 IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
618                 return err;
619         }
620
621 slow_path:
622         left = skb->len - hlen;         /* Space per frame */
623         ptr = hlen;                     /* Where to start from */
624
625         /*
626          *      Fragment the datagram.
627          */
628
629         *prevhdr = NEXTHDR_FRAGMENT;
630
631         /*
632          *      Keep copying data until we run out.
633          */
634         while(left > 0) {
635                 len = left;
636                 /* IF: it doesn't fit, use 'mtu' - the data space left */
637                 if (len > mtu)
638                         len = mtu;
639                 /* IF: we are not sending upto and including the packet end
640                    then align the next start on an eight byte boundary */
641                 if (len < left) {
642                         len &= ~7;
643                 }
644                 /*
645                  *      Allocate buffer.
646                  */
647
648                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
649                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
650                         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
651                         err = -ENOMEM;
652                         goto fail;
653                 }
654
655                 /*
656                  *      Set up data on packet
657                  */
658
659                 ip6_copy_metadata(frag, skb);
660                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
661                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
662                 frag->nh.raw = frag->data;
663                 fh = (struct frag_hdr*)(frag->data + hlen);
664                 frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr);
665
666                 /*
667                  *      Charge the memory for the fragment to any owner
668                  *      it might possess
669                  */
670                 if (skb->sk)
671                         skb_set_owner_w(frag, skb->sk);
672
673                 /*
674                  *      Copy the packet header into the new buffer.
675                  */
676                 memcpy(frag->nh.raw, skb->data, hlen);
677
678                 /*
679                  *      Build fragment header.
680                  */
681                 fh->nexthdr = nexthdr;
682                 fh->reserved = 0;
683                 if (!frag_id) {
684                         ipv6_select_ident(skb, fh);
685                         frag_id = fh->identification;
686                 } else
687                         fh->identification = frag_id;
688
689                 /*
690                  *      Copy a block of the IP datagram.
691                  */
692                 if (skb_copy_bits(skb, ptr, frag->h.raw, len))
693                         BUG();
694                 left -= len;
695
696                 fh->frag_off = htons(offset);
697                 if (left > 0)
698                         fh->frag_off |= htons(IP6_MF);
699                 frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
700
701                 ptr += len;
702                 offset += len;
703
704                 /*
705                  *      Put this fragment into the sending queue.
706                  */
707
708                 IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES);
709
710                 err = output(frag);
711                 if (err)
712                         goto fail;
713         }
714         kfree_skb(skb);
715         IP6_INC_STATS(IPSTATS_MIB_FRAGOKS);
716         return err;
717
718 fail:
719         kfree_skb(skb); 
720         IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS);
721         return err;
722 }
723
724 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
725 {
726         int err = 0;
727
728         *dst = NULL;
729         if (sk) {
730                 struct ipv6_pinfo *np = inet6_sk(sk);
731         
732                 *dst = sk_dst_check(sk, np->dst_cookie);
733                 if (*dst) {
734                         struct rt6_info *rt = (struct rt6_info*)*dst;
735         
736                                 /* Yes, checking route validity in not connected
737                                    case is not very simple. Take into account,
738                                    that we do not support routing by source, TOS,
739                                    and MSG_DONTROUTE            --ANK (980726)
740         
741                                    1. If route was host route, check that
742                                       cached destination is current.
743                                       If it is network route, we still may
744                                       check its validity using saved pointer
745                                       to the last used address: daddr_cache.
746                                       We do not want to save whole address now,
747                                       (because main consumer of this service
748                                        is tcp, which has not this problem),
749                                       so that the last trick works only on connected
750                                       sockets.
751                                    2. oif also should be the same.
752                                  */
753         
754                         if (((rt->rt6i_dst.plen != 128 ||
755                               !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr))
756                              && (np->daddr_cache == NULL ||
757                                  !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache)))
758                             || (fl->oif && fl->oif != (*dst)->dev->ifindex)) {
759                                 dst_release(*dst);
760                                 *dst = NULL;
761                         }
762                 }
763         }
764
765         if (*dst == NULL)
766                 *dst = ip6_route_output(sk, fl);
767
768         if ((err = (*dst)->error))
769                 goto out_err_release;
770
771         if (ipv6_addr_any(&fl->fl6_src)) {
772                 err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src);
773
774                 if (err)
775                         goto out_err_release;
776         }
777
778         return 0;
779
780 out_err_release:
781         dst_release(*dst);
782         *dst = NULL;
783         return err;
784 }
785
786 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
787
788 static inline int ip6_ufo_append_data(struct sock *sk,
789                         int getfrag(void *from, char *to, int offset, int len,
790                         int odd, struct sk_buff *skb),
791                         void *from, int length, int hh_len, int fragheaderlen,
792                         int transhdrlen, int mtu,unsigned int flags)
793
794 {
795         struct sk_buff *skb;
796         int err;
797
798         /* There is support for UDP large send offload by network
799          * device, so create one single skb packet containing complete
800          * udp datagram
801          */
802         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
803                 skb = sock_alloc_send_skb(sk,
804                         hh_len + fragheaderlen + transhdrlen + 20,
805                         (flags & MSG_DONTWAIT), &err);
806                 if (skb == NULL)
807                         return -ENOMEM;
808
809                 /* reserve space for Hardware header */
810                 skb_reserve(skb, hh_len);
811
812                 /* create space for UDP/IP header */
813                 skb_put(skb,fragheaderlen + transhdrlen);
814
815                 /* initialize network header pointer */
816                 skb->nh.raw = skb->data;
817
818                 /* initialize protocol header pointer */
819                 skb->h.raw = skb->data + fragheaderlen;
820
821                 skb->ip_summed = CHECKSUM_HW;
822                 skb->csum = 0;
823                 sk->sk_sndmsg_off = 0;
824         }
825
826         err = skb_append_datato_frags(sk,skb, getfrag, from,
827                                       (length - transhdrlen));
828         if (!err) {
829                 struct frag_hdr fhdr;
830
831                 /* specify the length of each IP datagram fragment*/
832                 skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen) - 
833                                                 sizeof(struct frag_hdr);
834                 ipv6_select_ident(skb, &fhdr);
835                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
836                 __skb_queue_tail(&sk->sk_write_queue, skb);
837
838                 return 0;
839         }
840         /* There is not enough support do UPD LSO,
841          * so follow normal path
842          */
843         kfree_skb(skb);
844
845         return err;
846 }
847
848 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
849         int offset, int len, int odd, struct sk_buff *skb),
850         void *from, int length, int transhdrlen,
851         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
852         struct rt6_info *rt, unsigned int flags)
853 {
854         struct inet_sock *inet = inet_sk(sk);
855         struct ipv6_pinfo *np = inet6_sk(sk);
856         struct sk_buff *skb;
857         unsigned int maxfraglen, fragheaderlen;
858         int exthdrlen;
859         int hh_len;
860         int mtu;
861         int copy;
862         int err;
863         int offset = 0;
864         int csummode = CHECKSUM_NONE;
865
866         if (flags&MSG_PROBE)
867                 return 0;
868         if (skb_queue_empty(&sk->sk_write_queue)) {
869                 /*
870                  * setup for corking
871                  */
872                 if (opt) {
873                         if (np->cork.opt == NULL) {
874                                 np->cork.opt = kmalloc(opt->tot_len,
875                                                        sk->sk_allocation);
876                                 if (unlikely(np->cork.opt == NULL))
877                                         return -ENOBUFS;
878                         } else if (np->cork.opt->tot_len < opt->tot_len) {
879                                 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
880                                 return -EINVAL;
881                         }
882                         memcpy(np->cork.opt, opt, opt->tot_len);
883                         inet->cork.flags |= IPCORK_OPT;
884                         /* need source address above miyazawa*/
885                 }
886                 dst_hold(&rt->u.dst);
887                 np->cork.rt = rt;
888                 inet->cork.fl = *fl;
889                 np->cork.hop_limit = hlimit;
890                 np->cork.tclass = tclass;
891                 mtu = dst_mtu(rt->u.dst.path);
892                 if (np && np->frag_size < mtu) {
893                         if (np->frag_size)
894                                 mtu = np->frag_size;
895                 }
896                 inet->cork.fragsize = mtu;
897                 if (dst_allfrag(rt->u.dst.path))
898                         inet->cork.flags |= IPCORK_ALLFRAG;
899                 inet->cork.length = 0;
900                 sk->sk_sndmsg_page = NULL;
901                 sk->sk_sndmsg_off = 0;
902                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0);
903                 length += exthdrlen;
904                 transhdrlen += exthdrlen;
905         } else {
906                 rt = np->cork.rt;
907                 fl = &inet->cork.fl;
908                 if (inet->cork.flags & IPCORK_OPT)
909                         opt = np->cork.opt;
910                 transhdrlen = 0;
911                 exthdrlen = 0;
912                 mtu = inet->cork.fragsize;
913         }
914
915         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
916
917         fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
918         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
919
920         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
921                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
922                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
923                         return -EMSGSIZE;
924                 }
925         }
926
927         /*
928          * Let's try using as much space as possible.
929          * Use MTU if total length of the message fits into the MTU.
930          * Otherwise, we need to reserve fragment header and
931          * fragment alignment (= 8-15 octects, in total).
932          *
933          * Note that we may need to "move" the data from the tail of
934          * of the buffer to the new fragment when we split 
935          * the message.
936          *
937          * FIXME: It may be fragmented into multiple chunks 
938          *        at once if non-fragmentable extension headers
939          *        are too large.
940          * --yoshfuji 
941          */
942
943         inet->cork.length += length;
944         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
945             (rt->u.dst.dev->features & NETIF_F_UFO)) {
946
947                 if(ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
948                                 fragheaderlen, transhdrlen, mtu, flags))
949                         goto error;
950
951                 return 0;
952         }
953
954         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
955                 goto alloc_new_skb;
956
957         while (length > 0) {
958                 /* Check if the remaining data fits into current packet. */
959                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
960                 if (copy < length)
961                         copy = maxfraglen - skb->len;
962
963                 if (copy <= 0) {
964                         char *data;
965                         unsigned int datalen;
966                         unsigned int fraglen;
967                         unsigned int fraggap;
968                         unsigned int alloclen;
969                         struct sk_buff *skb_prev;
970 alloc_new_skb:
971                         skb_prev = skb;
972
973                         /* There's no room in the current skb */
974                         if (skb_prev)
975                                 fraggap = skb_prev->len - maxfraglen;
976                         else
977                                 fraggap = 0;
978
979                         /*
980                          * If remaining data exceeds the mtu,
981                          * we know we need more fragment(s).
982                          */
983                         datalen = length + fraggap;
984                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
985                                 datalen = maxfraglen - fragheaderlen;
986
987                         fraglen = datalen + fragheaderlen;
988                         if ((flags & MSG_MORE) &&
989                             !(rt->u.dst.dev->features&NETIF_F_SG))
990                                 alloclen = mtu;
991                         else
992                                 alloclen = datalen + fragheaderlen;
993
994                         /*
995                          * The last fragment gets additional space at tail.
996                          * Note: we overallocate on fragments with MSG_MODE
997                          * because we have no idea if we're the last one.
998                          */
999                         if (datalen == length + fraggap)
1000                                 alloclen += rt->u.dst.trailer_len;
1001
1002                         /*
1003                          * We just reserve space for fragment header.
1004                          * Note: this may be overallocation if the message 
1005                          * (without MSG_MORE) fits into the MTU.
1006                          */
1007                         alloclen += sizeof(struct frag_hdr);
1008
1009                         if (transhdrlen) {
1010                                 skb = sock_alloc_send_skb(sk,
1011                                                 alloclen + hh_len,
1012                                                 (flags & MSG_DONTWAIT), &err);
1013                         } else {
1014                                 skb = NULL;
1015                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1016                                     2 * sk->sk_sndbuf)
1017                                         skb = sock_wmalloc(sk,
1018                                                            alloclen + hh_len, 1,
1019                                                            sk->sk_allocation);
1020                                 if (unlikely(skb == NULL))
1021                                         err = -ENOBUFS;
1022                         }
1023                         if (skb == NULL)
1024                                 goto error;
1025                         /*
1026                          *      Fill in the control structures
1027                          */
1028                         skb->ip_summed = csummode;
1029                         skb->csum = 0;
1030                         /* reserve for fragmentation */
1031                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1032
1033                         /*
1034                          *      Find where to start putting bytes
1035                          */
1036                         data = skb_put(skb, fraglen);
1037                         skb->nh.raw = data + exthdrlen;
1038                         data += fragheaderlen;
1039                         skb->h.raw = data + exthdrlen;
1040
1041                         if (fraggap) {
1042                                 skb->csum = skb_copy_and_csum_bits(
1043                                         skb_prev, maxfraglen,
1044                                         data + transhdrlen, fraggap, 0);
1045                                 skb_prev->csum = csum_sub(skb_prev->csum,
1046                                                           skb->csum);
1047                                 data += fraggap;
1048                                 skb_trim(skb_prev, maxfraglen);
1049                         }
1050                         copy = datalen - transhdrlen - fraggap;
1051                         if (copy < 0) {
1052                                 err = -EINVAL;
1053                                 kfree_skb(skb);
1054                                 goto error;
1055                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1056                                 err = -EFAULT;
1057                                 kfree_skb(skb);
1058                                 goto error;
1059                         }
1060
1061                         offset += copy;
1062                         length -= datalen - fraggap;
1063                         transhdrlen = 0;
1064                         exthdrlen = 0;
1065                         csummode = CHECKSUM_NONE;
1066
1067                         /*
1068                          * Put the packet on the pending queue
1069                          */
1070                         __skb_queue_tail(&sk->sk_write_queue, skb);
1071                         continue;
1072                 }
1073
1074                 if (copy > length)
1075                         copy = length;
1076
1077                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1078                         unsigned int off;
1079
1080                         off = skb->len;
1081                         if (getfrag(from, skb_put(skb, copy),
1082                                                 offset, copy, off, skb) < 0) {
1083                                 __skb_trim(skb, off);
1084                                 err = -EFAULT;
1085                                 goto error;
1086                         }
1087                 } else {
1088                         int i = skb_shinfo(skb)->nr_frags;
1089                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1090                         struct page *page = sk->sk_sndmsg_page;
1091                         int off = sk->sk_sndmsg_off;
1092                         unsigned int left;
1093
1094                         if (page && (left = PAGE_SIZE - off) > 0) {
1095                                 if (copy >= left)
1096                                         copy = left;
1097                                 if (page != frag->page) {
1098                                         if (i == MAX_SKB_FRAGS) {
1099                                                 err = -EMSGSIZE;
1100                                                 goto error;
1101                                         }
1102                                         get_page(page);
1103                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1104                                         frag = &skb_shinfo(skb)->frags[i];
1105                                 }
1106                         } else if(i < MAX_SKB_FRAGS) {
1107                                 if (copy > PAGE_SIZE)
1108                                         copy = PAGE_SIZE;
1109                                 page = alloc_pages(sk->sk_allocation, 0);
1110                                 if (page == NULL) {
1111                                         err = -ENOMEM;
1112                                         goto error;
1113                                 }
1114                                 sk->sk_sndmsg_page = page;
1115                                 sk->sk_sndmsg_off = 0;
1116
1117                                 skb_fill_page_desc(skb, i, page, 0, 0);
1118                                 frag = &skb_shinfo(skb)->frags[i];
1119                                 skb->truesize += PAGE_SIZE;
1120                                 atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
1121                         } else {
1122                                 err = -EMSGSIZE;
1123                                 goto error;
1124                         }
1125                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1126                                 err = -EFAULT;
1127                                 goto error;
1128                         }
1129                         sk->sk_sndmsg_off += copy;
1130                         frag->size += copy;
1131                         skb->len += copy;
1132                         skb->data_len += copy;
1133                 }
1134                 offset += copy;
1135                 length -= copy;
1136         }
1137         return 0;
1138 error:
1139         inet->cork.length -= length;
1140         IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1141         return err;
1142 }
1143
1144 int ip6_push_pending_frames(struct sock *sk)
1145 {
1146         struct sk_buff *skb, *tmp_skb;
1147         struct sk_buff **tail_skb;
1148         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1149         struct inet_sock *inet = inet_sk(sk);
1150         struct ipv6_pinfo *np = inet6_sk(sk);
1151         struct ipv6hdr *hdr;
1152         struct ipv6_txoptions *opt = np->cork.opt;
1153         struct rt6_info *rt = np->cork.rt;
1154         struct flowi *fl = &inet->cork.fl;
1155         unsigned char proto = fl->proto;
1156         int err = 0;
1157
1158         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1159                 goto out;
1160         tail_skb = &(skb_shinfo(skb)->frag_list);
1161
1162         /* move skb->data to ip header from ext header */
1163         if (skb->data < skb->nh.raw)
1164                 __skb_pull(skb, skb->nh.raw - skb->data);
1165         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1166                 __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
1167                 *tail_skb = tmp_skb;
1168                 tail_skb = &(tmp_skb->next);
1169                 skb->len += tmp_skb->len;
1170                 skb->data_len += tmp_skb->len;
1171                 skb->truesize += tmp_skb->truesize;
1172                 __sock_put(tmp_skb->sk);
1173                 tmp_skb->destructor = NULL;
1174                 tmp_skb->sk = NULL;
1175         }
1176
1177         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1178         __skb_pull(skb, skb->h.raw - skb->nh.raw);
1179         if (opt && opt->opt_flen)
1180                 ipv6_push_frag_opts(skb, opt, &proto);
1181         if (opt && opt->opt_nflen)
1182                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1183
1184         skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr));
1185         
1186         *(u32*)hdr = fl->fl6_flowlabel |
1187                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1188
1189         if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN)
1190                 hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
1191         else
1192                 hdr->payload_len = 0;
1193         hdr->hop_limit = np->cork.hop_limit;
1194         hdr->nexthdr = proto;
1195         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1196         ipv6_addr_copy(&hdr->daddr, final_dst);
1197
1198         skb->priority = sk->sk_priority;
1199
1200         skb->dst = dst_clone(&rt->u.dst);
1201         IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); 
1202         err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output);
1203         if (err) {
1204                 if (err > 0)
1205                         err = np->recverr ? net_xmit_errno(err) : 0;
1206                 if (err)
1207                         goto error;
1208         }
1209
1210 out:
1211         inet->cork.flags &= ~IPCORK_OPT;
1212         kfree(np->cork.opt);
1213         np->cork.opt = NULL;
1214         if (np->cork.rt) {
1215                 dst_release(&np->cork.rt->u.dst);
1216                 np->cork.rt = NULL;
1217                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1218         }
1219         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1220         return err;
1221 error:
1222         goto out;
1223 }
1224
1225 void ip6_flush_pending_frames(struct sock *sk)
1226 {
1227         struct inet_sock *inet = inet_sk(sk);
1228         struct ipv6_pinfo *np = inet6_sk(sk);
1229         struct sk_buff *skb;
1230
1231         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1232                 IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1233                 kfree_skb(skb);
1234         }
1235
1236         inet->cork.flags &= ~IPCORK_OPT;
1237
1238         kfree(np->cork.opt);
1239         np->cork.opt = NULL;
1240         if (np->cork.rt) {
1241                 dst_release(&np->cork.rt->u.dst);
1242                 np->cork.rt = NULL;
1243                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1244         }
1245         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1246 }