]> nv-tegra.nvidia Code Review - linux-2.6.git/blob - net/ipv6/route.c
[IPV6]: flowlabels are net-endian
[linux-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  *      Ville Nuorvala
26  *              Fixed routing subtrees.
27  */
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41
42 #ifdef  CONFIG_PROC_FS
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #endif
46
47 #include <net/snmp.h>
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #include <net/ndisc.h>
52 #include <net/addrconf.h>
53 #include <net/tcp.h>
54 #include <linux/rtnetlink.h>
55 #include <net/dst.h>
56 #include <net/xfrm.h>
57 #include <net/netevent.h>
58 #include <net/netlink.h>
59
60 #include <asm/uaccess.h>
61
62 #ifdef CONFIG_SYSCTL
63 #include <linux/sysctl.h>
64 #endif
65
66 /* Set to 3 to get tracing. */
67 #define RT6_DEBUG 2
68
69 #if RT6_DEBUG >= 3
70 #define RDBG(x) printk x
71 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #else
73 #define RDBG(x)
74 #define RT6_TRACE(x...) do { ; } while (0)
75 #endif
76
77 #define CLONE_OFFLINK_ROUTE 0
78
79 static int ip6_rt_max_size = 4096;
80 static int ip6_rt_gc_min_interval = HZ / 2;
81 static int ip6_rt_gc_timeout = 60*HZ;
82 int ip6_rt_gc_interval = 30*HZ;
83 static int ip6_rt_gc_elasticity = 9;
84 static int ip6_rt_mtu_expires = 10*60*HZ;
85 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
86
87 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
88 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
89 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
90 static void             ip6_dst_destroy(struct dst_entry *);
91 static void             ip6_dst_ifdown(struct dst_entry *,
92                                        struct net_device *dev, int how);
93 static int               ip6_dst_gc(void);
94
95 static int              ip6_pkt_discard(struct sk_buff *skb);
96 static int              ip6_pkt_discard_out(struct sk_buff *skb);
97 static void             ip6_link_failure(struct sk_buff *skb);
98 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
99
100 #ifdef CONFIG_IPV6_ROUTE_INFO
101 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
102                                            struct in6_addr *gwaddr, int ifindex,
103                                            unsigned pref);
104 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
105                                            struct in6_addr *gwaddr, int ifindex);
106 #endif
107
108 static struct dst_ops ip6_dst_ops = {
109         .family                 =       AF_INET6,
110         .protocol               =       __constant_htons(ETH_P_IPV6),
111         .gc                     =       ip6_dst_gc,
112         .gc_thresh              =       1024,
113         .check                  =       ip6_dst_check,
114         .destroy                =       ip6_dst_destroy,
115         .ifdown                 =       ip6_dst_ifdown,
116         .negative_advice        =       ip6_negative_advice,
117         .link_failure           =       ip6_link_failure,
118         .update_pmtu            =       ip6_rt_update_pmtu,
119         .entry_size             =       sizeof(struct rt6_info),
120 };
121
122 struct rt6_info ip6_null_entry = {
123         .u = {
124                 .dst = {
125                         .__refcnt       = ATOMIC_INIT(1),
126                         .__use          = 1,
127                         .dev            = &loopback_dev,
128                         .obsolete       = -1,
129                         .error          = -ENETUNREACH,
130                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
131                         .input          = ip6_pkt_discard,
132                         .output         = ip6_pkt_discard_out,
133                         .ops            = &ip6_dst_ops,
134                         .path           = (struct dst_entry*)&ip6_null_entry,
135                 }
136         },
137         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
138         .rt6i_metric    = ~(u32) 0,
139         .rt6i_ref       = ATOMIC_INIT(1),
140 };
141
142 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
143
144 static int ip6_pkt_prohibit(struct sk_buff *skb);
145 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
146 static int ip6_pkt_blk_hole(struct sk_buff *skb);
147
148 struct rt6_info ip6_prohibit_entry = {
149         .u = {
150                 .dst = {
151                         .__refcnt       = ATOMIC_INIT(1),
152                         .__use          = 1,
153                         .dev            = &loopback_dev,
154                         .obsolete       = -1,
155                         .error          = -EACCES,
156                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
157                         .input          = ip6_pkt_prohibit,
158                         .output         = ip6_pkt_prohibit_out,
159                         .ops            = &ip6_dst_ops,
160                         .path           = (struct dst_entry*)&ip6_prohibit_entry,
161                 }
162         },
163         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
164         .rt6i_metric    = ~(u32) 0,
165         .rt6i_ref       = ATOMIC_INIT(1),
166 };
167
168 struct rt6_info ip6_blk_hole_entry = {
169         .u = {
170                 .dst = {
171                         .__refcnt       = ATOMIC_INIT(1),
172                         .__use          = 1,
173                         .dev            = &loopback_dev,
174                         .obsolete       = -1,
175                         .error          = -EINVAL,
176                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
177                         .input          = ip6_pkt_blk_hole,
178                         .output         = ip6_pkt_blk_hole,
179                         .ops            = &ip6_dst_ops,
180                         .path           = (struct dst_entry*)&ip6_blk_hole_entry,
181                 }
182         },
183         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
184         .rt6i_metric    = ~(u32) 0,
185         .rt6i_ref       = ATOMIC_INIT(1),
186 };
187
188 #endif
189
190 /* allocate dst with ip6_dst_ops */
191 static __inline__ struct rt6_info *ip6_dst_alloc(void)
192 {
193         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
194 }
195
196 static void ip6_dst_destroy(struct dst_entry *dst)
197 {
198         struct rt6_info *rt = (struct rt6_info *)dst;
199         struct inet6_dev *idev = rt->rt6i_idev;
200
201         if (idev != NULL) {
202                 rt->rt6i_idev = NULL;
203                 in6_dev_put(idev);
204         }       
205 }
206
207 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
208                            int how)
209 {
210         struct rt6_info *rt = (struct rt6_info *)dst;
211         struct inet6_dev *idev = rt->rt6i_idev;
212
213         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
214                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
215                 if (loopback_idev != NULL) {
216                         rt->rt6i_idev = loopback_idev;
217                         in6_dev_put(idev);
218                 }
219         }
220 }
221
222 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
223 {
224         return (rt->rt6i_flags & RTF_EXPIRES &&
225                 time_after(jiffies, rt->rt6i_expires));
226 }
227
228 static inline int rt6_need_strict(struct in6_addr *daddr)
229 {
230         return (ipv6_addr_type(daddr) &
231                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
232 }
233
234 /*
235  *      Route lookup. Any table->tb6_lock is implied.
236  */
237
238 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
239                                                     int oif,
240                                                     int strict)
241 {
242         struct rt6_info *local = NULL;
243         struct rt6_info *sprt;
244
245         if (oif) {
246                 for (sprt = rt; sprt; sprt = sprt->u.next) {
247                         struct net_device *dev = sprt->rt6i_dev;
248                         if (dev->ifindex == oif)
249                                 return sprt;
250                         if (dev->flags & IFF_LOOPBACK) {
251                                 if (sprt->rt6i_idev == NULL ||
252                                     sprt->rt6i_idev->dev->ifindex != oif) {
253                                         if (strict && oif)
254                                                 continue;
255                                         if (local && (!oif || 
256                                                       local->rt6i_idev->dev->ifindex == oif))
257                                                 continue;
258                                 }
259                                 local = sprt;
260                         }
261                 }
262
263                 if (local)
264                         return local;
265
266                 if (strict)
267                         return &ip6_null_entry;
268         }
269         return rt;
270 }
271
272 #ifdef CONFIG_IPV6_ROUTER_PREF
273 static void rt6_probe(struct rt6_info *rt)
274 {
275         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
276         /*
277          * Okay, this does not seem to be appropriate
278          * for now, however, we need to check if it
279          * is really so; aka Router Reachability Probing.
280          *
281          * Router Reachability Probe MUST be rate-limited
282          * to no more than one per minute.
283          */
284         if (!neigh || (neigh->nud_state & NUD_VALID))
285                 return;
286         read_lock_bh(&neigh->lock);
287         if (!(neigh->nud_state & NUD_VALID) &&
288             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
289                 struct in6_addr mcaddr;
290                 struct in6_addr *target;
291
292                 neigh->updated = jiffies;
293                 read_unlock_bh(&neigh->lock);
294
295                 target = (struct in6_addr *)&neigh->primary_key;
296                 addrconf_addr_solict_mult(target, &mcaddr);
297                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
298         } else
299                 read_unlock_bh(&neigh->lock);
300 }
301 #else
302 static inline void rt6_probe(struct rt6_info *rt)
303 {
304         return;
305 }
306 #endif
307
308 /*
309  * Default Router Selection (RFC 2461 6.3.6)
310  */
311 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
312 {
313         struct net_device *dev = rt->rt6i_dev;
314         if (!oif || dev->ifindex == oif)
315                 return 2;
316         if ((dev->flags & IFF_LOOPBACK) &&
317             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
318                 return 1;
319         return 0;
320 }
321
322 static int inline rt6_check_neigh(struct rt6_info *rt)
323 {
324         struct neighbour *neigh = rt->rt6i_nexthop;
325         int m = 0;
326         if (rt->rt6i_flags & RTF_NONEXTHOP ||
327             !(rt->rt6i_flags & RTF_GATEWAY))
328                 m = 1;
329         else if (neigh) {
330                 read_lock_bh(&neigh->lock);
331                 if (neigh->nud_state & NUD_VALID)
332                         m = 2;
333                 else if (!(neigh->nud_state & NUD_FAILED))
334                         m = 1;
335                 read_unlock_bh(&neigh->lock);
336         }
337         return m;
338 }
339
340 static int rt6_score_route(struct rt6_info *rt, int oif,
341                            int strict)
342 {
343         int m, n;
344                 
345         m = rt6_check_dev(rt, oif);
346         if (!m && (strict & RT6_LOOKUP_F_IFACE))
347                 return -1;
348 #ifdef CONFIG_IPV6_ROUTER_PREF
349         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
350 #endif
351         n = rt6_check_neigh(rt);
352         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
353                 return -1;
354         return m;
355 }
356
357 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
358                                    int strict)
359 {
360         struct rt6_info *match = NULL, *last = NULL;
361         struct rt6_info *rt, *rt0 = *head;
362         u32 metric;
363         int mpri = -1;
364
365         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
366                   __FUNCTION__, head, head ? *head : NULL, oif);
367
368         for (rt = rt0, metric = rt0->rt6i_metric;
369              rt && rt->rt6i_metric == metric && (!last || rt != rt0);
370              rt = rt->u.next) {
371                 int m;
372
373                 if (rt6_check_expired(rt))
374                         continue;
375
376                 last = rt;
377
378                 m = rt6_score_route(rt, oif, strict);
379                 if (m < 0)
380                         continue;
381
382                 if (m > mpri) {
383                         if (strict & RT6_LOOKUP_F_REACHABLE)
384                                 rt6_probe(match);
385                         match = rt;
386                         mpri = m;
387                 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
388                         rt6_probe(rt);
389                 }
390         }
391
392         if (!match &&
393             (strict & RT6_LOOKUP_F_REACHABLE) &&
394             last && last != rt0) {
395                 /* no entries matched; do round-robin */
396                 static DEFINE_SPINLOCK(lock);
397                 spin_lock(&lock);
398                 *head = rt0->u.next;
399                 rt0->u.next = last->u.next;
400                 last->u.next = rt0;
401                 spin_unlock(&lock);
402         }
403
404         RT6_TRACE("%s() => %p, score=%d\n",
405                   __FUNCTION__, match, mpri);
406
407         return (match ? match : &ip6_null_entry);
408 }
409
410 #ifdef CONFIG_IPV6_ROUTE_INFO
411 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
412                   struct in6_addr *gwaddr)
413 {
414         struct route_info *rinfo = (struct route_info *) opt;
415         struct in6_addr prefix_buf, *prefix;
416         unsigned int pref;
417         u32 lifetime;
418         struct rt6_info *rt;
419
420         if (len < sizeof(struct route_info)) {
421                 return -EINVAL;
422         }
423
424         /* Sanity check for prefix_len and length */
425         if (rinfo->length > 3) {
426                 return -EINVAL;
427         } else if (rinfo->prefix_len > 128) {
428                 return -EINVAL;
429         } else if (rinfo->prefix_len > 64) {
430                 if (rinfo->length < 2) {
431                         return -EINVAL;
432                 }
433         } else if (rinfo->prefix_len > 0) {
434                 if (rinfo->length < 1) {
435                         return -EINVAL;
436                 }
437         }
438
439         pref = rinfo->route_pref;
440         if (pref == ICMPV6_ROUTER_PREF_INVALID)
441                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
442
443         lifetime = htonl(rinfo->lifetime);
444         if (lifetime == 0xffffffff) {
445                 /* infinity */
446         } else if (lifetime > 0x7fffffff/HZ) {
447                 /* Avoid arithmetic overflow */
448                 lifetime = 0x7fffffff/HZ - 1;
449         }
450
451         if (rinfo->length == 3)
452                 prefix = (struct in6_addr *)rinfo->prefix;
453         else {
454                 /* this function is safe */
455                 ipv6_addr_prefix(&prefix_buf,
456                                  (struct in6_addr *)rinfo->prefix,
457                                  rinfo->prefix_len);
458                 prefix = &prefix_buf;
459         }
460
461         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
462
463         if (rt && !lifetime) {
464                 ip6_del_rt(rt);
465                 rt = NULL;
466         }
467
468         if (!rt && lifetime)
469                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
470                                         pref);
471         else if (rt)
472                 rt->rt6i_flags = RTF_ROUTEINFO |
473                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
474
475         if (rt) {
476                 if (lifetime == 0xffffffff) {
477                         rt->rt6i_flags &= ~RTF_EXPIRES;
478                 } else {
479                         rt->rt6i_expires = jiffies + HZ * lifetime;
480                         rt->rt6i_flags |= RTF_EXPIRES;
481                 }
482                 dst_release(&rt->u.dst);
483         }
484         return 0;
485 }
486 #endif
487
488 #define BACKTRACK(saddr) \
489 do { \
490         if (rt == &ip6_null_entry) { \
491                 struct fib6_node *pn; \
492                 while (1) { \
493                         if (fn->fn_flags & RTN_TL_ROOT) \
494                                 goto out; \
495                         pn = fn->parent; \
496                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
497                                 fn = fib6_lookup(pn->subtree, NULL, saddr); \
498                         else \
499                                 fn = pn; \
500                         if (fn->fn_flags & RTN_RTINFO) \
501                                 goto restart; \
502                 } \
503         } \
504 } while(0)
505
506 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
507                                              struct flowi *fl, int flags)
508 {
509         struct fib6_node *fn;
510         struct rt6_info *rt;
511
512         read_lock_bh(&table->tb6_lock);
513         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
514 restart:
515         rt = fn->leaf;
516         rt = rt6_device_match(rt, fl->oif, flags);
517         BACKTRACK(&fl->fl6_src);
518 out:
519         dst_hold(&rt->u.dst);
520         read_unlock_bh(&table->tb6_lock);
521
522         rt->u.dst.lastuse = jiffies;
523         rt->u.dst.__use++;
524
525         return rt;
526
527 }
528
529 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
530                             int oif, int strict)
531 {
532         struct flowi fl = {
533                 .oif = oif,
534                 .nl_u = {
535                         .ip6_u = {
536                                 .daddr = *daddr,
537                         },
538                 },
539         };
540         struct dst_entry *dst;
541         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
542
543         if (saddr) {
544                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
545                 flags |= RT6_LOOKUP_F_HAS_SADDR;
546         }
547
548         dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
549         if (dst->error == 0)
550                 return (struct rt6_info *) dst;
551
552         dst_release(dst);
553
554         return NULL;
555 }
556
557 /* ip6_ins_rt is called with FREE table->tb6_lock.
558    It takes new route entry, the addition fails by any reason the
559    route is freed. In any case, if caller does not hold it, it may
560    be destroyed.
561  */
562
563 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
564 {
565         int err;
566         struct fib6_table *table;
567
568         table = rt->rt6i_table;
569         write_lock_bh(&table->tb6_lock);
570         err = fib6_add(&table->tb6_root, rt, info);
571         write_unlock_bh(&table->tb6_lock);
572
573         return err;
574 }
575
576 int ip6_ins_rt(struct rt6_info *rt)
577 {
578         return __ip6_ins_rt(rt, NULL);
579 }
580
581 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
582                                       struct in6_addr *saddr)
583 {
584         struct rt6_info *rt;
585
586         /*
587          *      Clone the route.
588          */
589
590         rt = ip6_rt_copy(ort);
591
592         if (rt) {
593                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
594                         if (rt->rt6i_dst.plen != 128 &&
595                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
596                                 rt->rt6i_flags |= RTF_ANYCAST;
597                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
598                 }
599
600                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
601                 rt->rt6i_dst.plen = 128;
602                 rt->rt6i_flags |= RTF_CACHE;
603                 rt->u.dst.flags |= DST_HOST;
604
605 #ifdef CONFIG_IPV6_SUBTREES
606                 if (rt->rt6i_src.plen && saddr) {
607                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
608                         rt->rt6i_src.plen = 128;
609                 }
610 #endif
611
612                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
613
614         }
615
616         return rt;
617 }
618
619 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
620 {
621         struct rt6_info *rt = ip6_rt_copy(ort);
622         if (rt) {
623                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
624                 rt->rt6i_dst.plen = 128;
625                 rt->rt6i_flags |= RTF_CACHE;
626                 rt->u.dst.flags |= DST_HOST;
627                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
628         }
629         return rt;
630 }
631
632 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
633                                             struct flowi *fl, int flags)
634 {
635         struct fib6_node *fn;
636         struct rt6_info *rt, *nrt;
637         int strict = 0;
638         int attempts = 3;
639         int err;
640         int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
641
642         strict |= flags & RT6_LOOKUP_F_IFACE;
643
644 relookup:
645         read_lock_bh(&table->tb6_lock);
646
647 restart_2:
648         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
649
650 restart:
651         rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
652         BACKTRACK(&fl->fl6_src);
653         if (rt == &ip6_null_entry ||
654             rt->rt6i_flags & RTF_CACHE)
655                 goto out;
656
657         dst_hold(&rt->u.dst);
658         read_unlock_bh(&table->tb6_lock);
659
660         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
661                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
662         else {
663 #if CLONE_OFFLINK_ROUTE
664                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
665 #else
666                 goto out2;
667 #endif
668         }
669
670         dst_release(&rt->u.dst);
671         rt = nrt ? : &ip6_null_entry;
672
673         dst_hold(&rt->u.dst);
674         if (nrt) {
675                 err = ip6_ins_rt(nrt);
676                 if (!err)
677                         goto out2;
678         }
679
680         if (--attempts <= 0)
681                 goto out2;
682
683         /*
684          * Race condition! In the gap, when table->tb6_lock was
685          * released someone could insert this route.  Relookup.
686          */
687         dst_release(&rt->u.dst);
688         goto relookup;
689
690 out:
691         if (reachable) {
692                 reachable = 0;
693                 goto restart_2;
694         }
695         dst_hold(&rt->u.dst);
696         read_unlock_bh(&table->tb6_lock);
697 out2:
698         rt->u.dst.lastuse = jiffies;
699         rt->u.dst.__use++;
700
701         return rt;
702 }
703
704 void ip6_route_input(struct sk_buff *skb)
705 {
706         struct ipv6hdr *iph = skb->nh.ipv6h;
707         int flags = RT6_LOOKUP_F_HAS_SADDR;
708         struct flowi fl = {
709                 .iif = skb->dev->ifindex,
710                 .nl_u = {
711                         .ip6_u = {
712                                 .daddr = iph->daddr,
713                                 .saddr = iph->saddr,
714 #ifdef CONFIG_IPV6_ROUTE_FWMARK
715                                 .fwmark = skb->nfmark,
716 #endif
717                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
718                         },
719                 },
720                 .proto = iph->nexthdr,
721         };
722
723         if (rt6_need_strict(&iph->daddr))
724                 flags |= RT6_LOOKUP_F_IFACE;
725
726         skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
727 }
728
729 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
730                                              struct flowi *fl, int flags)
731 {
732         struct fib6_node *fn;
733         struct rt6_info *rt, *nrt;
734         int strict = 0;
735         int attempts = 3;
736         int err;
737         int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
738
739         strict |= flags & RT6_LOOKUP_F_IFACE;
740
741 relookup:
742         read_lock_bh(&table->tb6_lock);
743
744 restart_2:
745         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
746
747 restart:
748         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
749         BACKTRACK(&fl->fl6_src);
750         if (rt == &ip6_null_entry ||
751             rt->rt6i_flags & RTF_CACHE)
752                 goto out;
753
754         dst_hold(&rt->u.dst);
755         read_unlock_bh(&table->tb6_lock);
756
757         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
758                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
759         else {
760 #if CLONE_OFFLINK_ROUTE
761                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
762 #else
763                 goto out2;
764 #endif
765         }
766
767         dst_release(&rt->u.dst);
768         rt = nrt ? : &ip6_null_entry;
769
770         dst_hold(&rt->u.dst);
771         if (nrt) {
772                 err = ip6_ins_rt(nrt);
773                 if (!err)
774                         goto out2;
775         }
776
777         if (--attempts <= 0)
778                 goto out2;
779
780         /*
781          * Race condition! In the gap, when table->tb6_lock was
782          * released someone could insert this route.  Relookup.
783          */
784         dst_release(&rt->u.dst);
785         goto relookup;
786
787 out:
788         if (reachable) {
789                 reachable = 0;
790                 goto restart_2;
791         }
792         dst_hold(&rt->u.dst);
793         read_unlock_bh(&table->tb6_lock);
794 out2:
795         rt->u.dst.lastuse = jiffies;
796         rt->u.dst.__use++;
797         return rt;
798 }
799
800 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
801 {
802         int flags = 0;
803
804         if (rt6_need_strict(&fl->fl6_dst))
805                 flags |= RT6_LOOKUP_F_IFACE;
806
807         if (!ipv6_addr_any(&fl->fl6_src))
808                 flags |= RT6_LOOKUP_F_HAS_SADDR;
809
810         return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
811 }
812
813
814 /*
815  *      Destination cache support functions
816  */
817
818 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
819 {
820         struct rt6_info *rt;
821
822         rt = (struct rt6_info *) dst;
823
824         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
825                 return dst;
826
827         return NULL;
828 }
829
830 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
831 {
832         struct rt6_info *rt = (struct rt6_info *) dst;
833
834         if (rt) {
835                 if (rt->rt6i_flags & RTF_CACHE)
836                         ip6_del_rt(rt);
837                 else
838                         dst_release(dst);
839         }
840         return NULL;
841 }
842
843 static void ip6_link_failure(struct sk_buff *skb)
844 {
845         struct rt6_info *rt;
846
847         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
848
849         rt = (struct rt6_info *) skb->dst;
850         if (rt) {
851                 if (rt->rt6i_flags&RTF_CACHE) {
852                         dst_set_expires(&rt->u.dst, 0);
853                         rt->rt6i_flags |= RTF_EXPIRES;
854                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
855                         rt->rt6i_node->fn_sernum = -1;
856         }
857 }
858
859 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
860 {
861         struct rt6_info *rt6 = (struct rt6_info*)dst;
862
863         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
864                 rt6->rt6i_flags |= RTF_MODIFIED;
865                 if (mtu < IPV6_MIN_MTU) {
866                         mtu = IPV6_MIN_MTU;
867                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
868                 }
869                 dst->metrics[RTAX_MTU-1] = mtu;
870                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
871         }
872 }
873
874 static int ipv6_get_mtu(struct net_device *dev);
875
876 static inline unsigned int ipv6_advmss(unsigned int mtu)
877 {
878         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
879
880         if (mtu < ip6_rt_min_advmss)
881                 mtu = ip6_rt_min_advmss;
882
883         /*
884          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
885          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
886          * IPV6_MAXPLEN is also valid and means: "any MSS, 
887          * rely only on pmtu discovery"
888          */
889         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
890                 mtu = IPV6_MAXPLEN;
891         return mtu;
892 }
893
894 static struct dst_entry *ndisc_dst_gc_list;
895 static DEFINE_SPINLOCK(ndisc_lock);
896
897 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
898                                   struct neighbour *neigh,
899                                   struct in6_addr *addr,
900                                   int (*output)(struct sk_buff *))
901 {
902         struct rt6_info *rt;
903         struct inet6_dev *idev = in6_dev_get(dev);
904
905         if (unlikely(idev == NULL))
906                 return NULL;
907
908         rt = ip6_dst_alloc();
909         if (unlikely(rt == NULL)) {
910                 in6_dev_put(idev);
911                 goto out;
912         }
913
914         dev_hold(dev);
915         if (neigh)
916                 neigh_hold(neigh);
917         else
918                 neigh = ndisc_get_neigh(dev, addr);
919
920         rt->rt6i_dev      = dev;
921         rt->rt6i_idev     = idev;
922         rt->rt6i_nexthop  = neigh;
923         atomic_set(&rt->u.dst.__refcnt, 1);
924         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
925         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
926         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
927         rt->u.dst.output  = output;
928
929 #if 0   /* there's no chance to use these for ndisc */
930         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
931                                 ? DST_HOST 
932                                 : 0;
933         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
934         rt->rt6i_dst.plen = 128;
935 #endif
936
937         spin_lock_bh(&ndisc_lock);
938         rt->u.dst.next = ndisc_dst_gc_list;
939         ndisc_dst_gc_list = &rt->u.dst;
940         spin_unlock_bh(&ndisc_lock);
941
942         fib6_force_start_gc();
943
944 out:
945         return (struct dst_entry *)rt;
946 }
947
948 int ndisc_dst_gc(int *more)
949 {
950         struct dst_entry *dst, *next, **pprev;
951         int freed;
952
953         next = NULL;
954         freed = 0;
955
956         spin_lock_bh(&ndisc_lock);
957         pprev = &ndisc_dst_gc_list;
958
959         while ((dst = *pprev) != NULL) {
960                 if (!atomic_read(&dst->__refcnt)) {
961                         *pprev = dst->next;
962                         dst_free(dst);
963                         freed++;
964                 } else {
965                         pprev = &dst->next;
966                         (*more)++;
967                 }
968         }
969
970         spin_unlock_bh(&ndisc_lock);
971
972         return freed;
973 }
974
975 static int ip6_dst_gc(void)
976 {
977         static unsigned expire = 30*HZ;
978         static unsigned long last_gc;
979         unsigned long now = jiffies;
980
981         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
982             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
983                 goto out;
984
985         expire++;
986         fib6_run_gc(expire);
987         last_gc = now;
988         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
989                 expire = ip6_rt_gc_timeout>>1;
990
991 out:
992         expire -= expire>>ip6_rt_gc_elasticity;
993         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
994 }
995
996 /* Clean host part of a prefix. Not necessary in radix tree,
997    but results in cleaner routing tables.
998
999    Remove it only when all the things will work!
1000  */
1001
1002 static int ipv6_get_mtu(struct net_device *dev)
1003 {
1004         int mtu = IPV6_MIN_MTU;
1005         struct inet6_dev *idev;
1006
1007         idev = in6_dev_get(dev);
1008         if (idev) {
1009                 mtu = idev->cnf.mtu6;
1010                 in6_dev_put(idev);
1011         }
1012         return mtu;
1013 }
1014
1015 int ipv6_get_hoplimit(struct net_device *dev)
1016 {
1017         int hoplimit = ipv6_devconf.hop_limit;
1018         struct inet6_dev *idev;
1019
1020         idev = in6_dev_get(dev);
1021         if (idev) {
1022                 hoplimit = idev->cnf.hop_limit;
1023                 in6_dev_put(idev);
1024         }
1025         return hoplimit;
1026 }
1027
1028 /*
1029  *
1030  */
1031
1032 int ip6_route_add(struct fib6_config *cfg)
1033 {
1034         int err;
1035         struct rt6_info *rt = NULL;
1036         struct net_device *dev = NULL;
1037         struct inet6_dev *idev = NULL;
1038         struct fib6_table *table;
1039         int addr_type;
1040
1041         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1042                 return -EINVAL;
1043 #ifndef CONFIG_IPV6_SUBTREES
1044         if (cfg->fc_src_len)
1045                 return -EINVAL;
1046 #endif
1047         if (cfg->fc_ifindex) {
1048                 err = -ENODEV;
1049                 dev = dev_get_by_index(cfg->fc_ifindex);
1050                 if (!dev)
1051                         goto out;
1052                 idev = in6_dev_get(dev);
1053                 if (!idev)
1054                         goto out;
1055         }
1056
1057         if (cfg->fc_metric == 0)
1058                 cfg->fc_metric = IP6_RT_PRIO_USER;
1059
1060         table = fib6_new_table(cfg->fc_table);
1061         if (table == NULL) {
1062                 err = -ENOBUFS;
1063                 goto out;
1064         }
1065
1066         rt = ip6_dst_alloc();
1067
1068         if (rt == NULL) {
1069                 err = -ENOMEM;
1070                 goto out;
1071         }
1072
1073         rt->u.dst.obsolete = -1;
1074         rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1075
1076         if (cfg->fc_protocol == RTPROT_UNSPEC)
1077                 cfg->fc_protocol = RTPROT_BOOT;
1078         rt->rt6i_protocol = cfg->fc_protocol;
1079
1080         addr_type = ipv6_addr_type(&cfg->fc_dst);
1081
1082         if (addr_type & IPV6_ADDR_MULTICAST)
1083                 rt->u.dst.input = ip6_mc_input;
1084         else
1085                 rt->u.dst.input = ip6_forward;
1086
1087         rt->u.dst.output = ip6_output;
1088
1089         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1090         rt->rt6i_dst.plen = cfg->fc_dst_len;
1091         if (rt->rt6i_dst.plen == 128)
1092                rt->u.dst.flags = DST_HOST;
1093
1094 #ifdef CONFIG_IPV6_SUBTREES
1095         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1096         rt->rt6i_src.plen = cfg->fc_src_len;
1097 #endif
1098
1099         rt->rt6i_metric = cfg->fc_metric;
1100
1101         /* We cannot add true routes via loopback here,
1102            they would result in kernel looping; promote them to reject routes
1103          */
1104         if ((cfg->fc_flags & RTF_REJECT) ||
1105             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1106                 /* hold loopback dev/idev if we haven't done so. */
1107                 if (dev != &loopback_dev) {
1108                         if (dev) {
1109                                 dev_put(dev);
1110                                 in6_dev_put(idev);
1111                         }
1112                         dev = &loopback_dev;
1113                         dev_hold(dev);
1114                         idev = in6_dev_get(dev);
1115                         if (!idev) {
1116                                 err = -ENODEV;
1117                                 goto out;
1118                         }
1119                 }
1120                 rt->u.dst.output = ip6_pkt_discard_out;
1121                 rt->u.dst.input = ip6_pkt_discard;
1122                 rt->u.dst.error = -ENETUNREACH;
1123                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1124                 goto install_route;
1125         }
1126
1127         if (cfg->fc_flags & RTF_GATEWAY) {
1128                 struct in6_addr *gw_addr;
1129                 int gwa_type;
1130
1131                 gw_addr = &cfg->fc_gateway;
1132                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1133                 gwa_type = ipv6_addr_type(gw_addr);
1134
1135                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1136                         struct rt6_info *grt;
1137
1138                         /* IPv6 strictly inhibits using not link-local
1139                            addresses as nexthop address.
1140                            Otherwise, router will not able to send redirects.
1141                            It is very good, but in some (rare!) circumstances
1142                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1143                            some exceptions. --ANK
1144                          */
1145                         err = -EINVAL;
1146                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1147                                 goto out;
1148
1149                         grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1150
1151                         err = -EHOSTUNREACH;
1152                         if (grt == NULL)
1153                                 goto out;
1154                         if (dev) {
1155                                 if (dev != grt->rt6i_dev) {
1156                                         dst_release(&grt->u.dst);
1157                                         goto out;
1158                                 }
1159                         } else {
1160                                 dev = grt->rt6i_dev;
1161                                 idev = grt->rt6i_idev;
1162                                 dev_hold(dev);
1163                                 in6_dev_hold(grt->rt6i_idev);
1164                         }
1165                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1166                                 err = 0;
1167                         dst_release(&grt->u.dst);
1168
1169                         if (err)
1170                                 goto out;
1171                 }
1172                 err = -EINVAL;
1173                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1174                         goto out;
1175         }
1176
1177         err = -ENODEV;
1178         if (dev == NULL)
1179                 goto out;
1180
1181         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1182                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1183                 if (IS_ERR(rt->rt6i_nexthop)) {
1184                         err = PTR_ERR(rt->rt6i_nexthop);
1185                         rt->rt6i_nexthop = NULL;
1186                         goto out;
1187                 }
1188         }
1189
1190         rt->rt6i_flags = cfg->fc_flags;
1191
1192 install_route:
1193         if (cfg->fc_mx) {
1194                 struct nlattr *nla;
1195                 int remaining;
1196
1197                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1198                         int type = nla->nla_type;
1199
1200                         if (type) {
1201                                 if (type > RTAX_MAX) {
1202                                         err = -EINVAL;
1203                                         goto out;
1204                                 }
1205
1206                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1207                         }
1208                 }
1209         }
1210
1211         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1212                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1213         if (!rt->u.dst.metrics[RTAX_MTU-1])
1214                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1215         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1216                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1217         rt->u.dst.dev = dev;
1218         rt->rt6i_idev = idev;
1219         rt->rt6i_table = table;
1220         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1221
1222 out:
1223         if (dev)
1224                 dev_put(dev);
1225         if (idev)
1226                 in6_dev_put(idev);
1227         if (rt)
1228                 dst_free((struct dst_entry *) rt);
1229         return err;
1230 }
1231
1232 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1233 {
1234         int err;
1235         struct fib6_table *table;
1236
1237         if (rt == &ip6_null_entry)
1238                 return -ENOENT;
1239
1240         table = rt->rt6i_table;
1241         write_lock_bh(&table->tb6_lock);
1242
1243         err = fib6_del(rt, info);
1244         dst_release(&rt->u.dst);
1245
1246         write_unlock_bh(&table->tb6_lock);
1247
1248         return err;
1249 }
1250
1251 int ip6_del_rt(struct rt6_info *rt)
1252 {
1253         return __ip6_del_rt(rt, NULL);
1254 }
1255
1256 static int ip6_route_del(struct fib6_config *cfg)
1257 {
1258         struct fib6_table *table;
1259         struct fib6_node *fn;
1260         struct rt6_info *rt;
1261         int err = -ESRCH;
1262
1263         table = fib6_get_table(cfg->fc_table);
1264         if (table == NULL)
1265                 return err;
1266
1267         read_lock_bh(&table->tb6_lock);
1268
1269         fn = fib6_locate(&table->tb6_root,
1270                          &cfg->fc_dst, cfg->fc_dst_len,
1271                          &cfg->fc_src, cfg->fc_src_len);
1272         
1273         if (fn) {
1274                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1275                         if (cfg->fc_ifindex &&
1276                             (rt->rt6i_dev == NULL ||
1277                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1278                                 continue;
1279                         if (cfg->fc_flags & RTF_GATEWAY &&
1280                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1281                                 continue;
1282                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1283                                 continue;
1284                         dst_hold(&rt->u.dst);
1285                         read_unlock_bh(&table->tb6_lock);
1286
1287                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1288                 }
1289         }
1290         read_unlock_bh(&table->tb6_lock);
1291
1292         return err;
1293 }
1294
1295 /*
1296  *      Handle redirects
1297  */
1298 struct ip6rd_flowi {
1299         struct flowi fl;
1300         struct in6_addr gateway;
1301 };
1302
1303 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1304                                              struct flowi *fl,
1305                                              int flags)
1306 {
1307         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1308         struct rt6_info *rt;
1309         struct fib6_node *fn;
1310
1311         /*
1312          * Get the "current" route for this destination and
1313          * check if the redirect has come from approriate router.
1314          *
1315          * RFC 2461 specifies that redirects should only be
1316          * accepted if they come from the nexthop to the target.
1317          * Due to the way the routes are chosen, this notion
1318          * is a bit fuzzy and one might need to check all possible
1319          * routes.
1320          */
1321
1322         read_lock_bh(&table->tb6_lock);
1323         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1324 restart:
1325         for (rt = fn->leaf; rt; rt = rt->u.next) {
1326                 /*
1327                  * Current route is on-link; redirect is always invalid.
1328                  *
1329                  * Seems, previous statement is not true. It could
1330                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1331                  * But then router serving it might decide, that we should
1332                  * know truth 8)8) --ANK (980726).
1333                  */
1334                 if (rt6_check_expired(rt))
1335                         continue;
1336                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1337                         continue;
1338                 if (fl->oif != rt->rt6i_dev->ifindex)
1339                         continue;
1340                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1341                         continue;
1342                 break;
1343         }
1344
1345         if (!rt)
1346                 rt = &ip6_null_entry;
1347         BACKTRACK(&fl->fl6_src);
1348 out:
1349         dst_hold(&rt->u.dst);
1350
1351         read_unlock_bh(&table->tb6_lock);
1352
1353         return rt;
1354 };
1355
1356 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1357                                            struct in6_addr *src,
1358                                            struct in6_addr *gateway,
1359                                            struct net_device *dev)
1360 {
1361         int flags = RT6_LOOKUP_F_HAS_SADDR;
1362         struct ip6rd_flowi rdfl = {
1363                 .fl = {
1364                         .oif = dev->ifindex,
1365                         .nl_u = {
1366                                 .ip6_u = {
1367                                         .daddr = *dest,
1368                                         .saddr = *src,
1369                                 },
1370                         },
1371                 },
1372                 .gateway = *gateway,
1373         };
1374
1375         if (rt6_need_strict(dest))
1376                 flags |= RT6_LOOKUP_F_IFACE;
1377
1378         return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1379 }
1380
1381 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1382                   struct in6_addr *saddr,
1383                   struct neighbour *neigh, u8 *lladdr, int on_link)
1384 {
1385         struct rt6_info *rt, *nrt = NULL;
1386         struct netevent_redirect netevent;
1387
1388         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1389
1390         if (rt == &ip6_null_entry) {
1391                 if (net_ratelimit())
1392                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1393                                "for redirect target\n");
1394                 goto out;
1395         }
1396
1397         /*
1398          *      We have finally decided to accept it.
1399          */
1400
1401         neigh_update(neigh, lladdr, NUD_STALE, 
1402                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1403                      NEIGH_UPDATE_F_OVERRIDE|
1404                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1405                                      NEIGH_UPDATE_F_ISROUTER))
1406                      );
1407
1408         /*
1409          * Redirect received -> path was valid.
1410          * Look, redirects are sent only in response to data packets,
1411          * so that this nexthop apparently is reachable. --ANK
1412          */
1413         dst_confirm(&rt->u.dst);
1414
1415         /* Duplicate redirect: silently ignore. */
1416         if (neigh == rt->u.dst.neighbour)
1417                 goto out;
1418
1419         nrt = ip6_rt_copy(rt);
1420         if (nrt == NULL)
1421                 goto out;
1422
1423         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1424         if (on_link)
1425                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1426
1427         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1428         nrt->rt6i_dst.plen = 128;
1429         nrt->u.dst.flags |= DST_HOST;
1430
1431         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1432         nrt->rt6i_nexthop = neigh_clone(neigh);
1433         /* Reset pmtu, it may be better */
1434         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1435         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1436
1437         if (ip6_ins_rt(nrt))
1438                 goto out;
1439
1440         netevent.old = &rt->u.dst;
1441         netevent.new = &nrt->u.dst;
1442         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1443
1444         if (rt->rt6i_flags&RTF_CACHE) {
1445                 ip6_del_rt(rt);
1446                 return;
1447         }
1448
1449 out:
1450         dst_release(&rt->u.dst);
1451         return;
1452 }
1453
1454 /*
1455  *      Handle ICMP "packet too big" messages
1456  *      i.e. Path MTU discovery
1457  */
1458
1459 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1460                         struct net_device *dev, u32 pmtu)
1461 {
1462         struct rt6_info *rt, *nrt;
1463         int allfrag = 0;
1464
1465         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1466         if (rt == NULL)
1467                 return;
1468
1469         if (pmtu >= dst_mtu(&rt->u.dst))
1470                 goto out;
1471
1472         if (pmtu < IPV6_MIN_MTU) {
1473                 /*
1474                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1475                  * MTU (1280) and a fragment header should always be included
1476                  * after a node receiving Too Big message reporting PMTU is
1477                  * less than the IPv6 Minimum Link MTU.
1478                  */
1479                 pmtu = IPV6_MIN_MTU;
1480                 allfrag = 1;
1481         }
1482
1483         /* New mtu received -> path was valid.
1484            They are sent only in response to data packets,
1485            so that this nexthop apparently is reachable. --ANK
1486          */
1487         dst_confirm(&rt->u.dst);
1488
1489         /* Host route. If it is static, it would be better
1490            not to override it, but add new one, so that
1491            when cache entry will expire old pmtu
1492            would return automatically.
1493          */
1494         if (rt->rt6i_flags & RTF_CACHE) {
1495                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1496                 if (allfrag)
1497                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1498                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1499                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1500                 goto out;
1501         }
1502
1503         /* Network route.
1504            Two cases are possible:
1505            1. It is connected route. Action: COW
1506            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1507          */
1508         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1509                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1510         else
1511                 nrt = rt6_alloc_clone(rt, daddr);
1512
1513         if (nrt) {
1514                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1515                 if (allfrag)
1516                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1517
1518                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1519                  * happened within 5 mins, the recommended timer is 10 mins.
1520                  * Here this route expiration time is set to ip6_rt_mtu_expires
1521                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1522                  * and detecting PMTU increase will be automatically happened.
1523                  */
1524                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1525                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1526
1527                 ip6_ins_rt(nrt);
1528         }
1529 out:
1530         dst_release(&rt->u.dst);
1531 }
1532
1533 /*
1534  *      Misc support functions
1535  */
1536
1537 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1538 {
1539         struct rt6_info *rt = ip6_dst_alloc();
1540
1541         if (rt) {
1542                 rt->u.dst.input = ort->u.dst.input;
1543                 rt->u.dst.output = ort->u.dst.output;
1544
1545                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1546                 rt->u.dst.error = ort->u.dst.error;
1547                 rt->u.dst.dev = ort->u.dst.dev;
1548                 if (rt->u.dst.dev)
1549                         dev_hold(rt->u.dst.dev);
1550                 rt->rt6i_idev = ort->rt6i_idev;
1551                 if (rt->rt6i_idev)
1552                         in6_dev_hold(rt->rt6i_idev);
1553                 rt->u.dst.lastuse = jiffies;
1554                 rt->rt6i_expires = 0;
1555
1556                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1557                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1558                 rt->rt6i_metric = 0;
1559
1560                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1561 #ifdef CONFIG_IPV6_SUBTREES
1562                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1563 #endif
1564                 rt->rt6i_table = ort->rt6i_table;
1565         }
1566         return rt;
1567 }
1568
1569 #ifdef CONFIG_IPV6_ROUTE_INFO
1570 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1571                                            struct in6_addr *gwaddr, int ifindex)
1572 {
1573         struct fib6_node *fn;
1574         struct rt6_info *rt = NULL;
1575         struct fib6_table *table;
1576
1577         table = fib6_get_table(RT6_TABLE_INFO);
1578         if (table == NULL)
1579                 return NULL;
1580
1581         write_lock_bh(&table->tb6_lock);
1582         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1583         if (!fn)
1584                 goto out;
1585
1586         for (rt = fn->leaf; rt; rt = rt->u.next) {
1587                 if (rt->rt6i_dev->ifindex != ifindex)
1588                         continue;
1589                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1590                         continue;
1591                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1592                         continue;
1593                 dst_hold(&rt->u.dst);
1594                 break;
1595         }
1596 out:
1597         write_unlock_bh(&table->tb6_lock);
1598         return rt;
1599 }
1600
1601 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1602                                            struct in6_addr *gwaddr, int ifindex,
1603                                            unsigned pref)
1604 {
1605         struct fib6_config cfg = {
1606                 .fc_table       = RT6_TABLE_INFO,
1607                 .fc_metric      = 1024,
1608                 .fc_ifindex     = ifindex,
1609                 .fc_dst_len     = prefixlen,
1610                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1611                                   RTF_UP | RTF_PREF(pref),
1612         };
1613
1614         ipv6_addr_copy(&cfg.fc_dst, prefix);
1615         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1616
1617         /* We should treat it as a default route if prefix length is 0. */
1618         if (!prefixlen)
1619                 cfg.fc_flags |= RTF_DEFAULT;
1620
1621         ip6_route_add(&cfg);
1622
1623         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1624 }
1625 #endif
1626
1627 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1628 {       
1629         struct rt6_info *rt;
1630         struct fib6_table *table;
1631
1632         table = fib6_get_table(RT6_TABLE_DFLT);
1633         if (table == NULL)
1634                 return NULL;
1635
1636         write_lock_bh(&table->tb6_lock);
1637         for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1638                 if (dev == rt->rt6i_dev &&
1639                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1640                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1641                         break;
1642         }
1643         if (rt)
1644                 dst_hold(&rt->u.dst);
1645         write_unlock_bh(&table->tb6_lock);
1646         return rt;
1647 }
1648
1649 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1650                                      struct net_device *dev,
1651                                      unsigned int pref)
1652 {
1653         struct fib6_config cfg = {
1654                 .fc_table       = RT6_TABLE_DFLT,
1655                 .fc_metric      = 1024,
1656                 .fc_ifindex     = dev->ifindex,
1657                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1658                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1659         };
1660
1661         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1662
1663         ip6_route_add(&cfg);
1664
1665         return rt6_get_dflt_router(gwaddr, dev);
1666 }
1667
1668 void rt6_purge_dflt_routers(void)
1669 {
1670         struct rt6_info *rt;
1671         struct fib6_table *table;
1672
1673         /* NOTE: Keep consistent with rt6_get_dflt_router */
1674         table = fib6_get_table(RT6_TABLE_DFLT);
1675         if (table == NULL)
1676                 return;
1677
1678 restart:
1679         read_lock_bh(&table->tb6_lock);
1680         for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1681                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1682                         dst_hold(&rt->u.dst);
1683                         read_unlock_bh(&table->tb6_lock);
1684                         ip6_del_rt(rt);
1685                         goto restart;
1686                 }
1687         }
1688         read_unlock_bh(&table->tb6_lock);
1689 }
1690
1691 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1692                                  struct fib6_config *cfg)
1693 {
1694         memset(cfg, 0, sizeof(*cfg));
1695
1696         cfg->fc_table = RT6_TABLE_MAIN;
1697         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1698         cfg->fc_metric = rtmsg->rtmsg_metric;
1699         cfg->fc_expires = rtmsg->rtmsg_info;
1700         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1701         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1702         cfg->fc_flags = rtmsg->rtmsg_flags;
1703
1704         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1705         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1706         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1707 }
1708
1709 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1710 {
1711         struct fib6_config cfg;
1712         struct in6_rtmsg rtmsg;
1713         int err;
1714
1715         switch(cmd) {
1716         case SIOCADDRT:         /* Add a route */
1717         case SIOCDELRT:         /* Delete a route */
1718                 if (!capable(CAP_NET_ADMIN))
1719                         return -EPERM;
1720                 err = copy_from_user(&rtmsg, arg,
1721                                      sizeof(struct in6_rtmsg));
1722                 if (err)
1723                         return -EFAULT;
1724
1725                 rtmsg_to_fib6_config(&rtmsg, &cfg);
1726
1727                 rtnl_lock();
1728                 switch (cmd) {
1729                 case SIOCADDRT:
1730                         err = ip6_route_add(&cfg);
1731                         break;
1732                 case SIOCDELRT:
1733                         err = ip6_route_del(&cfg);
1734                         break;
1735                 default:
1736                         err = -EINVAL;
1737                 }
1738                 rtnl_unlock();
1739
1740                 return err;
1741         };
1742
1743         return -EINVAL;
1744 }
1745
1746 /*
1747  *      Drop the packet on the floor
1748  */
1749
1750 static inline int ip6_pkt_drop(struct sk_buff *skb, int code)
1751 {
1752         int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1753         if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1754                 IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1755
1756         IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1757         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1758         kfree_skb(skb);
1759         return 0;
1760 }
1761
1762 static int ip6_pkt_discard(struct sk_buff *skb)
1763 {
1764         return ip6_pkt_drop(skb, ICMPV6_NOROUTE);
1765 }
1766
1767 static int ip6_pkt_discard_out(struct sk_buff *skb)
1768 {
1769         skb->dev = skb->dst->dev;
1770         return ip6_pkt_discard(skb);
1771 }
1772
1773 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1774
1775 static int ip6_pkt_prohibit(struct sk_buff *skb)
1776 {
1777         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED);
1778 }
1779
1780 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1781 {
1782         skb->dev = skb->dst->dev;
1783         return ip6_pkt_prohibit(skb);
1784 }
1785
1786 static int ip6_pkt_blk_hole(struct sk_buff *skb)
1787 {
1788         kfree_skb(skb);
1789         return 0;
1790 }
1791
1792 #endif
1793
1794 /*
1795  *      Allocate a dst for local (unicast / anycast) address.
1796  */
1797
1798 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1799                                     const struct in6_addr *addr,
1800                                     int anycast)
1801 {
1802         struct rt6_info *rt = ip6_dst_alloc();
1803
1804         if (rt == NULL)
1805                 return ERR_PTR(-ENOMEM);
1806
1807         dev_hold(&loopback_dev);
1808         in6_dev_hold(idev);
1809
1810         rt->u.dst.flags = DST_HOST;
1811         rt->u.dst.input = ip6_input;
1812         rt->u.dst.output = ip6_output;
1813         rt->rt6i_dev = &loopback_dev;
1814         rt->rt6i_idev = idev;
1815         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1816         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1817         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1818         rt->u.dst.obsolete = -1;
1819
1820         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1821         if (anycast)
1822                 rt->rt6i_flags |= RTF_ANYCAST;
1823         else
1824                 rt->rt6i_flags |= RTF_LOCAL;
1825         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1826         if (rt->rt6i_nexthop == NULL) {
1827                 dst_free((struct dst_entry *) rt);
1828                 return ERR_PTR(-ENOMEM);
1829         }
1830
1831         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1832         rt->rt6i_dst.plen = 128;
1833         rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1834
1835         atomic_set(&rt->u.dst.__refcnt, 1);
1836
1837         return rt;
1838 }
1839
1840 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1841 {
1842         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1843             rt != &ip6_null_entry) {
1844                 RT6_TRACE("deleted by ifdown %p\n", rt);
1845                 return -1;
1846         }
1847         return 0;
1848 }
1849
1850 void rt6_ifdown(struct net_device *dev)
1851 {
1852         fib6_clean_all(fib6_ifdown, 0, dev);
1853 }
1854
1855 struct rt6_mtu_change_arg
1856 {
1857         struct net_device *dev;
1858         unsigned mtu;
1859 };
1860
1861 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1862 {
1863         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1864         struct inet6_dev *idev;
1865
1866         /* In IPv6 pmtu discovery is not optional,
1867            so that RTAX_MTU lock cannot disable it.
1868            We still use this lock to block changes
1869            caused by addrconf/ndisc.
1870         */
1871
1872         idev = __in6_dev_get(arg->dev);
1873         if (idev == NULL)
1874                 return 0;
1875
1876         /* For administrative MTU increase, there is no way to discover
1877            IPv6 PMTU increase, so PMTU increase should be updated here.
1878            Since RFC 1981 doesn't include administrative MTU increase
1879            update PMTU increase is a MUST. (i.e. jumbo frame)
1880          */
1881         /*
1882            If new MTU is less than route PMTU, this new MTU will be the
1883            lowest MTU in the path, update the route PMTU to reflect PMTU
1884            decreases; if new MTU is greater than route PMTU, and the
1885            old MTU is the lowest MTU in the path, update the route PMTU
1886            to reflect the increase. In this case if the other nodes' MTU
1887            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1888            PMTU discouvery.
1889          */
1890         if (rt->rt6i_dev == arg->dev &&
1891             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1892             (dst_mtu(&rt->u.dst) > arg->mtu ||
1893              (dst_mtu(&rt->u.dst) < arg->mtu &&
1894               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1895                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1896         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1897         return 0;
1898 }
1899
1900 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1901 {
1902         struct rt6_mtu_change_arg arg = {
1903                 .dev = dev,
1904                 .mtu = mtu,
1905         };
1906
1907         fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1908 }
1909
1910 static struct nla_policy rtm_ipv6_policy[RTA_MAX+1] __read_mostly = {
1911         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
1912         [RTA_OIF]               = { .type = NLA_U32 },
1913         [RTA_IIF]               = { .type = NLA_U32 },
1914         [RTA_PRIORITY]          = { .type = NLA_U32 },
1915         [RTA_METRICS]           = { .type = NLA_NESTED },
1916 };
1917
1918 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1919                               struct fib6_config *cfg)
1920 {
1921         struct rtmsg *rtm;
1922         struct nlattr *tb[RTA_MAX+1];
1923         int err;
1924
1925         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1926         if (err < 0)
1927                 goto errout;
1928
1929         err = -EINVAL;
1930         rtm = nlmsg_data(nlh);
1931         memset(cfg, 0, sizeof(*cfg));
1932
1933         cfg->fc_table = rtm->rtm_table;
1934         cfg->fc_dst_len = rtm->rtm_dst_len;
1935         cfg->fc_src_len = rtm->rtm_src_len;
1936         cfg->fc_flags = RTF_UP;
1937         cfg->fc_protocol = rtm->rtm_protocol;
1938
1939         if (rtm->rtm_type == RTN_UNREACHABLE)
1940                 cfg->fc_flags |= RTF_REJECT;
1941
1942         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1943         cfg->fc_nlinfo.nlh = nlh;
1944
1945         if (tb[RTA_GATEWAY]) {
1946                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1947                 cfg->fc_flags |= RTF_GATEWAY;
1948         }
1949
1950         if (tb[RTA_DST]) {
1951                 int plen = (rtm->rtm_dst_len + 7) >> 3;
1952
1953                 if (nla_len(tb[RTA_DST]) < plen)
1954                         goto errout;
1955
1956                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1957         }
1958
1959         if (tb[RTA_SRC]) {
1960                 int plen = (rtm->rtm_src_len + 7) >> 3;
1961
1962                 if (nla_len(tb[RTA_SRC]) < plen)
1963                         goto errout;
1964
1965                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1966         }
1967
1968         if (tb[RTA_OIF])
1969                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1970
1971         if (tb[RTA_PRIORITY])
1972                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
1973
1974         if (tb[RTA_METRICS]) {
1975                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
1976                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1977         }
1978
1979         if (tb[RTA_TABLE])
1980                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
1981
1982         err = 0;
1983 errout:
1984         return err;
1985 }
1986
1987 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1988 {
1989         struct fib6_config cfg;
1990         int err;
1991
1992         err = rtm_to_fib6_config(skb, nlh, &cfg);
1993         if (err < 0)
1994                 return err;
1995
1996         return ip6_route_del(&cfg);
1997 }
1998
1999 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2000 {
2001         struct fib6_config cfg;
2002         int err;
2003
2004         err = rtm_to_fib6_config(skb, nlh, &cfg);
2005         if (err < 0)
2006                 return err;
2007
2008         return ip6_route_add(&cfg);
2009 }
2010
2011 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
2012                          struct in6_addr *dst, struct in6_addr *src,
2013                          int iif, int type, u32 pid, u32 seq,
2014                          int prefix, unsigned int flags)
2015 {
2016         struct rtmsg *rtm;
2017         struct nlmsghdr *nlh;
2018         struct rta_cacheinfo ci;
2019         u32 table;
2020
2021         if (prefix) {   /* user wants prefix routes only */
2022                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2023                         /* success since this is not a prefix route */
2024                         return 1;
2025                 }
2026         }
2027
2028         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2029         if (nlh == NULL)
2030                 return -ENOBUFS;
2031
2032         rtm = nlmsg_data(nlh);
2033         rtm->rtm_family = AF_INET6;
2034         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2035         rtm->rtm_src_len = rt->rt6i_src.plen;
2036         rtm->rtm_tos = 0;
2037         if (rt->rt6i_table)
2038                 table = rt->rt6i_table->tb6_id;
2039         else
2040                 table = RT6_TABLE_UNSPEC;
2041         rtm->rtm_table = table;
2042         NLA_PUT_U32(skb, RTA_TABLE, table);
2043         if (rt->rt6i_flags&RTF_REJECT)
2044                 rtm->rtm_type = RTN_UNREACHABLE;
2045         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2046                 rtm->rtm_type = RTN_LOCAL;
2047         else
2048                 rtm->rtm_type = RTN_UNICAST;
2049         rtm->rtm_flags = 0;
2050         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2051         rtm->rtm_protocol = rt->rt6i_protocol;
2052         if (rt->rt6i_flags&RTF_DYNAMIC)
2053                 rtm->rtm_protocol = RTPROT_REDIRECT;
2054         else if (rt->rt6i_flags & RTF_ADDRCONF)
2055                 rtm->rtm_protocol = RTPROT_KERNEL;
2056         else if (rt->rt6i_flags&RTF_DEFAULT)
2057                 rtm->rtm_protocol = RTPROT_RA;
2058
2059         if (rt->rt6i_flags&RTF_CACHE)
2060                 rtm->rtm_flags |= RTM_F_CLONED;
2061
2062         if (dst) {
2063                 NLA_PUT(skb, RTA_DST, 16, dst);
2064                 rtm->rtm_dst_len = 128;
2065         } else if (rtm->rtm_dst_len)
2066                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2067 #ifdef CONFIG_IPV6_SUBTREES
2068         if (src) {
2069                 NLA_PUT(skb, RTA_SRC, 16, src);
2070                 rtm->rtm_src_len = 128;
2071         } else if (rtm->rtm_src_len)
2072                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2073 #endif
2074         if (iif)
2075                 NLA_PUT_U32(skb, RTA_IIF, iif);
2076         else if (dst) {
2077                 struct in6_addr saddr_buf;
2078                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2079                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2080         }
2081
2082         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2083                 goto nla_put_failure;
2084
2085         if (rt->u.dst.neighbour)
2086                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2087
2088         if (rt->u.dst.dev)
2089                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2090
2091         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2092         ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
2093         if (rt->rt6i_expires)
2094                 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
2095         else
2096                 ci.rta_expires = 0;
2097         ci.rta_used = rt->u.dst.__use;
2098         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
2099         ci.rta_error = rt->u.dst.error;
2100         ci.rta_id = 0;
2101         ci.rta_ts = 0;
2102         ci.rta_tsage = 0;
2103         NLA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
2104
2105         return nlmsg_end(skb, nlh);
2106
2107 nla_put_failure:
2108         return nlmsg_cancel(skb, nlh);
2109 }
2110
2111 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2112 {
2113         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2114         int prefix;
2115
2116         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2117                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2118                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2119         } else
2120                 prefix = 0;
2121
2122         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2123                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2124                      prefix, NLM_F_MULTI);
2125 }
2126
2127 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2128 {
2129         struct nlattr *tb[RTA_MAX+1];
2130         struct rt6_info *rt;
2131         struct sk_buff *skb;
2132         struct rtmsg *rtm;
2133         struct flowi fl;
2134         int err, iif = 0;
2135
2136         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2137         if (err < 0)
2138                 goto errout;
2139
2140         err = -EINVAL;
2141         memset(&fl, 0, sizeof(fl));
2142
2143         if (tb[RTA_SRC]) {
2144                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2145                         goto errout;
2146
2147                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2148         }
2149
2150         if (tb[RTA_DST]) {
2151                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2152                         goto errout;
2153
2154                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2155         }
2156
2157         if (tb[RTA_IIF])
2158                 iif = nla_get_u32(tb[RTA_IIF]);
2159
2160         if (tb[RTA_OIF])
2161                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2162
2163         if (iif) {
2164                 struct net_device *dev;
2165                 dev = __dev_get_by_index(iif);
2166                 if (!dev) {
2167                         err = -ENODEV;
2168                         goto errout;
2169                 }
2170         }
2171
2172         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2173         if (skb == NULL) {
2174                 err = -ENOBUFS;
2175                 goto errout;
2176         }
2177
2178         /* Reserve room for dummy headers, this skb can pass
2179            through good chunk of routing engine.
2180          */
2181         skb->mac.raw = skb->data;
2182         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2183
2184         rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2185         skb->dst = &rt->u.dst;
2186
2187         err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2188                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2189                             nlh->nlmsg_seq, 0, 0);
2190         if (err < 0) {
2191                 kfree_skb(skb);
2192                 goto errout;
2193         }
2194
2195         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2196 errout:
2197         return err;
2198 }
2199
2200 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2201 {
2202         struct sk_buff *skb;
2203         u32 pid = 0, seq = 0;
2204         struct nlmsghdr *nlh = NULL;
2205         int payload = sizeof(struct rtmsg) + 256;
2206         int err = -ENOBUFS;
2207
2208         if (info) {
2209                 pid = info->pid;
2210                 nlh = info->nlh;
2211                 if (nlh)
2212                         seq = nlh->nlmsg_seq;
2213         }
2214
2215         skb = nlmsg_new(nlmsg_total_size(payload), gfp_any());
2216         if (skb == NULL)
2217                 goto errout;
2218
2219         err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2220         if (err < 0) {
2221                 kfree_skb(skb);
2222                 goto errout;
2223         }
2224
2225         err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2226 errout:
2227         if (err < 0)
2228                 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2229 }
2230
2231 /*
2232  *      /proc
2233  */
2234
2235 #ifdef CONFIG_PROC_FS
2236
2237 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2238
2239 struct rt6_proc_arg
2240 {
2241         char *buffer;
2242         int offset;
2243         int length;
2244         int skip;
2245         int len;
2246 };
2247
2248 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2249 {
2250         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2251         int i;
2252
2253         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2254                 arg->skip++;
2255                 return 0;
2256         }
2257
2258         if (arg->len >= arg->length)
2259                 return 0;
2260
2261         for (i=0; i<16; i++) {
2262                 sprintf(arg->buffer + arg->len, "%02x",
2263                         rt->rt6i_dst.addr.s6_addr[i]);
2264                 arg->len += 2;
2265         }
2266         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2267                             rt->rt6i_dst.plen);
2268
2269 #ifdef CONFIG_IPV6_SUBTREES
2270         for (i=0; i<16; i++) {
2271                 sprintf(arg->buffer + arg->len, "%02x",
2272                         rt->rt6i_src.addr.s6_addr[i]);
2273                 arg->len += 2;
2274         }
2275         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2276                             rt->rt6i_src.plen);
2277 #else
2278         sprintf(arg->buffer + arg->len,
2279                 "00000000000000000000000000000000 00 ");
2280         arg->len += 36;
2281 #endif
2282
2283         if (rt->rt6i_nexthop) {
2284                 for (i=0; i<16; i++) {
2285                         sprintf(arg->buffer + arg->len, "%02x",
2286                                 rt->rt6i_nexthop->primary_key[i]);
2287                         arg->len += 2;
2288                 }
2289         } else {
2290                 sprintf(arg->buffer + arg->len,
2291                         "00000000000000000000000000000000");
2292                 arg->len += 32;
2293         }
2294         arg->len += sprintf(arg->buffer + arg->len,
2295                             " %08x %08x %08x %08x %8s\n",
2296                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2297                             rt->u.dst.__use, rt->rt6i_flags, 
2298                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2299         return 0;
2300 }
2301
2302 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2303 {
2304         struct rt6_proc_arg arg = {
2305                 .buffer = buffer,
2306                 .offset = offset,
2307                 .length = length,
2308         };
2309
2310         fib6_clean_all(rt6_info_route, 0, &arg);
2311
2312         *start = buffer;
2313         if (offset)
2314                 *start += offset % RT6_INFO_LEN;
2315
2316         arg.len -= offset % RT6_INFO_LEN;
2317
2318         if (arg.len > length)
2319                 arg.len = length;
2320         if (arg.len < 0)
2321                 arg.len = 0;
2322
2323         return arg.len;
2324 }
2325
2326 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2327 {
2328         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2329                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2330                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2331                       rt6_stats.fib_rt_cache,
2332                       atomic_read(&ip6_dst_ops.entries),
2333                       rt6_stats.fib_discarded_routes);
2334
2335         return 0;
2336 }
2337
2338 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2339 {
2340         return single_open(file, rt6_stats_seq_show, NULL);
2341 }
2342
2343 static struct file_operations rt6_stats_seq_fops = {
2344         .owner   = THIS_MODULE,
2345         .open    = rt6_stats_seq_open,
2346         .read    = seq_read,
2347         .llseek  = seq_lseek,
2348         .release = single_release,
2349 };
2350 #endif  /* CONFIG_PROC_FS */
2351
2352 #ifdef CONFIG_SYSCTL
2353
2354 static int flush_delay;
2355
2356 static
2357 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2358                               void __user *buffer, size_t *lenp, loff_t *ppos)
2359 {
2360         if (write) {
2361                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2362                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2363                 return 0;
2364         } else
2365                 return -EINVAL;
2366 }
2367
2368 ctl_table ipv6_route_table[] = {
2369         {
2370                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2371                 .procname       =       "flush",
2372                 .data           =       &flush_delay,
2373                 .maxlen         =       sizeof(int),
2374                 .mode           =       0200,
2375                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2376         },
2377         {
2378                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2379                 .procname       =       "gc_thresh",
2380                 .data           =       &ip6_dst_ops.gc_thresh,
2381                 .maxlen         =       sizeof(int),
2382                 .mode           =       0644,
2383                 .proc_handler   =       &proc_dointvec,
2384         },
2385         {
2386                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2387                 .procname       =       "max_size",
2388                 .data           =       &ip6_rt_max_size,
2389                 .maxlen         =       sizeof(int),
2390                 .mode           =       0644,
2391                 .proc_handler   =       &proc_dointvec,
2392         },
2393         {
2394                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2395                 .procname       =       "gc_min_interval",
2396                 .data           =       &ip6_rt_gc_min_interval,
2397                 .maxlen         =       sizeof(int),
2398                 .mode           =       0644,
2399                 .proc_handler   =       &proc_dointvec_jiffies,
2400                 .strategy       =       &sysctl_jiffies,
2401         },
2402         {
2403                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2404                 .procname       =       "gc_timeout",
2405                 .data           =       &ip6_rt_gc_timeout,
2406                 .maxlen         =       sizeof(int),
2407                 .mode           =       0644,
2408                 .proc_handler   =       &proc_dointvec_jiffies,
2409                 .strategy       =       &sysctl_jiffies,
2410         },
2411         {
2412                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2413                 .procname       =       "gc_interval",
2414                 .data           =       &ip6_rt_gc_interval,
2415                 .maxlen         =       sizeof(int),
2416                 .mode           =       0644,
2417                 .proc_handler   =       &proc_dointvec_jiffies,
2418                 .strategy       =       &sysctl_jiffies,
2419         },
2420         {
2421                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2422                 .procname       =       "gc_elasticity",
2423                 .data           =       &ip6_rt_gc_elasticity,
2424                 .maxlen         =       sizeof(int),
2425                 .mode           =       0644,
2426                 .proc_handler   =       &proc_dointvec_jiffies,
2427                 .strategy       =       &sysctl_jiffies,
2428         },
2429         {
2430                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2431                 .procname       =       "mtu_expires",
2432                 .data           =       &ip6_rt_mtu_expires,
2433                 .maxlen         =       sizeof(int),
2434                 .mode           =       0644,
2435                 .proc_handler   =       &proc_dointvec_jiffies,
2436                 .strategy       =       &sysctl_jiffies,
2437         },
2438         {
2439                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2440                 .procname       =       "min_adv_mss",
2441                 .data           =       &ip6_rt_min_advmss,
2442                 .maxlen         =       sizeof(int),
2443                 .mode           =       0644,
2444                 .proc_handler   =       &proc_dointvec_jiffies,
2445                 .strategy       =       &sysctl_jiffies,
2446         },
2447         {
2448                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2449                 .procname       =       "gc_min_interval_ms",
2450                 .data           =       &ip6_rt_gc_min_interval,
2451                 .maxlen         =       sizeof(int),
2452                 .mode           =       0644,
2453                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2454                 .strategy       =       &sysctl_ms_jiffies,
2455         },
2456         { .ctl_name = 0 }
2457 };
2458
2459 #endif
2460
2461 void __init ip6_route_init(void)
2462 {
2463         struct proc_dir_entry *p;
2464
2465         ip6_dst_ops.kmem_cachep =
2466                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2467                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
2468         fib6_init();
2469 #ifdef  CONFIG_PROC_FS
2470         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2471         if (p)
2472                 p->owner = THIS_MODULE;
2473
2474         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2475 #endif
2476 #ifdef CONFIG_XFRM
2477         xfrm6_init();
2478 #endif
2479 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2480         fib6_rules_init();
2481 #endif
2482 }
2483
2484 void ip6_route_cleanup(void)
2485 {
2486 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2487         fib6_rules_cleanup();
2488 #endif
2489 #ifdef CONFIG_PROC_FS
2490         proc_net_remove("ipv6_route");
2491         proc_net_remove("rt6_stats");
2492 #endif
2493 #ifdef CONFIG_XFRM
2494         xfrm6_fini();
2495 #endif
2496         rt6_ifdown(NULL);
2497         fib6_gc_cleanup();
2498         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2499 }