e08d84063c1fdfaa7b46bdcc10ab9e01929d1339
[linux-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>     
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/init.h>
38 #include <linux/netlink.h>
39 #include <linux/if_arp.h>
40
41 #ifdef  CONFIG_PROC_FS
42 #include <linux/proc_fs.h>
43 #include <linux/seq_file.h>
44 #endif
45
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 #define RT6_SELECT_F_IFACE      0x1
78 #define RT6_SELECT_F_REACHABLE  0x2
79
80 static int ip6_rt_max_size = 4096;
81 static int ip6_rt_gc_min_interval = HZ / 2;
82 static int ip6_rt_gc_timeout = 60*HZ;
83 int ip6_rt_gc_interval = 30*HZ;
84 static int ip6_rt_gc_elasticity = 9;
85 static int ip6_rt_mtu_expires = 10*60*HZ;
86 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
87
88 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
89 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
90 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
91 static void             ip6_dst_destroy(struct dst_entry *);
92 static void             ip6_dst_ifdown(struct dst_entry *,
93                                        struct net_device *dev, int how);
94 static int               ip6_dst_gc(void);
95
96 static int              ip6_pkt_discard(struct sk_buff *skb);
97 static int              ip6_pkt_discard_out(struct sk_buff *skb);
98 static void             ip6_link_failure(struct sk_buff *skb);
99 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
100
101 #ifdef CONFIG_IPV6_ROUTE_INFO
102 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
103                                            struct in6_addr *gwaddr, int ifindex,
104                                            unsigned pref);
105 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
106                                            struct in6_addr *gwaddr, int ifindex);
107 #endif
108
109 static struct dst_ops ip6_dst_ops = {
110         .family                 =       AF_INET6,
111         .protocol               =       __constant_htons(ETH_P_IPV6),
112         .gc                     =       ip6_dst_gc,
113         .gc_thresh              =       1024,
114         .check                  =       ip6_dst_check,
115         .destroy                =       ip6_dst_destroy,
116         .ifdown                 =       ip6_dst_ifdown,
117         .negative_advice        =       ip6_negative_advice,
118         .link_failure           =       ip6_link_failure,
119         .update_pmtu            =       ip6_rt_update_pmtu,
120         .entry_size             =       sizeof(struct rt6_info),
121 };
122
123 struct rt6_info ip6_null_entry = {
124         .u = {
125                 .dst = {
126                         .__refcnt       = ATOMIC_INIT(1),
127                         .__use          = 1,
128                         .dev            = &loopback_dev,
129                         .obsolete       = -1,
130                         .error          = -ENETUNREACH,
131                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
132                         .input          = ip6_pkt_discard,
133                         .output         = ip6_pkt_discard_out,
134                         .ops            = &ip6_dst_ops,
135                         .path           = (struct dst_entry*)&ip6_null_entry,
136                 }
137         },
138         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
139         .rt6i_metric    = ~(u32) 0,
140         .rt6i_ref       = ATOMIC_INIT(1),
141 };
142
143 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
144
145 struct rt6_info ip6_prohibit_entry = {
146         .u = {
147                 .dst = {
148                         .__refcnt       = ATOMIC_INIT(1),
149                         .__use          = 1,
150                         .dev            = &loopback_dev,
151                         .obsolete       = -1,
152                         .error          = -EACCES,
153                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
154                         .input          = ip6_pkt_discard,
155                         .output         = ip6_pkt_discard_out,
156                         .ops            = &ip6_dst_ops,
157                         .path           = (struct dst_entry*)&ip6_prohibit_entry,
158                 }
159         },
160         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
161         .rt6i_metric    = ~(u32) 0,
162         .rt6i_ref       = ATOMIC_INIT(1),
163 };
164
165 struct rt6_info ip6_blk_hole_entry = {
166         .u = {
167                 .dst = {
168                         .__refcnt       = ATOMIC_INIT(1),
169                         .__use          = 1,
170                         .dev            = &loopback_dev,
171                         .obsolete       = -1,
172                         .error          = -EINVAL,
173                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
174                         .input          = ip6_pkt_discard,
175                         .output         = ip6_pkt_discard_out,
176                         .ops            = &ip6_dst_ops,
177                         .path           = (struct dst_entry*)&ip6_blk_hole_entry,
178                 }
179         },
180         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
181         .rt6i_metric    = ~(u32) 0,
182         .rt6i_ref       = ATOMIC_INIT(1),
183 };
184
185 #endif
186
187 /* allocate dst with ip6_dst_ops */
188 static __inline__ struct rt6_info *ip6_dst_alloc(void)
189 {
190         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
191 }
192
193 static void ip6_dst_destroy(struct dst_entry *dst)
194 {
195         struct rt6_info *rt = (struct rt6_info *)dst;
196         struct inet6_dev *idev = rt->rt6i_idev;
197
198         if (idev != NULL) {
199                 rt->rt6i_idev = NULL;
200                 in6_dev_put(idev);
201         }       
202 }
203
204 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
205                            int how)
206 {
207         struct rt6_info *rt = (struct rt6_info *)dst;
208         struct inet6_dev *idev = rt->rt6i_idev;
209
210         if (dev != &loopback_dev && idev != NULL && idev->dev == dev) {
211                 struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev);
212                 if (loopback_idev != NULL) {
213                         rt->rt6i_idev = loopback_idev;
214                         in6_dev_put(idev);
215                 }
216         }
217 }
218
219 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
220 {
221         return (rt->rt6i_flags & RTF_EXPIRES &&
222                 time_after(jiffies, rt->rt6i_expires));
223 }
224
225 static inline int rt6_need_strict(struct in6_addr *daddr)
226 {
227         return (ipv6_addr_type(daddr) &
228                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
229 }
230
231 /*
232  *      Route lookup. Any table->tb6_lock is implied.
233  */
234
235 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
236                                                     int oif,
237                                                     int strict)
238 {
239         struct rt6_info *local = NULL;
240         struct rt6_info *sprt;
241
242         if (oif) {
243                 for (sprt = rt; sprt; sprt = sprt->u.next) {
244                         struct net_device *dev = sprt->rt6i_dev;
245                         if (dev->ifindex == oif)
246                                 return sprt;
247                         if (dev->flags & IFF_LOOPBACK) {
248                                 if (sprt->rt6i_idev == NULL ||
249                                     sprt->rt6i_idev->dev->ifindex != oif) {
250                                         if (strict && oif)
251                                                 continue;
252                                         if (local && (!oif || 
253                                                       local->rt6i_idev->dev->ifindex == oif))
254                                                 continue;
255                                 }
256                                 local = sprt;
257                         }
258                 }
259
260                 if (local)
261                         return local;
262
263                 if (strict)
264                         return &ip6_null_entry;
265         }
266         return rt;
267 }
268
269 #ifdef CONFIG_IPV6_ROUTER_PREF
270 static void rt6_probe(struct rt6_info *rt)
271 {
272         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
273         /*
274          * Okay, this does not seem to be appropriate
275          * for now, however, we need to check if it
276          * is really so; aka Router Reachability Probing.
277          *
278          * Router Reachability Probe MUST be rate-limited
279          * to no more than one per minute.
280          */
281         if (!neigh || (neigh->nud_state & NUD_VALID))
282                 return;
283         read_lock_bh(&neigh->lock);
284         if (!(neigh->nud_state & NUD_VALID) &&
285             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
286                 struct in6_addr mcaddr;
287                 struct in6_addr *target;
288
289                 neigh->updated = jiffies;
290                 read_unlock_bh(&neigh->lock);
291
292                 target = (struct in6_addr *)&neigh->primary_key;
293                 addrconf_addr_solict_mult(target, &mcaddr);
294                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
295         } else
296                 read_unlock_bh(&neigh->lock);
297 }
298 #else
299 static inline void rt6_probe(struct rt6_info *rt)
300 {
301         return;
302 }
303 #endif
304
305 /*
306  * Default Router Selection (RFC 2461 6.3.6)
307  */
308 static int inline rt6_check_dev(struct rt6_info *rt, int oif)
309 {
310         struct net_device *dev = rt->rt6i_dev;
311         if (!oif || dev->ifindex == oif)
312                 return 2;
313         if ((dev->flags & IFF_LOOPBACK) &&
314             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
315                 return 1;
316         return 0;
317 }
318
319 static int inline rt6_check_neigh(struct rt6_info *rt)
320 {
321         struct neighbour *neigh = rt->rt6i_nexthop;
322         int m = 0;
323         if (rt->rt6i_flags & RTF_NONEXTHOP ||
324             !(rt->rt6i_flags & RTF_GATEWAY))
325                 m = 1;
326         else if (neigh) {
327                 read_lock_bh(&neigh->lock);
328                 if (neigh->nud_state & NUD_VALID)
329                         m = 2;
330                 read_unlock_bh(&neigh->lock);
331         }
332         return m;
333 }
334
335 static int rt6_score_route(struct rt6_info *rt, int oif,
336                            int strict)
337 {
338         int m, n;
339                 
340         m = rt6_check_dev(rt, oif);
341         if (!m && (strict & RT6_SELECT_F_IFACE))
342                 return -1;
343 #ifdef CONFIG_IPV6_ROUTER_PREF
344         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
345 #endif
346         n = rt6_check_neigh(rt);
347         if (n > 1)
348                 m |= 16;
349         else if (!n && strict & RT6_SELECT_F_REACHABLE)
350                 return -1;
351         return m;
352 }
353
354 static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
355                                    int strict)
356 {
357         struct rt6_info *match = NULL, *last = NULL;
358         struct rt6_info *rt, *rt0 = *head;
359         u32 metric;
360         int mpri = -1;
361
362         RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
363                   __FUNCTION__, head, head ? *head : NULL, oif);
364
365         for (rt = rt0, metric = rt0->rt6i_metric;
366              rt && rt->rt6i_metric == metric && (!last || rt != rt0);
367              rt = rt->u.next) {
368                 int m;
369
370                 if (rt6_check_expired(rt))
371                         continue;
372
373                 last = rt;
374
375                 m = rt6_score_route(rt, oif, strict);
376                 if (m < 0)
377                         continue;
378
379                 if (m > mpri) {
380                         rt6_probe(match);
381                         match = rt;
382                         mpri = m;
383                 } else {
384                         rt6_probe(rt);
385                 }
386         }
387
388         if (!match &&
389             (strict & RT6_SELECT_F_REACHABLE) &&
390             last && last != rt0) {
391                 /* no entries matched; do round-robin */
392                 static DEFINE_SPINLOCK(lock);
393                 spin_lock(&lock);
394                 *head = rt0->u.next;
395                 rt0->u.next = last->u.next;
396                 last->u.next = rt0;
397                 spin_unlock(&lock);
398         }
399
400         RT6_TRACE("%s() => %p, score=%d\n",
401                   __FUNCTION__, match, mpri);
402
403         return (match ? match : &ip6_null_entry);
404 }
405
406 #ifdef CONFIG_IPV6_ROUTE_INFO
407 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
408                   struct in6_addr *gwaddr)
409 {
410         struct route_info *rinfo = (struct route_info *) opt;
411         struct in6_addr prefix_buf, *prefix;
412         unsigned int pref;
413         u32 lifetime;
414         struct rt6_info *rt;
415
416         if (len < sizeof(struct route_info)) {
417                 return -EINVAL;
418         }
419
420         /* Sanity check for prefix_len and length */
421         if (rinfo->length > 3) {
422                 return -EINVAL;
423         } else if (rinfo->prefix_len > 128) {
424                 return -EINVAL;
425         } else if (rinfo->prefix_len > 64) {
426                 if (rinfo->length < 2) {
427                         return -EINVAL;
428                 }
429         } else if (rinfo->prefix_len > 0) {
430                 if (rinfo->length < 1) {
431                         return -EINVAL;
432                 }
433         }
434
435         pref = rinfo->route_pref;
436         if (pref == ICMPV6_ROUTER_PREF_INVALID)
437                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
438
439         lifetime = htonl(rinfo->lifetime);
440         if (lifetime == 0xffffffff) {
441                 /* infinity */
442         } else if (lifetime > 0x7fffffff/HZ) {
443                 /* Avoid arithmetic overflow */
444                 lifetime = 0x7fffffff/HZ - 1;
445         }
446
447         if (rinfo->length == 3)
448                 prefix = (struct in6_addr *)rinfo->prefix;
449         else {
450                 /* this function is safe */
451                 ipv6_addr_prefix(&prefix_buf,
452                                  (struct in6_addr *)rinfo->prefix,
453                                  rinfo->prefix_len);
454                 prefix = &prefix_buf;
455         }
456
457         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
458
459         if (rt && !lifetime) {
460                 ip6_del_rt(rt, NULL, NULL, NULL);
461                 rt = NULL;
462         }
463
464         if (!rt && lifetime)
465                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
466                                         pref);
467         else if (rt)
468                 rt->rt6i_flags = RTF_ROUTEINFO |
469                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
470
471         if (rt) {
472                 if (lifetime == 0xffffffff) {
473                         rt->rt6i_flags &= ~RTF_EXPIRES;
474                 } else {
475                         rt->rt6i_expires = jiffies + HZ * lifetime;
476                         rt->rt6i_flags |= RTF_EXPIRES;
477                 }
478                 dst_release(&rt->u.dst);
479         }
480         return 0;
481 }
482 #endif
483
484 #define BACKTRACK() \
485 if (rt == &ip6_null_entry && flags & RT6_F_STRICT) { \
486         while ((fn = fn->parent) != NULL) { \
487                 if (fn->fn_flags & RTN_TL_ROOT) { \
488                         dst_hold(&rt->u.dst); \
489                         goto out; \
490                 } \
491                 if (fn->fn_flags & RTN_RTINFO) \
492                         goto restart; \
493         } \
494 }
495
496 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
497                                              struct flowi *fl, int flags)
498 {
499         struct fib6_node *fn;
500         struct rt6_info *rt;
501
502         read_lock_bh(&table->tb6_lock);
503         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
504 restart:
505         rt = fn->leaf;
506         rt = rt6_device_match(rt, fl->oif, flags & RT6_F_STRICT);
507         BACKTRACK();
508         dst_hold(&rt->u.dst);
509 out:
510         read_unlock_bh(&table->tb6_lock);
511
512         rt->u.dst.lastuse = jiffies;
513         rt->u.dst.__use++;
514
515         return rt;
516
517 }
518
519 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
520                             int oif, int strict)
521 {
522         struct flowi fl = {
523                 .oif = oif,
524                 .nl_u = {
525                         .ip6_u = {
526                                 .daddr = *daddr,
527                                 /* TODO: saddr */
528                         },
529                 },
530         };
531         struct dst_entry *dst;
532         int flags = strict ? RT6_F_STRICT : 0;
533
534         dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
535         if (dst->error == 0)
536                 return (struct rt6_info *) dst;
537
538         dst_release(dst);
539
540         return NULL;
541 }
542
543 /* ip6_ins_rt is called with FREE table->tb6_lock.
544    It takes new route entry, the addition fails by any reason the
545    route is freed. In any case, if caller does not hold it, it may
546    be destroyed.
547  */
548
549 int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh,
550                 void *_rtattr, struct netlink_skb_parms *req)
551 {
552         int err;
553         struct fib6_table *table;
554
555         table = rt->rt6i_table;
556         write_lock_bh(&table->tb6_lock);
557         err = fib6_add(&table->tb6_root, rt, nlh, _rtattr, req);
558         write_unlock_bh(&table->tb6_lock);
559
560         return err;
561 }
562
563 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
564                                       struct in6_addr *saddr)
565 {
566         struct rt6_info *rt;
567
568         /*
569          *      Clone the route.
570          */
571
572         rt = ip6_rt_copy(ort);
573
574         if (rt) {
575                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
576                         if (rt->rt6i_dst.plen != 128 &&
577                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
578                                 rt->rt6i_flags |= RTF_ANYCAST;
579                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
580                 }
581
582                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
583                 rt->rt6i_dst.plen = 128;
584                 rt->rt6i_flags |= RTF_CACHE;
585                 rt->u.dst.flags |= DST_HOST;
586
587 #ifdef CONFIG_IPV6_SUBTREES
588                 if (rt->rt6i_src.plen && saddr) {
589                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
590                         rt->rt6i_src.plen = 128;
591                 }
592 #endif
593
594                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
595
596         }
597
598         return rt;
599 }
600
601 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
602 {
603         struct rt6_info *rt = ip6_rt_copy(ort);
604         if (rt) {
605                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
606                 rt->rt6i_dst.plen = 128;
607                 rt->rt6i_flags |= RTF_CACHE;
608                 if (rt->rt6i_flags & RTF_REJECT)
609                         rt->u.dst.error = ort->u.dst.error;
610                 rt->u.dst.flags |= DST_HOST;
611                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
612         }
613         return rt;
614 }
615
616 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
617                                             struct flowi *fl, int flags)
618 {
619         struct fib6_node *fn;
620         struct rt6_info *rt, *nrt;
621         int strict = 0;
622         int attempts = 3;
623         int err;
624         int reachable = RT6_SELECT_F_REACHABLE;
625
626         if (flags & RT6_F_STRICT)
627                 strict = RT6_SELECT_F_IFACE;
628
629 relookup:
630         read_lock_bh(&table->tb6_lock);
631
632 restart_2:
633         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
634
635 restart:
636         rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
637         BACKTRACK();
638         if (rt == &ip6_null_entry ||
639             rt->rt6i_flags & RTF_CACHE)
640                 goto out;
641
642         dst_hold(&rt->u.dst);
643         read_unlock_bh(&table->tb6_lock);
644
645         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
646                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
647         else {
648 #if CLONE_OFFLINK_ROUTE
649                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
650 #else
651                 goto out2;
652 #endif
653         }
654
655         dst_release(&rt->u.dst);
656         rt = nrt ? : &ip6_null_entry;
657
658         dst_hold(&rt->u.dst);
659         if (nrt) {
660                 err = ip6_ins_rt(nrt, NULL, NULL, NULL);
661                 if (!err)
662                         goto out2;
663         }
664
665         if (--attempts <= 0)
666                 goto out2;
667
668         /*
669          * Race condition! In the gap, when table->tb6_lock was
670          * released someone could insert this route.  Relookup.
671          */
672         dst_release(&rt->u.dst);
673         goto relookup;
674
675 out:
676         if (reachable) {
677                 reachable = 0;
678                 goto restart_2;
679         }
680         dst_hold(&rt->u.dst);
681         read_unlock_bh(&table->tb6_lock);
682 out2:
683         rt->u.dst.lastuse = jiffies;
684         rt->u.dst.__use++;
685
686         return rt;
687 }
688
689 void ip6_route_input(struct sk_buff *skb)
690 {
691         struct ipv6hdr *iph = skb->nh.ipv6h;
692         struct flowi fl = {
693                 .iif = skb->dev->ifindex,
694                 .nl_u = {
695                         .ip6_u = {
696                                 .daddr = iph->daddr,
697                                 .saddr = iph->saddr,
698                                 .flowlabel = (* (u32 *) iph)&IPV6_FLOWINFO_MASK,
699                         },
700                 },
701                 .proto = iph->nexthdr,
702         };
703         int flags = 0;
704
705         if (rt6_need_strict(&iph->daddr))
706                 flags |= RT6_F_STRICT;
707
708         skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
709 }
710
711 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
712                                              struct flowi *fl, int flags)
713 {
714         struct fib6_node *fn;
715         struct rt6_info *rt, *nrt;
716         int strict = 0;
717         int attempts = 3;
718         int err;
719         int reachable = RT6_SELECT_F_REACHABLE;
720
721         if (flags & RT6_F_STRICT)
722                 strict = RT6_SELECT_F_IFACE;
723
724 relookup:
725         read_lock_bh(&table->tb6_lock);
726
727 restart_2:
728         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
729
730 restart:
731         rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
732         BACKTRACK();
733         if (rt == &ip6_null_entry ||
734             rt->rt6i_flags & RTF_CACHE)
735                 goto out;
736
737         dst_hold(&rt->u.dst);
738         read_unlock_bh(&table->tb6_lock);
739
740         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
741                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
742         else {
743 #if CLONE_OFFLINK_ROUTE
744                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
745 #else
746                 goto out2;
747 #endif
748         }
749
750         dst_release(&rt->u.dst);
751         rt = nrt ? : &ip6_null_entry;
752
753         dst_hold(&rt->u.dst);
754         if (nrt) {
755                 err = ip6_ins_rt(nrt, NULL, NULL, NULL);
756                 if (!err)
757                         goto out2;
758         }
759
760         if (--attempts <= 0)
761                 goto out2;
762
763         /*
764          * Race condition! In the gap, when table->tb6_lock was
765          * released someone could insert this route.  Relookup.
766          */
767         dst_release(&rt->u.dst);
768         goto relookup;
769
770 out:
771         if (reachable) {
772                 reachable = 0;
773                 goto restart_2;
774         }
775         dst_hold(&rt->u.dst);
776         read_unlock_bh(&table->tb6_lock);
777 out2:
778         rt->u.dst.lastuse = jiffies;
779         rt->u.dst.__use++;
780         return rt;
781 }
782
783 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
784 {
785         int flags = 0;
786
787         if (rt6_need_strict(&fl->fl6_dst))
788                 flags |= RT6_F_STRICT;
789
790         return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
791 }
792
793
794 /*
795  *      Destination cache support functions
796  */
797
798 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
799 {
800         struct rt6_info *rt;
801
802         rt = (struct rt6_info *) dst;
803
804         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
805                 return dst;
806
807         return NULL;
808 }
809
810 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
811 {
812         struct rt6_info *rt = (struct rt6_info *) dst;
813
814         if (rt) {
815                 if (rt->rt6i_flags & RTF_CACHE)
816                         ip6_del_rt(rt, NULL, NULL, NULL);
817                 else
818                         dst_release(dst);
819         }
820         return NULL;
821 }
822
823 static void ip6_link_failure(struct sk_buff *skb)
824 {
825         struct rt6_info *rt;
826
827         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
828
829         rt = (struct rt6_info *) skb->dst;
830         if (rt) {
831                 if (rt->rt6i_flags&RTF_CACHE) {
832                         dst_set_expires(&rt->u.dst, 0);
833                         rt->rt6i_flags |= RTF_EXPIRES;
834                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
835                         rt->rt6i_node->fn_sernum = -1;
836         }
837 }
838
839 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
840 {
841         struct rt6_info *rt6 = (struct rt6_info*)dst;
842
843         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
844                 rt6->rt6i_flags |= RTF_MODIFIED;
845                 if (mtu < IPV6_MIN_MTU) {
846                         mtu = IPV6_MIN_MTU;
847                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
848                 }
849                 dst->metrics[RTAX_MTU-1] = mtu;
850                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
851         }
852 }
853
854 static int ipv6_get_mtu(struct net_device *dev);
855
856 static inline unsigned int ipv6_advmss(unsigned int mtu)
857 {
858         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
859
860         if (mtu < ip6_rt_min_advmss)
861                 mtu = ip6_rt_min_advmss;
862
863         /*
864          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and 
865          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. 
866          * IPV6_MAXPLEN is also valid and means: "any MSS, 
867          * rely only on pmtu discovery"
868          */
869         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
870                 mtu = IPV6_MAXPLEN;
871         return mtu;
872 }
873
874 static struct dst_entry *ndisc_dst_gc_list;
875 static DEFINE_SPINLOCK(ndisc_lock);
876
877 struct dst_entry *ndisc_dst_alloc(struct net_device *dev, 
878                                   struct neighbour *neigh,
879                                   struct in6_addr *addr,
880                                   int (*output)(struct sk_buff *))
881 {
882         struct rt6_info *rt;
883         struct inet6_dev *idev = in6_dev_get(dev);
884
885         if (unlikely(idev == NULL))
886                 return NULL;
887
888         rt = ip6_dst_alloc();
889         if (unlikely(rt == NULL)) {
890                 in6_dev_put(idev);
891                 goto out;
892         }
893
894         dev_hold(dev);
895         if (neigh)
896                 neigh_hold(neigh);
897         else
898                 neigh = ndisc_get_neigh(dev, addr);
899
900         rt->rt6i_dev      = dev;
901         rt->rt6i_idev     = idev;
902         rt->rt6i_nexthop  = neigh;
903         atomic_set(&rt->u.dst.__refcnt, 1);
904         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
905         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
906         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
907         rt->u.dst.output  = output;
908
909 #if 0   /* there's no chance to use these for ndisc */
910         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST 
911                                 ? DST_HOST 
912                                 : 0;
913         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
914         rt->rt6i_dst.plen = 128;
915 #endif
916
917         spin_lock_bh(&ndisc_lock);
918         rt->u.dst.next = ndisc_dst_gc_list;
919         ndisc_dst_gc_list = &rt->u.dst;
920         spin_unlock_bh(&ndisc_lock);
921
922         fib6_force_start_gc();
923
924 out:
925         return (struct dst_entry *)rt;
926 }
927
928 int ndisc_dst_gc(int *more)
929 {
930         struct dst_entry *dst, *next, **pprev;
931         int freed;
932
933         next = NULL;
934         freed = 0;
935
936         spin_lock_bh(&ndisc_lock);
937         pprev = &ndisc_dst_gc_list;
938
939         while ((dst = *pprev) != NULL) {
940                 if (!atomic_read(&dst->__refcnt)) {
941                         *pprev = dst->next;
942                         dst_free(dst);
943                         freed++;
944                 } else {
945                         pprev = &dst->next;
946                         (*more)++;
947                 }
948         }
949
950         spin_unlock_bh(&ndisc_lock);
951
952         return freed;
953 }
954
955 static int ip6_dst_gc(void)
956 {
957         static unsigned expire = 30*HZ;
958         static unsigned long last_gc;
959         unsigned long now = jiffies;
960
961         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
962             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
963                 goto out;
964
965         expire++;
966         fib6_run_gc(expire);
967         last_gc = now;
968         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
969                 expire = ip6_rt_gc_timeout>>1;
970
971 out:
972         expire -= expire>>ip6_rt_gc_elasticity;
973         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
974 }
975
976 /* Clean host part of a prefix. Not necessary in radix tree,
977    but results in cleaner routing tables.
978
979    Remove it only when all the things will work!
980  */
981
982 static int ipv6_get_mtu(struct net_device *dev)
983 {
984         int mtu = IPV6_MIN_MTU;
985         struct inet6_dev *idev;
986
987         idev = in6_dev_get(dev);
988         if (idev) {
989                 mtu = idev->cnf.mtu6;
990                 in6_dev_put(idev);
991         }
992         return mtu;
993 }
994
995 int ipv6_get_hoplimit(struct net_device *dev)
996 {
997         int hoplimit = ipv6_devconf.hop_limit;
998         struct inet6_dev *idev;
999
1000         idev = in6_dev_get(dev);
1001         if (idev) {
1002                 hoplimit = idev->cnf.hop_limit;
1003                 in6_dev_put(idev);
1004         }
1005         return hoplimit;
1006 }
1007
1008 /*
1009  *
1010  */
1011
1012 int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, 
1013                   void *_rtattr, struct netlink_skb_parms *req,
1014                   u32 table_id)
1015 {
1016         int err;
1017         struct rtmsg *r;
1018         struct rtattr **rta;
1019         struct rt6_info *rt = NULL;
1020         struct net_device *dev = NULL;
1021         struct inet6_dev *idev = NULL;
1022         struct fib6_table *table;
1023         int addr_type;
1024
1025         rta = (struct rtattr **) _rtattr;
1026
1027         if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128)
1028                 return -EINVAL;
1029 #ifndef CONFIG_IPV6_SUBTREES
1030         if (rtmsg->rtmsg_src_len)
1031                 return -EINVAL;
1032 #endif
1033         if (rtmsg->rtmsg_ifindex) {
1034                 err = -ENODEV;
1035                 dev = dev_get_by_index(rtmsg->rtmsg_ifindex);
1036                 if (!dev)
1037                         goto out;
1038                 idev = in6_dev_get(dev);
1039                 if (!idev)
1040                         goto out;
1041         }
1042
1043         if (rtmsg->rtmsg_metric == 0)
1044                 rtmsg->rtmsg_metric = IP6_RT_PRIO_USER;
1045
1046         table = fib6_new_table(table_id);
1047         if (table == NULL) {
1048                 err = -ENOBUFS;
1049                 goto out;
1050         }
1051
1052         rt = ip6_dst_alloc();
1053
1054         if (rt == NULL) {
1055                 err = -ENOMEM;
1056                 goto out;
1057         }
1058
1059         rt->u.dst.obsolete = -1;
1060         rt->rt6i_expires = jiffies + clock_t_to_jiffies(rtmsg->rtmsg_info);
1061         if (nlh && (r = NLMSG_DATA(nlh))) {
1062                 rt->rt6i_protocol = r->rtm_protocol;
1063         } else {
1064                 rt->rt6i_protocol = RTPROT_BOOT;
1065         }
1066
1067         addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst);
1068
1069         if (addr_type & IPV6_ADDR_MULTICAST)
1070                 rt->u.dst.input = ip6_mc_input;
1071         else
1072                 rt->u.dst.input = ip6_forward;
1073
1074         rt->u.dst.output = ip6_output;
1075
1076         ipv6_addr_prefix(&rt->rt6i_dst.addr, 
1077                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len);
1078         rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len;
1079         if (rt->rt6i_dst.plen == 128)
1080                rt->u.dst.flags = DST_HOST;
1081
1082 #ifdef CONFIG_IPV6_SUBTREES
1083         ipv6_addr_prefix(&rt->rt6i_src.addr, 
1084                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1085         rt->rt6i_src.plen = rtmsg->rtmsg_src_len;
1086 #endif
1087
1088         rt->rt6i_metric = rtmsg->rtmsg_metric;
1089
1090         /* We cannot add true routes via loopback here,
1091            they would result in kernel looping; promote them to reject routes
1092          */
1093         if ((rtmsg->rtmsg_flags&RTF_REJECT) ||
1094             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1095                 /* hold loopback dev/idev if we haven't done so. */
1096                 if (dev != &loopback_dev) {
1097                         if (dev) {
1098                                 dev_put(dev);
1099                                 in6_dev_put(idev);
1100                         }
1101                         dev = &loopback_dev;
1102                         dev_hold(dev);
1103                         idev = in6_dev_get(dev);
1104                         if (!idev) {
1105                                 err = -ENODEV;
1106                                 goto out;
1107                         }
1108                 }
1109                 rt->u.dst.output = ip6_pkt_discard_out;
1110                 rt->u.dst.input = ip6_pkt_discard;
1111                 rt->u.dst.error = -ENETUNREACH;
1112                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1113                 goto install_route;
1114         }
1115
1116         if (rtmsg->rtmsg_flags & RTF_GATEWAY) {
1117                 struct in6_addr *gw_addr;
1118                 int gwa_type;
1119
1120                 gw_addr = &rtmsg->rtmsg_gateway;
1121                 ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway);
1122                 gwa_type = ipv6_addr_type(gw_addr);
1123
1124                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1125                         struct rt6_info *grt;
1126
1127                         /* IPv6 strictly inhibits using not link-local
1128                            addresses as nexthop address.
1129                            Otherwise, router will not able to send redirects.
1130                            It is very good, but in some (rare!) circumstances
1131                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1132                            some exceptions. --ANK
1133                          */
1134                         err = -EINVAL;
1135                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1136                                 goto out;
1137
1138                         grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1);
1139
1140                         err = -EHOSTUNREACH;
1141                         if (grt == NULL)
1142                                 goto out;
1143                         if (dev) {
1144                                 if (dev != grt->rt6i_dev) {
1145                                         dst_release(&grt->u.dst);
1146                                         goto out;
1147                                 }
1148                         } else {
1149                                 dev = grt->rt6i_dev;
1150                                 idev = grt->rt6i_idev;
1151                                 dev_hold(dev);
1152                                 in6_dev_hold(grt->rt6i_idev);
1153                         }
1154                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1155                                 err = 0;
1156                         dst_release(&grt->u.dst);
1157
1158                         if (err)
1159                                 goto out;
1160                 }
1161                 err = -EINVAL;
1162                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1163                         goto out;
1164         }
1165
1166         err = -ENODEV;
1167         if (dev == NULL)
1168                 goto out;
1169
1170         if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) {
1171                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1172                 if (IS_ERR(rt->rt6i_nexthop)) {
1173                         err = PTR_ERR(rt->rt6i_nexthop);
1174                         rt->rt6i_nexthop = NULL;
1175                         goto out;
1176                 }
1177         }
1178
1179         rt->rt6i_flags = rtmsg->rtmsg_flags;
1180
1181 install_route:
1182         if (rta && rta[RTA_METRICS-1]) {
1183                 int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]);
1184                 struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]);
1185
1186                 while (RTA_OK(attr, attrlen)) {
1187                         unsigned flavor = attr->rta_type;
1188                         if (flavor) {
1189                                 if (flavor > RTAX_MAX) {
1190                                         err = -EINVAL;
1191                                         goto out;
1192                                 }
1193                                 rt->u.dst.metrics[flavor-1] =
1194                                         *(u32 *)RTA_DATA(attr);
1195                         }
1196                         attr = RTA_NEXT(attr, attrlen);
1197                 }
1198         }
1199
1200         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1201                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1202         if (!rt->u.dst.metrics[RTAX_MTU-1])
1203                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1204         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1205                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1206         rt->u.dst.dev = dev;
1207         rt->rt6i_idev = idev;
1208         rt->rt6i_table = table;
1209         return ip6_ins_rt(rt, nlh, _rtattr, req);
1210
1211 out:
1212         if (dev)
1213                 dev_put(dev);
1214         if (idev)
1215                 in6_dev_put(idev);
1216         if (rt)
1217                 dst_free((struct dst_entry *) rt);
1218         return err;
1219 }
1220
1221 int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr, struct netlink_skb_parms *req)
1222 {
1223         int err;
1224         struct fib6_table *table;
1225
1226         if (rt == &ip6_null_entry)
1227                 return -ENOENT;
1228
1229         table = rt->rt6i_table;
1230         write_lock_bh(&table->tb6_lock);
1231
1232         err = fib6_del(rt, nlh, _rtattr, req);
1233         dst_release(&rt->u.dst);
1234
1235         write_unlock_bh(&table->tb6_lock);
1236
1237         return err;
1238 }
1239
1240 static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh,
1241                          void *_rtattr, struct netlink_skb_parms *req,
1242                          u32 table_id)
1243 {
1244         struct fib6_table *table;
1245         struct fib6_node *fn;
1246         struct rt6_info *rt;
1247         int err = -ESRCH;
1248
1249         table = fib6_get_table(table_id);
1250         if (table == NULL)
1251                 return err;
1252
1253         read_lock_bh(&table->tb6_lock);
1254
1255         fn = fib6_locate(&table->tb6_root,
1256                          &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len,
1257                          &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len);
1258         
1259         if (fn) {
1260                 for (rt = fn->leaf; rt; rt = rt->u.next) {
1261                         if (rtmsg->rtmsg_ifindex &&
1262                             (rt->rt6i_dev == NULL ||
1263                              rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex))
1264                                 continue;
1265                         if (rtmsg->rtmsg_flags&RTF_GATEWAY &&
1266                             !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway))
1267                                 continue;
1268                         if (rtmsg->rtmsg_metric &&
1269                             rtmsg->rtmsg_metric != rt->rt6i_metric)
1270                                 continue;
1271                         dst_hold(&rt->u.dst);
1272                         read_unlock_bh(&table->tb6_lock);
1273
1274                         return ip6_del_rt(rt, nlh, _rtattr, req);
1275                 }
1276         }
1277         read_unlock_bh(&table->tb6_lock);
1278
1279         return err;
1280 }
1281
1282 /*
1283  *      Handle redirects
1284  */
1285 void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr,
1286                   struct neighbour *neigh, u8 *lladdr, int on_link)
1287 {
1288         struct rt6_info *rt, *nrt = NULL;
1289         struct fib6_node *fn;
1290         struct fib6_table *table;
1291         struct netevent_redirect netevent;
1292
1293         /* TODO: Very lazy, might need to check all tables */
1294         table = fib6_get_table(RT6_TABLE_MAIN);
1295         if (table == NULL)
1296                 return;
1297
1298         /*
1299          * Get the "current" route for this destination and
1300          * check if the redirect has come from approriate router.
1301          *
1302          * RFC 2461 specifies that redirects should only be
1303          * accepted if they come from the nexthop to the target.
1304          * Due to the way the routes are chosen, this notion
1305          * is a bit fuzzy and one might need to check all possible
1306          * routes.
1307          */
1308
1309         read_lock_bh(&table->tb6_lock);
1310         fn = fib6_lookup(&table->tb6_root, dest, NULL);
1311 restart:
1312         for (rt = fn->leaf; rt; rt = rt->u.next) {
1313                 /*
1314                  * Current route is on-link; redirect is always invalid.
1315                  *
1316                  * Seems, previous statement is not true. It could
1317                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1318                  * But then router serving it might decide, that we should
1319                  * know truth 8)8) --ANK (980726).
1320                  */
1321                 if (rt6_check_expired(rt))
1322                         continue;
1323                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1324                         continue;
1325                 if (neigh->dev != rt->rt6i_dev)
1326                         continue;
1327                 if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway))
1328                         continue;
1329                 break;
1330         }
1331         if (rt)
1332                 dst_hold(&rt->u.dst);
1333         else if (rt6_need_strict(dest)) {
1334                 while ((fn = fn->parent) != NULL) {
1335                         if (fn->fn_flags & RTN_ROOT)
1336                                 break;
1337                         if (fn->fn_flags & RTN_RTINFO)
1338                                 goto restart;
1339                 }
1340         }
1341         read_unlock_bh(&table->tb6_lock);
1342
1343         if (!rt) {
1344                 if (net_ratelimit())
1345                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1346                                "for redirect target\n");
1347                 return;
1348         }
1349
1350         /*
1351          *      We have finally decided to accept it.
1352          */
1353
1354         neigh_update(neigh, lladdr, NUD_STALE, 
1355                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1356                      NEIGH_UPDATE_F_OVERRIDE|
1357                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1358                                      NEIGH_UPDATE_F_ISROUTER))
1359                      );
1360
1361         /*
1362          * Redirect received -> path was valid.
1363          * Look, redirects are sent only in response to data packets,
1364          * so that this nexthop apparently is reachable. --ANK
1365          */
1366         dst_confirm(&rt->u.dst);
1367
1368         /* Duplicate redirect: silently ignore. */
1369         if (neigh == rt->u.dst.neighbour)
1370                 goto out;
1371
1372         nrt = ip6_rt_copy(rt);
1373         if (nrt == NULL)
1374                 goto out;
1375
1376         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1377         if (on_link)
1378                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1379
1380         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1381         nrt->rt6i_dst.plen = 128;
1382         nrt->u.dst.flags |= DST_HOST;
1383
1384         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1385         nrt->rt6i_nexthop = neigh_clone(neigh);
1386         /* Reset pmtu, it may be better */
1387         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1388         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1389
1390         if (ip6_ins_rt(nrt, NULL, NULL, NULL))
1391                 goto out;
1392
1393         netevent.old = &rt->u.dst;
1394         netevent.new = &nrt->u.dst;
1395         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1396
1397         if (rt->rt6i_flags&RTF_CACHE) {
1398                 ip6_del_rt(rt, NULL, NULL, NULL);
1399                 return;
1400         }
1401
1402 out:
1403         dst_release(&rt->u.dst);
1404         return;
1405 }
1406
1407 /*
1408  *      Handle ICMP "packet too big" messages
1409  *      i.e. Path MTU discovery
1410  */
1411
1412 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1413                         struct net_device *dev, u32 pmtu)
1414 {
1415         struct rt6_info *rt, *nrt;
1416         int allfrag = 0;
1417
1418         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1419         if (rt == NULL)
1420                 return;
1421
1422         if (pmtu >= dst_mtu(&rt->u.dst))
1423                 goto out;
1424
1425         if (pmtu < IPV6_MIN_MTU) {
1426                 /*
1427                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link 
1428                  * MTU (1280) and a fragment header should always be included
1429                  * after a node receiving Too Big message reporting PMTU is
1430                  * less than the IPv6 Minimum Link MTU.
1431                  */
1432                 pmtu = IPV6_MIN_MTU;
1433                 allfrag = 1;
1434         }
1435
1436         /* New mtu received -> path was valid.
1437            They are sent only in response to data packets,
1438            so that this nexthop apparently is reachable. --ANK
1439          */
1440         dst_confirm(&rt->u.dst);
1441
1442         /* Host route. If it is static, it would be better
1443            not to override it, but add new one, so that
1444            when cache entry will expire old pmtu
1445            would return automatically.
1446          */
1447         if (rt->rt6i_flags & RTF_CACHE) {
1448                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1449                 if (allfrag)
1450                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1451                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1452                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1453                 goto out;
1454         }
1455
1456         /* Network route.
1457            Two cases are possible:
1458            1. It is connected route. Action: COW
1459            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1460          */
1461         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1462                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1463         else
1464                 nrt = rt6_alloc_clone(rt, daddr);
1465
1466         if (nrt) {
1467                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1468                 if (allfrag)
1469                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1470
1471                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1472                  * happened within 5 mins, the recommended timer is 10 mins.
1473                  * Here this route expiration time is set to ip6_rt_mtu_expires
1474                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1475                  * and detecting PMTU increase will be automatically happened.
1476                  */
1477                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1478                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1479
1480                 ip6_ins_rt(nrt, NULL, NULL, NULL);
1481         }
1482 out:
1483         dst_release(&rt->u.dst);
1484 }
1485
1486 /*
1487  *      Misc support functions
1488  */
1489
1490 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1491 {
1492         struct rt6_info *rt = ip6_dst_alloc();
1493
1494         if (rt) {
1495                 rt->u.dst.input = ort->u.dst.input;
1496                 rt->u.dst.output = ort->u.dst.output;
1497
1498                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1499                 rt->u.dst.dev = ort->u.dst.dev;
1500                 if (rt->u.dst.dev)
1501                         dev_hold(rt->u.dst.dev);
1502                 rt->rt6i_idev = ort->rt6i_idev;
1503                 if (rt->rt6i_idev)
1504                         in6_dev_hold(rt->rt6i_idev);
1505                 rt->u.dst.lastuse = jiffies;
1506                 rt->rt6i_expires = 0;
1507
1508                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1509                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1510                 rt->rt6i_metric = 0;
1511
1512                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1513 #ifdef CONFIG_IPV6_SUBTREES
1514                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1515 #endif
1516                 rt->rt6i_table = ort->rt6i_table;
1517         }
1518         return rt;
1519 }
1520
1521 #ifdef CONFIG_IPV6_ROUTE_INFO
1522 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1523                                            struct in6_addr *gwaddr, int ifindex)
1524 {
1525         struct fib6_node *fn;
1526         struct rt6_info *rt = NULL;
1527         struct fib6_table *table;
1528
1529         table = fib6_get_table(RT6_TABLE_INFO);
1530         if (table == NULL)
1531                 return NULL;
1532
1533         write_lock_bh(&table->tb6_lock);
1534         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1535         if (!fn)
1536                 goto out;
1537
1538         for (rt = fn->leaf; rt; rt = rt->u.next) {
1539                 if (rt->rt6i_dev->ifindex != ifindex)
1540                         continue;
1541                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1542                         continue;
1543                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1544                         continue;
1545                 dst_hold(&rt->u.dst);
1546                 break;
1547         }
1548 out:
1549         write_unlock_bh(&table->tb6_lock);
1550         return rt;
1551 }
1552
1553 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1554                                            struct in6_addr *gwaddr, int ifindex,
1555                                            unsigned pref)
1556 {
1557         struct in6_rtmsg rtmsg;
1558
1559         memset(&rtmsg, 0, sizeof(rtmsg));
1560         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1561         ipv6_addr_copy(&rtmsg.rtmsg_dst, prefix);
1562         rtmsg.rtmsg_dst_len = prefixlen;
1563         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1564         rtmsg.rtmsg_metric = 1024;
1565         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref);
1566         /* We should treat it as a default route if prefix length is 0. */
1567         if (!prefixlen)
1568                 rtmsg.rtmsg_flags |= RTF_DEFAULT;
1569         rtmsg.rtmsg_ifindex = ifindex;
1570
1571         ip6_route_add(&rtmsg, NULL, NULL, NULL, RT6_TABLE_INFO);
1572
1573         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1574 }
1575 #endif
1576
1577 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1578 {       
1579         struct rt6_info *rt;
1580         struct fib6_table *table;
1581
1582         table = fib6_get_table(RT6_TABLE_DFLT);
1583         if (table == NULL)
1584                 return NULL;
1585
1586         write_lock_bh(&table->tb6_lock);
1587         for (rt = table->tb6_root.leaf; rt; rt=rt->u.next) {
1588                 if (dev == rt->rt6i_dev &&
1589                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1590                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1591                         break;
1592         }
1593         if (rt)
1594                 dst_hold(&rt->u.dst);
1595         write_unlock_bh(&table->tb6_lock);
1596         return rt;
1597 }
1598
1599 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1600                                      struct net_device *dev,
1601                                      unsigned int pref)
1602 {
1603         struct in6_rtmsg rtmsg;
1604
1605         memset(&rtmsg, 0, sizeof(struct in6_rtmsg));
1606         rtmsg.rtmsg_type = RTMSG_NEWROUTE;
1607         ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr);
1608         rtmsg.rtmsg_metric = 1024;
1609         rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES |
1610                             RTF_PREF(pref);
1611
1612         rtmsg.rtmsg_ifindex = dev->ifindex;
1613
1614         ip6_route_add(&rtmsg, NULL, NULL, NULL, RT6_TABLE_DFLT);
1615         return rt6_get_dflt_router(gwaddr, dev);
1616 }
1617
1618 void rt6_purge_dflt_routers(void)
1619 {
1620         struct rt6_info *rt;
1621         struct fib6_table *table;
1622
1623         /* NOTE: Keep consistent with rt6_get_dflt_router */
1624         table = fib6_get_table(RT6_TABLE_DFLT);
1625         if (table == NULL)
1626                 return;
1627
1628 restart:
1629         read_lock_bh(&table->tb6_lock);
1630         for (rt = table->tb6_root.leaf; rt; rt = rt->u.next) {
1631                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1632                         dst_hold(&rt->u.dst);
1633                         read_unlock_bh(&table->tb6_lock);
1634                         ip6_del_rt(rt, NULL, NULL, NULL);
1635                         goto restart;
1636                 }
1637         }
1638         read_unlock_bh(&table->tb6_lock);
1639 }
1640
1641 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1642 {
1643         struct in6_rtmsg rtmsg;
1644         int err;
1645
1646         switch(cmd) {
1647         case SIOCADDRT:         /* Add a route */
1648         case SIOCDELRT:         /* Delete a route */
1649                 if (!capable(CAP_NET_ADMIN))
1650                         return -EPERM;
1651                 err = copy_from_user(&rtmsg, arg,
1652                                      sizeof(struct in6_rtmsg));
1653                 if (err)
1654                         return -EFAULT;
1655                         
1656                 rtnl_lock();
1657                 switch (cmd) {
1658                 case SIOCADDRT:
1659                         err = ip6_route_add(&rtmsg, NULL, NULL, NULL,
1660                                             RT6_TABLE_MAIN);
1661                         break;
1662                 case SIOCDELRT:
1663                         err = ip6_route_del(&rtmsg, NULL, NULL, NULL,
1664                                             RT6_TABLE_MAIN);
1665                         break;
1666                 default:
1667                         err = -EINVAL;
1668                 }
1669                 rtnl_unlock();
1670
1671                 return err;
1672         };
1673
1674         return -EINVAL;
1675 }
1676
1677 /*
1678  *      Drop the packet on the floor
1679  */
1680
1681 static int ip6_pkt_discard(struct sk_buff *skb)
1682 {
1683         int type = ipv6_addr_type(&skb->nh.ipv6h->daddr);
1684         if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED)
1685                 IP6_INC_STATS(IPSTATS_MIB_INADDRERRORS);
1686
1687         IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
1688         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev);
1689         kfree_skb(skb);
1690         return 0;
1691 }
1692
1693 static int ip6_pkt_discard_out(struct sk_buff *skb)
1694 {
1695         skb->dev = skb->dst->dev;
1696         return ip6_pkt_discard(skb);
1697 }
1698
1699 /*
1700  *      Allocate a dst for local (unicast / anycast) address.
1701  */
1702
1703 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1704                                     const struct in6_addr *addr,
1705                                     int anycast)
1706 {
1707         struct rt6_info *rt = ip6_dst_alloc();
1708
1709         if (rt == NULL)
1710                 return ERR_PTR(-ENOMEM);
1711
1712         dev_hold(&loopback_dev);
1713         in6_dev_hold(idev);
1714
1715         rt->u.dst.flags = DST_HOST;
1716         rt->u.dst.input = ip6_input;
1717         rt->u.dst.output = ip6_output;
1718         rt->rt6i_dev = &loopback_dev;
1719         rt->rt6i_idev = idev;
1720         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1721         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1722         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1723         rt->u.dst.obsolete = -1;
1724
1725         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1726         if (anycast)
1727                 rt->rt6i_flags |= RTF_ANYCAST;
1728         else
1729                 rt->rt6i_flags |= RTF_LOCAL;
1730         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1731         if (rt->rt6i_nexthop == NULL) {
1732                 dst_free((struct dst_entry *) rt);
1733                 return ERR_PTR(-ENOMEM);
1734         }
1735
1736         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1737         rt->rt6i_dst.plen = 128;
1738         rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1739
1740         atomic_set(&rt->u.dst.__refcnt, 1);
1741
1742         return rt;
1743 }
1744
1745 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1746 {
1747         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1748             rt != &ip6_null_entry) {
1749                 RT6_TRACE("deleted by ifdown %p\n", rt);
1750                 return -1;
1751         }
1752         return 0;
1753 }
1754
1755 void rt6_ifdown(struct net_device *dev)
1756 {
1757         fib6_clean_all(fib6_ifdown, 0, dev);
1758 }
1759
1760 struct rt6_mtu_change_arg
1761 {
1762         struct net_device *dev;
1763         unsigned mtu;
1764 };
1765
1766 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1767 {
1768         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1769         struct inet6_dev *idev;
1770
1771         /* In IPv6 pmtu discovery is not optional,
1772            so that RTAX_MTU lock cannot disable it.
1773            We still use this lock to block changes
1774            caused by addrconf/ndisc.
1775         */
1776
1777         idev = __in6_dev_get(arg->dev);
1778         if (idev == NULL)
1779                 return 0;
1780
1781         /* For administrative MTU increase, there is no way to discover
1782            IPv6 PMTU increase, so PMTU increase should be updated here.
1783            Since RFC 1981 doesn't include administrative MTU increase
1784            update PMTU increase is a MUST. (i.e. jumbo frame)
1785          */
1786         /*
1787            If new MTU is less than route PMTU, this new MTU will be the
1788            lowest MTU in the path, update the route PMTU to reflect PMTU
1789            decreases; if new MTU is greater than route PMTU, and the
1790            old MTU is the lowest MTU in the path, update the route PMTU
1791            to reflect the increase. In this case if the other nodes' MTU
1792            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1793            PMTU discouvery.
1794          */
1795         if (rt->rt6i_dev == arg->dev &&
1796             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1797             (dst_mtu(&rt->u.dst) > arg->mtu ||
1798              (dst_mtu(&rt->u.dst) < arg->mtu &&
1799               dst_mtu(&rt->u.dst) == idev->cnf.mtu6)))
1800                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1801         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1802         return 0;
1803 }
1804
1805 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1806 {
1807         struct rt6_mtu_change_arg arg = {
1808                 .dev = dev,
1809                 .mtu = mtu,
1810         };
1811
1812         fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1813 }
1814
1815 static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta,
1816                               struct in6_rtmsg *rtmsg)
1817 {
1818         memset(rtmsg, 0, sizeof(*rtmsg));
1819
1820         rtmsg->rtmsg_dst_len = r->rtm_dst_len;
1821         rtmsg->rtmsg_src_len = r->rtm_src_len;
1822         rtmsg->rtmsg_flags = RTF_UP;
1823         if (r->rtm_type == RTN_UNREACHABLE)
1824                 rtmsg->rtmsg_flags |= RTF_REJECT;
1825
1826         if (rta[RTA_GATEWAY-1]) {
1827                 if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16))
1828                         return -EINVAL;
1829                 memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16);
1830                 rtmsg->rtmsg_flags |= RTF_GATEWAY;
1831         }
1832         if (rta[RTA_DST-1]) {
1833                 if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3))
1834                         return -EINVAL;
1835                 memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3));
1836         }
1837         if (rta[RTA_SRC-1]) {
1838                 if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3))
1839                         return -EINVAL;
1840                 memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3));
1841         }
1842         if (rta[RTA_OIF-1]) {
1843                 if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int)))
1844                         return -EINVAL;
1845                 memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
1846         }
1847         if (rta[RTA_PRIORITY-1]) {
1848                 if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4))
1849                         return -EINVAL;
1850                 memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
1851         }
1852         return 0;
1853 }
1854
1855 int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1856 {
1857         struct rtmsg *r = NLMSG_DATA(nlh);
1858         struct in6_rtmsg rtmsg;
1859
1860         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1861                 return -EINVAL;
1862         return ip6_route_del(&rtmsg, nlh, arg, &NETLINK_CB(skb), r->rtm_table);
1863 }
1864
1865 int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
1866 {
1867         struct rtmsg *r = NLMSG_DATA(nlh);
1868         struct in6_rtmsg rtmsg;
1869
1870         if (inet6_rtm_to_rtmsg(r, arg, &rtmsg))
1871                 return -EINVAL;
1872         return ip6_route_add(&rtmsg, nlh, arg, &NETLINK_CB(skb), r->rtm_table);
1873 }
1874
1875 struct rt6_rtnl_dump_arg
1876 {
1877         struct sk_buff *skb;
1878         struct netlink_callback *cb;
1879 };
1880
1881 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
1882                          struct in6_addr *dst, struct in6_addr *src,
1883                          int iif, int type, u32 pid, u32 seq,
1884                          int prefix, unsigned int flags)
1885 {
1886         struct rtmsg *rtm;
1887         struct nlmsghdr  *nlh;
1888         unsigned char    *b = skb->tail;
1889         struct rta_cacheinfo ci;
1890
1891         if (prefix) {   /* user wants prefix routes only */
1892                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
1893                         /* success since this is not a prefix route */
1894                         return 1;
1895                 }
1896         }
1897
1898         nlh = NLMSG_NEW(skb, pid, seq, type, sizeof(*rtm), flags);
1899         rtm = NLMSG_DATA(nlh);
1900         rtm->rtm_family = AF_INET6;
1901         rtm->rtm_dst_len = rt->rt6i_dst.plen;
1902         rtm->rtm_src_len = rt->rt6i_src.plen;
1903         rtm->rtm_tos = 0;
1904         if (rt->rt6i_table)
1905                 rtm->rtm_table = rt->rt6i_table->tb6_id;
1906         else
1907                 rtm->rtm_table = RT6_TABLE_UNSPEC;
1908         if (rt->rt6i_flags&RTF_REJECT)
1909                 rtm->rtm_type = RTN_UNREACHABLE;
1910         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
1911                 rtm->rtm_type = RTN_LOCAL;
1912         else
1913                 rtm->rtm_type = RTN_UNICAST;
1914         rtm->rtm_flags = 0;
1915         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
1916         rtm->rtm_protocol = rt->rt6i_protocol;
1917         if (rt->rt6i_flags&RTF_DYNAMIC)
1918                 rtm->rtm_protocol = RTPROT_REDIRECT;
1919         else if (rt->rt6i_flags & RTF_ADDRCONF)
1920                 rtm->rtm_protocol = RTPROT_KERNEL;
1921         else if (rt->rt6i_flags&RTF_DEFAULT)
1922                 rtm->rtm_protocol = RTPROT_RA;
1923
1924         if (rt->rt6i_flags&RTF_CACHE)
1925                 rtm->rtm_flags |= RTM_F_CLONED;
1926
1927         if (dst) {
1928                 RTA_PUT(skb, RTA_DST, 16, dst);
1929                 rtm->rtm_dst_len = 128;
1930         } else if (rtm->rtm_dst_len)
1931                 RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
1932 #ifdef CONFIG_IPV6_SUBTREES
1933         if (src) {
1934                 RTA_PUT(skb, RTA_SRC, 16, src);
1935                 rtm->rtm_src_len = 128;
1936         } else if (rtm->rtm_src_len)
1937                 RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
1938 #endif
1939         if (iif)
1940                 RTA_PUT(skb, RTA_IIF, 4, &iif);
1941         else if (dst) {
1942                 struct in6_addr saddr_buf;
1943                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
1944                         RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
1945         }
1946         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
1947                 goto rtattr_failure;
1948         if (rt->u.dst.neighbour)
1949                 RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
1950         if (rt->u.dst.dev)
1951                 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex);
1952         RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric);
1953         ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
1954         if (rt->rt6i_expires)
1955                 ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies);
1956         else
1957                 ci.rta_expires = 0;
1958         ci.rta_used = rt->u.dst.__use;
1959         ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt);
1960         ci.rta_error = rt->u.dst.error;
1961         ci.rta_id = 0;
1962         ci.rta_ts = 0;
1963         ci.rta_tsage = 0;
1964         RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
1965         nlh->nlmsg_len = skb->tail - b;
1966         return skb->len;
1967
1968 nlmsg_failure:
1969 rtattr_failure:
1970         skb_trim(skb, b - skb->data);
1971         return -1;
1972 }
1973
1974 static int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1975 {
1976         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1977         int prefix;
1978
1979         if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) {
1980                 struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh);
1981                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
1982         } else
1983                 prefix = 0;
1984
1985         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
1986                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
1987                      prefix, NLM_F_MULTI);
1988 }
1989
1990 static int fib6_dump_node(struct fib6_walker_t *w)
1991 {
1992         int res;
1993         struct rt6_info *rt;
1994
1995         for (rt = w->leaf; rt; rt = rt->u.next) {
1996                 res = rt6_dump_route(rt, w->args);
1997                 if (res < 0) {
1998                         /* Frame is full, suspend walking */
1999                         w->leaf = rt;
2000                         return 1;
2001                 }
2002                 BUG_TRAP(res!=0);
2003         }
2004         w->leaf = NULL;
2005         return 0;
2006 }
2007
2008 static void fib6_dump_end(struct netlink_callback *cb)
2009 {
2010         struct fib6_walker_t *w = (void*)cb->args[0];
2011
2012         if (w) {
2013                 cb->args[0] = 0;
2014                 kfree(w);
2015         }
2016         cb->done = (void*)cb->args[1];
2017         cb->args[1] = 0;
2018 }
2019
2020 static int fib6_dump_done(struct netlink_callback *cb)
2021 {
2022         fib6_dump_end(cb);
2023         return cb->done ? cb->done(cb) : 0;
2024 }
2025
2026 int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
2027 {
2028         struct fib6_table *table;
2029         struct rt6_rtnl_dump_arg arg;
2030         struct fib6_walker_t *w;
2031         int i, res = 0;
2032
2033         arg.skb = skb;
2034         arg.cb = cb;
2035
2036         /*
2037          * cb->args[0] = pointer to walker structure
2038          * cb->args[1] = saved cb->done() pointer
2039          * cb->args[2] = current table being dumped
2040          */
2041
2042         w = (void*)cb->args[0];
2043         if (w == NULL) {
2044                 /* New dump:
2045                  * 
2046                  * 1. hook callback destructor.
2047                  */
2048                 cb->args[1] = (long)cb->done;
2049                 cb->done = fib6_dump_done;
2050
2051                 /*
2052                  * 2. allocate and initialize walker.
2053                  */
2054                 w = kzalloc(sizeof(*w), GFP_ATOMIC);
2055                 if (w == NULL)
2056                         return -ENOMEM;
2057                 w->func = fib6_dump_node;
2058                 w->args = &arg;
2059                 cb->args[0] = (long)w;
2060                 cb->args[2] = FIB6_TABLE_MIN;
2061         } else {
2062                 w->args = &arg;
2063                 i = cb->args[2];
2064                 if (i > FIB6_TABLE_MAX)
2065                         goto end;
2066
2067                 table = fib6_get_table(i);
2068                 if (table != NULL) {
2069                         read_lock_bh(&table->tb6_lock);
2070                         w->root = &table->tb6_root;
2071                         res = fib6_walk_continue(w);
2072                         read_unlock_bh(&table->tb6_lock);
2073                         if (res != 0) {
2074                                 if (res < 0)
2075                                         fib6_walker_unlink(w);
2076                                 goto end;
2077                         }
2078                 }
2079
2080                 fib6_walker_unlink(w);
2081                 cb->args[2] = ++i;
2082         }
2083
2084         for (i = cb->args[2]; i <= FIB6_TABLE_MAX; i++) {
2085                 table = fib6_get_table(i);
2086                 if (table == NULL)
2087                         continue;
2088
2089                 read_lock_bh(&table->tb6_lock);
2090                 w->root = &table->tb6_root;
2091                 res = fib6_walk(w);
2092                 read_unlock_bh(&table->tb6_lock);
2093                 if (res)
2094                         break;
2095         }
2096 end:
2097         cb->args[2] = i;
2098
2099         res = res < 0 ? res : skb->len;
2100         /* res < 0 is an error. (really, impossible)
2101            res == 0 means that dump is complete, but skb still can contain data.
2102            res > 0 dump is not complete, but frame is full.
2103          */
2104         /* Destroy walker, if dump of this table is complete. */
2105         if (res <= 0)
2106                 fib6_dump_end(cb);
2107         return res;
2108 }
2109
2110 int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2111 {
2112         struct rtattr **rta = arg;
2113         int iif = 0;
2114         int err = -ENOBUFS;
2115         struct sk_buff *skb;
2116         struct flowi fl;
2117         struct rt6_info *rt;
2118
2119         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2120         if (skb == NULL)
2121                 goto out;
2122
2123         /* Reserve room for dummy headers, this skb can pass
2124            through good chunk of routing engine.
2125          */
2126         skb->mac.raw = skb->data;
2127         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2128
2129         memset(&fl, 0, sizeof(fl));
2130         if (rta[RTA_SRC-1])
2131                 ipv6_addr_copy(&fl.fl6_src,
2132                                (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1]));
2133         if (rta[RTA_DST-1])
2134                 ipv6_addr_copy(&fl.fl6_dst,
2135                                (struct in6_addr*)RTA_DATA(rta[RTA_DST-1]));
2136
2137         if (rta[RTA_IIF-1])
2138                 memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int));
2139
2140         if (iif) {
2141                 struct net_device *dev;
2142                 dev = __dev_get_by_index(iif);
2143                 if (!dev) {
2144                         err = -ENODEV;
2145                         goto out_free;
2146                 }
2147         }
2148
2149         fl.oif = 0;
2150         if (rta[RTA_OIF-1])
2151                 memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int));
2152
2153         rt = (struct rt6_info*)ip6_route_output(NULL, &fl);
2154
2155         skb->dst = &rt->u.dst;
2156
2157         NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
2158         err = rt6_fill_node(skb, rt, 
2159                             &fl.fl6_dst, &fl.fl6_src,
2160                             iif,
2161                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2162                             nlh->nlmsg_seq, 0, 0);
2163         if (err < 0) {
2164                 err = -EMSGSIZE;
2165                 goto out_free;
2166         }
2167
2168         err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
2169         if (err > 0)
2170                 err = 0;
2171 out:
2172         return err;
2173 out_free:
2174         kfree_skb(skb);
2175         goto out;       
2176 }
2177
2178 void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh, 
2179                         struct netlink_skb_parms *req)
2180 {
2181         struct sk_buff *skb;
2182         int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
2183         u32 pid = current->pid;
2184         u32 seq = 0;
2185
2186         if (req)
2187                 pid = req->pid;
2188         if (nlh)
2189                 seq = nlh->nlmsg_seq;
2190         
2191         skb = alloc_skb(size, gfp_any());
2192         if (!skb) {
2193                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, ENOBUFS);
2194                 return;
2195         }
2196         if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0) < 0) {
2197                 kfree_skb(skb);
2198                 netlink_set_err(rtnl, 0, RTNLGRP_IPV6_ROUTE, EINVAL);
2199                 return;
2200         }
2201         NETLINK_CB(skb).dst_group = RTNLGRP_IPV6_ROUTE;
2202         netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_ROUTE, gfp_any());
2203 }
2204
2205 /*
2206  *      /proc
2207  */
2208
2209 #ifdef CONFIG_PROC_FS
2210
2211 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2212
2213 struct rt6_proc_arg
2214 {
2215         char *buffer;
2216         int offset;
2217         int length;
2218         int skip;
2219         int len;
2220 };
2221
2222 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2223 {
2224         struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg;
2225         int i;
2226
2227         if (arg->skip < arg->offset / RT6_INFO_LEN) {
2228                 arg->skip++;
2229                 return 0;
2230         }
2231
2232         if (arg->len >= arg->length)
2233                 return 0;
2234
2235         for (i=0; i<16; i++) {
2236                 sprintf(arg->buffer + arg->len, "%02x",
2237                         rt->rt6i_dst.addr.s6_addr[i]);
2238                 arg->len += 2;
2239         }
2240         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2241                             rt->rt6i_dst.plen);
2242
2243 #ifdef CONFIG_IPV6_SUBTREES
2244         for (i=0; i<16; i++) {
2245                 sprintf(arg->buffer + arg->len, "%02x",
2246                         rt->rt6i_src.addr.s6_addr[i]);
2247                 arg->len += 2;
2248         }
2249         arg->len += sprintf(arg->buffer + arg->len, " %02x ",
2250                             rt->rt6i_src.plen);
2251 #else
2252         sprintf(arg->buffer + arg->len,
2253                 "00000000000000000000000000000000 00 ");
2254         arg->len += 36;
2255 #endif
2256
2257         if (rt->rt6i_nexthop) {
2258                 for (i=0; i<16; i++) {
2259                         sprintf(arg->buffer + arg->len, "%02x",
2260                                 rt->rt6i_nexthop->primary_key[i]);
2261                         arg->len += 2;
2262                 }
2263         } else {
2264                 sprintf(arg->buffer + arg->len,
2265                         "00000000000000000000000000000000");
2266                 arg->len += 32;
2267         }
2268         arg->len += sprintf(arg->buffer + arg->len,
2269                             " %08x %08x %08x %08x %8s\n",
2270                             rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2271                             rt->u.dst.__use, rt->rt6i_flags, 
2272                             rt->rt6i_dev ? rt->rt6i_dev->name : "");
2273         return 0;
2274 }
2275
2276 static int rt6_proc_info(char *buffer, char **start, off_t offset, int length)
2277 {
2278         struct rt6_proc_arg arg = {
2279                 .buffer = buffer,
2280                 .offset = offset,
2281                 .length = length,
2282         };
2283
2284         fib6_clean_all(rt6_info_route, 0, &arg);
2285
2286         *start = buffer;
2287         if (offset)
2288                 *start += offset % RT6_INFO_LEN;
2289
2290         arg.len -= offset % RT6_INFO_LEN;
2291
2292         if (arg.len > length)
2293                 arg.len = length;
2294         if (arg.len < 0)
2295                 arg.len = 0;
2296
2297         return arg.len;
2298 }
2299
2300 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2301 {
2302         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2303                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2304                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2305                       rt6_stats.fib_rt_cache,
2306                       atomic_read(&ip6_dst_ops.entries),
2307                       rt6_stats.fib_discarded_routes);
2308
2309         return 0;
2310 }
2311
2312 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2313 {
2314         return single_open(file, rt6_stats_seq_show, NULL);
2315 }
2316
2317 static struct file_operations rt6_stats_seq_fops = {
2318         .owner   = THIS_MODULE,
2319         .open    = rt6_stats_seq_open,
2320         .read    = seq_read,
2321         .llseek  = seq_lseek,
2322         .release = single_release,
2323 };
2324 #endif  /* CONFIG_PROC_FS */
2325
2326 #ifdef CONFIG_SYSCTL
2327
2328 static int flush_delay;
2329
2330 static
2331 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2332                               void __user *buffer, size_t *lenp, loff_t *ppos)
2333 {
2334         if (write) {
2335                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2336                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2337                 return 0;
2338         } else
2339                 return -EINVAL;
2340 }
2341
2342 ctl_table ipv6_route_table[] = {
2343         {
2344                 .ctl_name       =       NET_IPV6_ROUTE_FLUSH, 
2345                 .procname       =       "flush",
2346                 .data           =       &flush_delay,
2347                 .maxlen         =       sizeof(int),
2348                 .mode           =       0200,
2349                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2350         },
2351         {
2352                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2353                 .procname       =       "gc_thresh",
2354                 .data           =       &ip6_dst_ops.gc_thresh,
2355                 .maxlen         =       sizeof(int),
2356                 .mode           =       0644,
2357                 .proc_handler   =       &proc_dointvec,
2358         },
2359         {
2360                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2361                 .procname       =       "max_size",
2362                 .data           =       &ip6_rt_max_size,
2363                 .maxlen         =       sizeof(int),
2364                 .mode           =       0644,
2365                 .proc_handler   =       &proc_dointvec,
2366         },
2367         {
2368                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2369                 .procname       =       "gc_min_interval",
2370                 .data           =       &ip6_rt_gc_min_interval,
2371                 .maxlen         =       sizeof(int),
2372                 .mode           =       0644,
2373                 .proc_handler   =       &proc_dointvec_jiffies,
2374                 .strategy       =       &sysctl_jiffies,
2375         },
2376         {
2377                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2378                 .procname       =       "gc_timeout",
2379                 .data           =       &ip6_rt_gc_timeout,
2380                 .maxlen         =       sizeof(int),
2381                 .mode           =       0644,
2382                 .proc_handler   =       &proc_dointvec_jiffies,
2383                 .strategy       =       &sysctl_jiffies,
2384         },
2385         {
2386                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2387                 .procname       =       "gc_interval",
2388                 .data           =       &ip6_rt_gc_interval,
2389                 .maxlen         =       sizeof(int),
2390                 .mode           =       0644,
2391                 .proc_handler   =       &proc_dointvec_jiffies,
2392                 .strategy       =       &sysctl_jiffies,
2393         },
2394         {
2395                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2396                 .procname       =       "gc_elasticity",
2397                 .data           =       &ip6_rt_gc_elasticity,
2398                 .maxlen         =       sizeof(int),
2399                 .mode           =       0644,
2400                 .proc_handler   =       &proc_dointvec_jiffies,
2401                 .strategy       =       &sysctl_jiffies,
2402         },
2403         {
2404                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2405                 .procname       =       "mtu_expires",
2406                 .data           =       &ip6_rt_mtu_expires,
2407                 .maxlen         =       sizeof(int),
2408                 .mode           =       0644,
2409                 .proc_handler   =       &proc_dointvec_jiffies,
2410                 .strategy       =       &sysctl_jiffies,
2411         },
2412         {
2413                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2414                 .procname       =       "min_adv_mss",
2415                 .data           =       &ip6_rt_min_advmss,
2416                 .maxlen         =       sizeof(int),
2417                 .mode           =       0644,
2418                 .proc_handler   =       &proc_dointvec_jiffies,
2419                 .strategy       =       &sysctl_jiffies,
2420         },
2421         {
2422                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2423                 .procname       =       "gc_min_interval_ms",
2424                 .data           =       &ip6_rt_gc_min_interval,
2425                 .maxlen         =       sizeof(int),
2426                 .mode           =       0644,
2427                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2428                 .strategy       =       &sysctl_ms_jiffies,
2429         },
2430         { .ctl_name = 0 }
2431 };
2432
2433 #endif
2434
2435 void __init ip6_route_init(void)
2436 {
2437         struct proc_dir_entry *p;
2438
2439         ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache",
2440                                                      sizeof(struct rt6_info),
2441                                                      0, SLAB_HWCACHE_ALIGN,
2442                                                      NULL, NULL);
2443         if (!ip6_dst_ops.kmem_cachep)
2444                 panic("cannot create ip6_dst_cache");
2445
2446         fib6_init();
2447 #ifdef  CONFIG_PROC_FS
2448         p = proc_net_create("ipv6_route", 0, rt6_proc_info);
2449         if (p)
2450                 p->owner = THIS_MODULE;
2451
2452         proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2453 #endif
2454 #ifdef CONFIG_XFRM
2455         xfrm6_init();
2456 #endif
2457 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2458         fib6_rules_init();
2459 #endif
2460 }
2461
2462 void ip6_route_cleanup(void)
2463 {
2464 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2465         fib6_rules_cleanup();
2466 #endif
2467 #ifdef CONFIG_PROC_FS
2468         proc_net_remove("ipv6_route");
2469         proc_net_remove("rt6_stats");
2470 #endif
2471 #ifdef CONFIG_XFRM
2472         xfrm6_fini();
2473 #endif
2474         rt6_ifdown(NULL);
2475         fib6_gc_cleanup();
2476         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2477 }