[NET]: Eliminate duplicate copies of dst_discard
[linux-2.6.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  *      Ville Nuorvala
26  *              Fixed routing subtrees.
27  */
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <net/net_namespace.h>
44 #include <net/snmp.h>
45 #include <net/ipv6.h>
46 #include <net/ip6_fib.h>
47 #include <net/ip6_route.h>
48 #include <net/ndisc.h>
49 #include <net/addrconf.h>
50 #include <net/tcp.h>
51 #include <linux/rtnetlink.h>
52 #include <net/dst.h>
53 #include <net/xfrm.h>
54 #include <net/netevent.h>
55 #include <net/netlink.h>
56
57 #include <asm/uaccess.h>
58
59 #ifdef CONFIG_SYSCTL
60 #include <linux/sysctl.h>
61 #endif
62
63 /* Set to 3 to get tracing. */
64 #define RT6_DEBUG 2
65
66 #if RT6_DEBUG >= 3
67 #define RDBG(x) printk x
68 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
69 #else
70 #define RDBG(x)
71 #define RT6_TRACE(x...) do { ; } while (0)
72 #endif
73
74 #define CLONE_OFFLINK_ROUTE 0
75
76 static int ip6_rt_max_size = 4096;
77 static int ip6_rt_gc_min_interval = HZ / 2;
78 static int ip6_rt_gc_timeout = 60*HZ;
79 int ip6_rt_gc_interval = 30*HZ;
80 static int ip6_rt_gc_elasticity = 9;
81 static int ip6_rt_mtu_expires = 10*60*HZ;
82 static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
83
84 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
85 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
86 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
87 static void             ip6_dst_destroy(struct dst_entry *);
88 static void             ip6_dst_ifdown(struct dst_entry *,
89                                        struct net_device *dev, int how);
90 static int               ip6_dst_gc(void);
91
92 static int              ip6_pkt_discard(struct sk_buff *skb);
93 static int              ip6_pkt_discard_out(struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
96
97 #ifdef CONFIG_IPV6_ROUTE_INFO
98 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
99                                            struct in6_addr *gwaddr, int ifindex,
100                                            unsigned pref);
101 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
102                                            struct in6_addr *gwaddr, int ifindex);
103 #endif
104
105 static struct dst_ops ip6_dst_ops = {
106         .family                 =       AF_INET6,
107         .protocol               =       __constant_htons(ETH_P_IPV6),
108         .gc                     =       ip6_dst_gc,
109         .gc_thresh              =       1024,
110         .check                  =       ip6_dst_check,
111         .destroy                =       ip6_dst_destroy,
112         .ifdown                 =       ip6_dst_ifdown,
113         .negative_advice        =       ip6_negative_advice,
114         .link_failure           =       ip6_link_failure,
115         .update_pmtu            =       ip6_rt_update_pmtu,
116         .entry_size             =       sizeof(struct rt6_info),
117 };
118
119 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
120 {
121 }
122
123 static struct dst_ops ip6_dst_blackhole_ops = {
124         .family                 =       AF_INET6,
125         .protocol               =       __constant_htons(ETH_P_IPV6),
126         .destroy                =       ip6_dst_destroy,
127         .check                  =       ip6_dst_check,
128         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
129         .entry_size             =       sizeof(struct rt6_info),
130 };
131
132 struct rt6_info ip6_null_entry = {
133         .u = {
134                 .dst = {
135                         .__refcnt       = ATOMIC_INIT(1),
136                         .__use          = 1,
137                         .obsolete       = -1,
138                         .error          = -ENETUNREACH,
139                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
140                         .input          = ip6_pkt_discard,
141                         .output         = ip6_pkt_discard_out,
142                         .ops            = &ip6_dst_ops,
143                         .path           = (struct dst_entry*)&ip6_null_entry,
144                 }
145         },
146         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
147         .rt6i_metric    = ~(u32) 0,
148         .rt6i_ref       = ATOMIC_INIT(1),
149 };
150
151 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
152
153 static int ip6_pkt_prohibit(struct sk_buff *skb);
154 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
155
156 struct rt6_info ip6_prohibit_entry = {
157         .u = {
158                 .dst = {
159                         .__refcnt       = ATOMIC_INIT(1),
160                         .__use          = 1,
161                         .obsolete       = -1,
162                         .error          = -EACCES,
163                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
164                         .input          = ip6_pkt_prohibit,
165                         .output         = ip6_pkt_prohibit_out,
166                         .ops            = &ip6_dst_ops,
167                         .path           = (struct dst_entry*)&ip6_prohibit_entry,
168                 }
169         },
170         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
171         .rt6i_metric    = ~(u32) 0,
172         .rt6i_ref       = ATOMIC_INIT(1),
173 };
174
175 struct rt6_info ip6_blk_hole_entry = {
176         .u = {
177                 .dst = {
178                         .__refcnt       = ATOMIC_INIT(1),
179                         .__use          = 1,
180                         .obsolete       = -1,
181                         .error          = -EINVAL,
182                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
183                         .input          = dst_discard,
184                         .output         = dst_discard,
185                         .ops            = &ip6_dst_ops,
186                         .path           = (struct dst_entry*)&ip6_blk_hole_entry,
187                 }
188         },
189         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
190         .rt6i_metric    = ~(u32) 0,
191         .rt6i_ref       = ATOMIC_INIT(1),
192 };
193
194 #endif
195
196 /* allocate dst with ip6_dst_ops */
197 static __inline__ struct rt6_info *ip6_dst_alloc(void)
198 {
199         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
200 }
201
202 static void ip6_dst_destroy(struct dst_entry *dst)
203 {
204         struct rt6_info *rt = (struct rt6_info *)dst;
205         struct inet6_dev *idev = rt->rt6i_idev;
206
207         if (idev != NULL) {
208                 rt->rt6i_idev = NULL;
209                 in6_dev_put(idev);
210         }
211 }
212
213 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
214                            int how)
215 {
216         struct rt6_info *rt = (struct rt6_info *)dst;
217         struct inet6_dev *idev = rt->rt6i_idev;
218
219         if (dev != init_net.loopback_dev && idev != NULL && idev->dev == dev) {
220                 struct inet6_dev *loopback_idev = in6_dev_get(init_net.loopback_dev);
221                 if (loopback_idev != NULL) {
222                         rt->rt6i_idev = loopback_idev;
223                         in6_dev_put(idev);
224                 }
225         }
226 }
227
228 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
229 {
230         return (rt->rt6i_flags & RTF_EXPIRES &&
231                 time_after(jiffies, rt->rt6i_expires));
232 }
233
234 static inline int rt6_need_strict(struct in6_addr *daddr)
235 {
236         return (ipv6_addr_type(daddr) &
237                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
238 }
239
240 /*
241  *      Route lookup. Any table->tb6_lock is implied.
242  */
243
244 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
245                                                     int oif,
246                                                     int strict)
247 {
248         struct rt6_info *local = NULL;
249         struct rt6_info *sprt;
250
251         if (oif) {
252                 for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
253                         struct net_device *dev = sprt->rt6i_dev;
254                         if (dev->ifindex == oif)
255                                 return sprt;
256                         if (dev->flags & IFF_LOOPBACK) {
257                                 if (sprt->rt6i_idev == NULL ||
258                                     sprt->rt6i_idev->dev->ifindex != oif) {
259                                         if (strict && oif)
260                                                 continue;
261                                         if (local && (!oif ||
262                                                       local->rt6i_idev->dev->ifindex == oif))
263                                                 continue;
264                                 }
265                                 local = sprt;
266                         }
267                 }
268
269                 if (local)
270                         return local;
271
272                 if (strict)
273                         return &ip6_null_entry;
274         }
275         return rt;
276 }
277
278 #ifdef CONFIG_IPV6_ROUTER_PREF
279 static void rt6_probe(struct rt6_info *rt)
280 {
281         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
282         /*
283          * Okay, this does not seem to be appropriate
284          * for now, however, we need to check if it
285          * is really so; aka Router Reachability Probing.
286          *
287          * Router Reachability Probe MUST be rate-limited
288          * to no more than one per minute.
289          */
290         if (!neigh || (neigh->nud_state & NUD_VALID))
291                 return;
292         read_lock_bh(&neigh->lock);
293         if (!(neigh->nud_state & NUD_VALID) &&
294             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
295                 struct in6_addr mcaddr;
296                 struct in6_addr *target;
297
298                 neigh->updated = jiffies;
299                 read_unlock_bh(&neigh->lock);
300
301                 target = (struct in6_addr *)&neigh->primary_key;
302                 addrconf_addr_solict_mult(target, &mcaddr);
303                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
304         } else
305                 read_unlock_bh(&neigh->lock);
306 }
307 #else
308 static inline void rt6_probe(struct rt6_info *rt)
309 {
310         return;
311 }
312 #endif
313
314 /*
315  * Default Router Selection (RFC 2461 6.3.6)
316  */
317 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
318 {
319         struct net_device *dev = rt->rt6i_dev;
320         if (!oif || dev->ifindex == oif)
321                 return 2;
322         if ((dev->flags & IFF_LOOPBACK) &&
323             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
324                 return 1;
325         return 0;
326 }
327
328 static inline int rt6_check_neigh(struct rt6_info *rt)
329 {
330         struct neighbour *neigh = rt->rt6i_nexthop;
331         int m;
332         if (rt->rt6i_flags & RTF_NONEXTHOP ||
333             !(rt->rt6i_flags & RTF_GATEWAY))
334                 m = 1;
335         else if (neigh) {
336                 read_lock_bh(&neigh->lock);
337                 if (neigh->nud_state & NUD_VALID)
338                         m = 2;
339 #ifdef CONFIG_IPV6_ROUTER_PREF
340                 else if (neigh->nud_state & NUD_FAILED)
341                         m = 0;
342 #endif
343                 else
344                         m = 1;
345                 read_unlock_bh(&neigh->lock);
346         } else
347                 m = 0;
348         return m;
349 }
350
351 static int rt6_score_route(struct rt6_info *rt, int oif,
352                            int strict)
353 {
354         int m, n;
355
356         m = rt6_check_dev(rt, oif);
357         if (!m && (strict & RT6_LOOKUP_F_IFACE))
358                 return -1;
359 #ifdef CONFIG_IPV6_ROUTER_PREF
360         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
361 #endif
362         n = rt6_check_neigh(rt);
363         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
364                 return -1;
365         return m;
366 }
367
368 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
369                                    int *mpri, struct rt6_info *match)
370 {
371         int m;
372
373         if (rt6_check_expired(rt))
374                 goto out;
375
376         m = rt6_score_route(rt, oif, strict);
377         if (m < 0)
378                 goto out;
379
380         if (m > *mpri) {
381                 if (strict & RT6_LOOKUP_F_REACHABLE)
382                         rt6_probe(match);
383                 *mpri = m;
384                 match = rt;
385         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
386                 rt6_probe(rt);
387         }
388
389 out:
390         return match;
391 }
392
393 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
394                                      struct rt6_info *rr_head,
395                                      u32 metric, int oif, int strict)
396 {
397         struct rt6_info *rt, *match;
398         int mpri = -1;
399
400         match = NULL;
401         for (rt = rr_head; rt && rt->rt6i_metric == metric;
402              rt = rt->u.dst.rt6_next)
403                 match = find_match(rt, oif, strict, &mpri, match);
404         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
405              rt = rt->u.dst.rt6_next)
406                 match = find_match(rt, oif, strict, &mpri, match);
407
408         return match;
409 }
410
411 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
412 {
413         struct rt6_info *match, *rt0;
414
415         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
416                   __FUNCTION__, fn->leaf, oif);
417
418         rt0 = fn->rr_ptr;
419         if (!rt0)
420                 fn->rr_ptr = rt0 = fn->leaf;
421
422         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
423
424         if (!match &&
425             (strict & RT6_LOOKUP_F_REACHABLE)) {
426                 struct rt6_info *next = rt0->u.dst.rt6_next;
427
428                 /* no entries matched; do round-robin */
429                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
430                         next = fn->leaf;
431
432                 if (next != rt0)
433                         fn->rr_ptr = next;
434         }
435
436         RT6_TRACE("%s() => %p\n",
437                   __FUNCTION__, match);
438
439         return (match ? match : &ip6_null_entry);
440 }
441
442 #ifdef CONFIG_IPV6_ROUTE_INFO
443 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
444                   struct in6_addr *gwaddr)
445 {
446         struct route_info *rinfo = (struct route_info *) opt;
447         struct in6_addr prefix_buf, *prefix;
448         unsigned int pref;
449         u32 lifetime;
450         struct rt6_info *rt;
451
452         if (len < sizeof(struct route_info)) {
453                 return -EINVAL;
454         }
455
456         /* Sanity check for prefix_len and length */
457         if (rinfo->length > 3) {
458                 return -EINVAL;
459         } else if (rinfo->prefix_len > 128) {
460                 return -EINVAL;
461         } else if (rinfo->prefix_len > 64) {
462                 if (rinfo->length < 2) {
463                         return -EINVAL;
464                 }
465         } else if (rinfo->prefix_len > 0) {
466                 if (rinfo->length < 1) {
467                         return -EINVAL;
468                 }
469         }
470
471         pref = rinfo->route_pref;
472         if (pref == ICMPV6_ROUTER_PREF_INVALID)
473                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
474
475         lifetime = ntohl(rinfo->lifetime);
476         if (lifetime == 0xffffffff) {
477                 /* infinity */
478         } else if (lifetime > 0x7fffffff/HZ) {
479                 /* Avoid arithmetic overflow */
480                 lifetime = 0x7fffffff/HZ - 1;
481         }
482
483         if (rinfo->length == 3)
484                 prefix = (struct in6_addr *)rinfo->prefix;
485         else {
486                 /* this function is safe */
487                 ipv6_addr_prefix(&prefix_buf,
488                                  (struct in6_addr *)rinfo->prefix,
489                                  rinfo->prefix_len);
490                 prefix = &prefix_buf;
491         }
492
493         rt = rt6_get_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex);
494
495         if (rt && !lifetime) {
496                 ip6_del_rt(rt);
497                 rt = NULL;
498         }
499
500         if (!rt && lifetime)
501                 rt = rt6_add_route_info(prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
502                                         pref);
503         else if (rt)
504                 rt->rt6i_flags = RTF_ROUTEINFO |
505                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
506
507         if (rt) {
508                 if (lifetime == 0xffffffff) {
509                         rt->rt6i_flags &= ~RTF_EXPIRES;
510                 } else {
511                         rt->rt6i_expires = jiffies + HZ * lifetime;
512                         rt->rt6i_flags |= RTF_EXPIRES;
513                 }
514                 dst_release(&rt->u.dst);
515         }
516         return 0;
517 }
518 #endif
519
520 #define BACKTRACK(saddr) \
521 do { \
522         if (rt == &ip6_null_entry) { \
523                 struct fib6_node *pn; \
524                 while (1) { \
525                         if (fn->fn_flags & RTN_TL_ROOT) \
526                                 goto out; \
527                         pn = fn->parent; \
528                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
529                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
530                         else \
531                                 fn = pn; \
532                         if (fn->fn_flags & RTN_RTINFO) \
533                                 goto restart; \
534                 } \
535         } \
536 } while(0)
537
538 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
539                                              struct flowi *fl, int flags)
540 {
541         struct fib6_node *fn;
542         struct rt6_info *rt;
543
544         read_lock_bh(&table->tb6_lock);
545         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
546 restart:
547         rt = fn->leaf;
548         rt = rt6_device_match(rt, fl->oif, flags);
549         BACKTRACK(&fl->fl6_src);
550 out:
551         dst_use(&rt->u.dst, jiffies);
552         read_unlock_bh(&table->tb6_lock);
553         return rt;
554
555 }
556
557 struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr,
558                             int oif, int strict)
559 {
560         struct flowi fl = {
561                 .oif = oif,
562                 .nl_u = {
563                         .ip6_u = {
564                                 .daddr = *daddr,
565                         },
566                 },
567         };
568         struct dst_entry *dst;
569         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
570
571         if (saddr) {
572                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
573                 flags |= RT6_LOOKUP_F_HAS_SADDR;
574         }
575
576         dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_lookup);
577         if (dst->error == 0)
578                 return (struct rt6_info *) dst;
579
580         dst_release(dst);
581
582         return NULL;
583 }
584
585 EXPORT_SYMBOL(rt6_lookup);
586
587 /* ip6_ins_rt is called with FREE table->tb6_lock.
588    It takes new route entry, the addition fails by any reason the
589    route is freed. In any case, if caller does not hold it, it may
590    be destroyed.
591  */
592
593 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
594 {
595         int err;
596         struct fib6_table *table;
597
598         table = rt->rt6i_table;
599         write_lock_bh(&table->tb6_lock);
600         err = fib6_add(&table->tb6_root, rt, info);
601         write_unlock_bh(&table->tb6_lock);
602
603         return err;
604 }
605
606 int ip6_ins_rt(struct rt6_info *rt)
607 {
608         return __ip6_ins_rt(rt, NULL);
609 }
610
611 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
612                                       struct in6_addr *saddr)
613 {
614         struct rt6_info *rt;
615
616         /*
617          *      Clone the route.
618          */
619
620         rt = ip6_rt_copy(ort);
621
622         if (rt) {
623                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
624                         if (rt->rt6i_dst.plen != 128 &&
625                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
626                                 rt->rt6i_flags |= RTF_ANYCAST;
627                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
628                 }
629
630                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
631                 rt->rt6i_dst.plen = 128;
632                 rt->rt6i_flags |= RTF_CACHE;
633                 rt->u.dst.flags |= DST_HOST;
634
635 #ifdef CONFIG_IPV6_SUBTREES
636                 if (rt->rt6i_src.plen && saddr) {
637                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
638                         rt->rt6i_src.plen = 128;
639                 }
640 #endif
641
642                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
643
644         }
645
646         return rt;
647 }
648
649 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
650 {
651         struct rt6_info *rt = ip6_rt_copy(ort);
652         if (rt) {
653                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
654                 rt->rt6i_dst.plen = 128;
655                 rt->rt6i_flags |= RTF_CACHE;
656                 rt->u.dst.flags |= DST_HOST;
657                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
658         }
659         return rt;
660 }
661
662 static struct rt6_info *ip6_pol_route(struct fib6_table *table, int oif,
663                                             struct flowi *fl, int flags)
664 {
665         struct fib6_node *fn;
666         struct rt6_info *rt, *nrt;
667         int strict = 0;
668         int attempts = 3;
669         int err;
670         int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
671
672         strict |= flags & RT6_LOOKUP_F_IFACE;
673
674 relookup:
675         read_lock_bh(&table->tb6_lock);
676
677 restart_2:
678         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
679
680 restart:
681         rt = rt6_select(fn, oif, strict | reachable);
682         BACKTRACK(&fl->fl6_src);
683         if (rt == &ip6_null_entry ||
684             rt->rt6i_flags & RTF_CACHE)
685                 goto out;
686
687         dst_hold(&rt->u.dst);
688         read_unlock_bh(&table->tb6_lock);
689
690         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
691                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
692         else {
693 #if CLONE_OFFLINK_ROUTE
694                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
695 #else
696                 goto out2;
697 #endif
698         }
699
700         dst_release(&rt->u.dst);
701         rt = nrt ? : &ip6_null_entry;
702
703         dst_hold(&rt->u.dst);
704         if (nrt) {
705                 err = ip6_ins_rt(nrt);
706                 if (!err)
707                         goto out2;
708         }
709
710         if (--attempts <= 0)
711                 goto out2;
712
713         /*
714          * Race condition! In the gap, when table->tb6_lock was
715          * released someone could insert this route.  Relookup.
716          */
717         dst_release(&rt->u.dst);
718         goto relookup;
719
720 out:
721         if (reachable) {
722                 reachable = 0;
723                 goto restart_2;
724         }
725         dst_hold(&rt->u.dst);
726         read_unlock_bh(&table->tb6_lock);
727 out2:
728         rt->u.dst.lastuse = jiffies;
729         rt->u.dst.__use++;
730
731         return rt;
732 }
733
734 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
735                                             struct flowi *fl, int flags)
736 {
737         return ip6_pol_route(table, fl->iif, fl, flags);
738 }
739
740 void ip6_route_input(struct sk_buff *skb)
741 {
742         struct ipv6hdr *iph = ipv6_hdr(skb);
743         int flags = RT6_LOOKUP_F_HAS_SADDR;
744         struct flowi fl = {
745                 .iif = skb->dev->ifindex,
746                 .nl_u = {
747                         .ip6_u = {
748                                 .daddr = iph->daddr,
749                                 .saddr = iph->saddr,
750                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
751                         },
752                 },
753                 .mark = skb->mark,
754                 .proto = iph->nexthdr,
755         };
756
757         if (rt6_need_strict(&iph->daddr))
758                 flags |= RT6_LOOKUP_F_IFACE;
759
760         skb->dst = fib6_rule_lookup(&fl, flags, ip6_pol_route_input);
761 }
762
763 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
764                                              struct flowi *fl, int flags)
765 {
766         return ip6_pol_route(table, fl->oif, fl, flags);
767 }
768
769 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
770 {
771         int flags = 0;
772
773         if (rt6_need_strict(&fl->fl6_dst))
774                 flags |= RT6_LOOKUP_F_IFACE;
775
776         if (!ipv6_addr_any(&fl->fl6_src))
777                 flags |= RT6_LOOKUP_F_HAS_SADDR;
778
779         return fib6_rule_lookup(fl, flags, ip6_pol_route_output);
780 }
781
782 EXPORT_SYMBOL(ip6_route_output);
783
784 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
785 {
786         struct rt6_info *ort = (struct rt6_info *) *dstp;
787         struct rt6_info *rt = (struct rt6_info *)
788                 dst_alloc(&ip6_dst_blackhole_ops);
789         struct dst_entry *new = NULL;
790
791         if (rt) {
792                 new = &rt->u.dst;
793
794                 atomic_set(&new->__refcnt, 1);
795                 new->__use = 1;
796                 new->input = dst_discard;
797                 new->output = dst_discard;
798
799                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
800                 new->dev = ort->u.dst.dev;
801                 if (new->dev)
802                         dev_hold(new->dev);
803                 rt->rt6i_idev = ort->rt6i_idev;
804                 if (rt->rt6i_idev)
805                         in6_dev_hold(rt->rt6i_idev);
806                 rt->rt6i_expires = 0;
807
808                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
809                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
810                 rt->rt6i_metric = 0;
811
812                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
813 #ifdef CONFIG_IPV6_SUBTREES
814                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
815 #endif
816
817                 dst_free(new);
818         }
819
820         dst_release(*dstp);
821         *dstp = new;
822         return (new ? 0 : -ENOMEM);
823 }
824 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
825
826 /*
827  *      Destination cache support functions
828  */
829
830 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
831 {
832         struct rt6_info *rt;
833
834         rt = (struct rt6_info *) dst;
835
836         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
837                 return dst;
838
839         return NULL;
840 }
841
842 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
843 {
844         struct rt6_info *rt = (struct rt6_info *) dst;
845
846         if (rt) {
847                 if (rt->rt6i_flags & RTF_CACHE)
848                         ip6_del_rt(rt);
849                 else
850                         dst_release(dst);
851         }
852         return NULL;
853 }
854
855 static void ip6_link_failure(struct sk_buff *skb)
856 {
857         struct rt6_info *rt;
858
859         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
860
861         rt = (struct rt6_info *) skb->dst;
862         if (rt) {
863                 if (rt->rt6i_flags&RTF_CACHE) {
864                         dst_set_expires(&rt->u.dst, 0);
865                         rt->rt6i_flags |= RTF_EXPIRES;
866                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
867                         rt->rt6i_node->fn_sernum = -1;
868         }
869 }
870
871 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
872 {
873         struct rt6_info *rt6 = (struct rt6_info*)dst;
874
875         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
876                 rt6->rt6i_flags |= RTF_MODIFIED;
877                 if (mtu < IPV6_MIN_MTU) {
878                         mtu = IPV6_MIN_MTU;
879                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
880                 }
881                 dst->metrics[RTAX_MTU-1] = mtu;
882                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
883         }
884 }
885
886 static int ipv6_get_mtu(struct net_device *dev);
887
888 static inline unsigned int ipv6_advmss(unsigned int mtu)
889 {
890         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
891
892         if (mtu < ip6_rt_min_advmss)
893                 mtu = ip6_rt_min_advmss;
894
895         /*
896          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
897          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
898          * IPV6_MAXPLEN is also valid and means: "any MSS,
899          * rely only on pmtu discovery"
900          */
901         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
902                 mtu = IPV6_MAXPLEN;
903         return mtu;
904 }
905
906 static struct dst_entry *ndisc_dst_gc_list;
907 static DEFINE_SPINLOCK(ndisc_lock);
908
909 struct dst_entry *ndisc_dst_alloc(struct net_device *dev,
910                                   struct neighbour *neigh,
911                                   struct in6_addr *addr,
912                                   int (*output)(struct sk_buff *))
913 {
914         struct rt6_info *rt;
915         struct inet6_dev *idev = in6_dev_get(dev);
916
917         if (unlikely(idev == NULL))
918                 return NULL;
919
920         rt = ip6_dst_alloc();
921         if (unlikely(rt == NULL)) {
922                 in6_dev_put(idev);
923                 goto out;
924         }
925
926         dev_hold(dev);
927         if (neigh)
928                 neigh_hold(neigh);
929         else
930                 neigh = ndisc_get_neigh(dev, addr);
931
932         rt->rt6i_dev      = dev;
933         rt->rt6i_idev     = idev;
934         rt->rt6i_nexthop  = neigh;
935         atomic_set(&rt->u.dst.__refcnt, 1);
936         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
937         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
938         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
939         rt->u.dst.output  = output;
940
941 #if 0   /* there's no chance to use these for ndisc */
942         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
943                                 ? DST_HOST
944                                 : 0;
945         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
946         rt->rt6i_dst.plen = 128;
947 #endif
948
949         spin_lock_bh(&ndisc_lock);
950         rt->u.dst.next = ndisc_dst_gc_list;
951         ndisc_dst_gc_list = &rt->u.dst;
952         spin_unlock_bh(&ndisc_lock);
953
954         fib6_force_start_gc();
955
956 out:
957         return &rt->u.dst;
958 }
959
960 int ndisc_dst_gc(int *more)
961 {
962         struct dst_entry *dst, *next, **pprev;
963         int freed;
964
965         next = NULL;
966         freed = 0;
967
968         spin_lock_bh(&ndisc_lock);
969         pprev = &ndisc_dst_gc_list;
970
971         while ((dst = *pprev) != NULL) {
972                 if (!atomic_read(&dst->__refcnt)) {
973                         *pprev = dst->next;
974                         dst_free(dst);
975                         freed++;
976                 } else {
977                         pprev = &dst->next;
978                         (*more)++;
979                 }
980         }
981
982         spin_unlock_bh(&ndisc_lock);
983
984         return freed;
985 }
986
987 static int ip6_dst_gc(void)
988 {
989         static unsigned expire = 30*HZ;
990         static unsigned long last_gc;
991         unsigned long now = jiffies;
992
993         if (time_after(last_gc + ip6_rt_gc_min_interval, now) &&
994             atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size)
995                 goto out;
996
997         expire++;
998         fib6_run_gc(expire);
999         last_gc = now;
1000         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
1001                 expire = ip6_rt_gc_timeout>>1;
1002
1003 out:
1004         expire -= expire>>ip6_rt_gc_elasticity;
1005         return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size);
1006 }
1007
1008 /* Clean host part of a prefix. Not necessary in radix tree,
1009    but results in cleaner routing tables.
1010
1011    Remove it only when all the things will work!
1012  */
1013
1014 static int ipv6_get_mtu(struct net_device *dev)
1015 {
1016         int mtu = IPV6_MIN_MTU;
1017         struct inet6_dev *idev;
1018
1019         idev = in6_dev_get(dev);
1020         if (idev) {
1021                 mtu = idev->cnf.mtu6;
1022                 in6_dev_put(idev);
1023         }
1024         return mtu;
1025 }
1026
1027 int ipv6_get_hoplimit(struct net_device *dev)
1028 {
1029         int hoplimit = ipv6_devconf.hop_limit;
1030         struct inet6_dev *idev;
1031
1032         idev = in6_dev_get(dev);
1033         if (idev) {
1034                 hoplimit = idev->cnf.hop_limit;
1035                 in6_dev_put(idev);
1036         }
1037         return hoplimit;
1038 }
1039
1040 /*
1041  *
1042  */
1043
1044 int ip6_route_add(struct fib6_config *cfg)
1045 {
1046         int err;
1047         struct rt6_info *rt = NULL;
1048         struct net_device *dev = NULL;
1049         struct inet6_dev *idev = NULL;
1050         struct fib6_table *table;
1051         int addr_type;
1052
1053         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1054                 return -EINVAL;
1055 #ifndef CONFIG_IPV6_SUBTREES
1056         if (cfg->fc_src_len)
1057                 return -EINVAL;
1058 #endif
1059         if (cfg->fc_ifindex) {
1060                 err = -ENODEV;
1061                 dev = dev_get_by_index(&init_net, cfg->fc_ifindex);
1062                 if (!dev)
1063                         goto out;
1064                 idev = in6_dev_get(dev);
1065                 if (!idev)
1066                         goto out;
1067         }
1068
1069         if (cfg->fc_metric == 0)
1070                 cfg->fc_metric = IP6_RT_PRIO_USER;
1071
1072         table = fib6_new_table(cfg->fc_table);
1073         if (table == NULL) {
1074                 err = -ENOBUFS;
1075                 goto out;
1076         }
1077
1078         rt = ip6_dst_alloc();
1079
1080         if (rt == NULL) {
1081                 err = -ENOMEM;
1082                 goto out;
1083         }
1084
1085         rt->u.dst.obsolete = -1;
1086         rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1087
1088         if (cfg->fc_protocol == RTPROT_UNSPEC)
1089                 cfg->fc_protocol = RTPROT_BOOT;
1090         rt->rt6i_protocol = cfg->fc_protocol;
1091
1092         addr_type = ipv6_addr_type(&cfg->fc_dst);
1093
1094         if (addr_type & IPV6_ADDR_MULTICAST)
1095                 rt->u.dst.input = ip6_mc_input;
1096         else
1097                 rt->u.dst.input = ip6_forward;
1098
1099         rt->u.dst.output = ip6_output;
1100
1101         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1102         rt->rt6i_dst.plen = cfg->fc_dst_len;
1103         if (rt->rt6i_dst.plen == 128)
1104                rt->u.dst.flags = DST_HOST;
1105
1106 #ifdef CONFIG_IPV6_SUBTREES
1107         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1108         rt->rt6i_src.plen = cfg->fc_src_len;
1109 #endif
1110
1111         rt->rt6i_metric = cfg->fc_metric;
1112
1113         /* We cannot add true routes via loopback here,
1114            they would result in kernel looping; promote them to reject routes
1115          */
1116         if ((cfg->fc_flags & RTF_REJECT) ||
1117             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1118                 /* hold loopback dev/idev if we haven't done so. */
1119                 if (dev != init_net.loopback_dev) {
1120                         if (dev) {
1121                                 dev_put(dev);
1122                                 in6_dev_put(idev);
1123                         }
1124                         dev = init_net.loopback_dev;
1125                         dev_hold(dev);
1126                         idev = in6_dev_get(dev);
1127                         if (!idev) {
1128                                 err = -ENODEV;
1129                                 goto out;
1130                         }
1131                 }
1132                 rt->u.dst.output = ip6_pkt_discard_out;
1133                 rt->u.dst.input = ip6_pkt_discard;
1134                 rt->u.dst.error = -ENETUNREACH;
1135                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1136                 goto install_route;
1137         }
1138
1139         if (cfg->fc_flags & RTF_GATEWAY) {
1140                 struct in6_addr *gw_addr;
1141                 int gwa_type;
1142
1143                 gw_addr = &cfg->fc_gateway;
1144                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1145                 gwa_type = ipv6_addr_type(gw_addr);
1146
1147                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1148                         struct rt6_info *grt;
1149
1150                         /* IPv6 strictly inhibits using not link-local
1151                            addresses as nexthop address.
1152                            Otherwise, router will not able to send redirects.
1153                            It is very good, but in some (rare!) circumstances
1154                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1155                            some exceptions. --ANK
1156                          */
1157                         err = -EINVAL;
1158                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1159                                 goto out;
1160
1161                         grt = rt6_lookup(gw_addr, NULL, cfg->fc_ifindex, 1);
1162
1163                         err = -EHOSTUNREACH;
1164                         if (grt == NULL)
1165                                 goto out;
1166                         if (dev) {
1167                                 if (dev != grt->rt6i_dev) {
1168                                         dst_release(&grt->u.dst);
1169                                         goto out;
1170                                 }
1171                         } else {
1172                                 dev = grt->rt6i_dev;
1173                                 idev = grt->rt6i_idev;
1174                                 dev_hold(dev);
1175                                 in6_dev_hold(grt->rt6i_idev);
1176                         }
1177                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1178                                 err = 0;
1179                         dst_release(&grt->u.dst);
1180
1181                         if (err)
1182                                 goto out;
1183                 }
1184                 err = -EINVAL;
1185                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1186                         goto out;
1187         }
1188
1189         err = -ENODEV;
1190         if (dev == NULL)
1191                 goto out;
1192
1193         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1194                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1195                 if (IS_ERR(rt->rt6i_nexthop)) {
1196                         err = PTR_ERR(rt->rt6i_nexthop);
1197                         rt->rt6i_nexthop = NULL;
1198                         goto out;
1199                 }
1200         }
1201
1202         rt->rt6i_flags = cfg->fc_flags;
1203
1204 install_route:
1205         if (cfg->fc_mx) {
1206                 struct nlattr *nla;
1207                 int remaining;
1208
1209                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1210                         int type = nla_type(nla);
1211
1212                         if (type) {
1213                                 if (type > RTAX_MAX) {
1214                                         err = -EINVAL;
1215                                         goto out;
1216                                 }
1217
1218                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1219                         }
1220                 }
1221         }
1222
1223         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1224                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1225         if (!rt->u.dst.metrics[RTAX_MTU-1])
1226                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1227         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1228                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1229         rt->u.dst.dev = dev;
1230         rt->rt6i_idev = idev;
1231         rt->rt6i_table = table;
1232         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1233
1234 out:
1235         if (dev)
1236                 dev_put(dev);
1237         if (idev)
1238                 in6_dev_put(idev);
1239         if (rt)
1240                 dst_free(&rt->u.dst);
1241         return err;
1242 }
1243
1244 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1245 {
1246         int err;
1247         struct fib6_table *table;
1248
1249         if (rt == &ip6_null_entry)
1250                 return -ENOENT;
1251
1252         table = rt->rt6i_table;
1253         write_lock_bh(&table->tb6_lock);
1254
1255         err = fib6_del(rt, info);
1256         dst_release(&rt->u.dst);
1257
1258         write_unlock_bh(&table->tb6_lock);
1259
1260         return err;
1261 }
1262
1263 int ip6_del_rt(struct rt6_info *rt)
1264 {
1265         return __ip6_del_rt(rt, NULL);
1266 }
1267
1268 static int ip6_route_del(struct fib6_config *cfg)
1269 {
1270         struct fib6_table *table;
1271         struct fib6_node *fn;
1272         struct rt6_info *rt;
1273         int err = -ESRCH;
1274
1275         table = fib6_get_table(cfg->fc_table);
1276         if (table == NULL)
1277                 return err;
1278
1279         read_lock_bh(&table->tb6_lock);
1280
1281         fn = fib6_locate(&table->tb6_root,
1282                          &cfg->fc_dst, cfg->fc_dst_len,
1283                          &cfg->fc_src, cfg->fc_src_len);
1284
1285         if (fn) {
1286                 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1287                         if (cfg->fc_ifindex &&
1288                             (rt->rt6i_dev == NULL ||
1289                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1290                                 continue;
1291                         if (cfg->fc_flags & RTF_GATEWAY &&
1292                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1293                                 continue;
1294                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1295                                 continue;
1296                         dst_hold(&rt->u.dst);
1297                         read_unlock_bh(&table->tb6_lock);
1298
1299                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1300                 }
1301         }
1302         read_unlock_bh(&table->tb6_lock);
1303
1304         return err;
1305 }
1306
1307 /*
1308  *      Handle redirects
1309  */
1310 struct ip6rd_flowi {
1311         struct flowi fl;
1312         struct in6_addr gateway;
1313 };
1314
1315 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1316                                              struct flowi *fl,
1317                                              int flags)
1318 {
1319         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1320         struct rt6_info *rt;
1321         struct fib6_node *fn;
1322
1323         /*
1324          * Get the "current" route for this destination and
1325          * check if the redirect has come from approriate router.
1326          *
1327          * RFC 2461 specifies that redirects should only be
1328          * accepted if they come from the nexthop to the target.
1329          * Due to the way the routes are chosen, this notion
1330          * is a bit fuzzy and one might need to check all possible
1331          * routes.
1332          */
1333
1334         read_lock_bh(&table->tb6_lock);
1335         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1336 restart:
1337         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1338                 /*
1339                  * Current route is on-link; redirect is always invalid.
1340                  *
1341                  * Seems, previous statement is not true. It could
1342                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1343                  * But then router serving it might decide, that we should
1344                  * know truth 8)8) --ANK (980726).
1345                  */
1346                 if (rt6_check_expired(rt))
1347                         continue;
1348                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1349                         continue;
1350                 if (fl->oif != rt->rt6i_dev->ifindex)
1351                         continue;
1352                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1353                         continue;
1354                 break;
1355         }
1356
1357         if (!rt)
1358                 rt = &ip6_null_entry;
1359         BACKTRACK(&fl->fl6_src);
1360 out:
1361         dst_hold(&rt->u.dst);
1362
1363         read_unlock_bh(&table->tb6_lock);
1364
1365         return rt;
1366 };
1367
1368 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1369                                            struct in6_addr *src,
1370                                            struct in6_addr *gateway,
1371                                            struct net_device *dev)
1372 {
1373         int flags = RT6_LOOKUP_F_HAS_SADDR;
1374         struct ip6rd_flowi rdfl = {
1375                 .fl = {
1376                         .oif = dev->ifindex,
1377                         .nl_u = {
1378                                 .ip6_u = {
1379                                         .daddr = *dest,
1380                                         .saddr = *src,
1381                                 },
1382                         },
1383                 },
1384                 .gateway = *gateway,
1385         };
1386
1387         if (rt6_need_strict(dest))
1388                 flags |= RT6_LOOKUP_F_IFACE;
1389
1390         return (struct rt6_info *)fib6_rule_lookup((struct flowi *)&rdfl, flags, __ip6_route_redirect);
1391 }
1392
1393 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1394                   struct in6_addr *saddr,
1395                   struct neighbour *neigh, u8 *lladdr, int on_link)
1396 {
1397         struct rt6_info *rt, *nrt = NULL;
1398         struct netevent_redirect netevent;
1399
1400         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1401
1402         if (rt == &ip6_null_entry) {
1403                 if (net_ratelimit())
1404                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1405                                "for redirect target\n");
1406                 goto out;
1407         }
1408
1409         /*
1410          *      We have finally decided to accept it.
1411          */
1412
1413         neigh_update(neigh, lladdr, NUD_STALE,
1414                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1415                      NEIGH_UPDATE_F_OVERRIDE|
1416                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1417                                      NEIGH_UPDATE_F_ISROUTER))
1418                      );
1419
1420         /*
1421          * Redirect received -> path was valid.
1422          * Look, redirects are sent only in response to data packets,
1423          * so that this nexthop apparently is reachable. --ANK
1424          */
1425         dst_confirm(&rt->u.dst);
1426
1427         /* Duplicate redirect: silently ignore. */
1428         if (neigh == rt->u.dst.neighbour)
1429                 goto out;
1430
1431         nrt = ip6_rt_copy(rt);
1432         if (nrt == NULL)
1433                 goto out;
1434
1435         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1436         if (on_link)
1437                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1438
1439         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1440         nrt->rt6i_dst.plen = 128;
1441         nrt->u.dst.flags |= DST_HOST;
1442
1443         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1444         nrt->rt6i_nexthop = neigh_clone(neigh);
1445         /* Reset pmtu, it may be better */
1446         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1447         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1448
1449         if (ip6_ins_rt(nrt))
1450                 goto out;
1451
1452         netevent.old = &rt->u.dst;
1453         netevent.new = &nrt->u.dst;
1454         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1455
1456         if (rt->rt6i_flags&RTF_CACHE) {
1457                 ip6_del_rt(rt);
1458                 return;
1459         }
1460
1461 out:
1462         dst_release(&rt->u.dst);
1463         return;
1464 }
1465
1466 /*
1467  *      Handle ICMP "packet too big" messages
1468  *      i.e. Path MTU discovery
1469  */
1470
1471 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1472                         struct net_device *dev, u32 pmtu)
1473 {
1474         struct rt6_info *rt, *nrt;
1475         int allfrag = 0;
1476
1477         rt = rt6_lookup(daddr, saddr, dev->ifindex, 0);
1478         if (rt == NULL)
1479                 return;
1480
1481         if (pmtu >= dst_mtu(&rt->u.dst))
1482                 goto out;
1483
1484         if (pmtu < IPV6_MIN_MTU) {
1485                 /*
1486                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1487                  * MTU (1280) and a fragment header should always be included
1488                  * after a node receiving Too Big message reporting PMTU is
1489                  * less than the IPv6 Minimum Link MTU.
1490                  */
1491                 pmtu = IPV6_MIN_MTU;
1492                 allfrag = 1;
1493         }
1494
1495         /* New mtu received -> path was valid.
1496            They are sent only in response to data packets,
1497            so that this nexthop apparently is reachable. --ANK
1498          */
1499         dst_confirm(&rt->u.dst);
1500
1501         /* Host route. If it is static, it would be better
1502            not to override it, but add new one, so that
1503            when cache entry will expire old pmtu
1504            would return automatically.
1505          */
1506         if (rt->rt6i_flags & RTF_CACHE) {
1507                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1508                 if (allfrag)
1509                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1510                 dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires);
1511                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1512                 goto out;
1513         }
1514
1515         /* Network route.
1516            Two cases are possible:
1517            1. It is connected route. Action: COW
1518            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1519          */
1520         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1521                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1522         else
1523                 nrt = rt6_alloc_clone(rt, daddr);
1524
1525         if (nrt) {
1526                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1527                 if (allfrag)
1528                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1529
1530                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1531                  * happened within 5 mins, the recommended timer is 10 mins.
1532                  * Here this route expiration time is set to ip6_rt_mtu_expires
1533                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1534                  * and detecting PMTU increase will be automatically happened.
1535                  */
1536                 dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires);
1537                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1538
1539                 ip6_ins_rt(nrt);
1540         }
1541 out:
1542         dst_release(&rt->u.dst);
1543 }
1544
1545 /*
1546  *      Misc support functions
1547  */
1548
1549 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1550 {
1551         struct rt6_info *rt = ip6_dst_alloc();
1552
1553         if (rt) {
1554                 rt->u.dst.input = ort->u.dst.input;
1555                 rt->u.dst.output = ort->u.dst.output;
1556
1557                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1558                 rt->u.dst.error = ort->u.dst.error;
1559                 rt->u.dst.dev = ort->u.dst.dev;
1560                 if (rt->u.dst.dev)
1561                         dev_hold(rt->u.dst.dev);
1562                 rt->rt6i_idev = ort->rt6i_idev;
1563                 if (rt->rt6i_idev)
1564                         in6_dev_hold(rt->rt6i_idev);
1565                 rt->u.dst.lastuse = jiffies;
1566                 rt->rt6i_expires = 0;
1567
1568                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1569                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1570                 rt->rt6i_metric = 0;
1571
1572                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1573 #ifdef CONFIG_IPV6_SUBTREES
1574                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1575 #endif
1576                 rt->rt6i_table = ort->rt6i_table;
1577         }
1578         return rt;
1579 }
1580
1581 #ifdef CONFIG_IPV6_ROUTE_INFO
1582 static struct rt6_info *rt6_get_route_info(struct in6_addr *prefix, int prefixlen,
1583                                            struct in6_addr *gwaddr, int ifindex)
1584 {
1585         struct fib6_node *fn;
1586         struct rt6_info *rt = NULL;
1587         struct fib6_table *table;
1588
1589         table = fib6_get_table(RT6_TABLE_INFO);
1590         if (table == NULL)
1591                 return NULL;
1592
1593         write_lock_bh(&table->tb6_lock);
1594         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1595         if (!fn)
1596                 goto out;
1597
1598         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1599                 if (rt->rt6i_dev->ifindex != ifindex)
1600                         continue;
1601                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1602                         continue;
1603                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1604                         continue;
1605                 dst_hold(&rt->u.dst);
1606                 break;
1607         }
1608 out:
1609         write_unlock_bh(&table->tb6_lock);
1610         return rt;
1611 }
1612
1613 static struct rt6_info *rt6_add_route_info(struct in6_addr *prefix, int prefixlen,
1614                                            struct in6_addr *gwaddr, int ifindex,
1615                                            unsigned pref)
1616 {
1617         struct fib6_config cfg = {
1618                 .fc_table       = RT6_TABLE_INFO,
1619                 .fc_metric      = 1024,
1620                 .fc_ifindex     = ifindex,
1621                 .fc_dst_len     = prefixlen,
1622                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1623                                   RTF_UP | RTF_PREF(pref),
1624         };
1625
1626         ipv6_addr_copy(&cfg.fc_dst, prefix);
1627         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1628
1629         /* We should treat it as a default route if prefix length is 0. */
1630         if (!prefixlen)
1631                 cfg.fc_flags |= RTF_DEFAULT;
1632
1633         ip6_route_add(&cfg);
1634
1635         return rt6_get_route_info(prefix, prefixlen, gwaddr, ifindex);
1636 }
1637 #endif
1638
1639 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1640 {
1641         struct rt6_info *rt;
1642         struct fib6_table *table;
1643
1644         table = fib6_get_table(RT6_TABLE_DFLT);
1645         if (table == NULL)
1646                 return NULL;
1647
1648         write_lock_bh(&table->tb6_lock);
1649         for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1650                 if (dev == rt->rt6i_dev &&
1651                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1652                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1653                         break;
1654         }
1655         if (rt)
1656                 dst_hold(&rt->u.dst);
1657         write_unlock_bh(&table->tb6_lock);
1658         return rt;
1659 }
1660
1661 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1662                                      struct net_device *dev,
1663                                      unsigned int pref)
1664 {
1665         struct fib6_config cfg = {
1666                 .fc_table       = RT6_TABLE_DFLT,
1667                 .fc_metric      = 1024,
1668                 .fc_ifindex     = dev->ifindex,
1669                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1670                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1671         };
1672
1673         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1674
1675         ip6_route_add(&cfg);
1676
1677         return rt6_get_dflt_router(gwaddr, dev);
1678 }
1679
1680 void rt6_purge_dflt_routers(void)
1681 {
1682         struct rt6_info *rt;
1683         struct fib6_table *table;
1684
1685         /* NOTE: Keep consistent with rt6_get_dflt_router */
1686         table = fib6_get_table(RT6_TABLE_DFLT);
1687         if (table == NULL)
1688                 return;
1689
1690 restart:
1691         read_lock_bh(&table->tb6_lock);
1692         for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1693                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1694                         dst_hold(&rt->u.dst);
1695                         read_unlock_bh(&table->tb6_lock);
1696                         ip6_del_rt(rt);
1697                         goto restart;
1698                 }
1699         }
1700         read_unlock_bh(&table->tb6_lock);
1701 }
1702
1703 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1704                                  struct fib6_config *cfg)
1705 {
1706         memset(cfg, 0, sizeof(*cfg));
1707
1708         cfg->fc_table = RT6_TABLE_MAIN;
1709         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1710         cfg->fc_metric = rtmsg->rtmsg_metric;
1711         cfg->fc_expires = rtmsg->rtmsg_info;
1712         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1713         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1714         cfg->fc_flags = rtmsg->rtmsg_flags;
1715
1716         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1717         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1718         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1719 }
1720
1721 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1722 {
1723         struct fib6_config cfg;
1724         struct in6_rtmsg rtmsg;
1725         int err;
1726
1727         switch(cmd) {
1728         case SIOCADDRT:         /* Add a route */
1729         case SIOCDELRT:         /* Delete a route */
1730                 if (!capable(CAP_NET_ADMIN))
1731                         return -EPERM;
1732                 err = copy_from_user(&rtmsg, arg,
1733                                      sizeof(struct in6_rtmsg));
1734                 if (err)
1735                         return -EFAULT;
1736
1737                 rtmsg_to_fib6_config(&rtmsg, &cfg);
1738
1739                 rtnl_lock();
1740                 switch (cmd) {
1741                 case SIOCADDRT:
1742                         err = ip6_route_add(&cfg);
1743                         break;
1744                 case SIOCDELRT:
1745                         err = ip6_route_del(&cfg);
1746                         break;
1747                 default:
1748                         err = -EINVAL;
1749                 }
1750                 rtnl_unlock();
1751
1752                 return err;
1753         }
1754
1755         return -EINVAL;
1756 }
1757
1758 /*
1759  *      Drop the packet on the floor
1760  */
1761
1762 static inline int ip6_pkt_drop(struct sk_buff *skb, int code,
1763                                int ipstats_mib_noroutes)
1764 {
1765         int type;
1766         switch (ipstats_mib_noroutes) {
1767         case IPSTATS_MIB_INNOROUTES:
1768                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1769                 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1770                         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
1771                         break;
1772                 }
1773                 /* FALLTHROUGH */
1774         case IPSTATS_MIB_OUTNOROUTES:
1775                 IP6_INC_STATS(ip6_dst_idev(skb->dst), ipstats_mib_noroutes);
1776                 break;
1777         }
1778         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1779         kfree_skb(skb);
1780         return 0;
1781 }
1782
1783 static int ip6_pkt_discard(struct sk_buff *skb)
1784 {
1785         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1786 }
1787
1788 static int ip6_pkt_discard_out(struct sk_buff *skb)
1789 {
1790         skb->dev = skb->dst->dev;
1791         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1792 }
1793
1794 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1795
1796 static int ip6_pkt_prohibit(struct sk_buff *skb)
1797 {
1798         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1799 }
1800
1801 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1802 {
1803         skb->dev = skb->dst->dev;
1804         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1805 }
1806
1807 #endif
1808
1809 /*
1810  *      Allocate a dst for local (unicast / anycast) address.
1811  */
1812
1813 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1814                                     const struct in6_addr *addr,
1815                                     int anycast)
1816 {
1817         struct rt6_info *rt = ip6_dst_alloc();
1818
1819         if (rt == NULL)
1820                 return ERR_PTR(-ENOMEM);
1821
1822         dev_hold(init_net.loopback_dev);
1823         in6_dev_hold(idev);
1824
1825         rt->u.dst.flags = DST_HOST;
1826         rt->u.dst.input = ip6_input;
1827         rt->u.dst.output = ip6_output;
1828         rt->rt6i_dev = init_net.loopback_dev;
1829         rt->rt6i_idev = idev;
1830         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1831         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1832         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1833         rt->u.dst.obsolete = -1;
1834
1835         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1836         if (anycast)
1837                 rt->rt6i_flags |= RTF_ANYCAST;
1838         else
1839                 rt->rt6i_flags |= RTF_LOCAL;
1840         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1841         if (rt->rt6i_nexthop == NULL) {
1842                 dst_free(&rt->u.dst);
1843                 return ERR_PTR(-ENOMEM);
1844         }
1845
1846         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1847         rt->rt6i_dst.plen = 128;
1848         rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
1849
1850         atomic_set(&rt->u.dst.__refcnt, 1);
1851
1852         return rt;
1853 }
1854
1855 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1856 {
1857         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1858             rt != &ip6_null_entry) {
1859                 RT6_TRACE("deleted by ifdown %p\n", rt);
1860                 return -1;
1861         }
1862         return 0;
1863 }
1864
1865 void rt6_ifdown(struct net_device *dev)
1866 {
1867         fib6_clean_all(fib6_ifdown, 0, dev);
1868 }
1869
1870 struct rt6_mtu_change_arg
1871 {
1872         struct net_device *dev;
1873         unsigned mtu;
1874 };
1875
1876 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1877 {
1878         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1879         struct inet6_dev *idev;
1880
1881         /* In IPv6 pmtu discovery is not optional,
1882            so that RTAX_MTU lock cannot disable it.
1883            We still use this lock to block changes
1884            caused by addrconf/ndisc.
1885         */
1886
1887         idev = __in6_dev_get(arg->dev);
1888         if (idev == NULL)
1889                 return 0;
1890
1891         /* For administrative MTU increase, there is no way to discover
1892            IPv6 PMTU increase, so PMTU increase should be updated here.
1893            Since RFC 1981 doesn't include administrative MTU increase
1894            update PMTU increase is a MUST. (i.e. jumbo frame)
1895          */
1896         /*
1897            If new MTU is less than route PMTU, this new MTU will be the
1898            lowest MTU in the path, update the route PMTU to reflect PMTU
1899            decreases; if new MTU is greater than route PMTU, and the
1900            old MTU is the lowest MTU in the path, update the route PMTU
1901            to reflect the increase. In this case if the other nodes' MTU
1902            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1903            PMTU discouvery.
1904          */
1905         if (rt->rt6i_dev == arg->dev &&
1906             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1907             (dst_mtu(&rt->u.dst) > arg->mtu ||
1908              (dst_mtu(&rt->u.dst) < arg->mtu &&
1909               dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
1910                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1911                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1912         }
1913         return 0;
1914 }
1915
1916 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1917 {
1918         struct rt6_mtu_change_arg arg = {
1919                 .dev = dev,
1920                 .mtu = mtu,
1921         };
1922
1923         fib6_clean_all(rt6_mtu_change_route, 0, &arg);
1924 }
1925
1926 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
1927         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
1928         [RTA_OIF]               = { .type = NLA_U32 },
1929         [RTA_IIF]               = { .type = NLA_U32 },
1930         [RTA_PRIORITY]          = { .type = NLA_U32 },
1931         [RTA_METRICS]           = { .type = NLA_NESTED },
1932 };
1933
1934 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1935                               struct fib6_config *cfg)
1936 {
1937         struct rtmsg *rtm;
1938         struct nlattr *tb[RTA_MAX+1];
1939         int err;
1940
1941         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1942         if (err < 0)
1943                 goto errout;
1944
1945         err = -EINVAL;
1946         rtm = nlmsg_data(nlh);
1947         memset(cfg, 0, sizeof(*cfg));
1948
1949         cfg->fc_table = rtm->rtm_table;
1950         cfg->fc_dst_len = rtm->rtm_dst_len;
1951         cfg->fc_src_len = rtm->rtm_src_len;
1952         cfg->fc_flags = RTF_UP;
1953         cfg->fc_protocol = rtm->rtm_protocol;
1954
1955         if (rtm->rtm_type == RTN_UNREACHABLE)
1956                 cfg->fc_flags |= RTF_REJECT;
1957
1958         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1959         cfg->fc_nlinfo.nlh = nlh;
1960
1961         if (tb[RTA_GATEWAY]) {
1962                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1963                 cfg->fc_flags |= RTF_GATEWAY;
1964         }
1965
1966         if (tb[RTA_DST]) {
1967                 int plen = (rtm->rtm_dst_len + 7) >> 3;
1968
1969                 if (nla_len(tb[RTA_DST]) < plen)
1970                         goto errout;
1971
1972                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1973         }
1974
1975         if (tb[RTA_SRC]) {
1976                 int plen = (rtm->rtm_src_len + 7) >> 3;
1977
1978                 if (nla_len(tb[RTA_SRC]) < plen)
1979                         goto errout;
1980
1981                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1982         }
1983
1984         if (tb[RTA_OIF])
1985                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
1986
1987         if (tb[RTA_PRIORITY])
1988                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
1989
1990         if (tb[RTA_METRICS]) {
1991                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
1992                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1993         }
1994
1995         if (tb[RTA_TABLE])
1996                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
1997
1998         err = 0;
1999 errout:
2000         return err;
2001 }
2002
2003 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2004 {
2005         struct fib6_config cfg;
2006         int err;
2007
2008         err = rtm_to_fib6_config(skb, nlh, &cfg);
2009         if (err < 0)
2010                 return err;
2011
2012         return ip6_route_del(&cfg);
2013 }
2014
2015 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2016 {
2017         struct fib6_config cfg;
2018         int err;
2019
2020         err = rtm_to_fib6_config(skb, nlh, &cfg);
2021         if (err < 0)
2022                 return err;
2023
2024         return ip6_route_add(&cfg);
2025 }
2026
2027 static inline size_t rt6_nlmsg_size(void)
2028 {
2029         return NLMSG_ALIGN(sizeof(struct rtmsg))
2030                + nla_total_size(16) /* RTA_SRC */
2031                + nla_total_size(16) /* RTA_DST */
2032                + nla_total_size(16) /* RTA_GATEWAY */
2033                + nla_total_size(16) /* RTA_PREFSRC */
2034                + nla_total_size(4) /* RTA_TABLE */
2035                + nla_total_size(4) /* RTA_IIF */
2036                + nla_total_size(4) /* RTA_OIF */
2037                + nla_total_size(4) /* RTA_PRIORITY */
2038                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2039                + nla_total_size(sizeof(struct rta_cacheinfo));
2040 }
2041
2042 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
2043                          struct in6_addr *dst, struct in6_addr *src,
2044                          int iif, int type, u32 pid, u32 seq,
2045                          int prefix, unsigned int flags)
2046 {
2047         struct rtmsg *rtm;
2048         struct nlmsghdr *nlh;
2049         long expires;
2050         u32 table;
2051
2052         if (prefix) {   /* user wants prefix routes only */
2053                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2054                         /* success since this is not a prefix route */
2055                         return 1;
2056                 }
2057         }
2058
2059         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2060         if (nlh == NULL)
2061                 return -EMSGSIZE;
2062
2063         rtm = nlmsg_data(nlh);
2064         rtm->rtm_family = AF_INET6;
2065         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2066         rtm->rtm_src_len = rt->rt6i_src.plen;
2067         rtm->rtm_tos = 0;
2068         if (rt->rt6i_table)
2069                 table = rt->rt6i_table->tb6_id;
2070         else
2071                 table = RT6_TABLE_UNSPEC;
2072         rtm->rtm_table = table;
2073         NLA_PUT_U32(skb, RTA_TABLE, table);
2074         if (rt->rt6i_flags&RTF_REJECT)
2075                 rtm->rtm_type = RTN_UNREACHABLE;
2076         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2077                 rtm->rtm_type = RTN_LOCAL;
2078         else
2079                 rtm->rtm_type = RTN_UNICAST;
2080         rtm->rtm_flags = 0;
2081         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2082         rtm->rtm_protocol = rt->rt6i_protocol;
2083         if (rt->rt6i_flags&RTF_DYNAMIC)
2084                 rtm->rtm_protocol = RTPROT_REDIRECT;
2085         else if (rt->rt6i_flags & RTF_ADDRCONF)
2086                 rtm->rtm_protocol = RTPROT_KERNEL;
2087         else if (rt->rt6i_flags&RTF_DEFAULT)
2088                 rtm->rtm_protocol = RTPROT_RA;
2089
2090         if (rt->rt6i_flags&RTF_CACHE)
2091                 rtm->rtm_flags |= RTM_F_CLONED;
2092
2093         if (dst) {
2094                 NLA_PUT(skb, RTA_DST, 16, dst);
2095                 rtm->rtm_dst_len = 128;
2096         } else if (rtm->rtm_dst_len)
2097                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2098 #ifdef CONFIG_IPV6_SUBTREES
2099         if (src) {
2100                 NLA_PUT(skb, RTA_SRC, 16, src);
2101                 rtm->rtm_src_len = 128;
2102         } else if (rtm->rtm_src_len)
2103                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2104 #endif
2105         if (iif)
2106                 NLA_PUT_U32(skb, RTA_IIF, iif);
2107         else if (dst) {
2108                 struct in6_addr saddr_buf;
2109                 if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0)
2110                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2111         }
2112
2113         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2114                 goto nla_put_failure;
2115
2116         if (rt->u.dst.neighbour)
2117                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2118
2119         if (rt->u.dst.dev)
2120                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2121
2122         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2123
2124         expires = rt->rt6i_expires ? rt->rt6i_expires - jiffies : 0;
2125         if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2126                                expires, rt->u.dst.error) < 0)
2127                 goto nla_put_failure;
2128
2129         return nlmsg_end(skb, nlh);
2130
2131 nla_put_failure:
2132         nlmsg_cancel(skb, nlh);
2133         return -EMSGSIZE;
2134 }
2135
2136 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2137 {
2138         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2139         int prefix;
2140
2141         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2142                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2143                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2144         } else
2145                 prefix = 0;
2146
2147         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2148                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2149                      prefix, NLM_F_MULTI);
2150 }
2151
2152 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2153 {
2154         struct nlattr *tb[RTA_MAX+1];
2155         struct rt6_info *rt;
2156         struct sk_buff *skb;
2157         struct rtmsg *rtm;
2158         struct flowi fl;
2159         int err, iif = 0;
2160
2161         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2162         if (err < 0)
2163                 goto errout;
2164
2165         err = -EINVAL;
2166         memset(&fl, 0, sizeof(fl));
2167
2168         if (tb[RTA_SRC]) {
2169                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2170                         goto errout;
2171
2172                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2173         }
2174
2175         if (tb[RTA_DST]) {
2176                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2177                         goto errout;
2178
2179                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2180         }
2181
2182         if (tb[RTA_IIF])
2183                 iif = nla_get_u32(tb[RTA_IIF]);
2184
2185         if (tb[RTA_OIF])
2186                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2187
2188         if (iif) {
2189                 struct net_device *dev;
2190                 dev = __dev_get_by_index(&init_net, iif);
2191                 if (!dev) {
2192                         err = -ENODEV;
2193                         goto errout;
2194                 }
2195         }
2196
2197         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2198         if (skb == NULL) {
2199                 err = -ENOBUFS;
2200                 goto errout;
2201         }
2202
2203         /* Reserve room for dummy headers, this skb can pass
2204            through good chunk of routing engine.
2205          */
2206         skb_reset_mac_header(skb);
2207         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2208
2209         rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2210         skb->dst = &rt->u.dst;
2211
2212         err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2213                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2214                             nlh->nlmsg_seq, 0, 0);
2215         if (err < 0) {
2216                 kfree_skb(skb);
2217                 goto errout;
2218         }
2219
2220         err = rtnl_unicast(skb, NETLINK_CB(in_skb).pid);
2221 errout:
2222         return err;
2223 }
2224
2225 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2226 {
2227         struct sk_buff *skb;
2228         u32 pid = 0, seq = 0;
2229         struct nlmsghdr *nlh = NULL;
2230         int err = -ENOBUFS;
2231
2232         if (info) {
2233                 pid = info->pid;
2234                 nlh = info->nlh;
2235                 if (nlh)
2236                         seq = nlh->nlmsg_seq;
2237         }
2238
2239         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2240         if (skb == NULL)
2241                 goto errout;
2242
2243         err = rt6_fill_node(skb, rt, NULL, NULL, 0, event, pid, seq, 0, 0);
2244         if (err < 0) {
2245                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2246                 WARN_ON(err == -EMSGSIZE);
2247                 kfree_skb(skb);
2248                 goto errout;
2249         }
2250         err = rtnl_notify(skb, pid, RTNLGRP_IPV6_ROUTE, nlh, gfp_any());
2251 errout:
2252         if (err < 0)
2253                 rtnl_set_sk_err(RTNLGRP_IPV6_ROUTE, err);
2254 }
2255
2256 /*
2257  *      /proc
2258  */
2259
2260 #ifdef CONFIG_PROC_FS
2261
2262 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2263
2264 struct rt6_proc_arg
2265 {
2266         char *buffer;
2267         int offset;
2268         int length;
2269         int skip;
2270         int len;
2271 };
2272
2273 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2274 {
2275         struct seq_file *m = p_arg;
2276
2277         seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_dst.addr),
2278                    rt->rt6i_dst.plen);
2279
2280 #ifdef CONFIG_IPV6_SUBTREES
2281         seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_src.addr),
2282                    rt->rt6i_src.plen);
2283 #else
2284         seq_puts(m, "00000000000000000000000000000000 00 ");
2285 #endif
2286
2287         if (rt->rt6i_nexthop) {
2288                 seq_printf(m, NIP6_SEQFMT,
2289                            NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key)));
2290         } else {
2291                 seq_puts(m, "00000000000000000000000000000000");
2292         }
2293         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2294                    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2295                    rt->u.dst.__use, rt->rt6i_flags,
2296                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2297         return 0;
2298 }
2299
2300 static int ipv6_route_show(struct seq_file *m, void *v)
2301 {
2302         fib6_clean_all(rt6_info_route, 0, m);
2303         return 0;
2304 }
2305
2306 static int ipv6_route_open(struct inode *inode, struct file *file)
2307 {
2308         return single_open(file, ipv6_route_show, NULL);
2309 }
2310
2311 static const struct file_operations ipv6_route_proc_fops = {
2312         .owner          = THIS_MODULE,
2313         .open           = ipv6_route_open,
2314         .read           = seq_read,
2315         .llseek         = seq_lseek,
2316         .release        = single_release,
2317 };
2318
2319 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2320 {
2321         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2322                       rt6_stats.fib_nodes, rt6_stats.fib_route_nodes,
2323                       rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries,
2324                       rt6_stats.fib_rt_cache,
2325                       atomic_read(&ip6_dst_ops.entries),
2326                       rt6_stats.fib_discarded_routes);
2327
2328         return 0;
2329 }
2330
2331 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2332 {
2333         return single_open(file, rt6_stats_seq_show, NULL);
2334 }
2335
2336 static const struct file_operations rt6_stats_seq_fops = {
2337         .owner   = THIS_MODULE,
2338         .open    = rt6_stats_seq_open,
2339         .read    = seq_read,
2340         .llseek  = seq_lseek,
2341         .release = single_release,
2342 };
2343 #endif  /* CONFIG_PROC_FS */
2344
2345 #ifdef CONFIG_SYSCTL
2346
2347 static int flush_delay;
2348
2349 static
2350 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2351                               void __user *buffer, size_t *lenp, loff_t *ppos)
2352 {
2353         if (write) {
2354                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2355                 fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay);
2356                 return 0;
2357         } else
2358                 return -EINVAL;
2359 }
2360
2361 ctl_table ipv6_route_table[] = {
2362         {
2363                 .procname       =       "flush",
2364                 .data           =       &flush_delay,
2365                 .maxlen         =       sizeof(int),
2366                 .mode           =       0200,
2367                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2368         },
2369         {
2370                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2371                 .procname       =       "gc_thresh",
2372                 .data           =       &ip6_dst_ops.gc_thresh,
2373                 .maxlen         =       sizeof(int),
2374                 .mode           =       0644,
2375                 .proc_handler   =       &proc_dointvec,
2376         },
2377         {
2378                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2379                 .procname       =       "max_size",
2380                 .data           =       &ip6_rt_max_size,
2381                 .maxlen         =       sizeof(int),
2382                 .mode           =       0644,
2383                 .proc_handler   =       &proc_dointvec,
2384         },
2385         {
2386                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2387                 .procname       =       "gc_min_interval",
2388                 .data           =       &ip6_rt_gc_min_interval,
2389                 .maxlen         =       sizeof(int),
2390                 .mode           =       0644,
2391                 .proc_handler   =       &proc_dointvec_jiffies,
2392                 .strategy       =       &sysctl_jiffies,
2393         },
2394         {
2395                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2396                 .procname       =       "gc_timeout",
2397                 .data           =       &ip6_rt_gc_timeout,
2398                 .maxlen         =       sizeof(int),
2399                 .mode           =       0644,
2400                 .proc_handler   =       &proc_dointvec_jiffies,
2401                 .strategy       =       &sysctl_jiffies,
2402         },
2403         {
2404                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2405                 .procname       =       "gc_interval",
2406                 .data           =       &ip6_rt_gc_interval,
2407                 .maxlen         =       sizeof(int),
2408                 .mode           =       0644,
2409                 .proc_handler   =       &proc_dointvec_jiffies,
2410                 .strategy       =       &sysctl_jiffies,
2411         },
2412         {
2413                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2414                 .procname       =       "gc_elasticity",
2415                 .data           =       &ip6_rt_gc_elasticity,
2416                 .maxlen         =       sizeof(int),
2417                 .mode           =       0644,
2418                 .proc_handler   =       &proc_dointvec_jiffies,
2419                 .strategy       =       &sysctl_jiffies,
2420         },
2421         {
2422                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2423                 .procname       =       "mtu_expires",
2424                 .data           =       &ip6_rt_mtu_expires,
2425                 .maxlen         =       sizeof(int),
2426                 .mode           =       0644,
2427                 .proc_handler   =       &proc_dointvec_jiffies,
2428                 .strategy       =       &sysctl_jiffies,
2429         },
2430         {
2431                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2432                 .procname       =       "min_adv_mss",
2433                 .data           =       &ip6_rt_min_advmss,
2434                 .maxlen         =       sizeof(int),
2435                 .mode           =       0644,
2436                 .proc_handler   =       &proc_dointvec_jiffies,
2437                 .strategy       =       &sysctl_jiffies,
2438         },
2439         {
2440                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2441                 .procname       =       "gc_min_interval_ms",
2442                 .data           =       &ip6_rt_gc_min_interval,
2443                 .maxlen         =       sizeof(int),
2444                 .mode           =       0644,
2445                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2446                 .strategy       =       &sysctl_ms_jiffies,
2447         },
2448         { .ctl_name = 0 }
2449 };
2450
2451 #endif
2452
2453 void __init ip6_route_init(void)
2454 {
2455         ip6_dst_ops.kmem_cachep =
2456                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2457                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2458         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops.kmem_cachep;
2459
2460         fib6_init();
2461         proc_net_fops_create(&init_net, "ipv6_route", 0, &ipv6_route_proc_fops);
2462         proc_net_fops_create(&init_net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2463 #ifdef CONFIG_XFRM
2464         xfrm6_init();
2465 #endif
2466 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2467         fib6_rules_init();
2468 #endif
2469
2470         __rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL);
2471         __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL);
2472         __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL);
2473 }
2474
2475 void ip6_route_cleanup(void)
2476 {
2477 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2478         fib6_rules_cleanup();
2479 #endif
2480 #ifdef CONFIG_PROC_FS
2481         proc_net_remove(&init_net, "ipv6_route");
2482         proc_net_remove(&init_net, "rt6_stats");
2483 #endif
2484 #ifdef CONFIG_XFRM
2485         xfrm6_fini();
2486 #endif
2487         rt6_ifdown(NULL);
2488         fib6_gc_cleanup();
2489         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2490 }