2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
55 #include <linux/rtnetlink.h>
58 #include <net/netevent.h>
59 #include <net/netlink.h>
61 #include <asm/uaccess.h>
64 #include <linux/sysctl.h>
67 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
68 const struct in6_addr *dest);
69 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
70 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
71 static unsigned int ip6_mtu(const struct dst_entry *dst);
72 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
73 static void ip6_dst_destroy(struct dst_entry *);
74 static void ip6_dst_ifdown(struct dst_entry *,
75 struct net_device *dev, int how);
76 static int ip6_dst_gc(struct dst_ops *ops);
78 static int ip6_pkt_discard(struct sk_buff *skb);
79 static int ip6_pkt_discard_out(struct sk_buff *skb);
80 static void ip6_link_failure(struct sk_buff *skb);
81 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
83 #ifdef CONFIG_IPV6_ROUTE_INFO
84 static struct rt6_info *rt6_add_route_info(struct net *net,
85 const struct in6_addr *prefix, int prefixlen,
86 const struct in6_addr *gwaddr, int ifindex,
88 static struct rt6_info *rt6_get_route_info(struct net *net,
89 const struct in6_addr *prefix, int prefixlen,
90 const struct in6_addr *gwaddr, int ifindex);
93 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
95 struct rt6_info *rt = (struct rt6_info *) dst;
96 struct inet_peer *peer;
99 if (!(rt->dst.flags & DST_HOST))
102 peer = rt6_get_peer_create(rt);
104 u32 *old_p = __DST_METRICS_PTR(old);
105 unsigned long prev, new;
108 if (inet_metrics_new(peer))
109 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
111 new = (unsigned long) p;
112 prev = cmpxchg(&dst->_metrics, old, new);
115 p = __DST_METRICS_PTR(prev);
116 if (prev & DST_METRICS_READ_ONLY)
123 static inline const void *choose_neigh_daddr(struct rt6_info *rt, const void *daddr)
125 struct in6_addr *p = &rt->rt6i_gateway;
127 if (!ipv6_addr_any(p))
128 return (const void *) p;
132 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
134 struct rt6_info *rt = (struct rt6_info *) dst;
137 daddr = choose_neigh_daddr(rt, daddr);
138 n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
141 return neigh_create(&nd_tbl, daddr, dst->dev);
144 static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
146 struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
148 n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
152 dst_set_neighbour(&rt->dst, n);
157 static struct dst_ops ip6_dst_ops_template = {
159 .protocol = cpu_to_be16(ETH_P_IPV6),
162 .check = ip6_dst_check,
163 .default_advmss = ip6_default_advmss,
165 .cow_metrics = ipv6_cow_metrics,
166 .destroy = ip6_dst_destroy,
167 .ifdown = ip6_dst_ifdown,
168 .negative_advice = ip6_negative_advice,
169 .link_failure = ip6_link_failure,
170 .update_pmtu = ip6_rt_update_pmtu,
171 .local_out = __ip6_local_out,
172 .neigh_lookup = ip6_neigh_lookup,
175 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
177 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
179 return mtu ? : dst->dev->mtu;
182 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
186 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
192 static struct dst_ops ip6_dst_blackhole_ops = {
194 .protocol = cpu_to_be16(ETH_P_IPV6),
195 .destroy = ip6_dst_destroy,
196 .check = ip6_dst_check,
197 .mtu = ip6_blackhole_mtu,
198 .default_advmss = ip6_default_advmss,
199 .update_pmtu = ip6_rt_blackhole_update_pmtu,
200 .cow_metrics = ip6_rt_blackhole_cow_metrics,
201 .neigh_lookup = ip6_neigh_lookup,
204 static const u32 ip6_template_metrics[RTAX_MAX] = {
205 [RTAX_HOPLIMIT - 1] = 255,
208 static struct rt6_info ip6_null_entry_template = {
210 .__refcnt = ATOMIC_INIT(1),
213 .error = -ENETUNREACH,
214 .input = ip6_pkt_discard,
215 .output = ip6_pkt_discard_out,
217 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
218 .rt6i_protocol = RTPROT_KERNEL,
219 .rt6i_metric = ~(u32) 0,
220 .rt6i_ref = ATOMIC_INIT(1),
223 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
225 static int ip6_pkt_prohibit(struct sk_buff *skb);
226 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
228 static struct rt6_info ip6_prohibit_entry_template = {
230 .__refcnt = ATOMIC_INIT(1),
234 .input = ip6_pkt_prohibit,
235 .output = ip6_pkt_prohibit_out,
237 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
238 .rt6i_protocol = RTPROT_KERNEL,
239 .rt6i_metric = ~(u32) 0,
240 .rt6i_ref = ATOMIC_INIT(1),
243 static struct rt6_info ip6_blk_hole_entry_template = {
245 .__refcnt = ATOMIC_INIT(1),
249 .input = dst_discard,
250 .output = dst_discard,
252 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
253 .rt6i_protocol = RTPROT_KERNEL,
254 .rt6i_metric = ~(u32) 0,
255 .rt6i_ref = ATOMIC_INIT(1),
260 /* allocate dst with ip6_dst_ops */
261 static inline struct rt6_info *ip6_dst_alloc(struct net *net,
262 struct net_device *dev,
265 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
269 memset(&rt->rt6i_table, 0,
270 sizeof(*rt) - sizeof(struct dst_entry));
271 rt6_init_peer(rt, net->ipv6.peers);
276 static void ip6_dst_destroy(struct dst_entry *dst)
278 struct rt6_info *rt = (struct rt6_info *)dst;
279 struct inet6_dev *idev = rt->rt6i_idev;
281 if (!(rt->dst.flags & DST_HOST))
282 dst_destroy_metrics_generic(dst);
285 rt->rt6i_idev = NULL;
289 if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
290 dst_release(dst->from);
292 if (rt6_has_peer(rt)) {
293 struct inet_peer *peer = rt6_peer_ptr(rt);
298 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
300 static u32 rt6_peer_genid(void)
302 return atomic_read(&__rt6_peer_genid);
305 void rt6_bind_peer(struct rt6_info *rt, int create)
307 struct inet_peer_base *base;
308 struct inet_peer *peer;
310 base = inetpeer_base_ptr(rt->_rt6i_peer);
314 peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
315 if (!rt6_set_peer(rt, peer))
318 rt->rt6i_peer_genid = rt6_peer_genid();
321 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
324 struct rt6_info *rt = (struct rt6_info *)dst;
325 struct inet6_dev *idev = rt->rt6i_idev;
326 struct net_device *loopback_dev =
327 dev_net(dev)->loopback_dev;
329 if (dev != loopback_dev && idev && idev->dev == dev) {
330 struct inet6_dev *loopback_idev =
331 in6_dev_get(loopback_dev);
333 rt->rt6i_idev = loopback_idev;
339 static bool rt6_check_expired(const struct rt6_info *rt)
341 struct rt6_info *ort = NULL;
343 if (rt->rt6i_flags & RTF_EXPIRES) {
344 if (time_after(jiffies, rt->dst.expires))
346 } else if (rt->dst.from) {
347 ort = (struct rt6_info *) rt->dst.from;
348 return (ort->rt6i_flags & RTF_EXPIRES) &&
349 time_after(jiffies, ort->dst.expires);
354 static bool rt6_need_strict(const struct in6_addr *daddr)
356 return ipv6_addr_type(daddr) &
357 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
361 * Route lookup. Any table->tb6_lock is implied.
364 static inline struct rt6_info *rt6_device_match(struct net *net,
366 const struct in6_addr *saddr,
370 struct rt6_info *local = NULL;
371 struct rt6_info *sprt;
373 if (!oif && ipv6_addr_any(saddr))
376 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
377 struct net_device *dev = sprt->dst.dev;
380 if (dev->ifindex == oif)
382 if (dev->flags & IFF_LOOPBACK) {
383 if (!sprt->rt6i_idev ||
384 sprt->rt6i_idev->dev->ifindex != oif) {
385 if (flags & RT6_LOOKUP_F_IFACE && oif)
387 if (local && (!oif ||
388 local->rt6i_idev->dev->ifindex == oif))
394 if (ipv6_chk_addr(net, saddr, dev,
395 flags & RT6_LOOKUP_F_IFACE))
404 if (flags & RT6_LOOKUP_F_IFACE)
405 return net->ipv6.ip6_null_entry;
411 #ifdef CONFIG_IPV6_ROUTER_PREF
412 static void rt6_probe(struct rt6_info *rt)
414 struct neighbour *neigh;
416 * Okay, this does not seem to be appropriate
417 * for now, however, we need to check if it
418 * is really so; aka Router Reachability Probing.
420 * Router Reachability Probe MUST be rate-limited
421 * to no more than one per minute.
424 neigh = rt ? dst_get_neighbour_noref(&rt->dst) : NULL;
425 if (!neigh || (neigh->nud_state & NUD_VALID))
427 read_lock_bh(&neigh->lock);
428 if (!(neigh->nud_state & NUD_VALID) &&
429 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
430 struct in6_addr mcaddr;
431 struct in6_addr *target;
433 neigh->updated = jiffies;
434 read_unlock_bh(&neigh->lock);
436 target = (struct in6_addr *)&neigh->primary_key;
437 addrconf_addr_solict_mult(target, &mcaddr);
438 ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
440 read_unlock_bh(&neigh->lock);
446 static inline void rt6_probe(struct rt6_info *rt)
452 * Default Router Selection (RFC 2461 6.3.6)
454 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
456 struct net_device *dev = rt->dst.dev;
457 if (!oif || dev->ifindex == oif)
459 if ((dev->flags & IFF_LOOPBACK) &&
460 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
465 static inline int rt6_check_neigh(struct rt6_info *rt)
467 struct neighbour *neigh;
471 neigh = dst_get_neighbour_noref(&rt->dst);
472 if (rt->rt6i_flags & RTF_NONEXTHOP ||
473 !(rt->rt6i_flags & RTF_GATEWAY))
476 read_lock_bh(&neigh->lock);
477 if (neigh->nud_state & NUD_VALID)
479 #ifdef CONFIG_IPV6_ROUTER_PREF
480 else if (neigh->nud_state & NUD_FAILED)
485 read_unlock_bh(&neigh->lock);
492 static int rt6_score_route(struct rt6_info *rt, int oif,
497 m = rt6_check_dev(rt, oif);
498 if (!m && (strict & RT6_LOOKUP_F_IFACE))
500 #ifdef CONFIG_IPV6_ROUTER_PREF
501 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
503 n = rt6_check_neigh(rt);
504 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
509 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
510 int *mpri, struct rt6_info *match)
514 if (rt6_check_expired(rt))
517 m = rt6_score_route(rt, oif, strict);
522 if (strict & RT6_LOOKUP_F_REACHABLE)
526 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
534 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
535 struct rt6_info *rr_head,
536 u32 metric, int oif, int strict)
538 struct rt6_info *rt, *match;
542 for (rt = rr_head; rt && rt->rt6i_metric == metric;
543 rt = rt->dst.rt6_next)
544 match = find_match(rt, oif, strict, &mpri, match);
545 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
546 rt = rt->dst.rt6_next)
547 match = find_match(rt, oif, strict, &mpri, match);
552 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
554 struct rt6_info *match, *rt0;
559 fn->rr_ptr = rt0 = fn->leaf;
561 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
564 (strict & RT6_LOOKUP_F_REACHABLE)) {
565 struct rt6_info *next = rt0->dst.rt6_next;
567 /* no entries matched; do round-robin */
568 if (!next || next->rt6i_metric != rt0->rt6i_metric)
575 net = dev_net(rt0->dst.dev);
576 return match ? match : net->ipv6.ip6_null_entry;
579 #ifdef CONFIG_IPV6_ROUTE_INFO
580 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
581 const struct in6_addr *gwaddr)
583 struct net *net = dev_net(dev);
584 struct route_info *rinfo = (struct route_info *) opt;
585 struct in6_addr prefix_buf, *prefix;
587 unsigned long lifetime;
590 if (len < sizeof(struct route_info)) {
594 /* Sanity check for prefix_len and length */
595 if (rinfo->length > 3) {
597 } else if (rinfo->prefix_len > 128) {
599 } else if (rinfo->prefix_len > 64) {
600 if (rinfo->length < 2) {
603 } else if (rinfo->prefix_len > 0) {
604 if (rinfo->length < 1) {
609 pref = rinfo->route_pref;
610 if (pref == ICMPV6_ROUTER_PREF_INVALID)
613 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
615 if (rinfo->length == 3)
616 prefix = (struct in6_addr *)rinfo->prefix;
618 /* this function is safe */
619 ipv6_addr_prefix(&prefix_buf,
620 (struct in6_addr *)rinfo->prefix,
622 prefix = &prefix_buf;
625 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
628 if (rt && !lifetime) {
634 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
637 rt->rt6i_flags = RTF_ROUTEINFO |
638 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
641 if (!addrconf_finite_timeout(lifetime))
642 rt6_clean_expires(rt);
644 rt6_set_expires(rt, jiffies + HZ * lifetime);
646 dst_release(&rt->dst);
652 #define BACKTRACK(__net, saddr) \
654 if (rt == __net->ipv6.ip6_null_entry) { \
655 struct fib6_node *pn; \
657 if (fn->fn_flags & RTN_TL_ROOT) \
660 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
661 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
664 if (fn->fn_flags & RTN_RTINFO) \
670 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
671 struct fib6_table *table,
672 struct flowi6 *fl6, int flags)
674 struct fib6_node *fn;
677 read_lock_bh(&table->tb6_lock);
678 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
681 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
682 BACKTRACK(net, &fl6->saddr);
684 dst_use(&rt->dst, jiffies);
685 read_unlock_bh(&table->tb6_lock);
690 struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
693 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
695 EXPORT_SYMBOL_GPL(ip6_route_lookup);
697 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
698 const struct in6_addr *saddr, int oif, int strict)
700 struct flowi6 fl6 = {
704 struct dst_entry *dst;
705 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
708 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
709 flags |= RT6_LOOKUP_F_HAS_SADDR;
712 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
714 return (struct rt6_info *) dst;
721 EXPORT_SYMBOL(rt6_lookup);
723 /* ip6_ins_rt is called with FREE table->tb6_lock.
724 It takes new route entry, the addition fails by any reason the
725 route is freed. In any case, if caller does not hold it, it may
729 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
732 struct fib6_table *table;
734 table = rt->rt6i_table;
735 write_lock_bh(&table->tb6_lock);
736 err = fib6_add(&table->tb6_root, rt, info);
737 write_unlock_bh(&table->tb6_lock);
742 int ip6_ins_rt(struct rt6_info *rt)
744 struct nl_info info = {
745 .nl_net = dev_net(rt->dst.dev),
747 return __ip6_ins_rt(rt, &info);
750 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
751 const struct in6_addr *daddr,
752 const struct in6_addr *saddr)
760 rt = ip6_rt_copy(ort, daddr);
763 int attempts = !in_softirq();
765 if (!(rt->rt6i_flags & RTF_GATEWAY)) {
766 if (ort->rt6i_dst.plen != 128 &&
767 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
768 rt->rt6i_flags |= RTF_ANYCAST;
769 rt->rt6i_gateway = *daddr;
772 rt->rt6i_flags |= RTF_CACHE;
774 #ifdef CONFIG_IPV6_SUBTREES
775 if (rt->rt6i_src.plen && saddr) {
776 rt->rt6i_src.addr = *saddr;
777 rt->rt6i_src.plen = 128;
782 if (rt6_bind_neighbour(rt, rt->dst.dev)) {
783 struct net *net = dev_net(rt->dst.dev);
784 int saved_rt_min_interval =
785 net->ipv6.sysctl.ip6_rt_gc_min_interval;
786 int saved_rt_elasticity =
787 net->ipv6.sysctl.ip6_rt_gc_elasticity;
789 if (attempts-- > 0) {
790 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
791 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
793 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
795 net->ipv6.sysctl.ip6_rt_gc_elasticity =
797 net->ipv6.sysctl.ip6_rt_gc_min_interval =
798 saved_rt_min_interval;
802 net_warn_ratelimited("Neighbour table overflow\n");
811 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
812 const struct in6_addr *daddr)
814 struct rt6_info *rt = ip6_rt_copy(ort, daddr);
817 rt->rt6i_flags |= RTF_CACHE;
818 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_noref_raw(&ort->dst)));
823 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
824 struct flowi6 *fl6, int flags)
826 struct fib6_node *fn;
827 struct rt6_info *rt, *nrt;
831 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
833 strict |= flags & RT6_LOOKUP_F_IFACE;
836 read_lock_bh(&table->tb6_lock);
839 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
842 rt = rt6_select(fn, oif, strict | reachable);
844 BACKTRACK(net, &fl6->saddr);
845 if (rt == net->ipv6.ip6_null_entry ||
846 rt->rt6i_flags & RTF_CACHE)
850 read_unlock_bh(&table->tb6_lock);
852 if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
853 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
854 else if (!(rt->dst.flags & DST_HOST))
855 nrt = rt6_alloc_clone(rt, &fl6->daddr);
859 dst_release(&rt->dst);
860 rt = nrt ? : net->ipv6.ip6_null_entry;
864 err = ip6_ins_rt(nrt);
873 * Race condition! In the gap, when table->tb6_lock was
874 * released someone could insert this route. Relookup.
876 dst_release(&rt->dst);
885 read_unlock_bh(&table->tb6_lock);
887 rt->dst.lastuse = jiffies;
893 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
894 struct flowi6 *fl6, int flags)
896 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
899 static struct dst_entry *ip6_route_input_lookup(struct net *net,
900 struct net_device *dev,
901 struct flowi6 *fl6, int flags)
903 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
904 flags |= RT6_LOOKUP_F_IFACE;
906 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
909 void ip6_route_input(struct sk_buff *skb)
911 const struct ipv6hdr *iph = ipv6_hdr(skb);
912 struct net *net = dev_net(skb->dev);
913 int flags = RT6_LOOKUP_F_HAS_SADDR;
914 struct flowi6 fl6 = {
915 .flowi6_iif = skb->dev->ifindex,
918 .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
919 .flowi6_mark = skb->mark,
920 .flowi6_proto = iph->nexthdr,
923 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
926 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
927 struct flowi6 *fl6, int flags)
929 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
932 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
937 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
938 flags |= RT6_LOOKUP_F_IFACE;
940 if (!ipv6_addr_any(&fl6->saddr))
941 flags |= RT6_LOOKUP_F_HAS_SADDR;
943 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
945 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
948 EXPORT_SYMBOL(ip6_route_output);
950 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
952 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
953 struct dst_entry *new = NULL;
955 rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
957 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
958 rt6_init_peer(rt, net->ipv6.peers);
963 new->input = dst_discard;
964 new->output = dst_discard;
966 if (dst_metrics_read_only(&ort->dst))
967 new->_metrics = ort->dst._metrics;
969 dst_copy_metrics(new, &ort->dst);
970 rt->rt6i_idev = ort->rt6i_idev;
972 in6_dev_hold(rt->rt6i_idev);
974 rt->rt6i_gateway = ort->rt6i_gateway;
975 rt->rt6i_flags = ort->rt6i_flags;
976 rt6_clean_expires(rt);
979 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
980 #ifdef CONFIG_IPV6_SUBTREES
981 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
987 dst_release(dst_orig);
988 return new ? new : ERR_PTR(-ENOMEM);
992 * Destination cache support functions
995 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
999 rt = (struct rt6_info *) dst;
1001 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
1002 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
1003 if (!rt6_has_peer(rt))
1004 rt6_bind_peer(rt, 0);
1005 rt->rt6i_peer_genid = rt6_peer_genid();
1012 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1014 struct rt6_info *rt = (struct rt6_info *) dst;
1017 if (rt->rt6i_flags & RTF_CACHE) {
1018 if (rt6_check_expired(rt)) {
1030 static void ip6_link_failure(struct sk_buff *skb)
1032 struct rt6_info *rt;
1034 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1036 rt = (struct rt6_info *) skb_dst(skb);
1038 if (rt->rt6i_flags & RTF_CACHE)
1039 rt6_update_expires(rt, 0);
1040 else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1041 rt->rt6i_node->fn_sernum = -1;
1045 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1047 struct rt6_info *rt6 = (struct rt6_info*)dst;
1049 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1050 rt6->rt6i_flags |= RTF_MODIFIED;
1051 if (mtu < IPV6_MIN_MTU) {
1052 u32 features = dst_metric(dst, RTAX_FEATURES);
1054 features |= RTAX_FEATURE_ALLFRAG;
1055 dst_metric_set(dst, RTAX_FEATURES, features);
1057 dst_metric_set(dst, RTAX_MTU, mtu);
1061 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1063 struct net_device *dev = dst->dev;
1064 unsigned int mtu = dst_mtu(dst);
1065 struct net *net = dev_net(dev);
1067 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1069 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1070 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1073 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1074 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1075 * IPV6_MAXPLEN is also valid and means: "any MSS,
1076 * rely only on pmtu discovery"
1078 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1083 static unsigned int ip6_mtu(const struct dst_entry *dst)
1085 struct inet6_dev *idev;
1086 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1094 idev = __in6_dev_get(dst->dev);
1096 mtu = idev->cnf.mtu6;
1102 static struct dst_entry *icmp6_dst_gc_list;
1103 static DEFINE_SPINLOCK(icmp6_dst_lock);
1105 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1106 struct neighbour *neigh,
1109 struct dst_entry *dst;
1110 struct rt6_info *rt;
1111 struct inet6_dev *idev = in6_dev_get(dev);
1112 struct net *net = dev_net(dev);
1114 if (unlikely(!idev))
1115 return ERR_PTR(-ENODEV);
1117 rt = ip6_dst_alloc(net, dev, 0);
1118 if (unlikely(!rt)) {
1120 dst = ERR_PTR(-ENOMEM);
1127 neigh = ip6_neigh_lookup(&rt->dst, &fl6->daddr);
1128 if (IS_ERR(neigh)) {
1131 return ERR_CAST(neigh);
1135 rt->dst.flags |= DST_HOST;
1136 rt->dst.output = ip6_output;
1137 dst_set_neighbour(&rt->dst, neigh);
1138 atomic_set(&rt->dst.__refcnt, 1);
1139 rt->rt6i_dst.addr = fl6->daddr;
1140 rt->rt6i_dst.plen = 128;
1141 rt->rt6i_idev = idev;
1142 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1144 spin_lock_bh(&icmp6_dst_lock);
1145 rt->dst.next = icmp6_dst_gc_list;
1146 icmp6_dst_gc_list = &rt->dst;
1147 spin_unlock_bh(&icmp6_dst_lock);
1149 fib6_force_start_gc(net);
1151 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1157 int icmp6_dst_gc(void)
1159 struct dst_entry *dst, **pprev;
1162 spin_lock_bh(&icmp6_dst_lock);
1163 pprev = &icmp6_dst_gc_list;
1165 while ((dst = *pprev) != NULL) {
1166 if (!atomic_read(&dst->__refcnt)) {
1175 spin_unlock_bh(&icmp6_dst_lock);
1180 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1183 struct dst_entry *dst, **pprev;
1185 spin_lock_bh(&icmp6_dst_lock);
1186 pprev = &icmp6_dst_gc_list;
1187 while ((dst = *pprev) != NULL) {
1188 struct rt6_info *rt = (struct rt6_info *) dst;
1189 if (func(rt, arg)) {
1196 spin_unlock_bh(&icmp6_dst_lock);
1199 static int ip6_dst_gc(struct dst_ops *ops)
1201 unsigned long now = jiffies;
1202 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1203 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1204 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1205 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1206 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1207 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1210 entries = dst_entries_get_fast(ops);
1211 if (time_after(rt_last_gc + rt_min_interval, now) &&
1212 entries <= rt_max_size)
1215 net->ipv6.ip6_rt_gc_expire++;
1216 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1217 net->ipv6.ip6_rt_last_gc = now;
1218 entries = dst_entries_get_slow(ops);
1219 if (entries < ops->gc_thresh)
1220 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1222 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1223 return entries > rt_max_size;
1226 /* Clean host part of a prefix. Not necessary in radix tree,
1227 but results in cleaner routing tables.
1229 Remove it only when all the things will work!
1232 int ip6_dst_hoplimit(struct dst_entry *dst)
1234 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1235 if (hoplimit == 0) {
1236 struct net_device *dev = dst->dev;
1237 struct inet6_dev *idev;
1240 idev = __in6_dev_get(dev);
1242 hoplimit = idev->cnf.hop_limit;
1244 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1249 EXPORT_SYMBOL(ip6_dst_hoplimit);
1255 int ip6_route_add(struct fib6_config *cfg)
1258 struct net *net = cfg->fc_nlinfo.nl_net;
1259 struct rt6_info *rt = NULL;
1260 struct net_device *dev = NULL;
1261 struct inet6_dev *idev = NULL;
1262 struct fib6_table *table;
1265 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1267 #ifndef CONFIG_IPV6_SUBTREES
1268 if (cfg->fc_src_len)
1271 if (cfg->fc_ifindex) {
1273 dev = dev_get_by_index(net, cfg->fc_ifindex);
1276 idev = in6_dev_get(dev);
1281 if (cfg->fc_metric == 0)
1282 cfg->fc_metric = IP6_RT_PRIO_USER;
1285 if (cfg->fc_nlinfo.nlh &&
1286 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1287 table = fib6_get_table(net, cfg->fc_table);
1289 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1290 table = fib6_new_table(net, cfg->fc_table);
1293 table = fib6_new_table(net, cfg->fc_table);
1299 rt = ip6_dst_alloc(net, NULL, DST_NOCOUNT);
1306 rt->dst.obsolete = -1;
1308 if (cfg->fc_flags & RTF_EXPIRES)
1309 rt6_set_expires(rt, jiffies +
1310 clock_t_to_jiffies(cfg->fc_expires));
1312 rt6_clean_expires(rt);
1314 if (cfg->fc_protocol == RTPROT_UNSPEC)
1315 cfg->fc_protocol = RTPROT_BOOT;
1316 rt->rt6i_protocol = cfg->fc_protocol;
1318 addr_type = ipv6_addr_type(&cfg->fc_dst);
1320 if (addr_type & IPV6_ADDR_MULTICAST)
1321 rt->dst.input = ip6_mc_input;
1322 else if (cfg->fc_flags & RTF_LOCAL)
1323 rt->dst.input = ip6_input;
1325 rt->dst.input = ip6_forward;
1327 rt->dst.output = ip6_output;
1329 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1330 rt->rt6i_dst.plen = cfg->fc_dst_len;
1331 if (rt->rt6i_dst.plen == 128)
1332 rt->dst.flags |= DST_HOST;
1334 if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1335 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1340 dst_init_metrics(&rt->dst, metrics, 0);
1342 #ifdef CONFIG_IPV6_SUBTREES
1343 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1344 rt->rt6i_src.plen = cfg->fc_src_len;
1347 rt->rt6i_metric = cfg->fc_metric;
1349 /* We cannot add true routes via loopback here,
1350 they would result in kernel looping; promote them to reject routes
1352 if ((cfg->fc_flags & RTF_REJECT) ||
1353 (dev && (dev->flags & IFF_LOOPBACK) &&
1354 !(addr_type & IPV6_ADDR_LOOPBACK) &&
1355 !(cfg->fc_flags & RTF_LOCAL))) {
1356 /* hold loopback dev/idev if we haven't done so. */
1357 if (dev != net->loopback_dev) {
1362 dev = net->loopback_dev;
1364 idev = in6_dev_get(dev);
1370 rt->dst.output = ip6_pkt_discard_out;
1371 rt->dst.input = ip6_pkt_discard;
1372 rt->dst.error = -ENETUNREACH;
1373 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1377 if (cfg->fc_flags & RTF_GATEWAY) {
1378 const struct in6_addr *gw_addr;
1381 gw_addr = &cfg->fc_gateway;
1382 rt->rt6i_gateway = *gw_addr;
1383 gwa_type = ipv6_addr_type(gw_addr);
1385 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1386 struct rt6_info *grt;
1388 /* IPv6 strictly inhibits using not link-local
1389 addresses as nexthop address.
1390 Otherwise, router will not able to send redirects.
1391 It is very good, but in some (rare!) circumstances
1392 (SIT, PtP, NBMA NOARP links) it is handy to allow
1393 some exceptions. --ANK
1396 if (!(gwa_type & IPV6_ADDR_UNICAST))
1399 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1401 err = -EHOSTUNREACH;
1405 if (dev != grt->dst.dev) {
1406 dst_release(&grt->dst);
1411 idev = grt->rt6i_idev;
1413 in6_dev_hold(grt->rt6i_idev);
1415 if (!(grt->rt6i_flags & RTF_GATEWAY))
1417 dst_release(&grt->dst);
1423 if (!dev || (dev->flags & IFF_LOOPBACK))
1431 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1432 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1436 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1437 rt->rt6i_prefsrc.plen = 128;
1439 rt->rt6i_prefsrc.plen = 0;
1441 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1442 err = rt6_bind_neighbour(rt, dev);
1447 rt->rt6i_flags = cfg->fc_flags;
1454 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1455 int type = nla_type(nla);
1458 if (type > RTAX_MAX) {
1463 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1469 rt->rt6i_idev = idev;
1470 rt->rt6i_table = table;
1472 cfg->fc_nlinfo.nl_net = dev_net(dev);
1474 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1486 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1489 struct fib6_table *table;
1490 struct net *net = dev_net(rt->dst.dev);
1492 if (rt == net->ipv6.ip6_null_entry)
1495 table = rt->rt6i_table;
1496 write_lock_bh(&table->tb6_lock);
1498 err = fib6_del(rt, info);
1499 dst_release(&rt->dst);
1501 write_unlock_bh(&table->tb6_lock);
1506 int ip6_del_rt(struct rt6_info *rt)
1508 struct nl_info info = {
1509 .nl_net = dev_net(rt->dst.dev),
1511 return __ip6_del_rt(rt, &info);
1514 static int ip6_route_del(struct fib6_config *cfg)
1516 struct fib6_table *table;
1517 struct fib6_node *fn;
1518 struct rt6_info *rt;
1521 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1525 read_lock_bh(&table->tb6_lock);
1527 fn = fib6_locate(&table->tb6_root,
1528 &cfg->fc_dst, cfg->fc_dst_len,
1529 &cfg->fc_src, cfg->fc_src_len);
1532 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1533 if (cfg->fc_ifindex &&
1535 rt->dst.dev->ifindex != cfg->fc_ifindex))
1537 if (cfg->fc_flags & RTF_GATEWAY &&
1538 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1540 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1543 read_unlock_bh(&table->tb6_lock);
1545 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1548 read_unlock_bh(&table->tb6_lock);
1556 struct ip6rd_flowi {
1558 struct in6_addr gateway;
1561 static struct rt6_info *__ip6_route_redirect(struct net *net,
1562 struct fib6_table *table,
1566 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1567 struct rt6_info *rt;
1568 struct fib6_node *fn;
1571 * Get the "current" route for this destination and
1572 * check if the redirect has come from approriate router.
1574 * RFC 2461 specifies that redirects should only be
1575 * accepted if they come from the nexthop to the target.
1576 * Due to the way the routes are chosen, this notion
1577 * is a bit fuzzy and one might need to check all possible
1581 read_lock_bh(&table->tb6_lock);
1582 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1584 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1586 * Current route is on-link; redirect is always invalid.
1588 * Seems, previous statement is not true. It could
1589 * be node, which looks for us as on-link (f.e. proxy ndisc)
1590 * But then router serving it might decide, that we should
1591 * know truth 8)8) --ANK (980726).
1593 if (rt6_check_expired(rt))
1595 if (!(rt->rt6i_flags & RTF_GATEWAY))
1597 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1599 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1605 rt = net->ipv6.ip6_null_entry;
1606 BACKTRACK(net, &fl6->saddr);
1610 read_unlock_bh(&table->tb6_lock);
1615 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1616 const struct in6_addr *src,
1617 const struct in6_addr *gateway,
1618 struct net_device *dev)
1620 int flags = RT6_LOOKUP_F_HAS_SADDR;
1621 struct net *net = dev_net(dev);
1622 struct ip6rd_flowi rdfl = {
1624 .flowi6_oif = dev->ifindex,
1630 rdfl.gateway = *gateway;
1632 if (rt6_need_strict(dest))
1633 flags |= RT6_LOOKUP_F_IFACE;
1635 return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1636 flags, __ip6_route_redirect);
1639 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1640 const struct in6_addr *saddr,
1641 struct neighbour *neigh, u8 *lladdr, int on_link)
1643 struct rt6_info *rt, *nrt = NULL;
1644 struct netevent_redirect netevent;
1645 struct net *net = dev_net(neigh->dev);
1647 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1649 if (rt == net->ipv6.ip6_null_entry) {
1650 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
1655 * We have finally decided to accept it.
1658 neigh_update(neigh, lladdr, NUD_STALE,
1659 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1660 NEIGH_UPDATE_F_OVERRIDE|
1661 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1662 NEIGH_UPDATE_F_ISROUTER))
1666 * Redirect received -> path was valid.
1667 * Look, redirects are sent only in response to data packets,
1668 * so that this nexthop apparently is reachable. --ANK
1670 dst_confirm(&rt->dst);
1672 /* Duplicate redirect: silently ignore. */
1673 if (neigh == dst_get_neighbour_noref_raw(&rt->dst))
1676 nrt = ip6_rt_copy(rt, dest);
1680 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1682 nrt->rt6i_flags &= ~RTF_GATEWAY;
1684 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1685 dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1687 if (ip6_ins_rt(nrt))
1690 netevent.old = &rt->dst;
1691 netevent.new = &nrt->dst;
1692 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1694 if (rt->rt6i_flags & RTF_CACHE) {
1700 dst_release(&rt->dst);
1704 * Handle ICMP "packet too big" messages
1705 * i.e. Path MTU discovery
1708 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1709 struct net *net, u32 pmtu, int ifindex)
1711 struct rt6_info *rt, *nrt;
1714 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1718 if (rt6_check_expired(rt)) {
1723 if (pmtu >= dst_mtu(&rt->dst))
1726 if (pmtu < IPV6_MIN_MTU) {
1728 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1729 * MTU (1280) and a fragment header should always be included
1730 * after a node receiving Too Big message reporting PMTU is
1731 * less than the IPv6 Minimum Link MTU.
1733 pmtu = IPV6_MIN_MTU;
1737 /* New mtu received -> path was valid.
1738 They are sent only in response to data packets,
1739 so that this nexthop apparently is reachable. --ANK
1741 dst_confirm(&rt->dst);
1743 /* Host route. If it is static, it would be better
1744 not to override it, but add new one, so that
1745 when cache entry will expire old pmtu
1746 would return automatically.
1748 if (rt->rt6i_flags & RTF_CACHE) {
1749 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1751 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1752 features |= RTAX_FEATURE_ALLFRAG;
1753 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1755 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1756 rt->rt6i_flags |= RTF_MODIFIED;
1761 Two cases are possible:
1762 1. It is connected route. Action: COW
1763 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1765 if (!dst_get_neighbour_noref_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1766 nrt = rt6_alloc_cow(rt, daddr, saddr);
1768 nrt = rt6_alloc_clone(rt, daddr);
1771 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1773 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1774 features |= RTAX_FEATURE_ALLFRAG;
1775 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1778 /* According to RFC 1981, detecting PMTU increase shouldn't be
1779 * happened within 5 mins, the recommended timer is 10 mins.
1780 * Here this route expiration time is set to ip6_rt_mtu_expires
1781 * which is 10 mins. After 10 mins the decreased pmtu is expired
1782 * and detecting PMTU increase will be automatically happened.
1784 rt6_update_expires(nrt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1785 nrt->rt6i_flags |= RTF_DYNAMIC;
1789 dst_release(&rt->dst);
1792 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1793 struct net_device *dev, u32 pmtu)
1795 struct net *net = dev_net(dev);
1798 * RFC 1981 states that a node "MUST reduce the size of the packets it
1799 * is sending along the path" that caused the Packet Too Big message.
1800 * Since it's not possible in the general case to determine which
1801 * interface was used to send the original packet, we update the MTU
1802 * on the interface that will be used to send future packets. We also
1803 * update the MTU on the interface that received the Packet Too Big in
1804 * case the original packet was forced out that interface with
1805 * SO_BINDTODEVICE or similar. This is the next best thing to the
1806 * correct behaviour, which would be to update the MTU on all
1809 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1810 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1814 * Misc support functions
1817 static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
1818 const struct in6_addr *dest)
1820 struct net *net = dev_net(ort->dst.dev);
1821 struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0);
1824 rt->dst.input = ort->dst.input;
1825 rt->dst.output = ort->dst.output;
1826 rt->dst.flags |= DST_HOST;
1828 rt->rt6i_dst.addr = *dest;
1829 rt->rt6i_dst.plen = 128;
1830 dst_copy_metrics(&rt->dst, &ort->dst);
1831 rt->dst.error = ort->dst.error;
1832 rt->rt6i_idev = ort->rt6i_idev;
1834 in6_dev_hold(rt->rt6i_idev);
1835 rt->dst.lastuse = jiffies;
1837 rt->rt6i_gateway = ort->rt6i_gateway;
1838 rt->rt6i_flags = ort->rt6i_flags;
1839 if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
1840 (RTF_DEFAULT | RTF_ADDRCONF))
1841 rt6_set_from(rt, ort);
1843 rt6_clean_expires(rt);
1844 rt->rt6i_metric = 0;
1846 #ifdef CONFIG_IPV6_SUBTREES
1847 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1849 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1850 rt->rt6i_table = ort->rt6i_table;
1855 #ifdef CONFIG_IPV6_ROUTE_INFO
1856 static struct rt6_info *rt6_get_route_info(struct net *net,
1857 const struct in6_addr *prefix, int prefixlen,
1858 const struct in6_addr *gwaddr, int ifindex)
1860 struct fib6_node *fn;
1861 struct rt6_info *rt = NULL;
1862 struct fib6_table *table;
1864 table = fib6_get_table(net, RT6_TABLE_INFO);
1868 write_lock_bh(&table->tb6_lock);
1869 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1873 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1874 if (rt->dst.dev->ifindex != ifindex)
1876 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1878 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1884 write_unlock_bh(&table->tb6_lock);
1888 static struct rt6_info *rt6_add_route_info(struct net *net,
1889 const struct in6_addr *prefix, int prefixlen,
1890 const struct in6_addr *gwaddr, int ifindex,
1893 struct fib6_config cfg = {
1894 .fc_table = RT6_TABLE_INFO,
1895 .fc_metric = IP6_RT_PRIO_USER,
1896 .fc_ifindex = ifindex,
1897 .fc_dst_len = prefixlen,
1898 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1899 RTF_UP | RTF_PREF(pref),
1901 .fc_nlinfo.nlh = NULL,
1902 .fc_nlinfo.nl_net = net,
1905 cfg.fc_dst = *prefix;
1906 cfg.fc_gateway = *gwaddr;
1908 /* We should treat it as a default route if prefix length is 0. */
1910 cfg.fc_flags |= RTF_DEFAULT;
1912 ip6_route_add(&cfg);
1914 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1918 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1920 struct rt6_info *rt;
1921 struct fib6_table *table;
1923 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1927 write_lock_bh(&table->tb6_lock);
1928 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1929 if (dev == rt->dst.dev &&
1930 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1931 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1936 write_unlock_bh(&table->tb6_lock);
1940 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1941 struct net_device *dev,
1944 struct fib6_config cfg = {
1945 .fc_table = RT6_TABLE_DFLT,
1946 .fc_metric = IP6_RT_PRIO_USER,
1947 .fc_ifindex = dev->ifindex,
1948 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1949 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1951 .fc_nlinfo.nlh = NULL,
1952 .fc_nlinfo.nl_net = dev_net(dev),
1955 cfg.fc_gateway = *gwaddr;
1957 ip6_route_add(&cfg);
1959 return rt6_get_dflt_router(gwaddr, dev);
1962 void rt6_purge_dflt_routers(struct net *net)
1964 struct rt6_info *rt;
1965 struct fib6_table *table;
1967 /* NOTE: Keep consistent with rt6_get_dflt_router */
1968 table = fib6_get_table(net, RT6_TABLE_DFLT);
1973 read_lock_bh(&table->tb6_lock);
1974 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1975 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1977 read_unlock_bh(&table->tb6_lock);
1982 read_unlock_bh(&table->tb6_lock);
1985 static void rtmsg_to_fib6_config(struct net *net,
1986 struct in6_rtmsg *rtmsg,
1987 struct fib6_config *cfg)
1989 memset(cfg, 0, sizeof(*cfg));
1991 cfg->fc_table = RT6_TABLE_MAIN;
1992 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1993 cfg->fc_metric = rtmsg->rtmsg_metric;
1994 cfg->fc_expires = rtmsg->rtmsg_info;
1995 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1996 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1997 cfg->fc_flags = rtmsg->rtmsg_flags;
1999 cfg->fc_nlinfo.nl_net = net;
2001 cfg->fc_dst = rtmsg->rtmsg_dst;
2002 cfg->fc_src = rtmsg->rtmsg_src;
2003 cfg->fc_gateway = rtmsg->rtmsg_gateway;
2006 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2008 struct fib6_config cfg;
2009 struct in6_rtmsg rtmsg;
2013 case SIOCADDRT: /* Add a route */
2014 case SIOCDELRT: /* Delete a route */
2015 if (!capable(CAP_NET_ADMIN))
2017 err = copy_from_user(&rtmsg, arg,
2018 sizeof(struct in6_rtmsg));
2022 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2027 err = ip6_route_add(&cfg);
2030 err = ip6_route_del(&cfg);
2044 * Drop the packet on the floor
2047 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2050 struct dst_entry *dst = skb_dst(skb);
2051 switch (ipstats_mib_noroutes) {
2052 case IPSTATS_MIB_INNOROUTES:
2053 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2054 if (type == IPV6_ADDR_ANY) {
2055 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2056 IPSTATS_MIB_INADDRERRORS);
2060 case IPSTATS_MIB_OUTNOROUTES:
2061 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2062 ipstats_mib_noroutes);
2065 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2070 static int ip6_pkt_discard(struct sk_buff *skb)
2072 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2075 static int ip6_pkt_discard_out(struct sk_buff *skb)
2077 skb->dev = skb_dst(skb)->dev;
2078 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2081 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2083 static int ip6_pkt_prohibit(struct sk_buff *skb)
2085 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2088 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2090 skb->dev = skb_dst(skb)->dev;
2091 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2097 * Allocate a dst for local (unicast / anycast) address.
2100 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2101 const struct in6_addr *addr,
2104 struct net *net = dev_net(idev->dev);
2105 struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0);
2109 net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
2110 return ERR_PTR(-ENOMEM);
2115 rt->dst.flags |= DST_HOST;
2116 rt->dst.input = ip6_input;
2117 rt->dst.output = ip6_output;
2118 rt->rt6i_idev = idev;
2119 rt->dst.obsolete = -1;
2121 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2123 rt->rt6i_flags |= RTF_ANYCAST;
2125 rt->rt6i_flags |= RTF_LOCAL;
2126 err = rt6_bind_neighbour(rt, rt->dst.dev);
2129 return ERR_PTR(err);
2132 rt->rt6i_dst.addr = *addr;
2133 rt->rt6i_dst.plen = 128;
2134 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2136 atomic_set(&rt->dst.__refcnt, 1);
2141 int ip6_route_get_saddr(struct net *net,
2142 struct rt6_info *rt,
2143 const struct in6_addr *daddr,
2145 struct in6_addr *saddr)
2147 struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2149 if (rt->rt6i_prefsrc.plen)
2150 *saddr = rt->rt6i_prefsrc.addr;
2152 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2153 daddr, prefs, saddr);
2157 /* remove deleted ip from prefsrc entries */
2158 struct arg_dev_net_ip {
2159 struct net_device *dev;
2161 struct in6_addr *addr;
2164 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2166 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2167 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2168 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2170 if (((void *)rt->dst.dev == dev || !dev) &&
2171 rt != net->ipv6.ip6_null_entry &&
2172 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2173 /* remove prefsrc entry */
2174 rt->rt6i_prefsrc.plen = 0;
2179 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2181 struct net *net = dev_net(ifp->idev->dev);
2182 struct arg_dev_net_ip adni = {
2183 .dev = ifp->idev->dev,
2187 fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2190 struct arg_dev_net {
2191 struct net_device *dev;
2195 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2197 const struct arg_dev_net *adn = arg;
2198 const struct net_device *dev = adn->dev;
2200 if ((rt->dst.dev == dev || !dev) &&
2201 rt != adn->net->ipv6.ip6_null_entry)
2207 void rt6_ifdown(struct net *net, struct net_device *dev)
2209 struct arg_dev_net adn = {
2214 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2215 icmp6_clean_all(fib6_ifdown, &adn);
2218 struct rt6_mtu_change_arg {
2219 struct net_device *dev;
2223 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2225 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2226 struct inet6_dev *idev;
2228 /* In IPv6 pmtu discovery is not optional,
2229 so that RTAX_MTU lock cannot disable it.
2230 We still use this lock to block changes
2231 caused by addrconf/ndisc.
2234 idev = __in6_dev_get(arg->dev);
2238 /* For administrative MTU increase, there is no way to discover
2239 IPv6 PMTU increase, so PMTU increase should be updated here.
2240 Since RFC 1981 doesn't include administrative MTU increase
2241 update PMTU increase is a MUST. (i.e. jumbo frame)
2244 If new MTU is less than route PMTU, this new MTU will be the
2245 lowest MTU in the path, update the route PMTU to reflect PMTU
2246 decreases; if new MTU is greater than route PMTU, and the
2247 old MTU is the lowest MTU in the path, update the route PMTU
2248 to reflect the increase. In this case if the other nodes' MTU
2249 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2252 if (rt->dst.dev == arg->dev &&
2253 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2254 (dst_mtu(&rt->dst) >= arg->mtu ||
2255 (dst_mtu(&rt->dst) < arg->mtu &&
2256 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2257 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2262 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2264 struct rt6_mtu_change_arg arg = {
2269 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2272 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2273 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2274 [RTA_OIF] = { .type = NLA_U32 },
2275 [RTA_IIF] = { .type = NLA_U32 },
2276 [RTA_PRIORITY] = { .type = NLA_U32 },
2277 [RTA_METRICS] = { .type = NLA_NESTED },
2280 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2281 struct fib6_config *cfg)
2284 struct nlattr *tb[RTA_MAX+1];
2287 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2292 rtm = nlmsg_data(nlh);
2293 memset(cfg, 0, sizeof(*cfg));
2295 cfg->fc_table = rtm->rtm_table;
2296 cfg->fc_dst_len = rtm->rtm_dst_len;
2297 cfg->fc_src_len = rtm->rtm_src_len;
2298 cfg->fc_flags = RTF_UP;
2299 cfg->fc_protocol = rtm->rtm_protocol;
2301 if (rtm->rtm_type == RTN_UNREACHABLE)
2302 cfg->fc_flags |= RTF_REJECT;
2304 if (rtm->rtm_type == RTN_LOCAL)
2305 cfg->fc_flags |= RTF_LOCAL;
2307 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2308 cfg->fc_nlinfo.nlh = nlh;
2309 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2311 if (tb[RTA_GATEWAY]) {
2312 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2313 cfg->fc_flags |= RTF_GATEWAY;
2317 int plen = (rtm->rtm_dst_len + 7) >> 3;
2319 if (nla_len(tb[RTA_DST]) < plen)
2322 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2326 int plen = (rtm->rtm_src_len + 7) >> 3;
2328 if (nla_len(tb[RTA_SRC]) < plen)
2331 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2334 if (tb[RTA_PREFSRC])
2335 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2338 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2340 if (tb[RTA_PRIORITY])
2341 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2343 if (tb[RTA_METRICS]) {
2344 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2345 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2349 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2356 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2358 struct fib6_config cfg;
2361 err = rtm_to_fib6_config(skb, nlh, &cfg);
2365 return ip6_route_del(&cfg);
2368 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2370 struct fib6_config cfg;
2373 err = rtm_to_fib6_config(skb, nlh, &cfg);
2377 return ip6_route_add(&cfg);
2380 static inline size_t rt6_nlmsg_size(void)
2382 return NLMSG_ALIGN(sizeof(struct rtmsg))
2383 + nla_total_size(16) /* RTA_SRC */
2384 + nla_total_size(16) /* RTA_DST */
2385 + nla_total_size(16) /* RTA_GATEWAY */
2386 + nla_total_size(16) /* RTA_PREFSRC */
2387 + nla_total_size(4) /* RTA_TABLE */
2388 + nla_total_size(4) /* RTA_IIF */
2389 + nla_total_size(4) /* RTA_OIF */
2390 + nla_total_size(4) /* RTA_PRIORITY */
2391 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2392 + nla_total_size(sizeof(struct rta_cacheinfo));
2395 static int rt6_fill_node(struct net *net,
2396 struct sk_buff *skb, struct rt6_info *rt,
2397 struct in6_addr *dst, struct in6_addr *src,
2398 int iif, int type, u32 pid, u32 seq,
2399 int prefix, int nowait, unsigned int flags)
2401 const struct inet_peer *peer;
2403 struct nlmsghdr *nlh;
2406 struct neighbour *n;
2409 if (prefix) { /* user wants prefix routes only */
2410 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2411 /* success since this is not a prefix route */
2416 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2420 rtm = nlmsg_data(nlh);
2421 rtm->rtm_family = AF_INET6;
2422 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2423 rtm->rtm_src_len = rt->rt6i_src.plen;
2426 table = rt->rt6i_table->tb6_id;
2428 table = RT6_TABLE_UNSPEC;
2429 rtm->rtm_table = table;
2430 if (nla_put_u32(skb, RTA_TABLE, table))
2431 goto nla_put_failure;
2432 if (rt->rt6i_flags & RTF_REJECT)
2433 rtm->rtm_type = RTN_UNREACHABLE;
2434 else if (rt->rt6i_flags & RTF_LOCAL)
2435 rtm->rtm_type = RTN_LOCAL;
2436 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
2437 rtm->rtm_type = RTN_LOCAL;
2439 rtm->rtm_type = RTN_UNICAST;
2441 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2442 rtm->rtm_protocol = rt->rt6i_protocol;
2443 if (rt->rt6i_flags & RTF_DYNAMIC)
2444 rtm->rtm_protocol = RTPROT_REDIRECT;
2445 else if (rt->rt6i_flags & RTF_ADDRCONF)
2446 rtm->rtm_protocol = RTPROT_KERNEL;
2447 else if (rt->rt6i_flags & RTF_DEFAULT)
2448 rtm->rtm_protocol = RTPROT_RA;
2450 if (rt->rt6i_flags & RTF_CACHE)
2451 rtm->rtm_flags |= RTM_F_CLONED;
2454 if (nla_put(skb, RTA_DST, 16, dst))
2455 goto nla_put_failure;
2456 rtm->rtm_dst_len = 128;
2457 } else if (rtm->rtm_dst_len)
2458 if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
2459 goto nla_put_failure;
2460 #ifdef CONFIG_IPV6_SUBTREES
2462 if (nla_put(skb, RTA_SRC, 16, src))
2463 goto nla_put_failure;
2464 rtm->rtm_src_len = 128;
2465 } else if (rtm->rtm_src_len &&
2466 nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
2467 goto nla_put_failure;
2470 #ifdef CONFIG_IPV6_MROUTE
2471 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2472 int err = ip6mr_get_route(net, skb, rtm, nowait);
2477 goto nla_put_failure;
2479 if (err == -EMSGSIZE)
2480 goto nla_put_failure;
2485 if (nla_put_u32(skb, RTA_IIF, iif))
2486 goto nla_put_failure;
2488 struct in6_addr saddr_buf;
2489 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
2490 nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2491 goto nla_put_failure;
2494 if (rt->rt6i_prefsrc.plen) {
2495 struct in6_addr saddr_buf;
2496 saddr_buf = rt->rt6i_prefsrc.addr;
2497 if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
2498 goto nla_put_failure;
2501 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2502 goto nla_put_failure;
2505 n = dst_get_neighbour_noref(&rt->dst);
2507 if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
2509 goto nla_put_failure;
2515 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2516 goto nla_put_failure;
2517 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2518 goto nla_put_failure;
2519 if (!(rt->rt6i_flags & RTF_EXPIRES))
2521 else if (rt->dst.expires - jiffies < INT_MAX)
2522 expires = rt->dst.expires - jiffies;
2527 if (rt6_has_peer(rt))
2528 peer = rt6_peer_ptr(rt);
2530 if (peer && peer->tcp_ts_stamp) {
2532 tsage = get_seconds() - peer->tcp_ts_stamp;
2535 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, ts, tsage,
2536 expires, rt->dst.error) < 0)
2537 goto nla_put_failure;
2539 return nlmsg_end(skb, nlh);
2542 nlmsg_cancel(skb, nlh);
2546 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2548 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2551 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2552 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2553 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2557 return rt6_fill_node(arg->net,
2558 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2559 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2560 prefix, 0, NLM_F_MULTI);
2563 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2565 struct net *net = sock_net(in_skb->sk);
2566 struct nlattr *tb[RTA_MAX+1];
2567 struct rt6_info *rt;
2568 struct sk_buff *skb;
2571 int err, iif = 0, oif = 0;
2573 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2578 memset(&fl6, 0, sizeof(fl6));
2581 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2584 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2588 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2591 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2595 iif = nla_get_u32(tb[RTA_IIF]);
2598 oif = nla_get_u32(tb[RTA_OIF]);
2601 struct net_device *dev;
2604 dev = __dev_get_by_index(net, iif);
2610 fl6.flowi6_iif = iif;
2612 if (!ipv6_addr_any(&fl6.saddr))
2613 flags |= RT6_LOOKUP_F_HAS_SADDR;
2615 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
2618 fl6.flowi6_oif = oif;
2620 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
2623 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2625 dst_release(&rt->dst);
2630 /* Reserve room for dummy headers, this skb can pass
2631 through good chunk of routing engine.
2633 skb_reset_mac_header(skb);
2634 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2636 skb_dst_set(skb, &rt->dst);
2638 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2639 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2640 nlh->nlmsg_seq, 0, 0, 0);
2646 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2651 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2653 struct sk_buff *skb;
2654 struct net *net = info->nl_net;
2659 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2661 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2665 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2666 event, info->pid, seq, 0, 0, 0);
2668 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2669 WARN_ON(err == -EMSGSIZE);
2673 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2674 info->nlh, gfp_any());
2678 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2681 static int ip6_route_dev_notify(struct notifier_block *this,
2682 unsigned long event, void *data)
2684 struct net_device *dev = (struct net_device *)data;
2685 struct net *net = dev_net(dev);
2687 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2688 net->ipv6.ip6_null_entry->dst.dev = dev;
2689 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2690 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2691 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2692 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2693 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2694 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2705 #ifdef CONFIG_PROC_FS
2716 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2718 struct seq_file *m = p_arg;
2719 struct neighbour *n;
2721 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2723 #ifdef CONFIG_IPV6_SUBTREES
2724 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2726 seq_puts(m, "00000000000000000000000000000000 00 ");
2729 n = dst_get_neighbour_noref(&rt->dst);
2731 seq_printf(m, "%pi6", n->primary_key);
2733 seq_puts(m, "00000000000000000000000000000000");
2736 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2737 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2738 rt->dst.__use, rt->rt6i_flags,
2739 rt->dst.dev ? rt->dst.dev->name : "");
2743 static int ipv6_route_show(struct seq_file *m, void *v)
2745 struct net *net = (struct net *)m->private;
2746 fib6_clean_all_ro(net, rt6_info_route, 0, m);
2750 static int ipv6_route_open(struct inode *inode, struct file *file)
2752 return single_open_net(inode, file, ipv6_route_show);
2755 static const struct file_operations ipv6_route_proc_fops = {
2756 .owner = THIS_MODULE,
2757 .open = ipv6_route_open,
2759 .llseek = seq_lseek,
2760 .release = single_release_net,
2763 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2765 struct net *net = (struct net *)seq->private;
2766 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2767 net->ipv6.rt6_stats->fib_nodes,
2768 net->ipv6.rt6_stats->fib_route_nodes,
2769 net->ipv6.rt6_stats->fib_rt_alloc,
2770 net->ipv6.rt6_stats->fib_rt_entries,
2771 net->ipv6.rt6_stats->fib_rt_cache,
2772 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2773 net->ipv6.rt6_stats->fib_discarded_routes);
2778 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2780 return single_open_net(inode, file, rt6_stats_seq_show);
2783 static const struct file_operations rt6_stats_seq_fops = {
2784 .owner = THIS_MODULE,
2785 .open = rt6_stats_seq_open,
2787 .llseek = seq_lseek,
2788 .release = single_release_net,
2790 #endif /* CONFIG_PROC_FS */
2792 #ifdef CONFIG_SYSCTL
2795 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2796 void __user *buffer, size_t *lenp, loff_t *ppos)
2803 net = (struct net *)ctl->extra1;
2804 delay = net->ipv6.sysctl.flush_delay;
2805 proc_dointvec(ctl, write, buffer, lenp, ppos);
2806 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2810 ctl_table ipv6_route_table_template[] = {
2812 .procname = "flush",
2813 .data = &init_net.ipv6.sysctl.flush_delay,
2814 .maxlen = sizeof(int),
2816 .proc_handler = ipv6_sysctl_rtcache_flush
2819 .procname = "gc_thresh",
2820 .data = &ip6_dst_ops_template.gc_thresh,
2821 .maxlen = sizeof(int),
2823 .proc_handler = proc_dointvec,
2826 .procname = "max_size",
2827 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2828 .maxlen = sizeof(int),
2830 .proc_handler = proc_dointvec,
2833 .procname = "gc_min_interval",
2834 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2835 .maxlen = sizeof(int),
2837 .proc_handler = proc_dointvec_jiffies,
2840 .procname = "gc_timeout",
2841 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2842 .maxlen = sizeof(int),
2844 .proc_handler = proc_dointvec_jiffies,
2847 .procname = "gc_interval",
2848 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2849 .maxlen = sizeof(int),
2851 .proc_handler = proc_dointvec_jiffies,
2854 .procname = "gc_elasticity",
2855 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2856 .maxlen = sizeof(int),
2858 .proc_handler = proc_dointvec,
2861 .procname = "mtu_expires",
2862 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2863 .maxlen = sizeof(int),
2865 .proc_handler = proc_dointvec_jiffies,
2868 .procname = "min_adv_mss",
2869 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2870 .maxlen = sizeof(int),
2872 .proc_handler = proc_dointvec,
2875 .procname = "gc_min_interval_ms",
2876 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2877 .maxlen = sizeof(int),
2879 .proc_handler = proc_dointvec_ms_jiffies,
2884 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2886 struct ctl_table *table;
2888 table = kmemdup(ipv6_route_table_template,
2889 sizeof(ipv6_route_table_template),
2893 table[0].data = &net->ipv6.sysctl.flush_delay;
2894 table[0].extra1 = net;
2895 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2896 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2897 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2898 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2899 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2900 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2901 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2902 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2903 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2910 static int __net_init ip6_route_net_init(struct net *net)
2914 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2915 sizeof(net->ipv6.ip6_dst_ops));
2917 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2918 goto out_ip6_dst_ops;
2920 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2921 sizeof(*net->ipv6.ip6_null_entry),
2923 if (!net->ipv6.ip6_null_entry)
2924 goto out_ip6_dst_entries;
2925 net->ipv6.ip6_null_entry->dst.path =
2926 (struct dst_entry *)net->ipv6.ip6_null_entry;
2927 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2928 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2929 ip6_template_metrics, true);
2931 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2932 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2933 sizeof(*net->ipv6.ip6_prohibit_entry),
2935 if (!net->ipv6.ip6_prohibit_entry)
2936 goto out_ip6_null_entry;
2937 net->ipv6.ip6_prohibit_entry->dst.path =
2938 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2939 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2940 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2941 ip6_template_metrics, true);
2943 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2944 sizeof(*net->ipv6.ip6_blk_hole_entry),
2946 if (!net->ipv6.ip6_blk_hole_entry)
2947 goto out_ip6_prohibit_entry;
2948 net->ipv6.ip6_blk_hole_entry->dst.path =
2949 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2950 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2951 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2952 ip6_template_metrics, true);
2955 net->ipv6.sysctl.flush_delay = 0;
2956 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2957 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2958 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2959 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2960 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2961 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2962 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2964 #ifdef CONFIG_PROC_FS
2965 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2966 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2968 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2974 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2975 out_ip6_prohibit_entry:
2976 kfree(net->ipv6.ip6_prohibit_entry);
2978 kfree(net->ipv6.ip6_null_entry);
2980 out_ip6_dst_entries:
2981 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2986 static void __net_exit ip6_route_net_exit(struct net *net)
2988 #ifdef CONFIG_PROC_FS
2989 proc_net_remove(net, "ipv6_route");
2990 proc_net_remove(net, "rt6_stats");
2992 kfree(net->ipv6.ip6_null_entry);
2993 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2994 kfree(net->ipv6.ip6_prohibit_entry);
2995 kfree(net->ipv6.ip6_blk_hole_entry);
2997 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3000 static struct pernet_operations ip6_route_net_ops = {
3001 .init = ip6_route_net_init,
3002 .exit = ip6_route_net_exit,
3005 static int __net_init ipv6_inetpeer_init(struct net *net)
3007 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3011 inet_peer_base_init(bp);
3012 net->ipv6.peers = bp;
3016 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3018 struct inet_peer_base *bp = net->ipv6.peers;
3020 net->ipv6.peers = NULL;
3021 inetpeer_invalidate_tree(bp);
3025 static struct pernet_operations ipv6_inetpeer_ops = {
3026 .init = ipv6_inetpeer_init,
3027 .exit = ipv6_inetpeer_exit,
3030 static struct notifier_block ip6_route_dev_notifier = {
3031 .notifier_call = ip6_route_dev_notify,
3035 int __init ip6_route_init(void)
3040 ip6_dst_ops_template.kmem_cachep =
3041 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3042 SLAB_HWCACHE_ALIGN, NULL);
3043 if (!ip6_dst_ops_template.kmem_cachep)
3046 ret = dst_entries_init(&ip6_dst_blackhole_ops);
3048 goto out_kmem_cache;
3050 ret = register_pernet_subsys(&ip6_route_net_ops);
3052 goto out_dst_entries;
3054 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3056 goto out_register_subsys;
3058 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3060 /* Registering of the loopback is done before this portion of code,
3061 * the loopback reference in rt6_info will not be taken, do it
3062 * manually for init_net */
3063 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3064 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3065 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3066 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3067 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3068 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3069 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3073 goto out_register_inetpeer;
3079 ret = fib6_rules_init();
3084 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3085 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3086 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3087 goto fib6_rules_init;
3089 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3091 goto fib6_rules_init;
3097 fib6_rules_cleanup();
3102 out_register_inetpeer:
3103 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3104 out_register_subsys:
3105 unregister_pernet_subsys(&ip6_route_net_ops);
3107 dst_entries_destroy(&ip6_dst_blackhole_ops);
3109 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3113 void ip6_route_cleanup(void)
3115 unregister_netdevice_notifier(&ip6_route_dev_notifier);
3116 fib6_rules_cleanup();
3119 unregister_pernet_subsys(&ipv6_inetpeer_ops);
3120 unregister_pernet_subsys(&ip6_route_net_ops);
3121 dst_entries_destroy(&ip6_dst_blackhole_ops);
3122 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);