netfilter: add nf_ipv6_ops hook to fix xt_addrtype with IPv6
[linux-3.10.git] / net / ipv6 / route.c
index f568ac6..ad0aa6b 100644 (file)
@@ -57,6 +57,7 @@
 #include <net/xfrm.h>
 #include <net/netevent.h>
 #include <net/netlink.h>
+#include <net/nexthop.h>
 
 #include <asm/uaccess.h>
 
@@ -144,25 +145,12 @@ static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
        struct neighbour *n;
 
        daddr = choose_neigh_daddr(rt, skb, daddr);
-       n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
+       n = __ipv6_neigh_lookup(dst->dev, daddr);
        if (n)
                return n;
        return neigh_create(&nd_tbl, daddr, dst->dev);
 }
 
-static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
-{
-       struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
-       if (!n) {
-               n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
-               if (IS_ERR(n))
-                       return PTR_ERR(n);
-       }
-       rt->n = n;
-
-       return 0;
-}
-
 static struct dst_ops ip6_dst_ops_template = {
        .family                 =       AF_INET6,
        .protocol               =       cpu_to_be16(ETH_P_IPV6),
@@ -219,14 +207,14 @@ static struct dst_ops ip6_dst_blackhole_ops = {
 };
 
 static const u32 ip6_template_metrics[RTAX_MAX] = {
-       [RTAX_HOPLIMIT - 1] = 255,
+       [RTAX_HOPLIMIT - 1] = 0,
 };
 
 static const struct rt6_info ip6_null_entry_template = {
        .dst = {
                .__refcnt       = ATOMIC_INIT(1),
                .__use          = 1,
-               .obsolete       = -1,
+               .obsolete       = DST_OBSOLETE_FORCE_CHK,
                .error          = -ENETUNREACH,
                .input          = ip6_pkt_discard,
                .output         = ip6_pkt_discard_out,
@@ -246,7 +234,7 @@ static const struct rt6_info ip6_prohibit_entry_template = {
        .dst = {
                .__refcnt       = ATOMIC_INIT(1),
                .__use          = 1,
-               .obsolete       = -1,
+               .obsolete       = DST_OBSOLETE_FORCE_CHK,
                .error          = -EACCES,
                .input          = ip6_pkt_prohibit,
                .output         = ip6_pkt_prohibit_out,
@@ -261,7 +249,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
        .dst = {
                .__refcnt       = ATOMIC_INIT(1),
                .__use          = 1,
-               .obsolete       = -1,
+               .obsolete       = DST_OBSOLETE_FORCE_CHK,
                .error          = -EINVAL,
                .input          = dst_discard,
                .output         = dst_discard,
@@ -281,13 +269,16 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net,
                                             struct fib6_table *table)
 {
        struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
-                                       0, DST_OBSOLETE_NONE, flags);
+                                       0, DST_OBSOLETE_FORCE_CHK, flags);
 
        if (rt) {
                struct dst_entry *dst = &rt->dst;
 
                memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
                rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
+               rt->rt6i_genid = rt_genid(net);
+               INIT_LIST_HEAD(&rt->rt6i_siblings);
+               rt->rt6i_nsiblings = 0;
        }
        return rt;
 }
@@ -296,9 +287,7 @@ static void ip6_dst_destroy(struct dst_entry *dst)
 {
        struct rt6_info *rt = (struct rt6_info *)dst;
        struct inet6_dev *idev = rt->rt6i_idev;
-
-       if (rt->n)
-               neigh_release(rt->n);
+       struct dst_entry *from = dst->from;
 
        if (!(rt->dst.flags & DST_HOST))
                dst_destroy_metrics_generic(dst);
@@ -308,8 +297,8 @@ static void ip6_dst_destroy(struct dst_entry *dst)
                in6_dev_put(idev);
        }
 
-       if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
-               dst_release(dst->from);
+       dst->from = NULL;
+       dst_release(from);
 
        if (rt6_has_peer(rt)) {
                struct inet_peer *peer = rt6_peer_ptr(rt);
@@ -317,13 +306,6 @@ static void ip6_dst_destroy(struct dst_entry *dst)
        }
 }
 
-static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
-
-static u32 rt6_peer_genid(void)
-{
-       return atomic_read(&__rt6_peer_genid);
-}
-
 void rt6_bind_peer(struct rt6_info *rt, int create)
 {
        struct inet_peer_base *base;
@@ -337,8 +319,6 @@ void rt6_bind_peer(struct rt6_info *rt, int create)
        if (peer) {
                if (!rt6_set_peer(rt, peer))
                        inet_putpeer(peer);
-               else
-                       rt->rt6i_peer_genid = rt6_peer_genid();
        }
 }
 
@@ -359,25 +339,16 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
                                in6_dev_put(idev);
                        }
                }
-               if (rt->n && rt->n->dev == dev) {
-                       rt->n->dev = loopback_dev;
-                       dev_hold(loopback_dev);
-                       dev_put(dev);
-               }
        }
 }
 
 static bool rt6_check_expired(const struct rt6_info *rt)
 {
-       struct rt6_info *ort = NULL;
-
        if (rt->rt6i_flags & RTF_EXPIRES) {
                if (time_after(jiffies, rt->dst.expires))
                        return true;
        } else if (rt->dst.from) {
-               ort = (struct rt6_info *) rt->dst.from;
-               return (ort->rt6i_flags & RTF_EXPIRES) &&
-                       time_after(jiffies, ort->dst.expires);
+               return rt6_check_expired((struct rt6_info *) rt->dst.from);
        }
        return false;
 }
@@ -388,6 +359,62 @@ static bool rt6_need_strict(const struct in6_addr *daddr)
                (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 }
 
+/* Multipath route selection:
+ *   Hash based function using packet header and flowlabel.
+ * Adapted from fib_info_hashfn()
+ */
+static int rt6_info_hash_nhsfn(unsigned int candidate_count,
+                              const struct flowi6 *fl6)
+{
+       unsigned int val = fl6->flowi6_proto;
+
+       val ^= ipv6_addr_hash(&fl6->daddr);
+       val ^= ipv6_addr_hash(&fl6->saddr);
+
+       /* Work only if this not encapsulated */
+       switch (fl6->flowi6_proto) {
+       case IPPROTO_UDP:
+       case IPPROTO_TCP:
+       case IPPROTO_SCTP:
+               val ^= (__force u16)fl6->fl6_sport;
+               val ^= (__force u16)fl6->fl6_dport;
+               break;
+
+       case IPPROTO_ICMPV6:
+               val ^= (__force u16)fl6->fl6_icmp_type;
+               val ^= (__force u16)fl6->fl6_icmp_code;
+               break;
+       }
+       /* RFC6438 recommands to use flowlabel */
+       val ^= (__force u32)fl6->flowlabel;
+
+       /* Perhaps, we need to tune, this function? */
+       val = val ^ (val >> 7) ^ (val >> 12);
+       return val % candidate_count;
+}
+
+static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
+                                            struct flowi6 *fl6)
+{
+       struct rt6_info *sibling, *next_sibling;
+       int route_choosen;
+
+       route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
+       /* Don't change the route, if route_choosen == 0
+        * (siblings does not include ourself)
+        */
+       if (route_choosen)
+               list_for_each_entry_safe(sibling, next_sibling,
+                               &match->rt6i_siblings, rt6i_siblings) {
+                       route_choosen--;
+                       if (route_choosen == 0) {
+                               match = sibling;
+                               break;
+                       }
+               }
+       return match;
+}
+
 /*
  *     Route lookup. Any table->tb6_lock is implied.
  */
@@ -451,24 +478,34 @@ static void rt6_probe(struct rt6_info *rt)
         * Router Reachability Probe MUST be rate-limited
         * to no more than one per minute.
         */
-       neigh = rt ? rt->n : NULL;
-       if (!neigh || (neigh->nud_state & NUD_VALID))
+       if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
                return;
-       read_lock_bh(&neigh->lock);
-       if (!(neigh->nud_state & NUD_VALID) &&
+       rcu_read_lock_bh();
+       neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
+       if (neigh) {
+               write_lock(&neigh->lock);
+               if (neigh->nud_state & NUD_VALID)
+                       goto out;
+       }
+
+       if (!neigh ||
            time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
                struct in6_addr mcaddr;
                struct in6_addr *target;
 
-               neigh->updated = jiffies;
-               read_unlock_bh(&neigh->lock);
+               if (neigh) {
+                       neigh->updated = jiffies;
+                       write_unlock(&neigh->lock);
+               }
 
-               target = (struct in6_addr *)&neigh->primary_key;
+               target = (struct in6_addr *)&rt->rt6i_gateway;
                addrconf_addr_solict_mult(target, &mcaddr);
                ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
        } else {
-               read_unlock_bh(&neigh->lock);
+out:
+               write_unlock(&neigh->lock);
        }
+       rcu_read_unlock_bh();
 }
 #else
 static inline void rt6_probe(struct rt6_info *rt)
@@ -490,35 +527,36 @@ static inline int rt6_check_dev(struct rt6_info *rt, int oif)
        return 0;
 }
 
-static inline int rt6_check_neigh(struct rt6_info *rt)
+static inline bool rt6_check_neigh(struct rt6_info *rt)
 {
        struct neighbour *neigh;
-       int m;
+       bool ret = false;
 
-       neigh = rt->n;
        if (rt->rt6i_flags & RTF_NONEXTHOP ||
            !(rt->rt6i_flags & RTF_GATEWAY))
-               m = 1;
-       else if (neigh) {
-               read_lock_bh(&neigh->lock);
+               return true;
+
+       rcu_read_lock_bh();
+       neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
+       if (neigh) {
+               read_lock(&neigh->lock);
                if (neigh->nud_state & NUD_VALID)
-                       m = 2;
+                       ret = true;
 #ifdef CONFIG_IPV6_ROUTER_PREF
-               else if (neigh->nud_state & NUD_FAILED)
-                       m = 0;
+               else if (!(neigh->nud_state & NUD_FAILED))
+                       ret = true;
 #endif
-               else
-                       m = 1;
-               read_unlock_bh(&neigh->lock);
-       } else
-               m = 0;
-       return m;
+               read_unlock(&neigh->lock);
+       }
+       rcu_read_unlock_bh();
+
+       return ret;
 }
 
 static int rt6_score_route(struct rt6_info *rt, int oif,
                           int strict)
 {
-       int m, n;
+       int m;
 
        m = rt6_check_dev(rt, oif);
        if (!m && (strict & RT6_LOOKUP_F_IFACE))
@@ -526,8 +564,7 @@ static int rt6_score_route(struct rt6_info *rt, int oif,
 #ifdef CONFIG_IPV6_ROUTER_PREF
        m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
 #endif
-       n = rt6_check_neigh(rt);
-       if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
+       if (!rt6_check_neigh(rt) && (strict & RT6_LOOKUP_F_REACHABLE))
                return -1;
        return m;
 }
@@ -669,7 +706,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
                else
                        rt6_set_expires(rt, jiffies + HZ * lifetime);
 
-               dst_release(&rt->dst);
+               ip6_rt_put(rt);
        }
        return 0;
 }
@@ -705,6 +742,8 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 restart:
        rt = fn->leaf;
        rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
+       if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
+               rt = rt6_multipath_select(rt, fl6);
        BACKTRACK(net, &fl6->saddr);
 out:
        dst_use(&rt->dst, jiffies);
@@ -786,8 +825,6 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
        rt = ip6_rt_copy(ort, daddr);
 
        if (rt) {
-               int attempts = !in_softirq();
-
                if (!(rt->rt6i_flags & RTF_GATEWAY)) {
                        if (ort->rt6i_dst.plen != 128 &&
                            ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
@@ -803,32 +840,6 @@ static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
                        rt->rt6i_src.plen = 128;
                }
 #endif
-
-       retry:
-               if (rt6_bind_neighbour(rt, rt->dst.dev)) {
-                       struct net *net = dev_net(rt->dst.dev);
-                       int saved_rt_min_interval =
-                               net->ipv6.sysctl.ip6_rt_gc_min_interval;
-                       int saved_rt_elasticity =
-                               net->ipv6.sysctl.ip6_rt_gc_elasticity;
-
-                       if (attempts-- > 0) {
-                               net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
-                               net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
-
-                               ip6_dst_gc(&net->ipv6.ip6_dst_ops);
-
-                               net->ipv6.sysctl.ip6_rt_gc_elasticity =
-                                       saved_rt_elasticity;
-                               net->ipv6.sysctl.ip6_rt_gc_min_interval =
-                                       saved_rt_min_interval;
-                               goto retry;
-                       }
-
-                       net_warn_ratelimited("Neighbour table overflow\n");
-                       dst_free(&rt->dst);
-                       return NULL;
-               }
        }
 
        return rt;
@@ -839,10 +850,8 @@ static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
 {
        struct rt6_info *rt = ip6_rt_copy(ort, daddr);
 
-       if (rt) {
+       if (rt)
                rt->rt6i_flags |= RTF_CACHE;
-               rt->n = neigh_clone(ort->n);
-       }
        return rt;
 }
 
@@ -866,7 +875,8 @@ restart_2:
 
 restart:
        rt = rt6_select(fn, oif, strict | reachable);
-
+       if (rt->rt6i_nsiblings && oif == 0)
+               rt = rt6_multipath_select(rt, fl6);
        BACKTRACK(net, &fl6->saddr);
        if (rt == net->ipv6.ip6_null_entry ||
            rt->rt6i_flags & RTF_CACHE)
@@ -875,14 +885,14 @@ restart:
        dst_hold(&rt->dst);
        read_unlock_bh(&table->tb6_lock);
 
-       if (!rt->n && !(rt->rt6i_flags & RTF_NONEXTHOP))
+       if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)))
                nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
        else if (!(rt->dst.flags & DST_HOST))
                nrt = rt6_alloc_clone(rt, &fl6->daddr);
        else
                goto out2;
 
-       dst_release(&rt->dst);
+       ip6_rt_put(rt);
        rt = nrt ? : net->ipv6.ip6_null_entry;
 
        dst_hold(&rt->dst);
@@ -899,7 +909,7 @@ restart:
         * Race condition! In the gap, when table->tb6_lock was
         * released someone could insert this route.  Relookup.
         */
-       dst_release(&rt->dst);
+       ip6_rt_put(rt);
        goto relookup;
 
 out:
@@ -941,7 +951,7 @@ void ip6_route_input(struct sk_buff *skb)
                .flowi6_iif = skb->dev->ifindex,
                .daddr = iph->daddr,
                .saddr = iph->saddr,
-               .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
+               .flowlabel = ip6_flowinfo(iph),
                .flowi6_mark = skb->mark,
                .flowi6_proto = iph->nexthdr,
        };
@@ -1001,7 +1011,6 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori
 
                rt->rt6i_gateway = ort->rt6i_gateway;
                rt->rt6i_flags = ort->rt6i_flags;
-               rt6_clean_expires(rt);
                rt->rt6i_metric = 0;
 
                memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
@@ -1026,14 +1035,16 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
 
        rt = (struct rt6_info *) dst;
 
-       if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
-               if (rt->rt6i_peer_genid != rt6_peer_genid()) {
-                       if (!rt6_has_peer(rt))
-                               rt6_bind_peer(rt, 0);
-                       rt->rt6i_peer_genid = rt6_peer_genid();
-               }
+       /* All IPV6 dsts are created with ->obsolete set to the value
+        * DST_OBSOLETE_FORCE_CHK which forces validation calls down
+        * into this function always.
+        */
+       if (rt->rt6i_genid != rt_genid(dev_net(rt->dst.dev)))
+               return NULL;
+
+       if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
                return dst;
-       }
+
        return NULL;
 }
 
@@ -1104,7 +1115,7 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
        fl6.flowi6_flags = 0;
        fl6.daddr = iph->daddr;
        fl6.saddr = iph->saddr;
-       fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
+       fl6.flowlabel = ip6_flowinfo(iph);
 
        dst = ip6_route_output(net, NULL, &fl6);
        if (!dst->error)
@@ -1132,7 +1143,7 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
        fl6.flowi6_flags = 0;
        fl6.daddr = iph->daddr;
        fl6.saddr = iph->saddr;
-       fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;
+       fl6.flowlabel = ip6_flowinfo(iph);
 
        dst = ip6_route_output(net, NULL, &fl6);
        if (!dst->error)
@@ -1192,7 +1203,6 @@ static struct dst_entry *icmp6_dst_gc_list;
 static DEFINE_SPINLOCK(icmp6_dst_lock);
 
 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
-                                 struct neighbour *neigh,
                                  struct flowi6 *fl6)
 {
        struct dst_entry *dst;
@@ -1210,25 +1220,13 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
                goto out;
        }
 
-       if (neigh)
-               neigh_hold(neigh);
-       else {
-               neigh = ip6_neigh_lookup(&rt->dst, NULL, &fl6->daddr);
-               if (IS_ERR(neigh)) {
-                       in6_dev_put(idev);
-                       dst_free(&rt->dst);
-                       return ERR_CAST(neigh);
-               }
-       }
-
        rt->dst.flags |= DST_HOST;
        rt->dst.output  = ip6_output;
-       rt->n = neigh;
        atomic_set(&rt->dst.__refcnt, 1);
        rt->rt6i_dst.addr = fl6->daddr;
        rt->rt6i_dst.plen = 128;
        rt->rt6i_idev     = idev;
-       dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
+       dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
 
        spin_lock_bh(&icmp6_dst_lock);
        rt->dst.next = icmp6_dst_gc_list;
@@ -1312,12 +1310,6 @@ out:
        return entries > rt_max_size;
 }
 
-/* Clean host part of a prefix. Not necessary in radix tree,
-   but results in cleaner routing tables.
-
-   Remove it only when all the things will work!
- */
-
 int ip6_dst_hoplimit(struct dst_entry *dst)
 {
        int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
@@ -1392,8 +1384,6 @@ int ip6_route_add(struct fib6_config *cfg)
                goto out;
        }
 
-       rt->dst.obsolete = -1;
-
        if (cfg->fc_flags & RTF_EXPIRES)
                rt6_set_expires(rt, jiffies +
                                clock_t_to_jiffies(cfg->fc_expires));
@@ -1505,7 +1495,7 @@ int ip6_route_add(struct fib6_config *cfg)
                                goto out;
                        if (dev) {
                                if (dev != grt->dst.dev) {
-                                       dst_release(&grt->dst);
+                                       ip6_rt_put(grt);
                                        goto out;
                                }
                        } else {
@@ -1516,7 +1506,7 @@ int ip6_route_add(struct fib6_config *cfg)
                        }
                        if (!(grt->rt6i_flags & RTF_GATEWAY))
                                err = 0;
-                       dst_release(&grt->dst);
+                       ip6_rt_put(grt);
 
                        if (err)
                                goto out;
@@ -1540,12 +1530,6 @@ int ip6_route_add(struct fib6_config *cfg)
        } else
                rt->rt6i_prefsrc.plen = 0;
 
-       if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
-               err = rt6_bind_neighbour(rt, dev);
-               if (err)
-                       goto out;
-       }
-
        rt->rt6i_flags = cfg->fc_flags;
 
 install_route:
@@ -1591,17 +1575,18 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
        struct fib6_table *table;
        struct net *net = dev_net(rt->dst.dev);
 
-       if (rt == net->ipv6.ip6_null_entry)
-               return -ENOENT;
+       if (rt == net->ipv6.ip6_null_entry) {
+               err = -ENOENT;
+               goto out;
+       }
 
        table = rt->rt6i_table;
        write_lock_bh(&table->tb6_lock);
-
        err = fib6_del(rt, info);
-       dst_release(&rt->dst);
-
        write_unlock_bh(&table->tb6_lock);
 
+out:
+       ip6_rt_put(rt);
        return err;
 }
 
@@ -1657,37 +1642,32 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
        struct net *net = dev_net(skb->dev);
        struct netevent_redirect netevent;
        struct rt6_info *rt, *nrt = NULL;
-       const struct in6_addr *target;
        struct ndisc_options ndopts;
-       const struct in6_addr *dest;
-       struct neighbour *old_neigh;
        struct inet6_dev *in6_dev;
        struct neighbour *neigh;
-       struct icmp6hdr *icmph;
+       struct rd_msg *msg;
        int optlen, on_link;
        u8 *lladdr;
 
        optlen = skb->tail - skb->transport_header;
-       optlen -= sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr);
+       optlen -= sizeof(*msg);
 
        if (optlen < 0) {
                net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
                return;
        }
 
-       icmph = icmp6_hdr(skb);
-       target = (const struct in6_addr *) (icmph + 1);
-       dest = target + 1;
+       msg = (struct rd_msg *)icmp6_hdr(skb);
 
-       if (ipv6_addr_is_multicast(dest)) {
+       if (ipv6_addr_is_multicast(&msg->dest)) {
                net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
                return;
        }
 
        on_link = 0;
-       if (ipv6_addr_equal(dest, target)) {
+       if (ipv6_addr_equal(&msg->dest, &msg->target)) {
                on_link = 1;
-       } else if (ipv6_addr_type(target) !=
+       } else if (ipv6_addr_type(&msg->target) !=
                   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
                net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
                return;
@@ -1704,7 +1684,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
         *      first-hop router for the specified ICMP Destination Address.
         */
 
-       if (!ndisc_parse_options((u8*)(dest + 1), optlen, &ndopts)) {
+       if (!ndisc_parse_options(msg->opt, optlen, &ndopts)) {
                net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
                return;
        }
@@ -1731,15 +1711,10 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
         */
        dst_confirm(&rt->dst);
 
-       neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1);
+       neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
        if (!neigh)
                return;
 
-       /* Duplicate redirect: silently ignore. */
-       old_neigh = rt->n;
-       if (neigh == old_neigh)
-               goto out;
-
        /*
         *      We have finally decided to accept it.
         */
@@ -1751,7 +1726,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
                                     NEIGH_UPDATE_F_ISROUTER))
                     );
 
-       nrt = ip6_rt_copy(rt, dest);
+       nrt = ip6_rt_copy(rt, &msg->dest);
        if (!nrt)
                goto out;
 
@@ -1760,16 +1735,14 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
                nrt->rt6i_flags &= ~RTF_GATEWAY;
 
        nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
-       nrt->n = neigh_clone(neigh);
 
        if (ip6_ins_rt(nrt))
                goto out;
 
        netevent.old = &rt->dst;
-       netevent.old_neigh = old_neigh;
        netevent.new = &nrt->dst;
-       netevent.new_neigh = neigh;
-       netevent.daddr = dest;
+       netevent.daddr = &msg->dest;
+       netevent.neigh = neigh;
        call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
 
        if (rt->rt6i_flags & RTF_CACHE) {
@@ -1811,8 +1784,6 @@ static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
                if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
                    (RTF_DEFAULT | RTF_ADDRCONF))
                        rt6_set_from(rt, ort);
-               else
-                       rt6_clean_expires(rt);
                rt->rt6i_metric = 0;
 
 #ifdef CONFIG_IPV6_SUBTREES
@@ -1837,7 +1808,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
        if (!table)
                return NULL;
 
-       write_lock_bh(&table->tb6_lock);
+       read_lock_bh(&table->tb6_lock);
        fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
        if (!fn)
                goto out;
@@ -1853,7 +1824,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
                break;
        }
 out:
-       write_unlock_bh(&table->tb6_lock);
+       read_unlock_bh(&table->tb6_lock);
        return rt;
 }
 
@@ -1896,7 +1867,7 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev
        if (!table)
                return NULL;
 
-       write_lock_bh(&table->tb6_lock);
+       read_lock_bh(&table->tb6_lock);
        for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
                if (dev == rt->dst.dev &&
                    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
@@ -1905,7 +1876,7 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev
        }
        if (rt)
                dst_hold(&rt->dst);
-       write_unlock_bh(&table->tb6_lock);
+       read_unlock_bh(&table->tb6_lock);
        return rt;
 }
 
@@ -1944,7 +1915,8 @@ void rt6_purge_dflt_routers(struct net *net)
 restart:
        read_lock_bh(&table->tb6_lock);
        for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
-               if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
+               if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
+                   (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
                        dst_hold(&rt->dst);
                        read_unlock_bh(&table->tb6_lock);
                        ip6_del_rt(rt);
@@ -1984,7 +1956,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
        switch(cmd) {
        case SIOCADDRT:         /* Add a route */
        case SIOCDELRT:         /* Delete a route */
-               if (!capable(CAP_NET_ADMIN))
+               if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        return -EPERM;
                err = copy_from_user(&rtmsg, arg,
                                     sizeof(struct in6_rtmsg));
@@ -2075,7 +2047,6 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
 {
        struct net *net = dev_net(idev->dev);
        struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0, NULL);
-       int err;
 
        if (!rt) {
                net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
@@ -2088,18 +2059,12 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
        rt->dst.input = ip6_input;
        rt->dst.output = ip6_output;
        rt->rt6i_idev = idev;
-       rt->dst.obsolete = -1;
 
        rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
        if (anycast)
                rt->rt6i_flags |= RTF_ANYCAST;
        else
                rt->rt6i_flags |= RTF_LOCAL;
-       err = rt6_bind_neighbour(rt, rt->dst.dev);
-       if (err) {
-               dst_free(&rt->dst);
-               return ERR_PTR(err);
-       }
 
        rt->rt6i_dst.addr = *addr;
        rt->rt6i_dst.plen = 128;
@@ -2247,6 +2212,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
        [RTA_IIF]               = { .type = NLA_U32 },
        [RTA_PRIORITY]          = { .type = NLA_U32 },
        [RTA_METRICS]           = { .type = NLA_NESTED },
+       [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
 };
 
 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2324,12 +2290,72 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
        if (tb[RTA_TABLE])
                cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
 
+       if (tb[RTA_MULTIPATH]) {
+               cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
+               cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
+       }
+
        err = 0;
 errout:
        return err;
 }
 
-static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+static int ip6_route_multipath(struct fib6_config *cfg, int add)
+{
+       struct fib6_config r_cfg;
+       struct rtnexthop *rtnh;
+       int remaining;
+       int attrlen;
+       int err = 0, last_err = 0;
+
+beginning:
+       rtnh = (struct rtnexthop *)cfg->fc_mp;
+       remaining = cfg->fc_mp_len;
+
+       /* Parse a Multipath Entry */
+       while (rtnh_ok(rtnh, remaining)) {
+               memcpy(&r_cfg, cfg, sizeof(*cfg));
+               if (rtnh->rtnh_ifindex)
+                       r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
+
+               attrlen = rtnh_attrlen(rtnh);
+               if (attrlen > 0) {
+                       struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+
+                       nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+                       if (nla) {
+                               nla_memcpy(&r_cfg.fc_gateway, nla, 16);
+                               r_cfg.fc_flags |= RTF_GATEWAY;
+                       }
+               }
+               err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg);
+               if (err) {
+                       last_err = err;
+                       /* If we are trying to remove a route, do not stop the
+                        * loop when ip6_route_del() fails (because next hop is
+                        * already gone), we should try to remove all next hops.
+                        */
+                       if (add) {
+                               /* If add fails, we should try to delete all
+                                * next hops that have been already added.
+                                */
+                               add = 0;
+                               goto beginning;
+                       }
+               }
+               /* Because each route is added like a single route we remove
+                * this flag after the first nexthop (if there is a collision,
+                * we have already fail to add the first nexthop:
+                * fib6_add_rt2node() has reject it).
+                */
+               cfg->fc_nlinfo.nlh->nlmsg_flags &= ~NLM_F_EXCL;
+               rtnh = rtnh_next(rtnh, &remaining);
+       }
+
+       return last_err;
+}
+
+static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh)
 {
        struct fib6_config cfg;
        int err;
@@ -2338,10 +2364,13 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a
        if (err < 0)
                return err;
 
-       return ip6_route_del(&cfg);
+       if (cfg.fc_mp)
+               return ip6_route_multipath(&cfg, 0);
+       else
+               return ip6_route_del(&cfg);
 }
 
-static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh)
 {
        struct fib6_config cfg;
        int err;
@@ -2350,7 +2379,10 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *a
        if (err < 0)
                return err;
 
-       return ip6_route_add(&cfg);
+       if (cfg.fc_mp)
+               return ip6_route_multipath(&cfg, 1);
+       else
+               return ip6_route_add(&cfg);
 }
 
 static inline size_t rt6_nlmsg_size(void)
@@ -2378,7 +2410,6 @@ static int rt6_fill_node(struct net *net,
        struct nlmsghdr *nlh;
        long expires;
        u32 table;
-       struct neighbour *n;
 
        if (prefix) {   /* user wants prefix routes only */
                if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
@@ -2491,9 +2522,8 @@ static int rt6_fill_node(struct net *net,
        if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
                goto nla_put_failure;
 
-       n = rt->n;
-       if (n) {
-               if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0)
+       if (rt->rt6i_flags & RTF_GATEWAY) {
+               if (nla_put(skb, RTA_GATEWAY, 16, &rt->rt6i_gateway) < 0)
                        goto nla_put_failure;
        }
 
@@ -2532,7 +2562,7 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg)
                     prefix, 0, NLM_F_MULTI);
 }
 
-static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
+static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh)
 {
        struct net *net = sock_net(in_skb->sk);
        struct nlattr *tb[RTA_MAX+1];
@@ -2594,7 +2624,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
 
        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
        if (!skb) {
-               dst_release(&rt->dst);
+               ip6_rt_put(rt);
                err = -ENOBUFS;
                goto errout;
        }
@@ -2688,7 +2718,6 @@ struct rt6_proc_arg
 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
 {
        struct seq_file *m = p_arg;
-       struct neighbour *n;
 
        seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
 
@@ -2697,9 +2726,8 @@ static int rt6_info_route(struct rt6_info *rt, void *p_arg)
 #else
        seq_puts(m, "00000000000000000000000000000000 00 ");
 #endif
-       n = rt->n;
-       if (n) {
-               seq_printf(m, "%pi6", n->primary_key);
+       if (rt->rt6i_flags & RTF_GATEWAY) {
+               seq_printf(m, "%pi6", &rt->rt6i_gateway);
        } else {
                seq_puts(m, "00000000000000000000000000000000");
        }
@@ -2871,6 +2899,10 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
                table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
                table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
                table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
+
+               /* Don't export sysctls to unprivileged users */
+               if (net->user_ns != &init_user_ns)
+                       table[0].procname = NULL;
        }
 
        return table;
@@ -2962,8 +2994,8 @@ static void __net_exit ip6_route_net_exit(struct net *net)
 static int __net_init ip6_route_net_init_late(struct net *net)
 {
 #ifdef CONFIG_PROC_FS
-       proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
-       proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
+       proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
+       proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
 #endif
        return 0;
 }
@@ -2971,8 +3003,8 @@ static int __net_init ip6_route_net_init_late(struct net *net)
 static void __net_exit ip6_route_net_exit_late(struct net *net)
 {
 #ifdef CONFIG_PROC_FS
-       proc_net_remove(net, "ipv6_route");
-       proc_net_remove(net, "rt6_stats");
+       remove_proc_entry("ipv6_route", net->proc_net);
+       remove_proc_entry("rt6_stats", net->proc_net);
 #endif
 }