Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[linux-3.10.git] / net / ipv4 / route.c
index 55eb463..baa9b28 100644 (file)
@@ -70,7 +70,6 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
-#include <linux/bootmem.h>
 #include <linux/string.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
@@ -80,7 +79,6 @@
 #include <linux/netdevice.h>
 #include <linux/proc_fs.h>
 #include <linux/init.h>
-#include <linux/workqueue.h>
 #include <linux/skbuff.h>
 #include <linux/inetdevice.h>
 #include <linux/igmp.h>
 #include <linux/mroute.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/random.h>
-#include <linux/jhash.h>
 #include <linux/rcupdate.h>
 #include <linux/times.h>
 #include <linux/slab.h>
-#include <linux/prefetch.h>
 #include <net/dst.h>
 #include <net/net_namespace.h>
 #include <net/protocol.h>
@@ -141,13 +137,13 @@ static int ip_rt_min_advmss __read_mostly = 256;
 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 static unsigned int     ipv4_default_advmss(const struct dst_entry *dst);
 static unsigned int     ipv4_mtu(const struct dst_entry *dst);
-static void             ipv4_dst_destroy(struct dst_entry *dst);
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 static void             ipv4_link_failure(struct sk_buff *skb);
 static void             ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
                                           struct sk_buff *skb, u32 mtu);
 static void             ip_do_redirect(struct dst_entry *dst, struct sock *sk,
                                        struct sk_buff *skb);
+static void            ipv4_dst_destroy(struct dst_entry *dst);
 
 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
                            int how)
@@ -206,11 +202,6 @@ EXPORT_SYMBOL(ip_tos2prio);
 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 
-static inline int rt_genid(struct net *net)
-{
-       return atomic_read(&net->ipv4.rt_genid);
-}
-
 #ifdef CONFIG_PROC_FS
 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 {
@@ -446,32 +437,14 @@ static inline int ip_rt_proc_init(void)
 }
 #endif /* CONFIG_PROC_FS */
 
-static inline int rt_is_expired(struct rtable *rth)
+static inline bool rt_is_expired(const struct rtable *rth)
 {
        return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
 }
 
-/*
- * Perturbation of rt_genid by a small quantity [1..256]
- * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
- * many times (2^24) without giving recent rt_genid.
- * Jenkins hash is strong enough that litle changes of rt_genid are OK.
- */
-static void rt_cache_invalidate(struct net *net)
-{
-       unsigned char shuffle;
-
-       get_random_bytes(&shuffle, sizeof(shuffle));
-       atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
-}
-
-/*
- * delay < 0  : invalidate cache (fast : entries will be deleted later)
- * delay >= 0 : invalidate & flush cache (can be long)
- */
-void rt_cache_flush(struct net *net, int delay)
+void rt_cache_flush(struct net *net)
 {
-       rt_cache_invalidate(net);
+       rt_genid_bump(net);
 }
 
 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
@@ -589,11 +562,17 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
                build_sk_flow_key(fl4, sk);
 }
 
-static DEFINE_SEQLOCK(fnhe_seqlock);
+static inline void rt_free(struct rtable *rt)
+{
+       call_rcu(&rt->dst.rcu_head, dst_rcu_free);
+}
+
+static DEFINE_SPINLOCK(fnhe_lock);
 
 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 {
        struct fib_nh_exception *fnhe, *oldest;
+       struct rtable *orig;
 
        oldest = rcu_dereference(hash->chain);
        for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
@@ -601,6 +580,11 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
                if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
                        oldest = fnhe;
        }
+       orig = rcu_dereference(oldest->fnhe_rth);
+       if (orig) {
+               RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
+               rt_free(orig);
+       }
        return oldest;
 }
 
@@ -622,7 +606,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
        int depth;
        u32 hval = fnhe_hashfun(daddr);
 
-       write_seqlock_bh(&fnhe_seqlock);
+       spin_lock_bh(&fnhe_lock);
 
        hash = nh->nh_exceptions;
        if (!hash) {
@@ -669,11 +653,12 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
        fnhe->fnhe_stamp = jiffies;
 
 out_unlock:
-       write_sequnlock_bh(&fnhe_seqlock);
+       spin_unlock_bh(&fnhe_lock);
        return;
 }
 
-static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
+static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
+                            bool kill_route)
 {
        __be32 new_gw = icmp_hdr(skb)->un.gateway;
        __be32 old_gw = ip_hdr(skb)->saddr;
@@ -728,8 +713,8 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
                                update_or_create_fnhe(nh, fl4->daddr, new_gw,
                                                      0, 0);
                        }
-                       rt->rt_gateway = new_gw;
-                       rt->rt_flags |= RTCF_REDIRECTED;
+                       if (kill_route)
+                               rt->dst.obsolete = DST_OBSOLETE_KILL;
                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
                }
                neigh_release(n);
@@ -760,7 +745,7 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
        rt = (struct rtable *) dst;
 
        ip_rt_build_flow_key(&fl4, sk, skb);
-       __ip_do_redirect(rt, skb, &fl4);
+       __ip_do_redirect(rt, skb, &fl4, true);
 }
 
 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
@@ -817,7 +802,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
        net = dev_net(rt->dst.dev);
        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
        if (!peer) {
-               icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
+               icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
+                         rt_nexthop(rt, ip_hdr(skb)->daddr));
                return;
        }
 
@@ -842,15 +828,17 @@ void ip_rt_send_redirect(struct sk_buff *skb)
            time_after(jiffies,
                       (peer->rate_last +
                        (ip_rt_redirect_load << peer->rate_tokens)))) {
-               icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
+               __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
+
+               icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
                peer->rate_last = jiffies;
                ++peer->rate_tokens;
 #ifdef CONFIG_IP_ROUTE_VERBOSE
                if (log_martians &&
                    peer->rate_tokens == ip_rt_redirect_number)
                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
-                                            &ip_hdr(skb)->saddr, rt->rt_iif,
-                                            &rt->rt_dst, &rt->rt_gateway);
+                                            &ip_hdr(skb)->saddr, inet_iif(skb),
+                                            &ip_hdr(skb)->daddr, &gw);
 #endif
        }
 out_put_peer:
@@ -921,19 +909,30 @@ out:      kfree_skb(skb);
 
 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 {
+       struct dst_entry *dst = &rt->dst;
        struct fib_result res;
 
+       if (dst->dev->mtu < mtu)
+               return;
+
        if (mtu < ip_rt_min_pmtu)
                mtu = ip_rt_min_pmtu;
 
-       if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
+       if (!rt->rt_pmtu) {
+               dst->obsolete = DST_OBSOLETE_KILL;
+       } else {
+               rt->rt_pmtu = mtu;
+               dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
+       }
+
+       rcu_read_lock();
+       if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
                struct fib_nh *nh = &FIB_RES_NH(res);
 
                update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
                                      jiffies + ip_rt_mtu_expires);
        }
-       rt->rt_pmtu = mtu;
-       dst_set_expires(&rt->dst, ip_rt_mtu_expires);
+       rcu_read_unlock();
 }
 
 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
@@ -989,7 +988,7 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
                         RT_TOS(iph->tos), protocol, mark, flow_flags);
        rt = __ip_route_output_key(net, &fl4);
        if (!IS_ERR(rt)) {
-               __ip_do_redirect(rt, skb, &fl4);
+               __ip_do_redirect(rt, skb, &fl4, false);
                ip_rt_put(rt);
        }
 }
@@ -1004,7 +1003,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
        __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
        rt = __ip_route_output_key(sock_net(sk), &fl4);
        if (!IS_ERR(rt)) {
-               __ip_do_redirect(rt, skb, &fl4);
+               __ip_do_redirect(rt, skb, &fl4, false);
                ip_rt_put(rt);
        }
 }
@@ -1014,22 +1013,19 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
 {
        struct rtable *rt = (struct rtable *) dst;
 
-       if (rt_is_expired(rt))
+       /* All IPV4 dsts are created with ->obsolete set to the value
+        * DST_OBSOLETE_FORCE_CHK which forces validation calls down
+        * into this function always.
+        *
+        * When a PMTU/redirect information update invalidates a
+        * route, this is indicated by setting obsolete to
+        * DST_OBSOLETE_KILL.
+        */
+       if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
                return NULL;
        return dst;
 }
 
-static void ipv4_dst_destroy(struct dst_entry *dst)
-{
-       struct rtable *rt = (struct rtable *) dst;
-
-       if (rt->fi) {
-               fib_info_put(rt->fi);
-               rt->fi = NULL;
-       }
-}
-
-
 static void ipv4_link_failure(struct sk_buff *skb)
 {
        struct rtable *rt;
@@ -1085,8 +1081,9 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
                        src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
                else
-                       src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
-                                       RT_SCOPE_UNIVERSE);
+                       src = inet_select_addr(rt->dst.dev,
+                                              rt_nexthop(rt, iph->daddr),
+                                              RT_SCOPE_UNIVERSE);
                rcu_read_unlock();
        }
        memcpy(addr, &src, 4);
@@ -1120,10 +1117,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
        const struct rtable *rt = (const struct rtable *) dst;
        unsigned int mtu = rt->rt_pmtu;
 
-       if (mtu && time_after_eq(jiffies, rt->dst.expires))
-               mtu = 0;
-
-       if (!mtu)
+       if (!mtu || time_after_eq(jiffies, rt->dst.expires))
                mtu = dst_metric_raw(dst, RTAX_MTU);
 
        if (mtu && rt_is_output_route(rt))
@@ -1132,8 +1126,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
        mtu = dst->dev->mtu;
 
        if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
-
-               if (rt->rt_gateway != rt->rt_dst && mtu > 576)
+               if (rt->rt_uses_gateway && mtu > 576)
                        mtu = 576;
        }
 
@@ -1143,72 +1136,170 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
        return mtu;
 }
 
-static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
-                           struct fib_info *fi)
-{
-       if (fi->fib_metrics != (u32 *) dst_default_metrics) {
-               rt->fi = fi;
-               atomic_inc(&fi->fib_clntref);
-       }
-       dst_init_metrics(&rt->dst, fi->fib_metrics, true);
-}
-
-static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
+static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
 {
        struct fnhe_hash_bucket *hash = nh->nh_exceptions;
        struct fib_nh_exception *fnhe;
        u32 hval;
 
+       if (!hash)
+               return NULL;
+
        hval = fnhe_hashfun(daddr);
 
-restart:
        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
             fnhe = rcu_dereference(fnhe->fnhe_next)) {
-               __be32 fnhe_daddr, gw;
-               unsigned long expires;
-               unsigned int seq;
-               u32 pmtu;
-
-               seq = read_seqbegin(&fnhe_seqlock);
-               fnhe_daddr = fnhe->fnhe_daddr;
-               gw = fnhe->fnhe_gw;
-               pmtu = fnhe->fnhe_pmtu;
-               expires = fnhe->fnhe_expires;
-               if (read_seqretry(&fnhe_seqlock, seq))
-                       goto restart;
-               if (daddr != fnhe_daddr)
-                       continue;
-               if (pmtu) {
+               if (fnhe->fnhe_daddr == daddr)
+                       return fnhe;
+       }
+       return NULL;
+}
+
+static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
+                             __be32 daddr)
+{
+       bool ret = false;
+
+       spin_lock_bh(&fnhe_lock);
+
+       if (daddr == fnhe->fnhe_daddr) {
+               struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
+               if (orig && rt_is_expired(orig)) {
+                       fnhe->fnhe_gw = 0;
+                       fnhe->fnhe_pmtu = 0;
+                       fnhe->fnhe_expires = 0;
+               }
+               if (fnhe->fnhe_pmtu) {
+                       unsigned long expires = fnhe->fnhe_expires;
                        unsigned long diff = expires - jiffies;
 
                        if (time_before(jiffies, expires)) {
-                               rt->rt_pmtu = pmtu;
+                               rt->rt_pmtu = fnhe->fnhe_pmtu;
                                dst_set_expires(&rt->dst, diff);
                        }
                }
-               if (gw)
-                       rt->rt_gateway = gw;
+               if (fnhe->fnhe_gw) {
+                       rt->rt_flags |= RTCF_REDIRECTED;
+                       rt->rt_gateway = fnhe->fnhe_gw;
+                       rt->rt_uses_gateway = 1;
+               } else if (!rt->rt_gateway)
+                       rt->rt_gateway = daddr;
+
+               rcu_assign_pointer(fnhe->fnhe_rth, rt);
+               if (orig)
+                       rt_free(orig);
+
                fnhe->fnhe_stamp = jiffies;
-               break;
+               ret = true;
+       }
+       spin_unlock_bh(&fnhe_lock);
+
+       return ret;
+}
+
+static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
+{
+       struct rtable *orig, *prev, **p;
+       bool ret = true;
+
+       if (rt_is_input_route(rt)) {
+               p = (struct rtable **)&nh->nh_rth_input;
+       } else {
+               p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
+       }
+       orig = *p;
+
+       prev = cmpxchg(p, orig, rt);
+       if (prev == orig) {
+               if (orig)
+                       rt_free(orig);
+       } else
+               ret = false;
+
+       return ret;
+}
+
+static DEFINE_SPINLOCK(rt_uncached_lock);
+static LIST_HEAD(rt_uncached_list);
+
+static void rt_add_uncached_list(struct rtable *rt)
+{
+       spin_lock_bh(&rt_uncached_lock);
+       list_add_tail(&rt->rt_uncached, &rt_uncached_list);
+       spin_unlock_bh(&rt_uncached_lock);
+}
+
+static void ipv4_dst_destroy(struct dst_entry *dst)
+{
+       struct rtable *rt = (struct rtable *) dst;
+
+       if (!list_empty(&rt->rt_uncached)) {
+               spin_lock_bh(&rt_uncached_lock);
+               list_del(&rt->rt_uncached);
+               spin_unlock_bh(&rt_uncached_lock);
+       }
+}
+
+void rt_flush_dev(struct net_device *dev)
+{
+       if (!list_empty(&rt_uncached_list)) {
+               struct net *net = dev_net(dev);
+               struct rtable *rt;
+
+               spin_lock_bh(&rt_uncached_lock);
+               list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
+                       if (rt->dst.dev != dev)
+                               continue;
+                       rt->dst.dev = net->loopback_dev;
+                       dev_hold(rt->dst.dev);
+                       dev_put(dev);
+               }
+               spin_unlock_bh(&rt_uncached_lock);
        }
 }
 
-static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
+static bool rt_cache_valid(const struct rtable *rt)
+{
+       return  rt &&
+               rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
+               !rt_is_expired(rt);
+}
+
+static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
                           const struct fib_result *res,
+                          struct fib_nh_exception *fnhe,
                           struct fib_info *fi, u16 type, u32 itag)
 {
+       bool cached = false;
+
        if (fi) {
                struct fib_nh *nh = &FIB_RES_NH(*res);
 
-               if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
+               if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
                        rt->rt_gateway = nh->nh_gw;
-               if (unlikely(nh->nh_exceptions))
-                       rt_bind_exception(rt, nh, fl4->daddr);
-               rt_init_metrics(rt, fl4, fi);
+                       rt->rt_uses_gateway = 1;
+               }
+               dst_init_metrics(&rt->dst, fi->fib_metrics, true);
 #ifdef CONFIG_IP_ROUTE_CLASSID
-               rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
+               rt->dst.tclassid = nh->nh_tclassid;
 #endif
-       }
+               if (unlikely(fnhe))
+                       cached = rt_bind_exception(rt, fnhe, daddr);
+               else if (!(rt->dst.flags & DST_NOCACHE))
+                       cached = rt_cache_route(nh, rt);
+               if (unlikely(!cached)) {
+                       /* Routes we intend to cache in nexthop exception or
+                        * FIB nexthop have the DST_NOCACHE bit clear.
+                        * However, if we are unsuccessful at storing this
+                        * route into the cache we really need to set it.
+                        */
+                       rt->dst.flags |= DST_NOCACHE;
+                       if (!rt->rt_gateway)
+                               rt->rt_gateway = daddr;
+                       rt_add_uncached_list(rt);
+               }
+       } else
+               rt_add_uncached_list(rt);
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
 #ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -1219,10 +1310,10 @@ static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
 }
 
 static struct rtable *rt_dst_alloc(struct net_device *dev,
-                                  bool nopolicy, bool noxfrm)
+                                  bool nopolicy, bool noxfrm, bool will_cache)
 {
-       return dst_alloc(&ipv4_dst_ops, dev, 1, -1,
-                        DST_HOST | DST_NOCACHE |
+       return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
+                        (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
                         (nopolicy ? DST_NOPOLICY : 0) |
                         (noxfrm ? DST_NOXFRM : 0));
 }
@@ -1259,7 +1350,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                        goto e_err;
        }
        rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
-                          IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
+                          IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
        if (!rth)
                goto e_nobufs;
 
@@ -1268,21 +1359,15 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 #endif
        rth->dst.output = ip_rt_bug;
 
-       rth->rt_key_dst = daddr;
-       rth->rt_key_src = saddr;
        rth->rt_genid   = rt_genid(dev_net(dev));
        rth->rt_flags   = RTCF_MULTICAST;
        rth->rt_type    = RTN_MULTICAST;
-       rth->rt_key_tos = tos;
-       rth->rt_dst     = daddr;
-       rth->rt_src     = saddr;
-       rth->rt_route_iif = dev->ifindex;
-       rth->rt_iif     = dev->ifindex;
-       rth->rt_oif     = 0;
-       rth->rt_mark    = skb->mark;
+       rth->rt_is_input= 1;
+       rth->rt_iif     = 0;
        rth->rt_pmtu    = 0;
-       rth->rt_gateway = daddr;
-       rth->fi = NULL;
+       rth->rt_gateway = 0;
+       rth->rt_uses_gateway = 0;
+       INIT_LIST_HEAD(&rth->rt_uncached);
        if (our) {
                rth->dst.input= ip_local_deliver;
                rth->rt_flags |= RTCF_LOCAL;
@@ -1335,13 +1420,13 @@ static void ip_handle_martian_source(struct net_device *dev,
 static int __mkroute_input(struct sk_buff *skb,
                           const struct fib_result *res,
                           struct in_device *in_dev,
-                          __be32 daddr, __be32 saddr, u32 tos,
-                          struct rtable **result)
+                          __be32 daddr, __be32 saddr, u32 tos)
 {
        struct rtable *rth;
        int err;
        struct in_device *out_dev;
        unsigned int flags = 0;
+       bool do_cache;
        u32 itag;
 
        /* get a working reference to the output device */
@@ -1351,7 +1436,6 @@ static int __mkroute_input(struct sk_buff *skb,
                return -EINVAL;
        }
 
-
        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
                                  in_dev->dev, in_dev, &itag);
        if (err < 0) {
@@ -1361,13 +1445,13 @@ static int __mkroute_input(struct sk_buff *skb,
                goto cleanup;
        }
 
-       if (err)
-               flags |= RTCF_DIRECTSRC;
-
-       if (out_dev == in_dev && err &&
+       do_cache = res->fi && !itag;
+       if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
            (IN_DEV_SHARED_MEDIA(out_dev) ||
-            inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
+            inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
                flags |= RTCF_DOREDIRECT;
+               do_cache = false;
+       }
 
        if (skb->protocol != htons(ETH_P_IP)) {
                /* Not IP (i.e. ARP). Do not create route, if it is
@@ -1384,36 +1468,38 @@ static int __mkroute_input(struct sk_buff *skb,
                }
        }
 
+       if (do_cache) {
+               rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+               if (rt_cache_valid(rth)) {
+                       skb_dst_set_noref(skb, &rth->dst);
+                       goto out;
+               }
+       }
+
        rth = rt_dst_alloc(out_dev->dev,
                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
-                          IN_DEV_CONF_GET(out_dev, NOXFRM));
+                          IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
        if (!rth) {
                err = -ENOBUFS;
                goto cleanup;
        }
 
-       rth->rt_key_dst = daddr;
-       rth->rt_key_src = saddr;
        rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
        rth->rt_flags = flags;
        rth->rt_type = res->type;
-       rth->rt_key_tos = tos;
-       rth->rt_dst     = daddr;
-       rth->rt_src     = saddr;
-       rth->rt_route_iif = in_dev->dev->ifindex;
-       rth->rt_iif     = in_dev->dev->ifindex;
-       rth->rt_oif     = 0;
-       rth->rt_mark    = skb->mark;
+       rth->rt_is_input = 1;
+       rth->rt_iif     = 0;
        rth->rt_pmtu    = 0;
-       rth->rt_gateway = daddr;
-       rth->fi = NULL;
+       rth->rt_gateway = 0;
+       rth->rt_uses_gateway = 0;
+       INIT_LIST_HEAD(&rth->rt_uncached);
 
        rth->dst.input = ip_forward;
        rth->dst.output = ip_output;
 
-       rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag);
-
-       *result = rth;
+       rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
+       skb_dst_set(skb, &rth->dst);
+out:
        err = 0;
  cleanup:
        return err;
@@ -1425,21 +1511,13 @@ static int ip_mkroute_input(struct sk_buff *skb,
                            struct in_device *in_dev,
                            __be32 daddr, __be32 saddr, u32 tos)
 {
-       struct rtable *rth = NULL;
-       int err;
-
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (res->fi && res->fi->fib_nhs > 1)
                fib_select_multipath(res);
 #endif
 
        /* create a routing cache entry */
-       err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
-       if (err)
-               return err;
-
-       skb_dst_set(skb, &rth->dst);
-       return 0;
+       return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
 }
 
 /*
@@ -1464,6 +1542,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        struct rtable   *rth;
        int             err = -EINVAL;
        struct net    *net = dev_net(dev);
+       bool do_cache;
 
        /* IP on this device is disabled. */
 
@@ -1477,6 +1556,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
                goto martian_source;
 
+       res.fi = NULL;
        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
                goto brd_input;
 
@@ -1489,11 +1569,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        if (ipv4_is_zeronet(daddr))
                goto martian_destination;
 
-       if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
-               if (ipv4_is_loopback(daddr))
+       /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
+        * and call it once if daddr or/and saddr are loopback addresses
+        */
+       if (ipv4_is_loopback(daddr)) {
+               if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
                        goto martian_destination;
-
-               if (ipv4_is_loopback(saddr))
+       } else if (ipv4_is_loopback(saddr)) {
+               if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
                        goto martian_source;
        }
 
@@ -1518,12 +1601,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 
        if (res.type == RTN_LOCAL) {
                err = fib_validate_source(skb, saddr, daddr, tos,
-                                         net->loopback_dev->ifindex,
+                                         LOOPBACK_IFINDEX,
                                          dev, in_dev, &itag);
                if (err < 0)
                        goto martian_source_keep_err;
-               if (err)
-                       flags |= RTCF_DIRECTSRC;
                goto local_input;
        }
 
@@ -1544,16 +1625,27 @@ brd_input:
                                          in_dev, &itag);
                if (err < 0)
                        goto martian_source_keep_err;
-               if (err)
-                       flags |= RTCF_DIRECTSRC;
        }
        flags |= RTCF_BROADCAST;
        res.type = RTN_BROADCAST;
        RT_CACHE_STAT_INC(in_brd);
 
 local_input:
+       do_cache = false;
+       if (res.fi) {
+               if (!itag) {
+                       rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
+                       if (rt_cache_valid(rth)) {
+                               skb_dst_set_noref(skb, &rth->dst);
+                               err = 0;
+                               goto out;
+                       }
+                       do_cache = true;
+               }
+       }
+
        rth = rt_dst_alloc(net->loopback_dev,
-                          IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
+                          IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
        if (!rth)
                goto e_nobufs;
 
@@ -1563,26 +1655,22 @@ local_input:
        rth->dst.tclassid = itag;
 #endif
 
-       rth->rt_key_dst = daddr;
-       rth->rt_key_src = saddr;
        rth->rt_genid = rt_genid(net);
        rth->rt_flags   = flags|RTCF_LOCAL;
        rth->rt_type    = res.type;
-       rth->rt_key_tos = tos;
-       rth->rt_dst     = daddr;
-       rth->rt_src     = saddr;
-       rth->rt_route_iif = dev->ifindex;
-       rth->rt_iif     = dev->ifindex;
-       rth->rt_oif     = 0;
-       rth->rt_mark    = skb->mark;
+       rth->rt_is_input = 1;
+       rth->rt_iif     = 0;
        rth->rt_pmtu    = 0;
-       rth->rt_gateway = daddr;
-       rth->fi = NULL;
+       rth->rt_gateway = 0;
+       rth->rt_uses_gateway = 0;
+       INIT_LIST_HEAD(&rth->rt_uncached);
        if (res.type == RTN_UNREACHABLE) {
                rth->dst.input= ip_error;
                rth->dst.error= -err;
                rth->rt_flags   &= ~RTCF_LOCAL;
        }
+       if (do_cache)
+               rt_cache_route(&FIB_RES_NH(res), rth);
        skb_dst_set(skb, &rth->dst);
        err = 0;
        goto out;
@@ -1620,8 +1708,8 @@ martian_source_keep_err:
        goto out;
 }
 
-int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
-                  u8 tos, struct net_device *dev)
+int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+                        u8 tos, struct net_device *dev)
 {
        int res;
 
@@ -1664,20 +1752,20 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        rcu_read_unlock();
        return res;
 }
-EXPORT_SYMBOL(ip_route_input);
+EXPORT_SYMBOL(ip_route_input_noref);
 
 /* called with rcu_read_lock() */
 static struct rtable *__mkroute_output(const struct fib_result *res,
-                                      const struct flowi4 *fl4,
-                                      __be32 orig_daddr, __be32 orig_saddr,
-                                      int orig_oif, __u8 orig_rtos,
+                                      const struct flowi4 *fl4, int orig_oif,
                                       struct net_device *dev_out,
                                       unsigned int flags)
 {
        struct fib_info *fi = res->fi;
+       struct fib_nh_exception *fnhe;
        struct in_device *in_dev;
        u16 type = res->type;
        struct rtable *rth;
+       bool do_cache;
 
        in_dev = __in_dev_get_rcu(dev_out);
        if (!in_dev)
@@ -1697,6 +1785,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
        if (dev_out->flags & IFF_LOOPBACK)
                flags |= RTCF_LOCAL;
 
+       do_cache = true;
        if (type == RTN_BROADCAST) {
                flags |= RTCF_BROADCAST | RTCF_LOCAL;
                fi = NULL;
@@ -1705,6 +1794,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
                                     fl4->flowi4_proto))
                        flags &= ~RTCF_LOCAL;
+               else
+                       do_cache = false;
                /* If multicast route do not exist use
                 * default one, but do not gateway in this case.
                 * Yes, it is hack.
@@ -1713,29 +1804,51 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
                        fi = NULL;
        }
 
+       fnhe = NULL;
+       do_cache &= fi != NULL;
+       if (do_cache) {
+               struct rtable __rcu **prth;
+               struct fib_nh *nh = &FIB_RES_NH(*res);
+
+               fnhe = find_exception(nh, fl4->daddr);
+               if (fnhe)
+                       prth = &fnhe->fnhe_rth;
+               else {
+                       if (unlikely(fl4->flowi4_flags &
+                                    FLOWI_FLAG_KNOWN_NH &&
+                                    !(nh->nh_gw &&
+                                      nh->nh_scope == RT_SCOPE_LINK))) {
+                               do_cache = false;
+                               goto add;
+                       }
+                       prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
+               }
+               rth = rcu_dereference(*prth);
+               if (rt_cache_valid(rth)) {
+                       dst_hold(&rth->dst);
+                       return rth;
+               }
+       }
+
+add:
        rth = rt_dst_alloc(dev_out,
                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
-                          IN_DEV_CONF_GET(in_dev, NOXFRM));
+                          IN_DEV_CONF_GET(in_dev, NOXFRM),
+                          do_cache);
        if (!rth)
                return ERR_PTR(-ENOBUFS);
 
        rth->dst.output = ip_output;
 
-       rth->rt_key_dst = orig_daddr;
-       rth->rt_key_src = orig_saddr;
        rth->rt_genid = rt_genid(dev_net(dev_out));
        rth->rt_flags   = flags;
        rth->rt_type    = type;
-       rth->rt_key_tos = orig_rtos;
-       rth->rt_dst     = fl4->daddr;
-       rth->rt_src     = fl4->saddr;
-       rth->rt_route_iif = 0;
-       rth->rt_iif     = orig_oif ? : dev_out->ifindex;
-       rth->rt_oif     = orig_oif;
-       rth->rt_mark    = fl4->flowi4_mark;
+       rth->rt_is_input = 0;
+       rth->rt_iif     = orig_oif ? : 0;
        rth->rt_pmtu    = 0;
-       rth->rt_gateway = fl4->daddr;
-       rth->fi = NULL;
+       rth->rt_gateway = 0;
+       rth->rt_uses_gateway = 0;
+       INIT_LIST_HEAD(&rth->rt_uncached);
 
        RT_CACHE_STAT_INC(out_slow_tot);
 
@@ -1758,10 +1871,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
 #endif
        }
 
-       rt_set_nexthop(rth, fl4, res, fi, type, 0);
-
-       if (fl4->flowi4_flags & FLOWI_FLAG_RT_NOCACHE)
-               rth->dst.flags |= DST_NOCACHE;
+       rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
 
        return rth;
 }
@@ -1777,19 +1887,15 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
        unsigned int flags = 0;
        struct fib_result res;
        struct rtable *rth;
-       __be32 orig_daddr;
-       __be32 orig_saddr;
        int orig_oif;
 
        res.tclassid    = 0;
        res.fi          = NULL;
        res.table       = NULL;
 
-       orig_daddr = fl4->daddr;
-       orig_saddr = fl4->saddr;
        orig_oif = fl4->flowi4_oif;
 
-       fl4->flowi4_iif = net->loopback_dev->ifindex;
+       fl4->flowi4_iif = LOOPBACK_IFINDEX;
        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
@@ -1878,7 +1984,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
                if (!fl4->daddr)
                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
                dev_out = net->loopback_dev;
-               fl4->flowi4_oif = net->loopback_dev->ifindex;
+               fl4->flowi4_oif = LOOPBACK_IFINDEX;
                res.type = RTN_LOCAL;
                flags |= RTCF_LOCAL;
                goto make_route;
@@ -1925,7 +2031,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
                }
                dev_out = net->loopback_dev;
                fl4->flowi4_oif = dev_out->ifindex;
-               res.fi = NULL;
                flags |= RTCF_LOCAL;
                goto make_route;
        }
@@ -1948,8 +2053,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 
 
 make_route:
-       rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif,
-                              tos, dev_out, flags);
+       rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
 
 out:
        rcu_read_unlock();
@@ -1988,7 +2092,6 @@ static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
 static struct dst_ops ipv4_dst_blackhole_ops = {
        .family                 =       AF_INET,
        .protocol               =       cpu_to_be16(ETH_P_IP),
-       .destroy                =       ipv4_dst_destroy,
        .check                  =       ipv4_blackhole_dst_check,
        .mtu                    =       ipv4_blackhole_mtu,
        .default_advmss         =       ipv4_default_advmss,
@@ -2000,9 +2103,10 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
 
 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
 {
-       struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0);
        struct rtable *ort = (struct rtable *) dst_orig;
+       struct rtable *rt;
 
+       rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
        if (rt) {
                struct dst_entry *new = &rt->dst;
 
@@ -2014,24 +2118,17 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
                if (new->dev)
                        dev_hold(new->dev);
 
-               rt->rt_key_dst = ort->rt_key_dst;
-               rt->rt_key_src = ort->rt_key_src;
-               rt->rt_key_tos = ort->rt_key_tos;
-               rt->rt_route_iif = ort->rt_route_iif;
+               rt->rt_is_input = ort->rt_is_input;
                rt->rt_iif = ort->rt_iif;
-               rt->rt_oif = ort->rt_oif;
-               rt->rt_mark = ort->rt_mark;
                rt->rt_pmtu = ort->rt_pmtu;
 
                rt->rt_genid = rt_genid(net);
                rt->rt_flags = ort->rt_flags;
                rt->rt_type = ort->rt_type;
-               rt->rt_dst = ort->rt_dst;
-               rt->rt_src = ort->rt_src;
                rt->rt_gateway = ort->rt_gateway;
-               rt->fi = ort->fi;
-               if (rt->fi)
-                       atomic_inc(&rt->fi->fib_clntref);
+               rt->rt_uses_gateway = ort->rt_uses_gateway;
+
+               INIT_LIST_HEAD(&rt->rt_uncached);
 
                dst_free(new);
        }
@@ -2058,9 +2155,9 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 }
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
-static int rt_fill_info(struct net *net,
-                       struct sk_buff *skb, u32 pid, u32 seq, int event,
-                       int nowait, unsigned int flags)
+static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
+                       struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
+                       u32 seq, int event, int nowait, unsigned int flags)
 {
        struct rtable *rt = skb_rtable(skb);
        struct rtmsg *r;
@@ -2069,7 +2166,7 @@ static int rt_fill_info(struct net *net,
        u32 error;
        u32 metrics[RTAX_MAX];
 
-       nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
+       nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
        if (nlh == NULL)
                return -EMSGSIZE;
 
@@ -2077,7 +2174,7 @@ static int rt_fill_info(struct net *net,
        r->rtm_family    = AF_INET;
        r->rtm_dst_len  = 32;
        r->rtm_src_len  = 0;
-       r->rtm_tos      = rt->rt_key_tos;
+       r->rtm_tos      = fl4->flowi4_tos;
        r->rtm_table    = RT_TABLE_MAIN;
        if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
                goto nla_put_failure;
@@ -2088,11 +2185,11 @@ static int rt_fill_info(struct net *net,
        if (rt->rt_flags & RTCF_NOTIFY)
                r->rtm_flags |= RTM_F_NOTIFY;
 
-       if (nla_put_be32(skb, RTA_DST, rt->rt_dst))
+       if (nla_put_be32(skb, RTA_DST, dst))
                goto nla_put_failure;
-       if (rt->rt_key_src) {
+       if (src) {
                r->rtm_src_len = 32;
-               if (nla_put_be32(skb, RTA_SRC, rt->rt_key_src))
+               if (nla_put_be32(skb, RTA_SRC, src))
                        goto nla_put_failure;
        }
        if (rt->dst.dev &&
@@ -2104,57 +2201,39 @@ static int rt_fill_info(struct net *net,
                goto nla_put_failure;
 #endif
        if (!rt_is_input_route(rt) &&
-           rt->rt_src != rt->rt_key_src) {
-               if (nla_put_be32(skb, RTA_PREFSRC, rt->rt_src))
+           fl4->saddr != src) {
+               if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
                        goto nla_put_failure;
        }
-       if (rt->rt_dst != rt->rt_gateway &&
+       if (rt->rt_uses_gateway &&
            nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
                goto nla_put_failure;
 
+       expires = rt->dst.expires;
+       if (expires) {
+               unsigned long now = jiffies;
+
+               if (time_before(now, expires))
+                       expires -= now;
+               else
+                       expires = 0;
+       }
+
        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
-       if (rt->rt_pmtu)
+       if (rt->rt_pmtu && expires)
                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
        if (rtnetlink_put_metrics(skb, metrics) < 0)
                goto nla_put_failure;
 
-       if (rt->rt_mark &&
-           nla_put_be32(skb, RTA_MARK, rt->rt_mark))
+       if (fl4->flowi4_mark &&
+           nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
                goto nla_put_failure;
 
        error = rt->dst.error;
-       expires = rt->dst.expires;
-       if (expires) {
-               if (time_before(jiffies, expires))
-                       expires -= jiffies;
-               else
-                       expires = 0;
-       }
 
        if (rt_is_input_route(rt)) {
-#ifdef CONFIG_IP_MROUTE
-               __be32 dst = rt->rt_dst;
-
-               if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
-                   IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
-                       int err = ipmr_get_route(net, skb,
-                                                rt->rt_src, rt->rt_dst,
-                                                r, nowait);
-                       if (err <= 0) {
-                               if (!nowait) {
-                                       if (err == 0)
-                                               return 0;
-                                       goto nla_put_failure;
-                               } else {
-                                       if (err == -EMSGSIZE)
-                                               goto nla_put_failure;
-                                       error = err;
-                               }
-                       }
-               } else
-#endif
-                       if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
-                               goto nla_put_failure;
+               if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
+                       goto nla_put_failure;
        }
 
        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
@@ -2173,6 +2252,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
        struct rtmsg *rtm;
        struct nlattr *tb[RTA_MAX+1];
        struct rtable *rt = NULL;
+       struct flowi4 fl4;
        __be32 dst = 0;
        __be32 src = 0;
        u32 iif;
@@ -2207,6 +2287,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
        iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
        mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
 
+       memset(&fl4, 0, sizeof(fl4));
+       fl4.daddr = dst;
+       fl4.saddr = src;
+       fl4.flowi4_tos = rtm->rtm_tos;
+       fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
+       fl4.flowi4_mark = mark;
+
        if (iif) {
                struct net_device *dev;
 
@@ -2227,13 +2314,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
                if (err == 0 && rt->dst.error)
                        err = -rt->dst.error;
        } else {
-               struct flowi4 fl4 = {
-                       .daddr = dst,
-                       .saddr = src,
-                       .flowi4_tos = rtm->rtm_tos,
-                       .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
-                       .flowi4_mark = mark,
-               };
                rt = ip_route_output_key(net, &fl4);
 
                err = 0;
@@ -2248,12 +2328,13 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
        if (rtm->rtm_flags & RTM_F_NOTIFY)
                rt->rt_flags |= RTCF_NOTIFY;
 
-       err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+       err = rt_fill_info(net, dst, src, &fl4, skb,
+                          NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
                           RTM_NEWROUTE, 0, 0);
        if (err <= 0)
                goto errout_free;
 
-       err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
+       err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
 errout:
        return err;
 
@@ -2269,7 +2350,7 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
 
 void ip_rt_multicast_event(struct in_device *in_dev)
 {
-       rt_cache_flush(dev_net(in_dev->dev), 0);
+       rt_cache_flush(dev_net(in_dev->dev));
 }
 
 #ifdef CONFIG_SYSCTL
@@ -2278,16 +2359,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
                                        size_t *lenp, loff_t *ppos)
 {
        if (write) {
-               int flush_delay;
-               ctl_table ctl;
-               struct net *net;
-
-               memcpy(&ctl, __ctl, sizeof(ctl));
-               ctl.data = &flush_delay;
-               proc_dointvec(&ctl, write, buffer, lenp, ppos);
-
-               net = (struct net *)__ctl->extra1;
-               rt_cache_flush(net, flush_delay);
+               rt_cache_flush((struct net *)__ctl->extra1);
                return 0;
        }
 
@@ -2424,6 +2496,10 @@ static __net_init int sysctl_route_net_init(struct net *net)
                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
                if (tbl == NULL)
                        goto err_dup;
+
+               /* Don't export sysctls to unprivileged users */
+               if (net->user_ns != &init_user_ns)
+                       tbl[0].procname = NULL;
        }
        tbl[0].extra1 = net;
 
@@ -2457,8 +2533,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
 
 static __net_init int rt_genid_init(struct net *net)
 {
-       get_random_bytes(&net->ipv4.rt_genid,
-                        sizeof(net->ipv4.rt_genid));
+       atomic_set(&net->rt_genid, 0);
        get_random_bytes(&net->ipv4.dev_addr_genid,
                         sizeof(net->ipv4.dev_addr_genid));
        return 0;
@@ -2529,7 +2604,7 @@ int __init ip_rt_init(void)
                pr_err("Unable to create route proc files\n");
 #ifdef CONFIG_XFRM
        xfrm_init();
-       xfrm4_init(ip_rt_max_size);
+       xfrm4_init();
 #endif
        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);