Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[linux-3.10.git] / net / ipv4 / route.c
index be27cfa..baa9b28 100644 (file)
@@ -202,11 +202,6 @@ EXPORT_SYMBOL(ip_tos2prio);
 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
 
-static inline int rt_genid(struct net *net)
-{
-       return atomic_read(&net->ipv4.rt_genid);
-}
-
 #ifdef CONFIG_PROC_FS
 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 {
@@ -449,7 +444,7 @@ static inline bool rt_is_expired(const struct rtable *rth)
 
 void rt_cache_flush(struct net *net)
 {
-       atomic_inc(&net->ipv4.rt_genid);
+       rt_genid_bump(net);
 }
 
 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
@@ -807,7 +802,8 @@ void ip_rt_send_redirect(struct sk_buff *skb)
        net = dev_net(rt->dst.dev);
        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
        if (!peer) {
-               icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
+               icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
+                         rt_nexthop(rt, ip_hdr(skb)->daddr));
                return;
        }
 
@@ -832,7 +828,9 @@ void ip_rt_send_redirect(struct sk_buff *skb)
            time_after(jiffies,
                       (peer->rate_last +
                        (ip_rt_redirect_load << peer->rate_tokens)))) {
-               icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
+               __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
+
+               icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
                peer->rate_last = jiffies;
                ++peer->rate_tokens;
 #ifdef CONFIG_IP_ROUTE_VERBOSE
@@ -840,7 +838,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
                    peer->rate_tokens == ip_rt_redirect_number)
                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
                                             &ip_hdr(skb)->saddr, inet_iif(skb),
-                                            &ip_hdr(skb)->daddr, &rt->rt_gateway);
+                                            &ip_hdr(skb)->daddr, &gw);
 #endif
        }
 out_put_peer:
@@ -909,22 +907,32 @@ out:      kfree_skb(skb);
        return 0;
 }
 
-static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
+static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 {
+       struct dst_entry *dst = &rt->dst;
        struct fib_result res;
 
+       if (dst->dev->mtu < mtu)
+               return;
+
        if (mtu < ip_rt_min_pmtu)
                mtu = ip_rt_min_pmtu;
 
+       if (!rt->rt_pmtu) {
+               dst->obsolete = DST_OBSOLETE_KILL;
+       } else {
+               rt->rt_pmtu = mtu;
+               dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
+       }
+
        rcu_read_lock();
-       if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
+       if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
                struct fib_nh *nh = &FIB_RES_NH(res);
 
                update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
                                      jiffies + ip_rt_mtu_expires);
        }
        rcu_read_unlock();
-       return mtu;
 }
 
 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
@@ -934,14 +942,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
        struct flowi4 fl4;
 
        ip_rt_build_flow_key(&fl4, sk, skb);
-       mtu = __ip_rt_update_pmtu(rt, &fl4, mtu);
-
-       if (!rt->rt_pmtu) {
-               dst->obsolete = DST_OBSOLETE_KILL;
-       } else {
-               rt->rt_pmtu = mtu;
-               rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires);
-       }
+       __ip_rt_update_pmtu(rt, &fl4, mtu);
 }
 
 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
@@ -1116,10 +1117,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
        const struct rtable *rt = (const struct rtable *) dst;
        unsigned int mtu = rt->rt_pmtu;
 
-       if (mtu && time_after_eq(jiffies, rt->dst.expires))
-               mtu = 0;
-
-       if (!mtu)
+       if (!mtu || time_after_eq(jiffies, rt->dst.expires))
                mtu = dst_metric_raw(dst, RTAX_MTU);
 
        if (mtu && rt_is_output_route(rt))
@@ -1128,7 +1126,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
        mtu = dst->dev->mtu;
 
        if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
-               if (rt->rt_gateway && mtu > 576)
+               if (rt->rt_uses_gateway && mtu > 576)
                        mtu = 576;
        }
 
@@ -1165,8 +1163,12 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
        spin_lock_bh(&fnhe_lock);
 
        if (daddr == fnhe->fnhe_daddr) {
-               struct rtable *orig;
-
+               struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
+               if (orig && rt_is_expired(orig)) {
+                       fnhe->fnhe_gw = 0;
+                       fnhe->fnhe_pmtu = 0;
+                       fnhe->fnhe_expires = 0;
+               }
                if (fnhe->fnhe_pmtu) {
                        unsigned long expires = fnhe->fnhe_expires;
                        unsigned long diff = expires - jiffies;
@@ -1179,22 +1181,16 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
                if (fnhe->fnhe_gw) {
                        rt->rt_flags |= RTCF_REDIRECTED;
                        rt->rt_gateway = fnhe->fnhe_gw;
-               }
+                       rt->rt_uses_gateway = 1;
+               } else if (!rt->rt_gateway)
+                       rt->rt_gateway = daddr;
 
-               orig = rcu_dereference(fnhe->fnhe_rth);
                rcu_assign_pointer(fnhe->fnhe_rth, rt);
                if (orig)
                        rt_free(orig);
 
                fnhe->fnhe_stamp = jiffies;
                ret = true;
-       } else {
-               /* Routes we intend to cache in nexthop exception have
-                * the DST_NOCACHE bit clear.  However, if we are
-                * unsuccessful at storing this route into the cache
-                * we really need to set it.
-                */
-               rt->dst.flags |= DST_NOCACHE;
        }
        spin_unlock_bh(&fnhe_lock);
 
@@ -1209,8 +1205,6 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
        if (rt_is_input_route(rt)) {
                p = (struct rtable **)&nh->nh_rth_input;
        } else {
-               if (!nh->nh_pcpu_rth_output)
-                       goto nocache;
                p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
        }
        orig = *p;
@@ -1219,16 +1213,8 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
        if (prev == orig) {
                if (orig)
                        rt_free(orig);
-       } else {
-               /* Routes we intend to cache in the FIB nexthop have
-                * the DST_NOCACHE bit clear.  However, if we are
-                * unsuccessful at storing this route into the cache
-                * we really need to set it.
-                */
-nocache:
-               rt->dst.flags |= DST_NOCACHE;
+       } else
                ret = false;
-       }
 
        return ret;
 }
@@ -1289,8 +1275,10 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
        if (fi) {
                struct fib_nh *nh = &FIB_RES_NH(*res);
 
-               if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
+               if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
                        rt->rt_gateway = nh->nh_gw;
+                       rt->rt_uses_gateway = 1;
+               }
                dst_init_metrics(&rt->dst, fi->fib_metrics, true);
 #ifdef CONFIG_IP_ROUTE_CLASSID
                rt->dst.tclassid = nh->nh_tclassid;
@@ -1299,8 +1287,18 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
                        cached = rt_bind_exception(rt, fnhe, daddr);
                else if (!(rt->dst.flags & DST_NOCACHE))
                        cached = rt_cache_route(nh, rt);
-       }
-       if (unlikely(!cached))
+               if (unlikely(!cached)) {
+                       /* Routes we intend to cache in nexthop exception or
+                        * FIB nexthop have the DST_NOCACHE bit clear.
+                        * However, if we are unsuccessful at storing this
+                        * route into the cache we really need to set it.
+                        */
+                       rt->dst.flags |= DST_NOCACHE;
+                       if (!rt->rt_gateway)
+                               rt->rt_gateway = daddr;
+                       rt_add_uncached_list(rt);
+               }
+       } else
                rt_add_uncached_list(rt);
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
@@ -1368,6 +1366,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        rth->rt_iif     = 0;
        rth->rt_pmtu    = 0;
        rth->rt_gateway = 0;
+       rth->rt_uses_gateway = 0;
        INIT_LIST_HEAD(&rth->rt_uncached);
        if (our) {
                rth->dst.input= ip_local_deliver;
@@ -1437,7 +1436,6 @@ static int __mkroute_input(struct sk_buff *skb,
                return -EINVAL;
        }
 
-
        err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
                                  in_dev->dev, in_dev, &itag);
        if (err < 0) {
@@ -1447,10 +1445,13 @@ static int __mkroute_input(struct sk_buff *skb,
                goto cleanup;
        }
 
-       if (out_dev == in_dev && err &&
+       do_cache = res->fi && !itag;
+       if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
            (IN_DEV_SHARED_MEDIA(out_dev) ||
-            inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
+            inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
                flags |= RTCF_DOREDIRECT;
+               do_cache = false;
+       }
 
        if (skb->protocol != htons(ETH_P_IP)) {
                /* Not IP (i.e. ARP). Do not create route, if it is
@@ -1467,15 +1468,11 @@ static int __mkroute_input(struct sk_buff *skb,
                }
        }
 
-       do_cache = false;
-       if (res->fi) {
-               if (!itag) {
-                       rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
-                       if (rt_cache_valid(rth)) {
-                               skb_dst_set_noref(skb, &rth->dst);
-                               goto out;
-                       }
-                       do_cache = true;
+       if (do_cache) {
+               rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+               if (rt_cache_valid(rth)) {
+                       skb_dst_set_noref(skb, &rth->dst);
+                       goto out;
                }
        }
 
@@ -1494,6 +1491,7 @@ static int __mkroute_input(struct sk_buff *skb,
        rth->rt_iif     = 0;
        rth->rt_pmtu    = 0;
        rth->rt_gateway = 0;
+       rth->rt_uses_gateway = 0;
        INIT_LIST_HEAD(&rth->rt_uncached);
 
        rth->dst.input = ip_forward;
@@ -1571,11 +1569,14 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        if (ipv4_is_zeronet(daddr))
                goto martian_destination;
 
-       if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) {
-               if (ipv4_is_loopback(daddr))
+       /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
+        * and call it once if daddr or/and saddr are loopback addresses
+        */
+       if (ipv4_is_loopback(daddr)) {
+               if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
                        goto martian_destination;
-
-               if (ipv4_is_loopback(saddr))
+       } else if (ipv4_is_loopback(saddr)) {
+               if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
                        goto martian_source;
        }
 
@@ -1600,7 +1601,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 
        if (res.type == RTN_LOCAL) {
                err = fib_validate_source(skb, saddr, daddr, tos,
-                                         net->loopback_dev->ifindex,
+                                         LOOPBACK_IFINDEX,
                                          dev, in_dev, &itag);
                if (err < 0)
                        goto martian_source_keep_err;
@@ -1661,6 +1662,7 @@ local_input:
        rth->rt_iif     = 0;
        rth->rt_pmtu    = 0;
        rth->rt_gateway = 0;
+       rth->rt_uses_gateway = 0;
        INIT_LIST_HEAD(&rth->rt_uncached);
        if (res.type == RTN_UNREACHABLE) {
                rth->dst.input= ip_error;
@@ -1763,6 +1765,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
        struct in_device *in_dev;
        u16 type = res->type;
        struct rtable *rth;
+       bool do_cache;
 
        in_dev = __in_dev_get_rcu(dev_out);
        if (!in_dev)
@@ -1782,6 +1785,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
        if (dev_out->flags & IFF_LOOPBACK)
                flags |= RTCF_LOCAL;
 
+       do_cache = true;
        if (type == RTN_BROADCAST) {
                flags |= RTCF_BROADCAST | RTCF_LOCAL;
                fi = NULL;
@@ -1790,6 +1794,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
                                     fl4->flowi4_proto))
                        flags &= ~RTCF_LOCAL;
+               else
+                       do_cache = false;
                /* If multicast route do not exist use
                 * default one, but do not gateway in this case.
                 * Yes, it is hack.
@@ -1799,24 +1805,36 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
        }
 
        fnhe = NULL;
-       if (fi) {
+       do_cache &= fi != NULL;
+       if (do_cache) {
                struct rtable __rcu **prth;
+               struct fib_nh *nh = &FIB_RES_NH(*res);
 
-               fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
+               fnhe = find_exception(nh, fl4->daddr);
                if (fnhe)
                        prth = &fnhe->fnhe_rth;
-               else
-                       prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
+               else {
+                       if (unlikely(fl4->flowi4_flags &
+                                    FLOWI_FLAG_KNOWN_NH &&
+                                    !(nh->nh_gw &&
+                                      nh->nh_scope == RT_SCOPE_LINK))) {
+                               do_cache = false;
+                               goto add;
+                       }
+                       prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
+               }
                rth = rcu_dereference(*prth);
                if (rt_cache_valid(rth)) {
                        dst_hold(&rth->dst);
                        return rth;
                }
        }
+
+add:
        rth = rt_dst_alloc(dev_out,
                           IN_DEV_CONF_GET(in_dev, NOPOLICY),
                           IN_DEV_CONF_GET(in_dev, NOXFRM),
-                          fi);
+                          do_cache);
        if (!rth)
                return ERR_PTR(-ENOBUFS);
 
@@ -1829,6 +1847,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
        rth->rt_iif     = orig_oif ? : 0;
        rth->rt_pmtu    = 0;
        rth->rt_gateway = 0;
+       rth->rt_uses_gateway = 0;
        INIT_LIST_HEAD(&rth->rt_uncached);
 
        RT_CACHE_STAT_INC(out_slow_tot);
@@ -1876,7 +1895,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
 
        orig_oif = fl4->flowi4_oif;
 
-       fl4->flowi4_iif = net->loopback_dev->ifindex;
+       fl4->flowi4_iif = LOOPBACK_IFINDEX;
        fl4->flowi4_tos = tos & IPTOS_RT_MASK;
        fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
                         RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
@@ -1965,7 +1984,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
                if (!fl4->daddr)
                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
                dev_out = net->loopback_dev;
-               fl4->flowi4_oif = net->loopback_dev->ifindex;
+               fl4->flowi4_oif = LOOPBACK_IFINDEX;
                res.type = RTN_LOCAL;
                flags |= RTCF_LOCAL;
                goto make_route;
@@ -2107,6 +2126,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
                rt->rt_flags = ort->rt_flags;
                rt->rt_type = ort->rt_type;
                rt->rt_gateway = ort->rt_gateway;
+               rt->rt_uses_gateway = ort->rt_uses_gateway;
 
                INIT_LIST_HEAD(&rt->rt_uncached);
 
@@ -2136,7 +2156,7 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
 EXPORT_SYMBOL_GPL(ip_route_output_flow);
 
 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
-                       struct flowi4 *fl4, struct sk_buff *skb, u32 pid,
+                       struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
                        u32 seq, int event, int nowait, unsigned int flags)
 {
        struct rtable *rt = skb_rtable(skb);
@@ -2146,7 +2166,7 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
        u32 error;
        u32 metrics[RTAX_MAX];
 
-       nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
+       nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
        if (nlh == NULL)
                return -EMSGSIZE;
 
@@ -2185,28 +2205,31 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
                if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
                        goto nla_put_failure;
        }
-       if (rt->rt_gateway &&
+       if (rt->rt_uses_gateway &&
            nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
                goto nla_put_failure;
 
+       expires = rt->dst.expires;
+       if (expires) {
+               unsigned long now = jiffies;
+
+               if (time_before(now, expires))
+                       expires -= now;
+               else
+                       expires = 0;
+       }
+
        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
-       if (rt->rt_pmtu)
+       if (rt->rt_pmtu && expires)
                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
        if (rtnetlink_put_metrics(skb, metrics) < 0)
                goto nla_put_failure;
 
        if (fl4->flowi4_mark &&
-           nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark))
+           nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
                goto nla_put_failure;
 
        error = rt->dst.error;
-       expires = rt->dst.expires;
-       if (expires) {
-               if (time_before(jiffies, expires))
-                       expires -= jiffies;
-               else
-                       expires = 0;
-       }
 
        if (rt_is_input_route(rt)) {
                if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
@@ -2306,12 +2329,12 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void
                rt->rt_flags |= RTCF_NOTIFY;
 
        err = rt_fill_info(net, dst, src, &fl4, skb,
-                          NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+                          NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
                           RTM_NEWROUTE, 0, 0);
        if (err <= 0)
                goto errout_free;
 
-       err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
+       err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
 errout:
        return err;
 
@@ -2473,6 +2496,10 @@ static __net_init int sysctl_route_net_init(struct net *net)
                tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
                if (tbl == NULL)
                        goto err_dup;
+
+               /* Don't export sysctls to unprivileged users */
+               if (net->user_ns != &init_user_ns)
+                       tbl[0].procname = NULL;
        }
        tbl[0].extra1 = net;
 
@@ -2506,7 +2533,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
 
 static __net_init int rt_genid_init(struct net *net)
 {
-       atomic_set(&net->ipv4.rt_genid, 0);
+       atomic_set(&net->rt_genid, 0);
        get_random_bytes(&net->ipv4.dev_addr_genid,
                         sizeof(net->ipv4.dev_addr_genid));
        return 0;
@@ -2577,7 +2604,7 @@ int __init ip_rt_init(void)
                pr_err("Unable to create route proc files\n");
 #ifdef CONFIG_XFRM
        xfrm_init();
-       xfrm4_init(ip_rt_max_size);
+       xfrm4_init();
 #endif
        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);