net: listening_hash get a spinlock per bucket
Eric Dumazet [Thu, 20 Nov 2008 08:40:07 +0000 (00:40 -0800)]
This patch prepares RCU migration of listening_hash table for
TCP/DCCP protocols.

listening_hash table being small (32 slots per protocol), we add
a spinlock for each slot, instead of a single rwlock for whole table.

This should reduce hold time of readers, and writers concurrency.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

include/net/inet_hashtables.h
net/dccp/proto.c
net/ipv4/inet_diag.c
net/ipv4/inet_hashtables.c
net/ipv4/tcp_ipv4.c
net/ipv6/inet6_hashtables.c

index 4818960..62d2dd0 100644 (file)
@@ -99,6 +99,11 @@ struct inet_bind_hashbucket {
        struct hlist_head       chain;
 };
 
+struct inet_listen_hashbucket {
+       spinlock_t              lock;
+       struct hlist_head       head;
+};
+
 /* This is for listening sockets, thus all sockets which possess wildcards. */
 #define INET_LHTABLE_SIZE      32      /* Yes, really, this is all you need. */
 
@@ -123,22 +128,21 @@ struct inet_hashinfo {
        unsigned int                    bhash_size;
        /* Note : 4 bytes padding on 64 bit arches */
 
-       /* All sockets in TCP_LISTEN state will be in here.  This is the only
-        * table where wildcard'd TCP sockets can exist.  Hash function here
-        * is just local port number.
-        */
-       struct hlist_head               listening_hash[INET_LHTABLE_SIZE];
+       struct kmem_cache               *bind_bucket_cachep;
 
        /* All the above members are written once at bootup and
         * never written again _or_ are predominantly read-access.
         *
         * Now align to a new cache line as all the following members
-        * are often dirty.
+        * might be often dirty.
+        */
+       /* All sockets in TCP_LISTEN state will be in here.  This is the only
+        * table where wildcard'd TCP sockets can exist.  Hash function here
+        * is just local port number.
         */
-       rwlock_t                        lhash_lock ____cacheline_aligned;
-       atomic_t                        lhash_users;
-       wait_queue_head_t               lhash_wait;
-       struct kmem_cache                       *bind_bucket_cachep;
+       struct inet_listen_hashbucket   listening_hash[INET_LHTABLE_SIZE]
+                                       ____cacheline_aligned_in_smp;
+
 };
 
 static inline struct inet_ehash_bucket *inet_ehash_bucket(
@@ -236,26 +240,7 @@ extern void __inet_inherit_port(struct sock *sk, struct sock *child);
 
 extern void inet_put_port(struct sock *sk);
 
-extern void inet_listen_wlock(struct inet_hashinfo *hashinfo);
-
-/*
- * - We may sleep inside this lock.
- * - If sleeping is not required (or called from BH),
- *   use plain read_(un)lock(&inet_hashinfo.lhash_lock).
- */
-static inline void inet_listen_lock(struct inet_hashinfo *hashinfo)
-{
-       /* read_lock synchronizes to candidates to writers */
-       read_lock(&hashinfo->lhash_lock);
-       atomic_inc(&hashinfo->lhash_users);
-       read_unlock(&hashinfo->lhash_lock);
-}
-
-static inline void inet_listen_unlock(struct inet_hashinfo *hashinfo)
-{
-       if (atomic_dec_and_test(&hashinfo->lhash_users))
-               wake_up(&hashinfo->lhash_wait);
-}
+void inet_hashinfo_init(struct inet_hashinfo *h);
 
 extern void __inet_hash_nolisten(struct sock *sk);
 extern void inet_hash(struct sock *sk);
index bdf784c..8b63394 100644 (file)
@@ -44,12 +44,7 @@ atomic_t dccp_orphan_count = ATOMIC_INIT(0);
 
 EXPORT_SYMBOL_GPL(dccp_orphan_count);
 
-struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
-       .lhash_lock     = RW_LOCK_UNLOCKED,
-       .lhash_users    = ATOMIC_INIT(0),
-       .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
-};
-
+struct inet_hashinfo dccp_hashinfo;
 EXPORT_SYMBOL_GPL(dccp_hashinfo);
 
 /* the maximum queue length for tx in packets. 0 is no limit */
@@ -1030,6 +1025,7 @@ static int __init dccp_init(void)
        BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
                     FIELD_SIZEOF(struct sk_buff, cb));
 
+       inet_hashinfo_init(&dccp_hashinfo);
        dccp_hashinfo.bind_bucket_cachep =
                kmem_cache_create("dccp_bind_bucket",
                                  sizeof(struct inet_bind_bucket), 0,
index 41b3672..1cb154e 100644 (file)
@@ -718,13 +718,15 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
                if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV)))
                        goto skip_listen_ht;
 
-               inet_listen_lock(hashinfo);
                for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
                        struct sock *sk;
                        struct hlist_node *node;
+                       struct inet_listen_hashbucket *ilb;
 
                        num = 0;
-                       sk_for_each(sk, node, &hashinfo->listening_hash[i]) {
+                       ilb = &hashinfo->listening_hash[i];
+                       spin_lock_bh(&ilb->lock);
+                       sk_for_each(sk, node, &ilb->head) {
                                struct inet_sock *inet = inet_sk(sk);
 
                                if (num < s_num) {
@@ -742,7 +744,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
                                        goto syn_recv;
 
                                if (inet_csk_diag_dump(sk, skb, cb) < 0) {
-                                       inet_listen_unlock(hashinfo);
+                                       spin_unlock_bh(&ilb->lock);
                                        goto done;
                                }
 
@@ -751,7 +753,7 @@ syn_recv:
                                        goto next_listen;
 
                                if (inet_diag_dump_reqs(skb, sk, cb) < 0) {
-                                       inet_listen_unlock(hashinfo);
+                                       spin_unlock_bh(&ilb->lock);
                                        goto done;
                                }
 
@@ -760,12 +762,12 @@ next_listen:
                                cb->args[4] = 0;
                                ++num;
                        }
+                       spin_unlock_bh(&ilb->lock);
 
                        s_num = 0;
                        cb->args[3] = 0;
                        cb->args[4] = 0;
                }
-               inet_listen_unlock(hashinfo);
 skip_listen_ht:
                cb->args[0] = 1;
                s_i = num = s_num = 0;
index fd269cf..377d004 100644 (file)
@@ -111,35 +111,6 @@ void __inet_inherit_port(struct sock *sk, struct sock *child)
 EXPORT_SYMBOL_GPL(__inet_inherit_port);
 
 /*
- * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
- * Look, when several writers sleep and reader wakes them up, all but one
- * immediately hit write lock and grab all the cpus. Exclusive sleep solves
- * this, _but_ remember, it adds useless work on UP machines (wake up each
- * exclusive lock release). It should be ifdefed really.
- */
-void inet_listen_wlock(struct inet_hashinfo *hashinfo)
-       __acquires(hashinfo->lhash_lock)
-{
-       write_lock(&hashinfo->lhash_lock);
-
-       if (atomic_read(&hashinfo->lhash_users)) {
-               DEFINE_WAIT(wait);
-
-               for (;;) {
-                       prepare_to_wait_exclusive(&hashinfo->lhash_wait,
-                                                 &wait, TASK_UNINTERRUPTIBLE);
-                       if (!atomic_read(&hashinfo->lhash_users))
-                               break;
-                       write_unlock_bh(&hashinfo->lhash_lock);
-                       schedule();
-                       write_lock_bh(&hashinfo->lhash_lock);
-               }
-
-               finish_wait(&hashinfo->lhash_wait, &wait);
-       }
-}
-
-/*
  * Don't inline this cruft. Here are some nice properties to exploit here. The
  * BSD API does not allow a listening sock to specify the remote port nor the
  * remote address for the connection. So always assume those are both
@@ -191,25 +162,25 @@ struct sock *__inet_lookup_listener(struct net *net,
                                    const int dif)
 {
        struct sock *sk = NULL;
-       const struct hlist_head *head;
+       struct inet_listen_hashbucket *ilb;
 
-       read_lock(&hashinfo->lhash_lock);
-       head = &hashinfo->listening_hash[inet_lhashfn(net, hnum)];
-       if (!hlist_empty(head)) {
-               const struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
+       ilb = &hashinfo->listening_hash[inet_lhashfn(net, hnum)];
+       spin_lock(&ilb->lock);
+       if (!hlist_empty(&ilb->head)) {
+               const struct inet_sock *inet = inet_sk((sk = __sk_head(&ilb->head)));
 
                if (inet->num == hnum && !sk->sk_node.next &&
                    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
                    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
                    !sk->sk_bound_dev_if && net_eq(sock_net(sk), net))
                        goto sherry_cache;
-               sk = inet_lookup_listener_slow(net, head, daddr, hnum, dif);
+               sk = inet_lookup_listener_slow(net, &ilb->head, daddr, hnum, dif);
        }
        if (sk) {
 sherry_cache:
                sock_hold(sk);
        }
-       read_unlock(&hashinfo->lhash_lock);
+       spin_unlock(&ilb->lock);
        return sk;
 }
 EXPORT_SYMBOL_GPL(__inet_lookup_listener);
@@ -389,8 +360,7 @@ EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
 static void __inet_hash(struct sock *sk)
 {
        struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-       struct hlist_head *list;
-       rwlock_t *lock;
+       struct inet_listen_hashbucket *ilb;
 
        if (sk->sk_state != TCP_LISTEN) {
                __inet_hash_nolisten(sk);
@@ -398,14 +368,12 @@ static void __inet_hash(struct sock *sk)
        }
 
        WARN_ON(!sk_unhashed(sk));
-       list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
-       lock = &hashinfo->lhash_lock;
+       ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
 
-       inet_listen_wlock(hashinfo);
-       __sk_add_node(sk, list);
+       spin_lock(&ilb->lock);
+       __sk_add_node(sk, &ilb->head);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
-       write_unlock(lock);
-       wake_up(&hashinfo->lhash_wait);
+       spin_unlock(&ilb->lock);
 }
 
 void inet_hash(struct sock *sk)
@@ -420,29 +388,27 @@ EXPORT_SYMBOL_GPL(inet_hash);
 
 void inet_unhash(struct sock *sk)
 {
-       rwlock_t *lock;
        struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
 
        if (sk_unhashed(sk))
-               goto out;
+               return;
 
        if (sk->sk_state == TCP_LISTEN) {
-               local_bh_disable();
-               inet_listen_wlock(hashinfo);
-               lock = &hashinfo->lhash_lock;
+               struct inet_listen_hashbucket *ilb;
+
+               ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+               spin_lock_bh(&ilb->lock);
                if (__sk_del_node_init(sk))
                        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+               spin_unlock_bh(&ilb->lock);
        } else {
-               lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+               rwlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
                write_lock_bh(lock);
                if (__sk_nulls_del_node_init_rcu(sk))
                        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+               write_unlock_bh(lock);
        }
-
-       write_unlock_bh(lock);
-out:
-       if (sk->sk_state == TCP_LISTEN)
-               wake_up(&hashinfo->lhash_wait);
 }
 EXPORT_SYMBOL_GPL(inet_unhash);
 
@@ -556,3 +522,13 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row,
 }
 
 EXPORT_SYMBOL_GPL(inet_hash_connect);
+
+void inet_hashinfo_init(struct inet_hashinfo *h)
+{
+       int i;
+
+       for (i = 0; i < INET_LHTABLE_SIZE; i++)
+               spin_lock_init(&h->listening_hash[i].lock);
+}
+
+EXPORT_SYMBOL_GPL(inet_hashinfo_init);
index 5559fea..330b08a 100644 (file)
@@ -97,11 +97,7 @@ struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 }
 #endif
 
-struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
-       .lhash_lock  = __RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
-       .lhash_users = ATOMIC_INIT(0),
-       .lhash_wait  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
-};
+struct inet_hashinfo tcp_hashinfo;
 
 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 {
@@ -1874,15 +1870,18 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
        struct inet_connection_sock *icsk;
        struct hlist_node *node;
        struct sock *sk = cur;
+       struct inet_listen_hashbucket *ilb;
        struct tcp_iter_state *st = seq->private;
        struct net *net = seq_file_net(seq);
 
        if (!sk) {
                st->bucket = 0;
-               sk = sk_head(&tcp_hashinfo.listening_hash[0]);
+               ilb = &tcp_hashinfo.listening_hash[0];
+               spin_lock_bh(&ilb->lock);
+               sk = sk_head(&ilb->head);
                goto get_sk;
        }
-
+       ilb = &tcp_hashinfo.listening_hash[st->bucket];
        ++st->num;
 
        if (st->state == TCP_SEQ_STATE_OPENREQ) {
@@ -1932,8 +1931,11 @@ start_req:
                }
                read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
        }
+       spin_unlock_bh(&ilb->lock);
        if (++st->bucket < INET_LHTABLE_SIZE) {
-               sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
+               ilb = &tcp_hashinfo.listening_hash[st->bucket];
+               spin_lock_bh(&ilb->lock);
+               sk = sk_head(&ilb->head);
                goto get_sk;
        }
        cur = NULL;
@@ -2066,12 +2068,10 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
        void *rc;
        struct tcp_iter_state *st = seq->private;
 
-       inet_listen_lock(&tcp_hashinfo);
        st->state = TCP_SEQ_STATE_LISTENING;
        rc        = listening_get_idx(seq, &pos);
 
        if (!rc) {
-               inet_listen_unlock(&tcp_hashinfo);
                st->state = TCP_SEQ_STATE_ESTABLISHED;
                rc        = established_get_idx(seq, pos);
        }
@@ -2103,7 +2103,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
        case TCP_SEQ_STATE_LISTENING:
                rc = listening_get_next(seq, v);
                if (!rc) {
-                       inet_listen_unlock(&tcp_hashinfo);
                        st->state = TCP_SEQ_STATE_ESTABLISHED;
                        rc        = established_get_first(seq);
                }
@@ -2130,7 +2129,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
                }
        case TCP_SEQ_STATE_LISTENING:
                if (v != SEQ_START_TOKEN)
-                       inet_listen_unlock(&tcp_hashinfo);
+                       spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
                break;
        case TCP_SEQ_STATE_TIME_WAIT:
        case TCP_SEQ_STATE_ESTABLISHED:
@@ -2405,6 +2404,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
 
 void __init tcp_v4_init(void)
 {
+       inet_hashinfo_init(&tcp_hashinfo);
        if (register_pernet_device(&tcp_sk_ops))
                panic("Failed to create the TCP control socket.\n");
 }
index c1b4d40..21544b9 100644 (file)
 void __inet6_hash(struct sock *sk)
 {
        struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
-       rwlock_t *lock;
 
        WARN_ON(!sk_unhashed(sk));
 
        if (sk->sk_state == TCP_LISTEN) {
-               struct hlist_head *list;
+               struct inet_listen_hashbucket *ilb;
 
-               list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
-               lock = &hashinfo->lhash_lock;
-               inet_listen_wlock(hashinfo);
-               __sk_add_node(sk, list);
+               ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+               spin_lock(&ilb->lock);
+               __sk_add_node(sk, &ilb->head);
+               spin_unlock(&ilb->lock);
        } else {
                unsigned int hash;
                struct hlist_nulls_head *list;
+               rwlock_t *lock;
 
                sk->sk_hash = hash = inet6_sk_ehashfn(sk);
                list = &inet_ehash_bucket(hashinfo, hash)->chain;
                lock = inet_ehash_lockp(hashinfo, hash);
                write_lock(lock);
                __sk_nulls_add_node_rcu(sk, list);
+               write_unlock(lock);
        }
 
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
-       write_unlock(lock);
 }
 EXPORT_SYMBOL(__inet6_hash);
 
@@ -126,10 +126,11 @@ struct sock *inet6_lookup_listener(struct net *net,
        const struct hlist_node *node;
        struct sock *result = NULL;
        int score, hiscore = 0;
+       struct inet_listen_hashbucket *ilb;
 
-       read_lock(&hashinfo->lhash_lock);
-       sk_for_each(sk, node,
-                       &hashinfo->listening_hash[inet_lhashfn(net, hnum)]) {
+       ilb = &hashinfo->listening_hash[inet_lhashfn(net, hnum)];
+       spin_lock(&ilb->lock);
+       sk_for_each(sk, node, &ilb->head) {
                if (net_eq(sock_net(sk), net) && inet_sk(sk)->num == hnum &&
                                sk->sk_family == PF_INET6) {
                        const struct ipv6_pinfo *np = inet6_sk(sk);
@@ -157,7 +158,7 @@ struct sock *inet6_lookup_listener(struct net *net,
        }
        if (result)
                sock_hold(result);
-       read_unlock(&hashinfo->lhash_lock);
+       spin_unlock(&ilb->lock);
        return result;
 }