RDS/IB: Use SLAB_HWCACHE_ALIGN flag for kmem_cache_create()
[linux-2.6.git] / net / rds / rds.h
index 1f82ec0..4510344 100644 (file)
@@ -80,6 +80,7 @@ enum {
 /* Bits for c_flags */
 #define RDS_LL_SEND_FULL       0
 #define RDS_RECONNECT_PENDING  1
+#define RDS_IN_XMIT            2
 
 struct rds_connection {
        struct hlist_node       c_hash_node;
@@ -91,12 +92,13 @@ struct rds_connection {
        struct rds_cong_map     *c_lcong;
        struct rds_cong_map     *c_fcong;
 
-       struct mutex            c_send_lock;    /* protect send ring */
        struct rds_message      *c_xmit_rm;
        unsigned long           c_xmit_sg;
        unsigned int            c_xmit_hdr_off;
        unsigned int            c_xmit_data_off;
+       unsigned int            c_xmit_atomic_sent;
        unsigned int            c_xmit_rdma_sent;
+       unsigned int            c_xmit_data_sent;
 
        spinlock_t              c_lock;         /* protect msg queues */
        u64                     c_next_tx_seq;
@@ -116,11 +118,10 @@ struct rds_connection {
        struct delayed_work     c_conn_w;
        struct work_struct      c_down_w;
        struct mutex            c_cm_lock;      /* protect conn state & cm */
+       wait_queue_head_t       c_waitq;
 
        struct list_head        c_map_item;
        unsigned long           c_map_queued;
-       unsigned long           c_map_offset;
-       unsigned long           c_map_bytes;
 
        unsigned int            c_unacked_packets;
        unsigned int            c_unacked_bytes;
@@ -206,6 +207,48 @@ struct rds_incoming {
        rds_rdma_cookie_t       i_rdma_cookie;
 };
 
+struct rds_mr {
+       struct rb_node          r_rb_node;
+       atomic_t                r_refcount;
+       u32                     r_key;
+
+       /* A copy of the creation flags */
+       unsigned int            r_use_once:1;
+       unsigned int            r_invalidate:1;
+       unsigned int            r_write:1;
+
+       /* This is for RDS_MR_DEAD.
+        * It would be nice & consistent to make this part of the above
+        * bit field here, but we need to use test_and_set_bit.
+        */
+       unsigned long           r_state;
+       struct rds_sock         *r_sock; /* back pointer to the socket that owns us */
+       struct rds_transport    *r_trans;
+       void                    *r_trans_private;
+};
+
+/* Flags for mr->r_state */
+#define RDS_MR_DEAD            0
+
+static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
+{
+       return r_key | (((u64) offset) << 32);
+}
+
+static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
+{
+       return cookie;
+}
+
+static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
+{
+       return cookie >> 32;
+}
+
+/* atomic operation types */
+#define RDS_ATOMIC_TYPE_CSWP           0
+#define RDS_ATOMIC_TYPE_FADD           1
+
 /*
  * m_sock_item and m_conn_item are on lists that are serialized under
  * conn->c_lock.  m_sock_item has additional meaning in that once it is empty
@@ -258,13 +301,61 @@ struct rds_message {
         *   -> rs->rs_lock
         */
        spinlock_t              m_rs_lock;
+       wait_queue_head_t       m_flush_wait;
+
        struct rds_sock         *m_rs;
-       struct rds_rdma_op      *m_rdma_op;
+
+       /* cookie to send to remote, in rds header */
        rds_rdma_cookie_t       m_rdma_cookie;
-       struct rds_mr           *m_rdma_mr;
-       unsigned int            m_nents;
-       unsigned int            m_count;
-       struct scatterlist      m_sg[0];
+
+       unsigned int            m_used_sgs;
+       unsigned int            m_total_sgs;
+
+       void                    *m_final_op;
+
+       struct {
+               struct rm_atomic_op {
+                       int                     op_type;
+                       uint64_t                op_swap_add;
+                       uint64_t                op_compare;
+
+                       u32                     op_rkey;
+                       u64                     op_remote_addr;
+                       unsigned int            op_notify:1;
+                       unsigned int            op_recverr:1;
+                       unsigned int            op_mapped:1;
+                       unsigned int            op_silent:1;
+                       unsigned int            op_active:1;
+                       struct scatterlist      *op_sg;
+                       struct rds_notifier     *op_notifier;
+
+                       struct rds_mr           *op_rdma_mr;
+               } atomic;
+               struct rm_rdma_op {
+                       u32                     op_rkey;
+                       u64                     op_remote_addr;
+                       unsigned int            op_write:1;
+                       unsigned int            op_fence:1;
+                       unsigned int            op_notify:1;
+                       unsigned int            op_recverr:1;
+                       unsigned int            op_mapped:1;
+                       unsigned int            op_silent:1;
+                       unsigned int            op_active:1;
+                       unsigned int            op_bytes;
+                       unsigned int            op_nents;
+                       unsigned int            op_count;
+                       struct scatterlist      *op_sg;
+                       struct rds_notifier     *op_notifier;
+
+                       struct rds_mr           *op_rdma_mr;
+               } rdma;
+               struct rm_data_op {
+                       unsigned int            op_active:1;
+                       unsigned int            op_nents;
+                       unsigned int            op_count;
+                       struct scatterlist      *op_sg;
+               } data;
+       };
 };
 
 /*
@@ -305,17 +396,19 @@ struct rds_notifier {
  *                 transport is responsible for other serialization, including
  *                 rds_recv_incoming().  This is called in process context but
  *                 should try hard not to block.
- *
- * @xmit_cong_map: This asks the transport to send the local bitmap down the
- *                given connection.  XXX get a better story about the bitmap
- *                flag and header.
  */
 
+#define RDS_TRANS_IB   0
+#define RDS_TRANS_IWARP        1
+#define RDS_TRANS_TCP  2
+#define RDS_TRANS_COUNT        3
+
 struct rds_transport {
        char                    t_name[TRANSNAMSIZ];
        struct list_head        t_item;
        struct module           *t_owner;
        unsigned int            t_prefer_loopback:1;
+       unsigned int            t_type;
 
        int (*laddr_check)(__be32 addr);
        int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
@@ -326,13 +419,11 @@ struct rds_transport {
        void (*xmit_complete)(struct rds_connection *conn);
        int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
                    unsigned int hdr_off, unsigned int sg, unsigned int off);
-       int (*xmit_cong_map)(struct rds_connection *conn,
-                            struct rds_cong_map *map, unsigned long offset);
-       int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op);
+       int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
+       int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
        int (*recv)(struct rds_connection *conn);
        int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
                                size_t size);
-       void (*inc_purge)(struct rds_incoming *inc);
        void (*inc_free)(struct rds_incoming *inc);
 
        int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
@@ -361,7 +452,7 @@ struct rds_sock {
         * bound_addr used for both incoming and outgoing, no INADDR_ANY
         * support.
         */
-       struct rb_node          rs_bound_node;
+       struct hlist_node       rs_bound_node;
        __be32                  rs_bound_addr;
        __be32                  rs_conn_addr;
        __be16                  rs_bound_port;
@@ -382,6 +473,8 @@ struct rds_sock {
 
        /* flag indicating we were congested or not */
        int                     rs_congested;
+       /* seen congestion (ENOBUFS) when sending? */
+       int                     rs_seen_congestion;
 
        /* rs_lock protects all these adjacent members before the newline */
        spinlock_t              rs_lock;
@@ -458,8 +551,8 @@ struct rds_statistics {
        uint64_t        s_recv_ping;
        uint64_t        s_send_queue_empty;
        uint64_t        s_send_queue_full;
-       uint64_t        s_send_sem_contention;
-       uint64_t        s_send_sem_queue_raced;
+       uint64_t        s_send_lock_contention;
+       uint64_t        s_send_lock_queue_raced;
        uint64_t        s_send_immediate_retry;
        uint64_t        s_send_delayed_retry;
        uint64_t        s_send_drop_acked;
@@ -484,7 +577,7 @@ void rds_sock_put(struct rds_sock *rs);
 void rds_wake_sk_sleep(struct rds_sock *rs);
 static inline void __rds_wake_sk_sleep(struct sock *sk)
 {
-       wait_queue_head_t *waitq = sk->sk_sleep;
+       wait_queue_head_t *waitq = sk_sleep(sk);
 
        if (!sock_flag(sk, SOCK_DEAD) && waitq)
                wake_up(waitq);
@@ -519,9 +612,11 @@ struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
                                       struct rds_transport *trans, gfp_t gfp);
 struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
                               struct rds_transport *trans, gfp_t gfp);
+void rds_conn_shutdown(struct rds_connection *conn);
 void rds_conn_destroy(struct rds_connection *conn);
 void rds_conn_reset(struct rds_connection *conn);
 void rds_conn_drop(struct rds_connection *conn);
+void rds_conn_connect_if_down(struct rds_connection *conn);
 void rds_for_each_conn_info(struct socket *sock, unsigned int len,
                          struct rds_info_iterator *iter,
                          struct rds_info_lengths *lens,
@@ -558,7 +653,8 @@ rds_conn_connecting(struct rds_connection *conn)
 
 /* message.c */
 struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
-struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
+struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
+int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov,
                                               size_t total_len);
 struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
 void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
@@ -572,7 +668,6 @@ int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *vers
 int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
 int rds_message_inc_copy_to_user(struct rds_incoming *inc,
                                 struct iovec *first_iov, size_t size);
-void rds_message_inc_purge(struct rds_incoming *inc);
 void rds_message_inc_free(struct rds_incoming *inc);
 void rds_message_addref(struct rds_message *rm);
 void rds_message_put(struct rds_message *rm);
@@ -628,17 +723,42 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
 typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
 void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
                         is_acked_func is_acked);
-int rds_send_acked_before(struct rds_connection *conn, u64 seq);
 void rds_send_remove_from_sock(struct list_head *messages, int status);
 int rds_send_pong(struct rds_connection *conn, __be16 dport);
 struct rds_message *rds_send_get_message(struct rds_connection *,
-                                        struct rds_rdma_op *);
+                                        struct rm_rdma_op *);
 
 /* rdma.c */
 void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
+int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen);
+int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
+void rds_rdma_drop_keys(struct rds_sock *rs);
+int rds_rdma_extra_size(struct rds_rdma_args *args);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+                         struct cmsghdr *cmsg);
+int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
+                         struct cmsghdr *cmsg);
+int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
+                         struct cmsghdr *cmsg);
+int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
+                         struct cmsghdr *cmsg);
+void rds_rdma_free_op(struct rm_rdma_op *ro);
+void rds_atomic_free_op(struct rm_atomic_op *ao);
+void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
+void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
+int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
+                   struct cmsghdr *cmsg);
+
+extern void __rds_put_mr_final(struct rds_mr *mr);
+static inline void rds_mr_put(struct rds_mr *mr)
+{
+       if (atomic_dec_and_test(&mr->r_refcount))
+               __rds_put_mr_final(mr);
+}
 
 /* stats.c */
-DECLARE_PER_CPU(struct rds_statistics, rds_stats);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
 #define rds_stats_inc_which(which, member) do {                \
        per_cpu(which, get_cpu()).member++;             \
        put_cpu();                                      \
@@ -652,7 +772,8 @@ DECLARE_PER_CPU(struct rds_statistics, rds_stats);
 int __init rds_stats_init(void);
 void rds_stats_exit(void);
 void rds_stats_info_copy(struct rds_info_iterator *iter,
-                        uint64_t *values, char **names, size_t nr);
+                        uint64_t *values, const char *const *names,
+                        size_t nr);
 
 /* sysctl.c */
 int __init rds_sysctl_init(void);
@@ -672,6 +793,7 @@ extern unsigned int  rds_sysctl_trace_level;
 int __init rds_threads_init(void);
 void rds_threads_exit(void);
 extern struct workqueue_struct *rds_wq;
+void rds_queue_reconnect(struct rds_connection *conn);
 void rds_connect_worker(struct work_struct *);
 void rds_shutdown_worker(struct work_struct *);
 void rds_send_worker(struct work_struct *);