[TCP]: Allow choosing TCP congestion control via sockopt.
Stephen Hemminger [Fri, 24 Jun 2005 03:37:36 +0000 (20:37 -0700)]
Allow using setsockopt to set TCP congestion control to use on a per
socket basis.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

include/linux/tcp.h
include/net/tcp.h
net/ipv4/tcp.c
net/ipv4/tcp_cong.c
net/ipv4/tcp_ipv4.c
net/ipv6/tcp_ipv6.c

index 3ea75dd..dfd93d0 100644 (file)
@@ -127,6 +127,7 @@ enum {
 #define TCP_WINDOW_CLAMP       10      /* Bound advertised window */
 #define TCP_INFO               11      /* Information about this connection. */
 #define TCP_QUICKACK           12      /* Block/reenable quick acks */
+#define TCP_CONGESTION         13      /* Congestion control algorithm */
 
 #define TCPI_OPT_TIMESTAMPS    1
 #define TCPI_OPT_SACK          2
index e427cf3..d04b211 100644 (file)
@@ -1162,8 +1162,9 @@ extern void tcp_init_congestion_control(struct tcp_sock *tp);
 extern void tcp_cleanup_congestion_control(struct tcp_sock *tp);
 extern int tcp_set_default_congestion_control(const char *name);
 extern void tcp_get_default_congestion_control(char *name);
+extern int tcp_set_congestion_control(struct tcp_sock *tp, const char *name);
 
-extern struct tcp_congestion_ops tcp_reno;
+extern struct tcp_congestion_ops tcp_init_congestion_ops;
 extern u32 tcp_reno_ssthresh(struct tcp_sock *tp);
 extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack,
                                u32 rtt, u32 in_flight, int flag);
index f3dbc8d..882436d 100644 (file)
@@ -1927,6 +1927,25 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
                return tp->af_specific->setsockopt(sk, level, optname,
                                                   optval, optlen);
 
+       /* This is a string value all the others are int's */
+       if (optname == TCP_CONGESTION) {
+               char name[TCP_CA_NAME_MAX];
+
+               if (optlen < 1)
+                       return -EINVAL;
+
+               val = strncpy_from_user(name, optval,
+                                       min(TCP_CA_NAME_MAX-1, optlen));
+               if (val < 0)
+                       return -EFAULT;
+               name[val] = 0;
+
+               lock_sock(sk);
+               err = tcp_set_congestion_control(tp, name);
+               release_sock(sk);
+               return err;
+       }
+
        if (optlen < sizeof(int))
                return -EINVAL;
 
@@ -2211,6 +2230,16 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
        case TCP_QUICKACK:
                val = !tp->ack.pingpong;
                break;
+
+       case TCP_CONGESTION:
+               if (get_user(len, optlen))
+                       return -EFAULT;
+               len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
+               if (put_user(len, optlen))
+                       return -EFAULT;
+               if (copy_to_user(optval, tp->ca_ops->name, len))
+                       return -EFAULT;
+               return 0;
        default:
                return -ENOPROTOOPT;
        };
@@ -2224,7 +2253,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
 
 
 extern void __skb_cb_too_small_for_tcp(int, int);
-extern void tcpdiag_init(void);
+extern struct tcp_congestion_ops tcp_reno;
 
 static __initdata unsigned long thash_entries;
 static int __init set_thash_entries(char *str)
index 665394a..4970d10 100644 (file)
@@ -21,7 +21,7 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name)
 {
        struct tcp_congestion_ops *e;
 
-       list_for_each_entry(e, &tcp_cong_list, list) {
+       list_for_each_entry_rcu(e, &tcp_cong_list, list) {
                if (strcmp(e->name, name) == 0)
                        return e;
        }
@@ -77,6 +77,9 @@ void tcp_init_congestion_control(struct tcp_sock *tp)
 {
        struct tcp_congestion_ops *ca;
 
+       if (tp->ca_ops != &tcp_init_congestion_ops)
+               return;
+
        rcu_read_lock();
        list_for_each_entry_rcu(ca, &tcp_cong_list, list) {
                if (try_module_get(ca->owner)) {
@@ -139,6 +142,34 @@ void tcp_get_default_congestion_control(char *name)
        rcu_read_unlock();
 }
 
+/* Change congestion control for socket */
+int tcp_set_congestion_control(struct tcp_sock *tp, const char *name)
+{
+       struct tcp_congestion_ops *ca;
+       int err = 0;
+
+       rcu_read_lock();
+       ca = tcp_ca_find(name);
+       if (ca == tp->ca_ops)
+               goto out;
+
+       if (!ca)
+               err = -ENOENT;
+
+       else if (!try_module_get(ca->owner))
+               err = -EBUSY;
+
+       else {
+               tcp_cleanup_congestion_control(tp);
+               tp->ca_ops = ca;
+               if (tp->ca_ops->init)
+                       tp->ca_ops->init(tp);
+       }
+ out:
+       rcu_read_unlock();
+       return err;
+}
+
 /*
  * TCP Reno congestion control
  * This is special case used for fallback as well.
@@ -192,4 +223,15 @@ struct tcp_congestion_ops tcp_reno = {
        .min_cwnd       = tcp_reno_min_cwnd,
 };
 
-EXPORT_SYMBOL_GPL(tcp_reno);
+/* Initial congestion control used (until SYN)
+ * really reno under another name so we can tell difference
+ * during tcp_set_default_congestion_control
+ */
+struct tcp_congestion_ops tcp_init_congestion_ops  = {
+       .name           = "",
+       .owner          = THIS_MODULE,
+       .ssthresh       = tcp_reno_ssthresh,
+       .cong_avoid     = tcp_reno_cong_avoid,
+       .min_cwnd       = tcp_reno_min_cwnd,
+};
+EXPORT_SYMBOL_GPL(tcp_init_congestion_ops);
index 9122814..ebf1123 100644 (file)
@@ -2048,7 +2048,7 @@ static int tcp_v4_init_sock(struct sock *sk)
        tp->mss_cache_std = tp->mss_cache = 536;
 
        tp->reordering = sysctl_tcp_reordering;
-       tp->ca_ops = &tcp_reno;
+       tp->ca_ops = &tcp_init_congestion_ops;
 
        sk->sk_state = TCP_CLOSE;
 
index fce5603..9dac7fd 100644 (file)
@@ -2025,7 +2025,7 @@ static int tcp_v6_init_sock(struct sock *sk)
        sk->sk_state = TCP_CLOSE;
 
        tp->af_specific = &ipv6_specific;
-       tp->ca_ops = &tcp_reno;
+       tp->ca_ops = &tcp_init_congestion_ops;
        sk->sk_write_space = sk_stream_write_space;
        sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);