Blame view
net/ipv4/tcp_cong.c
9.65 KB
317a76f9a
|
1 2 3 4 5 6 7 |
/* * Plugable TCP congestion control support and newReno * congestion control. * Based on ideas from I/O scheduler suport and Web100. * * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> */ |
317a76f9a
|
8 9 10 11 12 |
#include <linux/module.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/list.h> #include <net/tcp.h> |
886236c12
|
13 |
int sysctl_tcp_max_ssthresh = 0; |
317a76f9a
|
14 15 16 17 18 19 20 |
static DEFINE_SPINLOCK(tcp_cong_list_lock); static LIST_HEAD(tcp_cong_list); /* Simple linear search, don't expect many entries! */ static struct tcp_congestion_ops *tcp_ca_find(const char *name) { struct tcp_congestion_ops *e; |
5f8ef48d2
|
21 |
list_for_each_entry_rcu(e, &tcp_cong_list, list) { |
317a76f9a
|
22 23 24 25 26 27 28 29 |
if (strcmp(e->name, name) == 0) return e; } return NULL; } /* |
d08df601a
|
30 |
* Attach new congestion control algorithm to the list |
317a76f9a
|
31 32 33 34 35 36 37 |
* of available options. */ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) { int ret = 0; /* all algorithms must implement ssthresh and cong_avoid ops */ |
72dc5b922
|
38 |
if (!ca->ssthresh || !ca->cong_avoid) { |
317a76f9a
|
39 40 41 42 43 44 45 46 47 48 49 50 |
printk(KERN_ERR "TCP %s does not implement required ops ", ca->name); return -EINVAL; } spin_lock(&tcp_cong_list_lock); if (tcp_ca_find(ca->name)) { printk(KERN_NOTICE "TCP %s already registered ", ca->name); ret = -EEXIST; } else { |
3d2573f7e
|
51 |
list_add_tail_rcu(&ca->list, &tcp_cong_list); |
317a76f9a
|
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 |
printk(KERN_INFO "TCP %s registered ", ca->name); } spin_unlock(&tcp_cong_list_lock); return ret; } EXPORT_SYMBOL_GPL(tcp_register_congestion_control); /* * Remove congestion control algorithm, called from * the module's remove function. Module ref counts are used * to ensure that this can't be done till all sockets using * that method are closed. */ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) { spin_lock(&tcp_cong_list_lock); list_del_rcu(&ca->list); spin_unlock(&tcp_cong_list_lock); } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); /* Assign choice of congestion control. */ |
6687e988d
|
76 |
void tcp_init_congestion_control(struct sock *sk) |
317a76f9a
|
77 |
{ |
6687e988d
|
78 |
struct inet_connection_sock *icsk = inet_csk(sk); |
317a76f9a
|
79 |
struct tcp_congestion_ops *ca; |
4d4d3d1e8
|
80 81 82 83 84 85 86 87 |
/* if no choice made yet assign the current value set as default */ if (icsk->icsk_ca_ops == &tcp_init_congestion_ops) { rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { if (try_module_get(ca->owner)) { icsk->icsk_ca_ops = ca; break; } |
5f8ef48d2
|
88 |
|
4d4d3d1e8
|
89 |
/* fallback to next available */ |
317a76f9a
|
90 |
} |
4d4d3d1e8
|
91 |
rcu_read_unlock(); |
317a76f9a
|
92 |
} |
317a76f9a
|
93 |
|
6687e988d
|
94 95 |
if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); |
317a76f9a
|
96 97 98 |
} /* Manage refcounts on socket close. */ |
6687e988d
|
99 |
void tcp_cleanup_congestion_control(struct sock *sk) |
317a76f9a
|
100 |
{ |
6687e988d
|
101 102 103 104 105 |
struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); module_put(icsk->icsk_ca_ops->owner); |
317a76f9a
|
106 107 108 109 110 111 112 113 114 115 116 |
} /* Used by sysctl to change default congestion control */ int tcp_set_default_congestion_control(const char *name) { struct tcp_congestion_ops *ca; int ret = -ENOENT; spin_lock(&tcp_cong_list_lock); ca = tcp_ca_find(name); #ifdef CONFIG_KMOD |
35bfbc940
|
117 |
if (!ca && capable(CAP_SYS_MODULE)) { |
317a76f9a
|
118 119 120 121 122 123 124 125 126 |
spin_unlock(&tcp_cong_list_lock); request_module("tcp_%s", name); spin_lock(&tcp_cong_list_lock); ca = tcp_ca_find(name); } #endif if (ca) { |
164891aad
|
127 |
ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */ |
317a76f9a
|
128 129 130 131 132 133 134 |
list_move(&ca->list, &tcp_cong_list); ret = 0; } spin_unlock(&tcp_cong_list_lock); return ret; } |
b1736a714
|
135 136 137 138 139 140 |
/* Set default value from kernel configuration at bootup */ static int __init tcp_congestion_default(void) { return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG); } late_initcall(tcp_congestion_default); |
3ff825b28
|
141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
/* Build string with list of available congestion control values */ void tcp_get_available_congestion_control(char *buf, size_t maxlen) { struct tcp_congestion_ops *ca; size_t offs = 0; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); } rcu_read_unlock(); } |
317a76f9a
|
156 157 158 159 160 161 162 163 164 165 166 167 |
/* Get current default congestion control */ void tcp_get_default_congestion_control(char *name) { struct tcp_congestion_ops *ca; /* We will always have reno... */ BUG_ON(list_empty(&tcp_cong_list)); rcu_read_lock(); ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); strncpy(name, ca->name, TCP_CA_NAME_MAX); rcu_read_unlock(); } |
ce7bc3bf1
|
168 169 170 171 172 173 174 175 176 |
/* Built list of non-restricted congestion control values */ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) { struct tcp_congestion_ops *ca; size_t offs = 0; *buf = '\0'; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { |
164891aad
|
177 |
if (!(ca->flags & TCP_CONG_NON_RESTRICTED)) |
ce7bc3bf1
|
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
continue; offs += snprintf(buf + offs, maxlen - offs, "%s%s", offs == 0 ? "" : " ", ca->name); } rcu_read_unlock(); } /* Change list of non-restricted congestion control */ int tcp_set_allowed_congestion_control(char *val) { struct tcp_congestion_ops *ca; char *clone, *name; int ret = 0; clone = kstrdup(val, GFP_USER); if (!clone) return -ENOMEM; spin_lock(&tcp_cong_list_lock); /* pass 1 check for bad entries */ while ((name = strsep(&clone, " ")) && *name) { ca = tcp_ca_find(name); if (!ca) { ret = -ENOENT; goto out; } } |
164891aad
|
207 |
/* pass 2 clear old values */ |
ce7bc3bf1
|
208 |
list_for_each_entry_rcu(ca, &tcp_cong_list, list) |
164891aad
|
209 |
ca->flags &= ~TCP_CONG_NON_RESTRICTED; |
ce7bc3bf1
|
210 211 212 213 214 215 |
/* pass 3 mark as allowed */ while ((name = strsep(&val, " ")) && *name) { ca = tcp_ca_find(name); WARN_ON(!ca); if (ca) |
164891aad
|
216 |
ca->flags |= TCP_CONG_NON_RESTRICTED; |
ce7bc3bf1
|
217 218 219 220 221 222 |
} out: spin_unlock(&tcp_cong_list_lock); return ret; } |
5f8ef48d2
|
223 |
/* Change congestion control for socket */ |
6687e988d
|
224 |
int tcp_set_congestion_control(struct sock *sk, const char *name) |
5f8ef48d2
|
225 |
{ |
6687e988d
|
226 |
struct inet_connection_sock *icsk = inet_csk(sk); |
5f8ef48d2
|
227 228 229 230 231 |
struct tcp_congestion_ops *ca; int err = 0; rcu_read_lock(); ca = tcp_ca_find(name); |
4d4d3d1e8
|
232 |
|
35bfbc940
|
233 |
/* no change asking for existing value */ |
6687e988d
|
234 |
if (ca == icsk->icsk_ca_ops) |
5f8ef48d2
|
235 |
goto out; |
35bfbc940
|
236 237 238 239 240 241 242 243 244 |
#ifdef CONFIG_KMOD /* not found attempt to autoload module */ if (!ca && capable(CAP_SYS_MODULE)) { rcu_read_unlock(); request_module("tcp_%s", name); rcu_read_lock(); ca = tcp_ca_find(name); } #endif |
5f8ef48d2
|
245 246 |
if (!ca) err = -ENOENT; |
164891aad
|
247 |
else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN))) |
ce7bc3bf1
|
248 |
err = -EPERM; |
5f8ef48d2
|
249 250 251 252 |
else if (!try_module_get(ca->owner)) err = -EBUSY; else { |
6687e988d
|
253 254 |
tcp_cleanup_congestion_control(sk); icsk->icsk_ca_ops = ca; |
4d4d3d1e8
|
255 256 |
if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init) |
6687e988d
|
257 |
icsk->icsk_ca_ops->init(sk); |
5f8ef48d2
|
258 259 260 261 262 |
} out: rcu_read_unlock(); return err; } |
40efc6fa1
|
263 264 |
/* |
a02ba0416
|
265 266 267 268 269 |
* Slow start is used when congestion window is less than slow start * threshold. This version implements the basic RFC2581 version * and optionally supports: * RFC3742 Limited Slow Start - growth limited to max_ssthresh * RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged |
40efc6fa1
|
270 271 272 |
*/ void tcp_slow_start(struct tcp_sock *tp) { |
a02ba0416
|
273 274 275 276 277 278 279 280 281 282 283 284 285 286 |
int cnt; /* increase in packets */ /* RFC3465: ABC Slow start * Increase only after a full MSS of bytes is acked * * TCP sender SHOULD increase cwnd by the number of * previously unacknowledged bytes ACKed by each incoming * acknowledgment, provided the increase is not more than L */ if (sysctl_tcp_abc && tp->bytes_acked < tp->mss_cache) return; if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh) cnt = sysctl_tcp_max_ssthresh >> 1; /* limited slow start */ |
886236c12
|
287 |
else |
a02ba0416
|
288 |
cnt = tp->snd_cwnd; /* exponential increase */ |
886236c12
|
289 |
|
a02ba0416
|
290 291 292 |
/* RFC3465: ABC * We MAY increase by 2 if discovered delayed ack */ |
886236c12
|
293 294 |
if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache) cnt <<= 1; |
40efc6fa1
|
295 |
tp->bytes_acked = 0; |
886236c12
|
296 297 298 299 300 301 |
tp->snd_cwnd_cnt += cnt; while (tp->snd_cwnd_cnt >= tp->snd_cwnd) { tp->snd_cwnd_cnt -= tp->snd_cwnd; if (tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++; } |
40efc6fa1
|
302 303 |
} EXPORT_SYMBOL_GPL(tcp_slow_start); |
317a76f9a
|
304 305 306 307 308 309 310 |
/* * TCP Reno congestion control * This is special case used for fallback as well. */ /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ |
16751347a
|
311 |
void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight, int flag) |
317a76f9a
|
312 |
{ |
6687e988d
|
313 |
struct tcp_sock *tp = tcp_sk(sk); |
f4805eded
|
314 |
if (!tcp_is_cwnd_limited(sk, in_flight)) |
317a76f9a
|
315 |
return; |
7faffa1c7
|
316 |
/* In "safe" area, increase. */ |
e905a9eda
|
317 |
if (tp->snd_cwnd <= tp->snd_ssthresh) |
7faffa1c7
|
318 |
tcp_slow_start(tp); |
9772efb97
|
319 |
|
e905a9eda
|
320 |
/* In dangerous area, increase slowly. */ |
9772efb97
|
321 |
else if (sysctl_tcp_abc) { |
e905a9eda
|
322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 |
/* RFC3465: Appropriate Byte Count * increase once for each full cwnd acked */ if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; if (tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++; } } else { /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { if (tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++; tp->snd_cwnd_cnt = 0; } else tp->snd_cwnd_cnt++; } |
317a76f9a
|
339 340 341 342 |
} EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); /* Slow start threshold is half the congestion window (min 2) */ |
6687e988d
|
343 |
u32 tcp_reno_ssthresh(struct sock *sk) |
317a76f9a
|
344 |
{ |
6687e988d
|
345 |
const struct tcp_sock *tp = tcp_sk(sk); |
317a76f9a
|
346 347 348 |
return max(tp->snd_cwnd >> 1U, 2U); } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); |
72dc5b922
|
349 350 |
/* Lower bound on congestion window with halving. */ u32 tcp_reno_min_cwnd(const struct sock *sk) |
317a76f9a
|
351 |
{ |
6687e988d
|
352 |
const struct tcp_sock *tp = tcp_sk(sk); |
317a76f9a
|
353 354 355 356 357 |
return tp->snd_ssthresh/2; } EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); struct tcp_congestion_ops tcp_reno = { |
164891aad
|
358 |
.flags = TCP_CONG_NON_RESTRICTED, |
317a76f9a
|
359 360 361 362 363 364 |
.name = "reno", .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .min_cwnd = tcp_reno_min_cwnd, }; |
5f8ef48d2
|
365 366 367 368 369 370 371 372 373 374 375 376 |
/* Initial congestion control used (until SYN) * really reno under another name so we can tell difference * during tcp_set_default_congestion_control */ struct tcp_congestion_ops tcp_init_congestion_ops = { .name = "", .owner = THIS_MODULE, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, .min_cwnd = tcp_reno_min_cwnd, }; EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); |