Commit 8d3a564da34e5844aca4f991b73f8ca512246b23

Authored by Doug Leith
Committed by David S. Miller
1 parent 8c83f80b2d

tcp: tcp_vegas cong avoid fix

This patch addresses a book-keeping issue in tcp_vegas.c.  At present
tcp_vegas does separate book-keeping of cwnd based on packet sequence
numbers.  A mismatch can develop between this book-keeping and
tp->snd_cwnd due, for example, to delayed acks acking multiple
packets.  When vegas transitions to reno operation (e.g. following
loss), then this mismatch leads to incorrect behaviour (akin to a cwnd
backoff).  This seems mostly to affect operation at low cwnds where
delayed acking can lead to a significant fraction of cwnd being
covered by a single ack, leading to the book-keeping mismatch.  This
patch modifies the congestion avoidance update to avoid the need for
separate book-keeping while leaving vegas congestion avoidance
functionally unchanged.  A secondary advantage of this modification is
that the use of fixed-point (via V_PARAM_SHIFT) and 64 bit arithmetic
is no longer necessary, simplifying the code.

Some example test measurements with the patched code (confirming no functional
change in the congestion avoidance algorithm) can be seen at:

http://www.hamilton.ie/doug/vegaspatch/

Signed-off-by: Doug Leith <doug.leith@nuim.ie>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 1 changed file with 10 additions and 70 deletions Side-by-side Diff

net/ipv4/tcp_vegas.c
... ... @@ -40,18 +40,14 @@
40 40  
41 41 #include "tcp_vegas.h"
42 42  
43   -/* Default values of the Vegas variables, in fixed-point representation
44   - * with V_PARAM_SHIFT bits to the right of the binary point.
45   - */
46   -#define V_PARAM_SHIFT 1
47   -static int alpha = 2<<V_PARAM_SHIFT;
48   -static int beta = 4<<V_PARAM_SHIFT;
49   -static int gamma = 1<<V_PARAM_SHIFT;
  43 +static int alpha = 2;
  44 +static int beta = 4;
  45 +static int gamma = 1;
50 46  
51 47 module_param(alpha, int, 0644);
52   -MODULE_PARM_DESC(alpha, "lower bound of packets in network (scale by 2)");
  48 +MODULE_PARM_DESC(alpha, "lower bound of packets in network");
53 49 module_param(beta, int, 0644);
54   -MODULE_PARM_DESC(beta, "upper bound of packets in network (scale by 2)");
  50 +MODULE_PARM_DESC(beta, "upper bound of packets in network");
55 51 module_param(gamma, int, 0644);
56 52 MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)");
57 53  
58 54  
59 55  
60 56  
61 57  
... ... @@ -172,49 +168,13 @@
172 168 return;
173 169 }
174 170  
175   - /* The key players are v_beg_snd_una and v_beg_snd_nxt.
176   - *
177   - * These are so named because they represent the approximate values
178   - * of snd_una and snd_nxt at the beginning of the current RTT. More
179   - * precisely, they represent the amount of data sent during the RTT.
180   - * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
181   - * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
182   - * bytes of data have been ACKed during the course of the RTT, giving
183   - * an "actual" rate of:
184   - *
185   - * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
186   - *
187   - * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
188   - * because delayed ACKs can cover more than one segment, so they
189   - * don't line up nicely with the boundaries of RTTs.
190   - *
191   - * Another unfortunate fact of life is that delayed ACKs delay the
192   - * advance of the left edge of our send window, so that the number
193   - * of bytes we send in an RTT is often less than our cwnd will allow.
194   - * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
195   - */
196   -
197 171 if (after(ack, vegas->beg_snd_nxt)) {
198 172 /* Do the Vegas once-per-RTT cwnd adjustment. */
199   - u32 old_wnd, old_snd_cwnd;
200 173  
201   -
202   - /* Here old_wnd is essentially the window of data that was
203   - * sent during the previous RTT, and has all
204   - * been acknowledged in the course of the RTT that ended
205   - * with the ACK we just received. Likewise, old_snd_cwnd
206   - * is the cwnd during the previous RTT.
207   - */
208   - old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) /
209   - tp->mss_cache;
210   - old_snd_cwnd = vegas->beg_snd_cwnd;
211   -
212 174 /* Save the extent of the current window so we can use this
213 175 * at the end of the next RTT.
214 176 */
215   - vegas->beg_snd_una = vegas->beg_snd_nxt;
216 177 vegas->beg_snd_nxt = tp->snd_nxt;
217   - vegas->beg_snd_cwnd = tp->snd_cwnd;
218 178  
219 179 /* We do the Vegas calculations only if we got enough RTT
220 180 * samples that we can be reasonably sure that we got
221 181  
222 182  
223 183  
... ... @@ -252,22 +212,14 @@
252 212 *
253 213 * This is:
254 214 * (actual rate in segments) * baseRTT
255   - * We keep it as a fixed point number with
256   - * V_PARAM_SHIFT bits to the right of the binary point.
257 215 */
258   - target_cwnd = ((u64)old_wnd * vegas->baseRTT);
259   - target_cwnd <<= V_PARAM_SHIFT;
260   - do_div(target_cwnd, rtt);
  216 + target_cwnd = tp->snd_cwnd * vegas->baseRTT / rtt;
261 217  
262 218 /* Calculate the difference between the window we had,
263 219 * and the window we would like to have. This quantity
264 220 * is the "Diff" from the Arizona Vegas papers.
265   - *
266   - * Again, this is a fixed point number with
267   - * V_PARAM_SHIFT bits to the right of the binary
268   - * point.
269 221 */
270   - diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
  222 + diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT;
271 223  
272 224 if (diff > gamma && tp->snd_ssthresh > 2 ) {
273 225 /* Going too fast. Time to slow down
274 226  
... ... @@ -282,16 +234,13 @@
282 234 * truncation robs us of full link
283 235 * utilization.
284 236 */
285   - tp->snd_cwnd = min(tp->snd_cwnd,
286   - ((u32)target_cwnd >>
287   - V_PARAM_SHIFT)+1);
  237 + tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
288 238  
289 239 } else if (tp->snd_cwnd <= tp->snd_ssthresh) {
290 240 /* Slow start. */
291 241 tcp_slow_start(tp);
292 242 } else {
293 243 /* Congestion avoidance. */
294   - u32 next_snd_cwnd;
295 244  
296 245 /* Figure out where we would like cwnd
297 246 * to be.
298 247  
299 248  
300 249  
... ... @@ -300,26 +249,17 @@
300 249 /* The old window was too fast, so
301 250 * we slow down.
302 251 */
303   - next_snd_cwnd = old_snd_cwnd - 1;
  252 + tp->snd_cwnd--;
304 253 } else if (diff < alpha) {
305 254 /* We don't have enough extra packets
306 255 * in the network, so speed up.
307 256 */
308   - next_snd_cwnd = old_snd_cwnd + 1;
  257 + tp->snd_cwnd++;
309 258 } else {
310 259 /* Sending just as fast as we
311 260 * should be.
312 261 */
313   - next_snd_cwnd = old_snd_cwnd;
314 262 }
315   -
316   - /* Adjust cwnd upward or downward, toward the
317   - * desired value.
318   - */
319   - if (next_snd_cwnd > tp->snd_cwnd)
320   - tp->snd_cwnd++;
321   - else if (next_snd_cwnd < tp->snd_cwnd)
322   - tp->snd_cwnd--;
323 263 }
324 264  
325 265 if (tp->snd_cwnd < 2)