Blame view

net/ipv4/tcp_bbr.c 40.9 KB
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
  /* Bottleneck Bandwidth and RTT (BBR) congestion control
   *
   * BBR congestion control computes the sending rate based on the delivery
   * rate (throughput) estimated from ACKs. In a nutshell:
   *
   *   On each ACK, update our model of the network path:
   *      bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips)
   *      min_rtt = windowed_min(rtt, 10 seconds)
   *   pacing_rate = pacing_gain * bottleneck_bandwidth
   *   cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4)
   *
   * The core algorithm does not react directly to packet losses or delays,
   * although BBR may adjust the size of next send per ACK when loss is
   * observed, or adjust the sending rate if it estimates there is a
   * traffic policer, in order to keep the drop rate reasonable.
   *
9b9375b5b   Neal Cardwell   tcp_bbr: add a st...
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
   * Here is a state transition diagram for BBR:
   *
   *             |
   *             V
   *    +---> STARTUP  ----+
   *    |        |         |
   *    |        V         |
   *    |      DRAIN   ----+
   *    |        |         |
   *    |        V         |
   *    +---> PROBE_BW ----+
   *    |      ^    |      |
   *    |      |    |      |
   *    |      +----+      |
   *    |                  |
   *    +---- PROBE_RTT <--+
   *
   * A BBR flow starts in STARTUP, and ramps up its sending rate quickly.
   * When it estimates the pipe is full, it enters DRAIN to drain the queue.
   * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT.
   * A long-lived BBR flow spends the vast majority of its time remaining
   * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth
   * in a fair manner, with a small, bounded queue. *If* a flow has been
   * continuously sending for the entire min_rtt window, and hasn't seen an RTT
   * sample that matches or decreases its min_rtt estimate for 10 seconds, then
   * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe
   * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if
   * we estimated that we reached the full bw of the pipe then we enter PROBE_BW;
   * otherwise we enter STARTUP to try to fill the pipe.
   *
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
47
48
49
50
51
52
53
54
   * BBR is described in detail in:
   *   "BBR: Congestion-Based Congestion Control",
   *   Neal Cardwell, Yuchung Cheng, C. Stephen Gunn, Soheil Hassas Yeganeh,
   *   Van Jacobson. ACM Queue, Vol. 14 No. 5, September-October 2016.
   *
   * There is a public e-mail list for discussing BBR development and testing:
   *   https://groups.google.com/forum/#!forum/bbr-dev
   *
218af599f   Eric Dumazet   tcp: internal imp...
55
56
57
   * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled,
   * otherwise TCP stack falls back to an internal pacing using one high
   * resolution timer per TCP socket and may use more resources.
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
   */
  #include <linux/module.h>
  #include <net/tcp.h>
  #include <linux/inet_diag.h>
  #include <linux/inet.h>
  #include <linux/random.h>
  #include <linux/win_minmax.h>
  
  /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth
   * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps.
   * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32.
   * Since the minimum window is >=4 packets, the lower bound isn't
   * an issue. The upper bound isn't an issue with existing technologies.
   */
  #define BW_SCALE 24
  #define BW_UNIT (1 << BW_SCALE)
  
  #define BBR_SCALE 8	/* scaling factor for fractions in BBR (e.g. gains) */
  #define BBR_UNIT (1 << BBR_SCALE)
  
  /* BBR has the following modes for deciding how fast to send: */
  enum bbr_mode {
  	BBR_STARTUP,	/* ramp up sending rate rapidly to fill pipe */
  	BBR_DRAIN,	/* drain any queue created during startup */
  	BBR_PROBE_BW,	/* discover, share bw: pace around estimated bw */
9b9375b5b   Neal Cardwell   tcp_bbr: add a st...
83
  	BBR_PROBE_RTT,	/* cut inflight to min to probe min_rtt */
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
84
85
86
87
88
89
90
91
92
93
  };
  
  /* BBR congestion control block */
  struct bbr {
  	u32	min_rtt_us;	        /* min RTT in min_rtt_win_sec window */
  	u32	min_rtt_stamp;	        /* timestamp of min_rtt_us */
  	u32	probe_rtt_done_stamp;   /* end time for BBR_PROBE_RTT mode */
  	struct minmax bw;	/* Max recent delivery rate in pkts/uS << 24 */
  	u32	rtt_cnt;	    /* count of packet-timed rounds elapsed */
  	u32     next_rtt_delivered; /* scb->tx.delivered at end of round */
9a568de48   Eric Dumazet   tcp: switch TCP T...
94
  	u64	cycle_mstamp;	     /* time of this cycle phase start */
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
95
96
97
  	u32     mode:3,		     /* current bbr_mode in state machine */
  		prev_ca_state:3,     /* CA state on previous ACK */
  		packet_conservation:1,  /* use packet conservation? */
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
98
  		round_start:1,	     /* start of packet-timed tx->ack round? */
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
99
100
  		idle_restart:1,	     /* restarting after idle? */
  		probe_rtt_round_done:1,  /* a BBR_PROBE_RTT round at 4 pkts? */
fb9988622   Kevin Yang   tcp_bbr: add bbr_...
101
  		unused:13,
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
102
103
104
105
106
107
108
109
110
  		lt_is_sampling:1,    /* taking long-term ("LT") samples now? */
  		lt_rtt_cnt:7,	     /* round trips in long-term interval */
  		lt_use_bw:1;	     /* use lt_bw as our bw estimate? */
  	u32	lt_bw;		     /* LT est delivery rate in pkts/uS << 24 */
  	u32	lt_last_delivered;   /* LT intvl start: tp->delivered */
  	u32	lt_last_stamp;	     /* LT intvl start: tp->delivered_mstamp */
  	u32	lt_last_lost;	     /* LT intvl start: tp->lost */
  	u32	pacing_gain:10,	/* current gain for setting pacing rate */
  		cwnd_gain:10,	/* current gain for setting cwnd */
c589e69b5   Neal Cardwell   tcp_bbr: record "...
111
112
  		full_bw_reached:1,   /* reached full bw in Startup? */
  		full_bw_cnt:2,	/* number of rounds without large bw gains */
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
113
  		cycle_idx:3,	/* current index in pacing_gain cycle array */
329845655   Neal Cardwell   tcp_bbr: init pac...
114
115
  		has_seen_rtt:1, /* have we seen an RTT sample yet? */
  		unused_b:5;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
116
117
  	u32	prior_cwnd;	/* prior cwnd upon entering loss recovery */
  	u32	full_bw;	/* recent bw, to estimate if pipe is full */
78dc70eba   Priyaranjan Jha   tcp_bbr: adapt cw...
118
119
120
121
122
123
124
125
  
  	/* For tracking ACK aggregation: */
  	u64	ack_epoch_mstamp;	/* start of ACK sampling epoch */
  	u16	extra_acked[2];		/* max excess data ACKed in epoch */
  	u32	ack_epoch_acked:20,	/* packets (S)ACKed in sampling epoch */
  		extra_acked_win_rtts:5,	/* age of extra_acked, in round trips */
  		extra_acked_win_idx:1,	/* current index in extra_acked array */
  		unused_c:6;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
126
127
128
129
130
131
132
133
134
135
136
137
  };
  
  #define CYCLE_LEN	8	/* number of phases in a pacing gain cycle */
  
  /* Window length of bw filter (in rounds): */
  static const int bbr_bw_rtts = CYCLE_LEN + 2;
  /* Window length of min_rtt filter (in sec): */
  static const u32 bbr_min_rtt_win_sec = 10;
  /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */
  static const u32 bbr_probe_rtt_mode_ms = 200;
  /* Skip TSO below the following bandwidth (bits/sec): */
  static const int bbr_min_tso_rate = 1200000;
1106a5ade   Neal Cardwell   tcp_bbr: update c...
138
139
140
141
142
143
  /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
   * In order to help drive the network toward lower queues and low latency while
   * maintaining high utilization, the average pacing rate aims to be slightly
   * lower than the estimated bandwidth. This is an important aspect of the
   * design.
   */
97ec3eb33   Neal Cardwell   tcp_bbr: fix typo...
144
  static const int bbr_pacing_margin_percent = 1;
ab408b6dc   Eric Dumazet   tcp: switch tcp a...
145

0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
  /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
   * that will allow a smoothly increasing pacing rate that will double each RTT
   * and send the same number of packets per RTT that an un-paced, slow-starting
   * Reno or CUBIC flow would:
   */
  static const int bbr_high_gain  = BBR_UNIT * 2885 / 1000 + 1;
  /* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain
   * the queue created in BBR_STARTUP in a single round:
   */
  static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885;
  /* The gain for deriving steady-state cwnd tolerates delayed/stretched ACKs: */
  static const int bbr_cwnd_gain  = BBR_UNIT * 2;
  /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */
  static const int bbr_pacing_gain[] = {
  	BBR_UNIT * 5 / 4,	/* probe for more available bw */
  	BBR_UNIT * 3 / 4,	/* drain queue and/or yield bw to other flows */
  	BBR_UNIT, BBR_UNIT, BBR_UNIT,	/* cruise at 1.0*bw to utilize pipe, */
  	BBR_UNIT, BBR_UNIT, BBR_UNIT	/* without creating excess queue... */
  };
  /* Randomize the starting gain cycling phase over N phases: */
  static const u32 bbr_cycle_rand = 7;
  
  /* Try to keep at least this many packets in flight, if things go smoothly. For
   * smooth functioning, a sliding window protocol ACKing every other packet
   * needs at least 4 packets in flight:
   */
  static const u32 bbr_cwnd_min_target = 4;
  
  /* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */
  /* If bw has increased significantly (1.25x), there may be more bw available: */
  static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4;
  /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */
  static const u32 bbr_full_bw_cnt = 3;
  
  /* "long-term" ("LT") bandwidth estimator parameters... */
  /* The minimum number of rounds in an LT bw sampling interval: */
  static const u32 bbr_lt_intvl_min_rtts = 4;
  /* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */
  static const u32 bbr_lt_loss_thresh = 50;
  /* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */
  static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8;
  /* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */
  static const u32 bbr_lt_bw_diff = 4000 / 8;
  /* If we estimate we're policed, use lt_bw for this many round trips: */
  static const u32 bbr_lt_bw_max_rtts = 48;
78dc70eba   Priyaranjan Jha   tcp_bbr: adapt cw...
191
192
193
194
195
196
197
198
  /* Gain factor for adding extra_acked to target cwnd: */
  static const int bbr_extra_acked_gain = BBR_UNIT;
  /* Window length of extra_acked window. */
  static const u32 bbr_extra_acked_win_rtts = 5;
  /* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */
  static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
  /* Time period for clamping cwnd increment due to ack aggregation */
  static const u32 bbr_extra_acked_max_us = 100 * 1000;
5490b32dc   Kevin Yang   tcp_bbr: in resta...
199
  static void bbr_check_probe_rtt_done(struct sock *sk);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
200
201
202
203
  /* Do we estimate that STARTUP filled the pipe? */
  static bool bbr_full_bw_reached(const struct sock *sk)
  {
  	const struct bbr *bbr = inet_csk_ca(sk);
c589e69b5   Neal Cardwell   tcp_bbr: record "...
204
  	return bbr->full_bw_reached;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
  }
  
  /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
  static u32 bbr_max_bw(const struct sock *sk)
  {
  	struct bbr *bbr = inet_csk_ca(sk);
  
  	return minmax_get(&bbr->bw);
  }
  
  /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */
  static u32 bbr_bw(const struct sock *sk)
  {
  	struct bbr *bbr = inet_csk_ca(sk);
  
  	return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
  }
78dc70eba   Priyaranjan Jha   tcp_bbr: adapt cw...
222
223
224
225
226
227
228
229
230
  /* Return maximum extra acked in past k-2k round trips,
   * where k = bbr_extra_acked_win_rtts.
   */
  static u16 bbr_extra_acked(const struct sock *sk)
  {
  	struct bbr *bbr = inet_csk_ca(sk);
  
  	return max(bbr->extra_acked[0], bbr->extra_acked[1]);
  }
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
231
232
233
234
235
236
  /* Return rate in bytes per second, optionally with a gain.
   * The order here is chosen carefully to avoid overflow of u64. This should
   * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
   */
  static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
  {
cadefe5f5   Eric Dumazet   tcp_bbr: fix bbr ...
237
  	unsigned int mss = tcp_sk(sk)->mss_cache;
cadefe5f5   Eric Dumazet   tcp_bbr: fix bbr ...
238
  	rate *= mss;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
239
240
  	rate *= gain;
  	rate >>= BBR_SCALE;
97ec3eb33   Neal Cardwell   tcp_bbr: fix typo...
241
  	rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
242
243
  	return rate >> BW_SCALE;
  }
f19fd62da   Neal Cardwell   tcp_bbr: introduc...
244
  /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
76a9ebe81   Eric Dumazet   net: extend sk_pa...
245
  static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
f19fd62da   Neal Cardwell   tcp_bbr: introduc...
246
247
248
249
250
251
252
  {
  	u64 rate = bw;
  
  	rate = bbr_rate_bytes_per_sec(sk, rate, gain);
  	rate = min_t(u64, rate, sk->sk_max_pacing_rate);
  	return rate;
  }
79135b89b   Neal Cardwell   tcp_bbr: introduc...
253
254
255
256
  /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */
  static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
329845655   Neal Cardwell   tcp_bbr: init pac...
257
  	struct bbr *bbr = inet_csk_ca(sk);
79135b89b   Neal Cardwell   tcp_bbr: introduc...
258
259
260
261
262
  	u64 bw;
  	u32 rtt_us;
  
  	if (tp->srtt_us) {		/* any RTT sample yet? */
  		rtt_us = max(tp->srtt_us >> 3, 1U);
329845655   Neal Cardwell   tcp_bbr: init pac...
263
  		bbr->has_seen_rtt = 1;
79135b89b   Neal Cardwell   tcp_bbr: introduc...
264
265
266
267
268
269
270
  	} else {			 /* no RTT sample yet */
  		rtt_us = USEC_PER_MSEC;	 /* use nominal default RTT */
  	}
  	bw = (u64)tp->snd_cwnd * BW_UNIT;
  	do_div(bw, rtt_us);
  	sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
  }
1106a5ade   Neal Cardwell   tcp_bbr: update c...
271
  /* Pace using current bw estimate and a gain factor. */
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
272
273
  static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
  {
329845655   Neal Cardwell   tcp_bbr: init pac...
274
275
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct bbr *bbr = inet_csk_ca(sk);
76a9ebe81   Eric Dumazet   net: extend sk_pa...
276
  	unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
277

329845655   Neal Cardwell   tcp_bbr: init pac...
278
279
  	if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
  		bbr_init_pacing_rate_from_rtt(sk);
4aea287e9   Neal Cardwell   tcp_bbr: cut paci...
280
  	if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate)
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
281
282
  		sk->sk_pacing_rate = rate;
  }
dcb8c9b43   Eric Dumazet   tcp_bbr: better d...
283
284
  /* override sysctl_tcp_min_tso_segs */
  static u32 bbr_min_tso_segs(struct sock *sk)
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
285
  {
dcb8c9b43   Eric Dumazet   tcp_bbr: better d...
286
  	return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
287
  }
71abf467b   Eric Dumazet   tcp_bbr: remove b...
288
  static u32 bbr_tso_segs_goal(struct sock *sk)
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
289
290
  {
  	struct tcp_sock *tp = tcp_sk(sk);
dcb8c9b43   Eric Dumazet   tcp_bbr: better d...
291
292
293
294
295
  	u32 segs, bytes;
  
  	/* Sort of tcp_tso_autosize() but ignoring
  	 * driver provided sk_gso_max_size.
  	 */
7c68fa2bd   Eric Dumazet   net: annotate loc...
296
297
  	bytes = min_t(unsigned long,
  		      sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
dcb8c9b43   Eric Dumazet   tcp_bbr: better d...
298
299
  		      GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
  	segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
300

71abf467b   Eric Dumazet   tcp_bbr: remove b...
301
  	return min(segs, 0x7FU);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
  }
  
  /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */
  static void bbr_save_cwnd(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct bbr *bbr = inet_csk_ca(sk);
  
  	if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
  		bbr->prior_cwnd = tp->snd_cwnd;  /* this cwnd is good enough */
  	else  /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
  		bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
  }
  
  static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct bbr *bbr = inet_csk_ca(sk);
  
  	if (event == CA_EVENT_TX_START && tp->app_limited) {
  		bbr->idle_restart = 1;
78dc70eba   Priyaranjan Jha   tcp_bbr: adapt cw...
323
324
  		bbr->ack_epoch_mstamp = tp->tcp_mstamp;
  		bbr->ack_epoch_acked = 0;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
325
326
327
328
329
  		/* Avoid pointless buffer overflows: pace at est. bw if we don't
  		 * need more speed (we're restarting from idle and app-limited).
  		 */
  		if (bbr->mode == BBR_PROBE_BW)
  			bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT);
5490b32dc   Kevin Yang   tcp_bbr: in resta...
330
331
  		else if (bbr->mode == BBR_PROBE_RTT)
  			bbr_check_probe_rtt_done(sk);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
332
333
  	}
  }
232aa8ec3   Priyaranjan Jha   tcp_bbr: refactor...
334
  /* Calculate bdp based on min RTT and the estimated bottleneck bandwidth:
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
335
   *
de8e1beb1   Luke Hsiao   tcp_bbr: clarify ...
336
   * bdp = ceil(bw * min_rtt * gain)
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
337
338
339
340
341
   *
   * The key factor, gain, controls the amount of queue. While a small gain
   * builds a smaller queue, it becomes more vulnerable to noise in RTT
   * measurements (e.g., delayed ACKs or other ACK compression effects). This
   * noise may cause BBR to under-estimate the rate.
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
342
   */
232aa8ec3   Priyaranjan Jha   tcp_bbr: refactor...
343
  static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
344
345
  {
  	struct bbr *bbr = inet_csk_ca(sk);
232aa8ec3   Priyaranjan Jha   tcp_bbr: refactor...
346
  	u32 bdp;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
347
348
349
350
351
352
353
354
355
356
357
358
  	u64 w;
  
  	/* If we've never had a valid RTT sample, cap cwnd at the initial
  	 * default. This should only happen when the connection is not using TCP
  	 * timestamps and has retransmitted all of the SYN/SYNACK/data packets
  	 * ACKed so far. In this case, an RTO can cut cwnd to 1, in which
  	 * case we need to slow-start up toward something safe: TCP_INIT_CWND.
  	 */
  	if (unlikely(bbr->min_rtt_us == ~0U))	 /* no valid RTT samples yet? */
  		return TCP_INIT_CWND;  /* be safe: cap at default initial cwnd*/
  
  	w = (u64)bw * bbr->min_rtt_us;
de8e1beb1   Luke Hsiao   tcp_bbr: clarify ...
359
360
361
  	/* Apply a gain to the given value, remove the BW_SCALE shift, and
  	 * round the value up to avoid a negative feedback loop.
  	 */
232aa8ec3   Priyaranjan Jha   tcp_bbr: refactor...
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
  	bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
  
  	return bdp;
  }
  
  /* To achieve full performance in high-speed paths, we budget enough cwnd to
   * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
   *   - one skb in sending host Qdisc,
   *   - one skb in sending host TSO/GSO engine
   *   - one skb being received by receiver host LRO/GRO/delayed-ACK engine
   * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
   * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
   * which allows 2 outstanding 2-packet sequences, to try to keep pipe
   * full even with ACK-every-other-packet delayed ACKs.
   */
6b3656a60   Kevin(Yudong) Yang   tcp_bbr: fix quan...
377
  static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
232aa8ec3   Priyaranjan Jha   tcp_bbr: refactor...
378
379
  {
  	struct bbr *bbr = inet_csk_ca(sk);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
380
381
  
  	/* Allow enough full-sized skbs in flight to utilize end systems. */
71abf467b   Eric Dumazet   tcp_bbr: remove b...
382
  	cwnd += 3 * bbr_tso_segs_goal(sk);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
383
384
385
  
  	/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
  	cwnd = (cwnd + 1) & ~1U;
383d47093   Neal Cardwell   tcp_bbr: fix bw p...
386
  	/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
6b3656a60   Kevin(Yudong) Yang   tcp_bbr: fix quan...
387
  	if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
383d47093   Neal Cardwell   tcp_bbr: fix bw p...
388
  		cwnd += 2;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
389
390
  	return cwnd;
  }
232aa8ec3   Priyaranjan Jha   tcp_bbr: refactor...
391
392
393
394
395
396
  /* Find inflight based on min RTT and the estimated bottleneck bandwidth. */
  static u32 bbr_inflight(struct sock *sk, u32 bw, int gain)
  {
  	u32 inflight;
  
  	inflight = bbr_bdp(sk, bw, gain);
6b3656a60   Kevin(Yudong) Yang   tcp_bbr: fix quan...
397
  	inflight = bbr_quantization_budget(sk, inflight);
232aa8ec3   Priyaranjan Jha   tcp_bbr: refactor...
398
399
400
  
  	return inflight;
  }
a87c83d5e   Neal Cardwell   tcp_bbr: adjust T...
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
  /* With pacing at lower layers, there's often less data "in the network" than
   * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
   * we often have several skbs queued in the pacing layer with a pre-scheduled
   * earliest departure time (EDT). BBR adapts its pacing rate based on the
   * inflight level that it estimates has already been "baked in" by previous
   * departure time decisions. We calculate a rough estimate of the number of our
   * packets that might be in the network at the earliest departure time for the
   * next skb scheduled:
   *   in_network_at_edt = inflight_at_edt - (EDT - now) * bw
   * If we're increasing inflight, then we want to know if the transmit of the
   * EDT skb will push inflight above the target, so inflight_at_edt includes
   * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight,
   * then estimate if inflight will sink too low just before the EDT transmit.
   */
  static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct bbr *bbr = inet_csk_ca(sk);
  	u64 now_ns, edt_ns, interval_us;
  	u32 interval_delivered, inflight_at_edt;
  
  	now_ns = tp->tcp_clock_cache;
  	edt_ns = max(tp->tcp_wstamp_ns, now_ns);
  	interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC);
  	interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE;
  	inflight_at_edt = inflight_now;
  	if (bbr->pacing_gain > BBR_UNIT)              /* increasing inflight */
  		inflight_at_edt += bbr_tso_segs_goal(sk);  /* include EDT skb */
  	if (interval_delivered >= inflight_at_edt)
  		return 0;
  	return inflight_at_edt - interval_delivered;
  }
78dc70eba   Priyaranjan Jha   tcp_bbr: adapt cw...
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
  /* Find the cwnd increment based on estimate of ack aggregation */
  static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
  {
  	u32 max_aggr_cwnd, aggr_cwnd = 0;
  
  	if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
  		max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
  				/ BW_UNIT;
  		aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
  			     >> BBR_SCALE;
  		aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
  	}
  
  	return aggr_cwnd;
  }
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
  /* An optimization in BBR to reduce losses: On the first round of recovery, we
   * follow the packet conservation principle: send P packets per P packets acked.
   * After that, we slow-start and send at most 2*P packets per P packets acked.
   * After recovery finishes, or upon undo, we restore the cwnd we had when
   * recovery started (capped by the target cwnd based on estimated BDP).
   *
   * TODO(ycheng/ncardwell): implement a rate-based approach.
   */
  static bool bbr_set_cwnd_to_recover_or_restore(
  	struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct bbr *bbr = inet_csk_ca(sk);
  	u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
  	u32 cwnd = tp->snd_cwnd;
  
  	/* An ACK for P pkts should release at most 2*P packets. We do this
  	 * in two steps. First, here we deduct the number of lost packets.
  	 * Then, in bbr_set_cwnd() we slow start up toward the target cwnd.
  	 */
  	if (rs->losses > 0)
  		cwnd = max_t(s32, cwnd - rs->losses, 1);
  
  	if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) {
  		/* Starting 1st round of Recovery, so do packet conservation. */
  		bbr->packet_conservation = 1;
  		bbr->next_rtt_delivered = tp->delivered;  /* start round now */
  		/* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */
  		cwnd = tcp_packets_in_flight(tp) + acked;
  	} else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) {
  		/* Exiting loss recovery; restore cwnd saved before recovery. */
fb9988622   Kevin Yang   tcp_bbr: add bbr_...
479
  		cwnd = max(cwnd, bbr->prior_cwnd);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
480
481
482
  		bbr->packet_conservation = 0;
  	}
  	bbr->prev_ca_state = state;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
  	if (bbr->packet_conservation) {
  		*new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked);
  		return true;	/* yes, using packet conservation */
  	}
  	*new_cwnd = cwnd;
  	return false;
  }
  
  /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss
   * has drawn us down below target), or snap down to target if we're above it.
   */
  static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
  			 u32 acked, u32 bw, int gain)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct bbr *bbr = inet_csk_ca(sk);
8e995bf14   Kevin Yang   tcp_bbr: apply PR...
499
  	u32 cwnd = tp->snd_cwnd, target_cwnd = 0;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
500
501
  
  	if (!acked)
8e995bf14   Kevin Yang   tcp_bbr: apply PR...
502
  		goto done;  /* no packet fully ACKed; just apply caps */
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
503
504
505
  
  	if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
  		goto done;
232aa8ec3   Priyaranjan Jha   tcp_bbr: refactor...
506
  	target_cwnd = bbr_bdp(sk, bw, gain);
78dc70eba   Priyaranjan Jha   tcp_bbr: adapt cw...
507
508
509
510
511
  
  	/* Increment the cwnd to account for excess ACKed data that seems
  	 * due to aggregation (of data and/or ACKs) visible in the ACK stream.
  	 */
  	target_cwnd += bbr_ack_aggregation_cwnd(sk);
6b3656a60   Kevin(Yudong) Yang   tcp_bbr: fix quan...
512
  	target_cwnd = bbr_quantization_budget(sk, target_cwnd);
78dc70eba   Priyaranjan Jha   tcp_bbr: adapt cw...
513
514
  
  	/* If we're below target cwnd, slow start cwnd toward target cwnd. */
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
  	if (bbr_full_bw_reached(sk))  /* only cut cwnd if we filled the pipe */
  		cwnd = min(cwnd + acked, target_cwnd);
  	else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
  		cwnd = cwnd + acked;
  	cwnd = max(cwnd, bbr_cwnd_min_target);
  
  done:
  	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);	/* apply global cap */
  	if (bbr->mode == BBR_PROBE_RTT)  /* drain queue, refresh min_rtt */
  		tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target);
  }
  
  /* End cycle phase if it's time and/or we hit the phase's in-flight target. */
  static bool bbr_is_next_cycle_phase(struct sock *sk,
  				    const struct rate_sample *rs)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct bbr *bbr = inet_csk_ca(sk);
  	bool is_full_length =
9a568de48   Eric Dumazet   tcp: switch TCP T...
534
  		tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) >
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
535
536
537
538
539
540
541
542
  		bbr->min_rtt_us;
  	u32 inflight, bw;
  
  	/* The pacing_gain of 1.0 paces at the estimated bw to try to fully
  	 * use the pipe without increasing the queue.
  	 */
  	if (bbr->pacing_gain == BBR_UNIT)
  		return is_full_length;		/* just use wall clock time */
a87c83d5e   Neal Cardwell   tcp_bbr: adjust T...
543
  	inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
544
545
546
547
548
549
550
551
552
553
  	bw = bbr_max_bw(sk);
  
  	/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
  	 * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is
  	 * small (e.g. on a LAN). We do not persist if packets are lost, since
  	 * a path with small buffers may not hold that much.
  	 */
  	if (bbr->pacing_gain > BBR_UNIT)
  		return is_full_length &&
  			(rs->losses ||  /* perhaps pacing_gain*BDP won't fit */
232aa8ec3   Priyaranjan Jha   tcp_bbr: refactor...
554
  			 inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
555
556
557
558
559
560
  
  	/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
  	 * probing didn't find more bw. If inflight falls to match BDP then we
  	 * estimate queue is drained; persisting would underutilize the pipe.
  	 */
  	return is_full_length ||
232aa8ec3   Priyaranjan Jha   tcp_bbr: refactor...
561
  		inflight <= bbr_inflight(sk, bw, BBR_UNIT);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
562
563
564
565
566
567
568
569
570
  }
  
  static void bbr_advance_cycle_phase(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct bbr *bbr = inet_csk_ca(sk);
  
  	bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
  	bbr->cycle_mstamp = tp->delivered_mstamp;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
571
572
573
574
575
576
577
  }
  
  /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
  static void bbr_update_cycle_phase(struct sock *sk,
  				   const struct rate_sample *rs)
  {
  	struct bbr *bbr = inet_csk_ca(sk);
3aff3b4b9   Neal Cardwell   tcp_bbr: fix paci...
578
  	if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs))
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
579
580
581
582
583
584
585
586
  		bbr_advance_cycle_phase(sk);
  }
  
  static void bbr_reset_startup_mode(struct sock *sk)
  {
  	struct bbr *bbr = inet_csk_ca(sk);
  
  	bbr->mode = BBR_STARTUP;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
587
588
589
590
591
592
593
  }
  
  static void bbr_reset_probe_bw_mode(struct sock *sk)
  {
  	struct bbr *bbr = inet_csk_ca(sk);
  
  	bbr->mode = BBR_PROBE_BW;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
  	bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
  	bbr_advance_cycle_phase(sk);	/* flip to next phase of gain cycle */
  }
  
  static void bbr_reset_mode(struct sock *sk)
  {
  	if (!bbr_full_bw_reached(sk))
  		bbr_reset_startup_mode(sk);
  	else
  		bbr_reset_probe_bw_mode(sk);
  }
  
  /* Start a new long-term sampling interval. */
  static void bbr_reset_lt_bw_sampling_interval(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct bbr *bbr = inet_csk_ca(sk);
9a568de48   Eric Dumazet   tcp: switch TCP T...
611
  	bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
  	bbr->lt_last_delivered = tp->delivered;
  	bbr->lt_last_lost = tp->lost;
  	bbr->lt_rtt_cnt = 0;
  }
  
  /* Completely reset long-term bandwidth sampling. */
  static void bbr_reset_lt_bw_sampling(struct sock *sk)
  {
  	struct bbr *bbr = inet_csk_ca(sk);
  
  	bbr->lt_bw = 0;
  	bbr->lt_use_bw = 0;
  	bbr->lt_is_sampling = false;
  	bbr_reset_lt_bw_sampling_interval(sk);
  }
  
  /* Long-term bw sampling interval is done. Estimate whether we're policed. */
  static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw)
  {
  	struct bbr *bbr = inet_csk_ca(sk);
  	u32 diff;
  
  	if (bbr->lt_bw) {  /* do we have bw from a previous interval? */
  		/* Is new bw close to the lt_bw from the previous interval? */
  		diff = abs(bw - bbr->lt_bw);
  		if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) ||
  		    (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <=
  		     bbr_lt_bw_diff)) {
  			/* All criteria are met; estimate we're policed. */
  			bbr->lt_bw = (bw + bbr->lt_bw) >> 1;  /* avg 2 intvls */
  			bbr->lt_use_bw = 1;
  			bbr->pacing_gain = BBR_UNIT;  /* try to avoid drops */
  			bbr->lt_rtt_cnt = 0;
  			return;
  		}
  	}
  	bbr->lt_bw = bw;
  	bbr_reset_lt_bw_sampling_interval(sk);
  }
  
  /* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of
   * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and
   * explicitly models their policed rate, to reduce unnecessary losses. We
   * estimate that we're policed if we see 2 consecutive sampling intervals with
   * consistent throughput and high packet loss. If we think we're being policed,
   * set lt_bw to the "long-term" average delivery rate from those 2 intervals.
   */
  static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct bbr *bbr = inet_csk_ca(sk);
  	u32 lost, delivered;
  	u64 bw;
9a568de48   Eric Dumazet   tcp: switch TCP T...
665
  	u32 t;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
  
  	if (bbr->lt_use_bw) {	/* already using long-term rate, lt_bw? */
  		if (bbr->mode == BBR_PROBE_BW && bbr->round_start &&
  		    ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) {
  			bbr_reset_lt_bw_sampling(sk);    /* stop using lt_bw */
  			bbr_reset_probe_bw_mode(sk);  /* restart gain cycling */
  		}
  		return;
  	}
  
  	/* Wait for the first loss before sampling, to let the policer exhaust
  	 * its tokens and estimate the steady-state rate allowed by the policer.
  	 * Starting samples earlier includes bursts that over-estimate the bw.
  	 */
  	if (!bbr->lt_is_sampling) {
  		if (!rs->losses)
  			return;
  		bbr_reset_lt_bw_sampling_interval(sk);
  		bbr->lt_is_sampling = true;
  	}
  
  	/* To avoid underestimates, reset sampling if we run out of data. */
  	if (rs->is_app_limited) {
  		bbr_reset_lt_bw_sampling(sk);
  		return;
  	}
  
  	if (bbr->round_start)
  		bbr->lt_rtt_cnt++;	/* count round trips in this interval */
  	if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts)
  		return;		/* sampling interval needs to be longer */
  	if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) {
  		bbr_reset_lt_bw_sampling(sk);  /* interval is too long */
  		return;
  	}
  
  	/* End sampling interval when a packet is lost, so we estimate the
  	 * policer tokens were exhausted. Stopping the sampling before the
  	 * tokens are exhausted under-estimates the policed rate.
  	 */
  	if (!rs->losses)
  		return;
  
  	/* Calculate packets lost and delivered in sampling interval. */
  	lost = tp->lost - bbr->lt_last_lost;
  	delivered = tp->delivered - bbr->lt_last_delivered;
  	/* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */
  	if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered)
  		return;
  
  	/* Find average delivery rate in this sampling interval. */
9a568de48   Eric Dumazet   tcp: switch TCP T...
717
718
719
720
721
  	t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp;
  	if ((s32)t < 1)
  		return;		/* interval is less than one ms, so wait */
  	/* Check if can multiply without overflow */
  	if (t >= ~0U / USEC_PER_MSEC) {
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
722
723
724
  		bbr_reset_lt_bw_sampling(sk);  /* interval too long; reset */
  		return;
  	}
9a568de48   Eric Dumazet   tcp: switch TCP T...
725
  	t *= USEC_PER_MSEC;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
  	bw = (u64)delivered * BW_UNIT;
  	do_div(bw, t);
  	bbr_lt_bw_interval_done(sk, bw);
  }
  
  /* Estimate the bandwidth based on how fast packets are delivered */
  static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct bbr *bbr = inet_csk_ca(sk);
  	u64 bw;
  
  	bbr->round_start = 0;
  	if (rs->delivered < 0 || rs->interval_us <= 0)
  		return; /* Not a valid observation */
  
  	/* See if we've reached the next RTT */
  	if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) {
  		bbr->next_rtt_delivered = tp->delivered;
  		bbr->rtt_cnt++;
  		bbr->round_start = 1;
  		bbr->packet_conservation = 0;
  	}
  
  	bbr_lt_bw_sampling(sk, rs);
  
  	/* Divide delivered by the interval to find a (lower bound) bottleneck
  	 * bandwidth sample. Delivered is in packets and interval_us in uS and
  	 * ratio will be <<1 for most connections. So delivered is first scaled.
  	 */
5b2f1f307   Wen Yang   tcp_bbr: improve ...
756
  	bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
  
  	/* If this sample is application-limited, it is likely to have a very
  	 * low delivered count that represents application behavior rather than
  	 * the available network rate. Such a sample could drag down estimated
  	 * bw, causing needless slow-down. Thus, to continue to send at the
  	 * last measured network rate, we filter out app-limited samples unless
  	 * they describe the path bw at least as well as our bw model.
  	 *
  	 * So the goal during app-limited phase is to proceed with the best
  	 * network rate no matter how long. We automatically leave this
  	 * phase when app writes faster than the network can deliver :)
  	 */
  	if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) {
  		/* Incorporate new sample into our max bw filter. */
  		minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw);
  	}
  }
78dc70eba   Priyaranjan Jha   tcp_bbr: adapt cw...
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
  /* Estimates the windowed max degree of ack aggregation.
   * This is used to provision extra in-flight data to keep sending during
   * inter-ACK silences.
   *
   * Degree of ack aggregation is estimated as extra data acked beyond expected.
   *
   * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval"
   * cwnd += max_extra_acked
   *
   * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
   * Max filter is an approximate sliding window of 5-10 (packet timed) round
   * trips.
   */
  static void bbr_update_ack_aggregation(struct sock *sk,
  				       const struct rate_sample *rs)
  {
  	u32 epoch_us, expected_acked, extra_acked;
  	struct bbr *bbr = inet_csk_ca(sk);
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
  	    rs->delivered < 0 || rs->interval_us <= 0)
  		return;
  
  	if (bbr->round_start) {
  		bbr->extra_acked_win_rtts = min(0x1F,
  						bbr->extra_acked_win_rtts + 1);
  		if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
  			bbr->extra_acked_win_rtts = 0;
  			bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
  						   0 : 1;
  			bbr->extra_acked[bbr->extra_acked_win_idx] = 0;
  		}
  	}
  
  	/* Compute how many packets we expected to be delivered over epoch. */
  	epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp,
  				      bbr->ack_epoch_mstamp);
  	expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT;
  
  	/* Reset the aggregation epoch if ACK rate is below expected rate or
  	 * significantly large no. of ack received since epoch (potentially
  	 * quite old epoch).
  	 */
  	if (bbr->ack_epoch_acked <= expected_acked ||
  	    (bbr->ack_epoch_acked + rs->acked_sacked >=
  	     bbr_ack_epoch_acked_reset_thresh)) {
  		bbr->ack_epoch_acked = 0;
  		bbr->ack_epoch_mstamp = tp->delivered_mstamp;
  		expected_acked = 0;
  	}
  
  	/* Compute excess data delivered, beyond what was expected. */
  	bbr->ack_epoch_acked = min_t(u32, 0xFFFFF,
  				     bbr->ack_epoch_acked + rs->acked_sacked);
  	extra_acked = bbr->ack_epoch_acked - expected_acked;
  	extra_acked = min(extra_acked, tp->snd_cwnd);
  	if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx])
  		bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
  }
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
  /* Estimate when the pipe is full, using the change in delivery rate: BBR
   * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
   * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
   * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the
   * higher rwin, 3: we get higher delivery rate samples. Or transient
   * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar
   * design goal, but uses delay and inter-ACK spacing instead of bandwidth.
   */
  static void bbr_check_full_bw_reached(struct sock *sk,
  				      const struct rate_sample *rs)
  {
  	struct bbr *bbr = inet_csk_ca(sk);
  	u32 bw_thresh;
  
  	if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited)
  		return;
  
  	bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE;
  	if (bbr_max_bw(sk) >= bw_thresh) {
  		bbr->full_bw = bbr_max_bw(sk);
  		bbr->full_bw_cnt = 0;
  		return;
  	}
  	++bbr->full_bw_cnt;
c589e69b5   Neal Cardwell   tcp_bbr: record "...
858
  	bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
859
860
861
862
863
864
865
866
867
  }
  
  /* If pipe is probably full, drain the queue and then enter steady-state. */
  static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
  {
  	struct bbr *bbr = inet_csk_ca(sk);
  
  	if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
  		bbr->mode = BBR_DRAIN;	/* drain queue we created */
537945700   Yousuk Seung   net-tcp_bbr: set ...
868
  		tcp_sk(sk)->snd_ssthresh =
232aa8ec3   Priyaranjan Jha   tcp_bbr: refactor...
869
  				bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
870
871
  	}	/* fall through to check if in-flight is already small: */
  	if (bbr->mode == BBR_DRAIN &&
a87c83d5e   Neal Cardwell   tcp_bbr: adjust T...
872
  	    bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
232aa8ec3   Priyaranjan Jha   tcp_bbr: refactor...
873
  	    bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
874
875
  		bbr_reset_probe_bw_mode(sk);  /* we estimate queue is drained */
  }
fb9988622   Kevin Yang   tcp_bbr: add bbr_...
876
877
878
879
880
881
882
883
884
885
886
887
888
  static void bbr_check_probe_rtt_done(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct bbr *bbr = inet_csk_ca(sk);
  
  	if (!(bbr->probe_rtt_done_stamp &&
  	      after(tcp_jiffies32, bbr->probe_rtt_done_stamp)))
  		return;
  
  	bbr->min_rtt_stamp = tcp_jiffies32;  /* wait a while until PROBE_RTT */
  	tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
  	bbr_reset_mode(sk);
  }
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
  /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and
   * periodically drain the bottleneck queue, to converge to measure the true
   * min_rtt (unloaded propagation delay). This allows the flows to keep queues
   * small (reducing queuing delay and packet loss) and achieve fairness among
   * BBR flows.
   *
   * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires,
   * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets.
   * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed
   * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and
   * re-enter the previous mode. BBR uses 200ms to approximately bound the
   * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s).
   *
   * Note that flows need only pay 2% if they are busy sending over the last 10
   * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have
   * natural silences or low-rate periods within 10 seconds where the rate is low
   * enough for long enough to drain its queue in the bottleneck. We pick up
   * these min RTT measurements opportunistically with our min_rtt filter. :-)
   */
  static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct bbr *bbr = inet_csk_ca(sk);
  	bool filter_expired;
  
  	/* Track min RTT seen in the min_rtt_win_sec filter window: */
2660bfa84   Eric Dumazet   tcp_bbr: use tcp_...
915
  	filter_expired = after(tcp_jiffies32,
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
916
917
  			       bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
  	if (rs->rtt_us >= 0 &&
1b9e2a8c9   Ryan Sharpelletti   tcp: only postpon...
918
  	    (rs->rtt_us < bbr->min_rtt_us ||
e42866031   Yuchung Cheng   tcp: avoid min RT...
919
  	     (filter_expired && !rs->is_ack_delayed))) {
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
920
  		bbr->min_rtt_us = rs->rtt_us;
2660bfa84   Eric Dumazet   tcp_bbr: use tcp_...
921
  		bbr->min_rtt_stamp = tcp_jiffies32;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
922
923
924
925
926
  	}
  
  	if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
  	    !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
  		bbr->mode = BBR_PROBE_RTT;  /* dip, drain queue */
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
927
928
929
930
931
932
933
934
935
936
937
  		bbr_save_cwnd(sk);  /* note cwnd so we can restore it */
  		bbr->probe_rtt_done_stamp = 0;
  	}
  
  	if (bbr->mode == BBR_PROBE_RTT) {
  		/* Ignore low rate samples during this mode. */
  		tp->app_limited =
  			(tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
  		/* Maintain min packets in flight for max(200 ms, 1 round). */
  		if (!bbr->probe_rtt_done_stamp &&
  		    tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) {
2660bfa84   Eric Dumazet   tcp_bbr: use tcp_...
938
  			bbr->probe_rtt_done_stamp = tcp_jiffies32 +
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
939
940
941
942
943
944
  				msecs_to_jiffies(bbr_probe_rtt_mode_ms);
  			bbr->probe_rtt_round_done = 0;
  			bbr->next_rtt_delivered = tp->delivered;
  		} else if (bbr->probe_rtt_done_stamp) {
  			if (bbr->round_start)
  				bbr->probe_rtt_round_done = 1;
fb9988622   Kevin Yang   tcp_bbr: add bbr_...
945
946
  			if (bbr->probe_rtt_round_done)
  				bbr_check_probe_rtt_done(sk);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
947
948
  		}
  	}
e6e6a278b   Neal Cardwell   tcp_bbr: fix to z...
949
950
951
  	/* Restart after idle ends only once we process a new S/ACK for data */
  	if (rs->delivered > 0)
  		bbr->idle_restart = 0;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
952
  }
cf33e25c0   Neal Cardwell   tcp_bbr: centrali...
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
  static void bbr_update_gains(struct sock *sk)
  {
  	struct bbr *bbr = inet_csk_ca(sk);
  
  	switch (bbr->mode) {
  	case BBR_STARTUP:
  		bbr->pacing_gain = bbr_high_gain;
  		bbr->cwnd_gain	 = bbr_high_gain;
  		break;
  	case BBR_DRAIN:
  		bbr->pacing_gain = bbr_drain_gain;	/* slow, to drain */
  		bbr->cwnd_gain	 = bbr_high_gain;	/* keep cwnd */
  		break;
  	case BBR_PROBE_BW:
  		bbr->pacing_gain = (bbr->lt_use_bw ?
  				    BBR_UNIT :
  				    bbr_pacing_gain[bbr->cycle_idx]);
  		bbr->cwnd_gain	 = bbr_cwnd_gain;
  		break;
  	case BBR_PROBE_RTT:
  		bbr->pacing_gain = BBR_UNIT;
  		bbr->cwnd_gain	 = BBR_UNIT;
  		break;
  	default:
  		WARN_ONCE(1, "BBR bad mode: %u
  ", bbr->mode);
  		break;
  	}
  }
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
982
983
984
  static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
  {
  	bbr_update_bw(sk, rs);
78dc70eba   Priyaranjan Jha   tcp_bbr: adapt cw...
985
  	bbr_update_ack_aggregation(sk, rs);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
986
987
988
989
  	bbr_update_cycle_phase(sk, rs);
  	bbr_check_full_bw_reached(sk, rs);
  	bbr_check_drain(sk, rs);
  	bbr_update_min_rtt(sk, rs);
cf33e25c0   Neal Cardwell   tcp_bbr: centrali...
990
  	bbr_update_gains(sk);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
991
992
993
994
995
996
997
998
999
1000
1001
  }
  
  static void bbr_main(struct sock *sk, const struct rate_sample *rs)
  {
  	struct bbr *bbr = inet_csk_ca(sk);
  	u32 bw;
  
  	bbr_update_model(sk, rs);
  
  	bw = bbr_bw(sk);
  	bbr_set_pacing_rate(sk, bw, bbr->pacing_gain);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
1002
1003
1004
1005
1006
1007
1008
  	bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
  }
  
  static void bbr_init(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct bbr *bbr = inet_csk_ca(sk);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
1009
1010
  
  	bbr->prior_cwnd = 0;
537945700   Yousuk Seung   net-tcp_bbr: set ...
1011
  	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
1012
1013
1014
1015
1016
1017
1018
1019
  	bbr->rtt_cnt = 0;
  	bbr->next_rtt_delivered = 0;
  	bbr->prev_ca_state = TCP_CA_Open;
  	bbr->packet_conservation = 0;
  
  	bbr->probe_rtt_done_stamp = 0;
  	bbr->probe_rtt_round_done = 0;
  	bbr->min_rtt_us = tcp_min_rtt(tp);
2660bfa84   Eric Dumazet   tcp_bbr: use tcp_...
1020
  	bbr->min_rtt_stamp = tcp_jiffies32;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
1021
1022
  
  	minmax_reset(&bbr->bw, bbr->rtt_cnt, 0);  /* init max bw to 0 */
329845655   Neal Cardwell   tcp_bbr: init pac...
1023
  	bbr->has_seen_rtt = 0;
79135b89b   Neal Cardwell   tcp_bbr: introduc...
1024
  	bbr_init_pacing_rate_from_rtt(sk);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
1025

0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
1026
1027
  	bbr->round_start = 0;
  	bbr->idle_restart = 0;
c589e69b5   Neal Cardwell   tcp_bbr: record "...
1028
  	bbr->full_bw_reached = 0;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
1029
1030
  	bbr->full_bw = 0;
  	bbr->full_bw_cnt = 0;
9a568de48   Eric Dumazet   tcp: switch TCP T...
1031
  	bbr->cycle_mstamp = 0;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
1032
1033
1034
  	bbr->cycle_idx = 0;
  	bbr_reset_lt_bw_sampling(sk);
  	bbr_reset_startup_mode(sk);
218af599f   Eric Dumazet   tcp: internal imp...
1035

78dc70eba   Priyaranjan Jha   tcp_bbr: adapt cw...
1036
1037
1038
1039
1040
1041
  	bbr->ack_epoch_mstamp = tp->tcp_mstamp;
  	bbr->ack_epoch_acked = 0;
  	bbr->extra_acked_win_rtts = 0;
  	bbr->extra_acked_win_idx = 0;
  	bbr->extra_acked[0] = 0;
  	bbr->extra_acked[1] = 0;
218af599f   Eric Dumazet   tcp: internal imp...
1042
  	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
  }
  
  static u32 bbr_sndbuf_expand(struct sock *sk)
  {
  	/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
  	return 3;
  }
  
  /* In theory BBR does not need to undo the cwnd since it does not
   * always reduce cwnd on losses (see bbr_main()). Keep it for now.
   */
  static u32 bbr_undo_cwnd(struct sock *sk)
  {
2f6c498e4   Neal Cardwell   tcp_bbr: reset fu...
1056
1057
1058
1059
  	struct bbr *bbr = inet_csk_ca(sk);
  
  	bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
  	bbr->full_bw_cnt = 0;
600647d46   Neal Cardwell   tcp_bbr: reset lo...
1060
  	bbr_reset_lt_bw_sampling(sk);
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
1061
1062
1063
1064
1065
1066
1067
  	return tcp_sk(sk)->snd_cwnd;
  }
  
  /* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
  static u32 bbr_ssthresh(struct sock *sk)
  {
  	bbr_save_cwnd(sk);
537945700   Yousuk Seung   net-tcp_bbr: set ...
1068
  	return tcp_sk(sk)->snd_ssthresh;
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
  }
  
  static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
  			   union tcp_cc_info *info)
  {
  	if (ext & (1 << (INET_DIAG_BBRINFO - 1)) ||
  	    ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
  		struct tcp_sock *tp = tcp_sk(sk);
  		struct bbr *bbr = inet_csk_ca(sk);
  		u64 bw = bbr_bw(sk);
  
  		bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE;
  		memset(&info->bbr, 0, sizeof(info->bbr));
  		info->bbr.bbr_bw_lo		= (u32)bw;
  		info->bbr.bbr_bw_hi		= (u32)(bw >> 32);
  		info->bbr.bbr_min_rtt		= bbr->min_rtt_us;
  		info->bbr.bbr_pacing_gain	= bbr->pacing_gain;
  		info->bbr.bbr_cwnd_gain		= bbr->cwnd_gain;
  		*attr = INET_DIAG_BBRINFO;
  		return sizeof(info->bbr);
  	}
  	return 0;
  }
  
  static void bbr_set_state(struct sock *sk, u8 new_state)
  {
  	struct bbr *bbr = inet_csk_ca(sk);
  
  	if (new_state == TCP_CA_Loss) {
  		struct rate_sample rs = { .losses = 1 };
  
  		bbr->prev_ca_state = TCP_CA_Loss;
  		bbr->full_bw = 0;
  		bbr->round_start = 1;	/* treat RTO like end of a round */
  		bbr_lt_bw_sampling(sk, &rs);
  	}
  }
  
  static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
  	.flags		= TCP_CONG_NON_RESTRICTED,
  	.name		= "bbr",
  	.owner		= THIS_MODULE,
  	.init		= bbr_init,
  	.cong_control	= bbr_main,
  	.sndbuf_expand	= bbr_sndbuf_expand,
  	.undo_cwnd	= bbr_undo_cwnd,
  	.cwnd_event	= bbr_cwnd_event,
  	.ssthresh	= bbr_ssthresh,
dcb8c9b43   Eric Dumazet   tcp_bbr: better d...
1117
  	.min_tso_segs	= bbr_min_tso_segs,
0f8782ea1   Neal Cardwell   tcp_bbr: add BBR ...
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
  	.get_info	= bbr_get_info,
  	.set_state	= bbr_set_state,
  };
  
  static int __init bbr_register(void)
  {
  	BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
  	return tcp_register_congestion_control(&tcp_bbr_cong_ops);
  }
  
  static void __exit bbr_unregister(void)
  {
  	tcp_unregister_congestion_control(&tcp_bbr_cong_ops);
  }
  
  module_init(bbr_register);
  module_exit(bbr_unregister);
  
  MODULE_AUTHOR("Van Jacobson <vanj@google.com>");
  MODULE_AUTHOR("Neal Cardwell <ncardwell@google.com>");
  MODULE_AUTHOR("Yuchung Cheng <ycheng@google.com>");
  MODULE_AUTHOR("Soheil Hassas Yeganeh <soheil@google.com>");
  MODULE_LICENSE("Dual BSD/GPL");
  MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)");