Blame view

net/ipv4/tcp_recovery.c 7.51 KB
b24413180   Greg Kroah-Hartman   License cleanup: ...
1
  // SPDX-License-Identifier: GPL-2.0
659a8ad56   Yuchung Cheng   tcp: track the pa...
2
3
  #include <linux/tcp.h>
  #include <net/tcp.h>
4f41b1c58   Yuchung Cheng   tcp: use RACK to ...
4

9a568de48   Eric Dumazet   tcp: switch TCP T...
5
  static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
1d0833df5   Yuchung Cheng   tcp: use sequence...
6
  {
9a568de48   Eric Dumazet   tcp: switch TCP T...
7
  	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
1d0833df5   Yuchung Cheng   tcp: use sequence...
8
  }
1f7455c39   kbuild test robot   tcp: tcp_rack_reo...
9
  static u32 tcp_rack_reo_wnd(const struct sock *sk)
20b654dfe   Yuchung Cheng   tcp: support DUPA...
10
11
  {
  	struct tcp_sock *tp = tcp_sk(sk);
7ec65372c   Wei Wang   tcp: add stat of ...
12
  	if (!tp->reord_seen) {
20b654dfe   Yuchung Cheng   tcp: support DUPA...
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
  		/* If reordering has not been observed, be aggressive during
  		 * the recovery or starting the recovery by DUPACK threshold.
  		 */
  		if (inet_csk(sk)->icsk_ca_state >= TCP_CA_Recovery)
  			return 0;
  
  		if (tp->sacked_out >= tp->reordering &&
  		    !(sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_NO_DUPTHRESH))
  			return 0;
  	}
  
  	/* To be more reordering resilient, allow min_rtt/4 settling delay.
  	 * Use min_rtt instead of the smoothed RTT because reordering is
  	 * often a path property and less related to queuing or delayed ACKs.
  	 * Upon receiving DSACKs, linearly increase the window up to the
  	 * smoothed RTT.
  	 */
  	return min((tcp_min_rtt(tp) >> 2) * tp->rack.reo_wnd_steps,
  		   tp->srtt_us >> 3);
  }
b8fef65a8   Yuchung Cheng   tcp: new helper t...
33
34
35
  s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd)
  {
  	return tp->rack.rtt_us + reo_wnd -
2fd66ffba   Eric Dumazet   tcp: introduce tc...
36
  	       tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb));
b8fef65a8   Yuchung Cheng   tcp: new helper t...
37
  }
a0370b3f3   Yuchung Cheng   tcp: enable RACK ...
38
39
40
  /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
   *
   * Marks a packet lost, if some packet sent later has been (s)acked.
4f41b1c58   Yuchung Cheng   tcp: use RACK to ...
41
42
43
44
45
46
47
48
49
50
51
52
   * The underlying idea is similar to the traditional dupthresh and FACK
   * but they look at different metrics:
   *
   * dupthresh: 3 OOO packets delivered (packet count)
   * FACK: sequence delta to highest sacked sequence (sequence space)
   * RACK: sent time delta to the latest delivered packet (time domain)
   *
   * The advantage of RACK is it applies to both original and retransmitted
   * packet and therefore is robust against tail losses. Another advantage
   * is being more resilient to reordering by simply allowing some
   * "settling delay", instead of tweaking the dupthresh.
   *
a0370b3f3   Yuchung Cheng   tcp: enable RACK ...
53
54
55
56
   * When tcp_rack_detect_loss() detects some packets are lost and we
   * are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
   * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
   * make us enter the CA_Recovery state.
4f41b1c58   Yuchung Cheng   tcp: use RACK to ...
57
   */
7c1c73085   Eric Dumazet   tcp: do not pass ...
58
  static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
4f41b1c58   Yuchung Cheng   tcp: use RACK to ...
59
60
  {
  	struct tcp_sock *tp = tcp_sk(sk);
043b87d75   Yuchung Cheng   tcp: more efficie...
61
  	struct sk_buff *skb, *n;
e636f8b01   Yuchung Cheng   tcp: new helper f...
62
  	u32 reo_wnd;
4f41b1c58   Yuchung Cheng   tcp: use RACK to ...
63

57dde7f70   Yuchung Cheng   tcp: add reorderi...
64
  	*reo_timeout = 0;
20b654dfe   Yuchung Cheng   tcp: support DUPA...
65
  	reo_wnd = tcp_rack_reo_wnd(sk);
043b87d75   Yuchung Cheng   tcp: more efficie...
66
67
  	list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
  				 tcp_tsorted_anchor) {
4f41b1c58   Yuchung Cheng   tcp: use RACK to ...
68
  		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
bef062230   Yuchung Cheng   tcp: a small refa...
69
  		s32 remaining;
4f41b1c58   Yuchung Cheng   tcp: use RACK to ...
70

bef062230   Yuchung Cheng   tcp: a small refa...
71
72
73
74
  		/* Skip ones marked lost but not yet retransmitted */
  		if ((scb->sacked & TCPCB_LOST) &&
  		    !(scb->sacked & TCPCB_SACKED_RETRANS))
  			continue;
57dde7f70   Yuchung Cheng   tcp: add reorderi...
75

2fd66ffba   Eric Dumazet   tcp: introduce tc...
76
77
  		if (!tcp_rack_sent_after(tp->rack.mstamp,
  					 tcp_skb_timestamp_us(skb),
bef062230   Yuchung Cheng   tcp: a small refa...
78
79
  					 tp->rack.end_seq, scb->end_seq))
  			break;
57dde7f70   Yuchung Cheng   tcp: add reorderi...
80

bef062230   Yuchung Cheng   tcp: a small refa...
81
82
83
  		/* A packet is lost if it has not been s/acked beyond
  		 * the recent RTT plus the reordering window.
  		 */
b8fef65a8   Yuchung Cheng   tcp: new helper t...
84
  		remaining = tcp_rack_skb_timeout(tp, skb, reo_wnd);
428aec5e6   Yuchung Cheng   tcp: fix off-by-o...
85
  		if (remaining <= 0) {
d716bfdb1   Yuchung Cheng   tcp: account lost...
86
  			tcp_mark_skb_lost(sk, skb);
bef062230   Yuchung Cheng   tcp: a small refa...
87
88
  			list_del_init(&skb->tcp_tsorted_anchor);
  		} else {
428aec5e6   Yuchung Cheng   tcp: fix off-by-o...
89
90
  			/* Record maximum wait time */
  			*reo_timeout = max_t(u32, *reo_timeout, remaining);
4f41b1c58   Yuchung Cheng   tcp: use RACK to ...
91
92
  		}
  	}
e636f8b01   Yuchung Cheng   tcp: new helper f...
93
  }
128eda86b   Eric Dumazet   tcp: do not pass ...
94
  void tcp_rack_mark_lost(struct sock *sk)
e636f8b01   Yuchung Cheng   tcp: new helper f...
95
96
  {
  	struct tcp_sock *tp = tcp_sk(sk);
57dde7f70   Yuchung Cheng   tcp: add reorderi...
97
  	u32 timeout;
e636f8b01   Yuchung Cheng   tcp: new helper f...
98

a0370b3f3   Yuchung Cheng   tcp: enable RACK ...
99
  	if (!tp->rack.advanced)
e636f8b01   Yuchung Cheng   tcp: new helper f...
100
  		return;
57dde7f70   Yuchung Cheng   tcp: add reorderi...
101

e636f8b01   Yuchung Cheng   tcp: new helper f...
102
103
  	/* Reset the advanced flag to avoid unnecessary queue scanning */
  	tp->rack.advanced = 0;
7c1c73085   Eric Dumazet   tcp: do not pass ...
104
  	tcp_rack_detect_loss(sk, &timeout);
57dde7f70   Yuchung Cheng   tcp: add reorderi...
105
  	if (timeout) {
bb4d991a2   Yuchung Cheng   tcp: adjust tail ...
106
  		timeout = usecs_to_jiffies(timeout) + TCP_TIMEOUT_MIN;
57dde7f70   Yuchung Cheng   tcp: add reorderi...
107
108
109
  		inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
  					  timeout, inet_csk(sk)->icsk_rto);
  	}
4f41b1c58   Yuchung Cheng   tcp: use RACK to ...
110
  }
deed7be78   Yuchung Cheng   tcp: record most ...
111
112
113
114
  /* Record the most recently (re)sent time among the (s)acked packets
   * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
   * draft-cheng-tcpm-rack-00.txt
   */
1d0833df5   Yuchung Cheng   tcp: use sequence...
115
  void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
9a568de48   Eric Dumazet   tcp: switch TCP T...
116
  		      u64 xmit_time)
659a8ad56   Yuchung Cheng   tcp: track the pa...
117
  {
deed7be78   Yuchung Cheng   tcp: record most ...
118
  	u32 rtt_us;
9a568de48   Eric Dumazet   tcp: switch TCP T...
119
  	rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time);
6065fd0d1   Yuchung Cheng   tcp: evaluate pac...
120
  	if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) {
659a8ad56   Yuchung Cheng   tcp: track the pa...
121
122
123
124
125
126
127
128
129
130
  		/* If the sacked packet was retransmitted, it's ambiguous
  		 * whether the retransmission or the original (or the prior
  		 * retransmission) was sacked.
  		 *
  		 * If the original is lost, there is no ambiguity. Otherwise
  		 * we assume the original can be delayed up to aRTT + min_rtt.
  		 * the aRTT term is bounded by the fast recovery or timeout,
  		 * so it's at least one RTT (i.e., retransmission is at least
  		 * an RTT later).
  		 */
6065fd0d1   Yuchung Cheng   tcp: evaluate pac...
131
  		return;
659a8ad56   Yuchung Cheng   tcp: track the pa...
132
  	}
659a8ad56   Yuchung Cheng   tcp: track the pa...
133
  	tp->rack.advanced = 1;
6065fd0d1   Yuchung Cheng   tcp: evaluate pac...
134
135
136
137
138
139
  	tp->rack.rtt_us = rtt_us;
  	if (tcp_rack_sent_after(xmit_time, tp->rack.mstamp,
  				end_seq, tp->rack.end_seq)) {
  		tp->rack.mstamp = xmit_time;
  		tp->rack.end_seq = end_seq;
  	}
659a8ad56   Yuchung Cheng   tcp: track the pa...
140
  }
57dde7f70   Yuchung Cheng   tcp: add reorderi...
141
142
143
144
145
146
147
  
  /* We have waited long enough to accommodate reordering. Mark the expired
   * packets lost and retransmit them.
   */
  void tcp_rack_reo_timeout(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
57dde7f70   Yuchung Cheng   tcp: add reorderi...
148
  	u32 timeout, prior_inflight;
57dde7f70   Yuchung Cheng   tcp: add reorderi...
149
  	prior_inflight = tcp_packets_in_flight(tp);
7c1c73085   Eric Dumazet   tcp: do not pass ...
150
  	tcp_rack_detect_loss(sk, &timeout);
57dde7f70   Yuchung Cheng   tcp: add reorderi...
151
152
153
154
155
156
157
158
159
160
161
  	if (prior_inflight != tcp_packets_in_flight(tp)) {
  		if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
  			tcp_enter_recovery(sk, false);
  			if (!inet_csk(sk)->icsk_ca_ops->cong_control)
  				tcp_cwnd_reduction(sk, 1, 0);
  		}
  		tcp_xmit_retransmit_queue(sk);
  	}
  	if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
  		tcp_rearm_rto(sk);
  }
1f2556916   Priyaranjan Jha   tcp: higher throu...
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
  
  /* Updates the RACK's reo_wnd based on DSACK and no. of recoveries.
   *
   * If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded
   * by srtt), since there is possibility that spurious retransmission was
   * due to reordering delay longer than reo_wnd.
   *
   * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
   * no. of successful recoveries (accounts for full DSACK-based loss
   * recovery undo). After that, reset it to default (min_rtt/4).
   *
   * At max, reo_wnd is incremented only once per rtt. So that the new
   * DSACK on which we are reacting, is due to the spurious retx (approx)
   * after the reo_wnd has been updated last time.
   *
   * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
   * absolute value to account for change in rtt.
   */
  void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_STATIC_REO_WND ||
  	    !rs->prior_delivered)
  		return;
  
  	/* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */
  	if (before(rs->prior_delivered, tp->rack.last_delivered))
  		tp->rack.dsack_seen = 0;
  
  	/* Adjust the reo_wnd if update is pending */
  	if (tp->rack.dsack_seen) {
  		tp->rack.reo_wnd_steps = min_t(u32, 0xFF,
  					       tp->rack.reo_wnd_steps + 1);
  		tp->rack.dsack_seen = 0;
  		tp->rack.last_delivered = tp->delivered;
  		tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH;
  	} else if (!tp->rack.reo_wnd_persist) {
  		tp->rack.reo_wnd_steps = 1;
  	}
  }
6ac06ecd3   Yuchung Cheng   tcp: simpler NewR...
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
  
  /* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits
   * the next unacked packet upon receiving
   * a) three or more DUPACKs to start the fast recovery
   * b) an ACK acknowledging new data during the fast recovery.
   */
  void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced)
  {
  	const u8 state = inet_csk(sk)->icsk_ca_state;
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	if ((state < TCP_CA_Recovery && tp->sacked_out >= tp->reordering) ||
  	    (state == TCP_CA_Recovery && snd_una_advanced)) {
  		struct sk_buff *skb = tcp_rtx_queue_head(sk);
  		u32 mss;
  
  		if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
  			return;
  
  		mss = tcp_skb_mss(skb);
  		if (tcp_skb_pcount(skb) > 1 && skb->len > mss)
  			tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
  				     mss, mss, GFP_ATOMIC);
179ac35f2   Yuchung Cheng   tcp: consistently...
226
  		tcp_mark_skb_lost(sk, skb);
6ac06ecd3   Yuchung Cheng   tcp: simpler NewR...
227
228
  	}
  }