Blame view

net/ipv4/tcp_recovery.c 5.44 KB
b24413180   Greg Kroah-Hartman   License cleanup: ...
1
  // SPDX-License-Identifier: GPL-2.0
659a8ad56   Yuchung Cheng   tcp: track the pa...
2
3
  #include <linux/tcp.h>
  #include <net/tcp.h>
a0370b3f3   Yuchung Cheng   tcp: enable RACK ...
4
  int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION;
4f41b1c58   Yuchung Cheng   tcp: use RACK to ...
5

db8da6bb5   Yuchung Cheng   tcp: new helper f...
6
7
8
9
10
11
12
13
14
  static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	tcp_skb_mark_lost_uncond_verify(tp, skb);
  	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
  		/* Account for retransmits that are lost again */
  		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
  		tp->retrans_out -= tcp_skb_pcount(skb);
ecde8f36f   Yuchung Cheng   tcp: fix lost ret...
15
16
  		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
  			      tcp_skb_pcount(skb));
db8da6bb5   Yuchung Cheng   tcp: new helper f...
17
18
  	}
  }
9a568de48   Eric Dumazet   tcp: switch TCP T...
19
  static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
1d0833df5   Yuchung Cheng   tcp: use sequence...
20
  {
9a568de48   Eric Dumazet   tcp: switch TCP T...
21
  	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
1d0833df5   Yuchung Cheng   tcp: use sequence...
22
  }
a0370b3f3   Yuchung Cheng   tcp: enable RACK ...
23
24
25
  /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
   *
   * Marks a packet lost, if some packet sent later has been (s)acked.
4f41b1c58   Yuchung Cheng   tcp: use RACK to ...
26
27
28
29
30
31
32
33
34
35
36
37
   * The underlying idea is similar to the traditional dupthresh and FACK
   * but they look at different metrics:
   *
   * dupthresh: 3 OOO packets delivered (packet count)
   * FACK: sequence delta to highest sacked sequence (sequence space)
   * RACK: sent time delta to the latest delivered packet (time domain)
   *
   * The advantage of RACK is it applies to both original and retransmitted
   * packet and therefore is robust against tail losses. Another advantage
   * is being more resilient to reordering by simply allowing some
   * "settling delay", instead of tweaking the dupthresh.
   *
a0370b3f3   Yuchung Cheng   tcp: enable RACK ...
38
39
40
41
   * When tcp_rack_detect_loss() detects some packets are lost and we
   * are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
   * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
   * make us enter the CA_Recovery state.
4f41b1c58   Yuchung Cheng   tcp: use RACK to ...
42
   */
7c1c73085   Eric Dumazet   tcp: do not pass ...
43
  static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
4f41b1c58   Yuchung Cheng   tcp: use RACK to ...
44
45
46
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *skb;
e636f8b01   Yuchung Cheng   tcp: new helper f...
47
  	u32 reo_wnd;
4f41b1c58   Yuchung Cheng   tcp: use RACK to ...
48

57dde7f70   Yuchung Cheng   tcp: add reorderi...
49
  	*reo_timeout = 0;
4f41b1c58   Yuchung Cheng   tcp: use RACK to ...
50
51
52
53
  	/* To be more reordering resilient, allow min_rtt/4 settling delay
  	 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
  	 * RTT because reordering is often a path property and less related
  	 * to queuing or delayed ACKs.
4f41b1c58   Yuchung Cheng   tcp: use RACK to ...
54
55
  	 */
  	reo_wnd = 1000;
a0370b3f3   Yuchung Cheng   tcp: enable RACK ...
56
  	if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
4f41b1c58   Yuchung Cheng   tcp: use RACK to ...
57
58
59
60
61
62
63
64
65
66
67
68
  		reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
  
  	tcp_for_write_queue(skb, sk) {
  		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
  
  		if (skb == tcp_send_head(sk))
  			break;
  
  		/* Skip ones already (s)acked */
  		if (!after(scb->end_seq, tp->snd_una) ||
  		    scb->sacked & TCPCB_SACKED_ACKED)
  			continue;
9a568de48   Eric Dumazet   tcp: switch TCP T...
69
  		if (tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
1d0833df5   Yuchung Cheng   tcp: use sequence...
70
  					tp->rack.end_seq, scb->end_seq)) {
deed7be78   Yuchung Cheng   tcp: record most ...
71
72
73
74
  			/* Step 3 in draft-cheng-tcpm-rack-00.txt:
  			 * A packet is lost if its elapsed time is beyond
  			 * the recent RTT plus the reordering window.
  			 */
9a568de48   Eric Dumazet   tcp: switch TCP T...
75
76
  			u32 elapsed = tcp_stamp_us_delta(tp->tcp_mstamp,
  							 skb->skb_mstamp);
57dde7f70   Yuchung Cheng   tcp: add reorderi...
77
78
79
  			s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed;
  
  			if (remaining < 0) {
deed7be78   Yuchung Cheng   tcp: record most ...
80
  				tcp_rack_mark_skb_lost(sk, skb);
57dde7f70   Yuchung Cheng   tcp: add reorderi...
81
  				continue;
deed7be78   Yuchung Cheng   tcp: record most ...
82
  			}
57dde7f70   Yuchung Cheng   tcp: add reorderi...
83
84
85
86
87
88
89
90
  
  			/* Skip ones marked lost but not yet retransmitted */
  			if ((scb->sacked & TCPCB_LOST) &&
  			    !(scb->sacked & TCPCB_SACKED_RETRANS))
  				continue;
  
  			/* Record maximum wait time (+1 to avoid 0) */
  			*reo_timeout = max_t(u32, *reo_timeout, 1 + remaining);
4f41b1c58   Yuchung Cheng   tcp: use RACK to ...
91
92
93
94
95
96
97
  		} else if (!(scb->sacked & TCPCB_RETRANS)) {
  			/* Original data are sent sequentially so stop early
  			 * b/c the rest are all sent after rack_sent
  			 */
  			break;
  		}
  	}
e636f8b01   Yuchung Cheng   tcp: new helper f...
98
  }
128eda86b   Eric Dumazet   tcp: do not pass ...
99
  void tcp_rack_mark_lost(struct sock *sk)
e636f8b01   Yuchung Cheng   tcp: new helper f...
100
101
  {
  	struct tcp_sock *tp = tcp_sk(sk);
57dde7f70   Yuchung Cheng   tcp: add reorderi...
102
  	u32 timeout;
e636f8b01   Yuchung Cheng   tcp: new helper f...
103

a0370b3f3   Yuchung Cheng   tcp: enable RACK ...
104
  	if (!tp->rack.advanced)
e636f8b01   Yuchung Cheng   tcp: new helper f...
105
  		return;
57dde7f70   Yuchung Cheng   tcp: add reorderi...
106

e636f8b01   Yuchung Cheng   tcp: new helper f...
107
108
  	/* Reset the advanced flag to avoid unnecessary queue scanning */
  	tp->rack.advanced = 0;
7c1c73085   Eric Dumazet   tcp: do not pass ...
109
  	tcp_rack_detect_loss(sk, &timeout);
57dde7f70   Yuchung Cheng   tcp: add reorderi...
110
  	if (timeout) {
bb4d991a2   Yuchung Cheng   tcp: adjust tail ...
111
  		timeout = usecs_to_jiffies(timeout) + TCP_TIMEOUT_MIN;
57dde7f70   Yuchung Cheng   tcp: add reorderi...
112
113
114
  		inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
  					  timeout, inet_csk(sk)->icsk_rto);
  	}
4f41b1c58   Yuchung Cheng   tcp: use RACK to ...
115
  }
deed7be78   Yuchung Cheng   tcp: record most ...
116
117
118
119
  /* Record the most recently (re)sent time among the (s)acked packets
   * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
   * draft-cheng-tcpm-rack-00.txt
   */
1d0833df5   Yuchung Cheng   tcp: use sequence...
120
  void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
9a568de48   Eric Dumazet   tcp: switch TCP T...
121
  		      u64 xmit_time)
659a8ad56   Yuchung Cheng   tcp: track the pa...
122
  {
deed7be78   Yuchung Cheng   tcp: record most ...
123
  	u32 rtt_us;
9a568de48   Eric Dumazet   tcp: switch TCP T...
124
125
  	if (tp->rack.mstamp &&
  	    !tcp_rack_sent_after(xmit_time, tp->rack.mstamp,
1d0833df5   Yuchung Cheng   tcp: use sequence...
126
  				 end_seq, tp->rack.end_seq))
659a8ad56   Yuchung Cheng   tcp: track the pa...
127
  		return;
9a568de48   Eric Dumazet   tcp: switch TCP T...
128
  	rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time);
659a8ad56   Yuchung Cheng   tcp: track the pa...
129
  	if (sacked & TCPCB_RETRANS) {
659a8ad56   Yuchung Cheng   tcp: track the pa...
130
131
132
133
134
135
136
137
138
139
  		/* If the sacked packet was retransmitted, it's ambiguous
  		 * whether the retransmission or the original (or the prior
  		 * retransmission) was sacked.
  		 *
  		 * If the original is lost, there is no ambiguity. Otherwise
  		 * we assume the original can be delayed up to aRTT + min_rtt.
  		 * the aRTT term is bounded by the fast recovery or timeout,
  		 * so it's at least one RTT (i.e., retransmission is at least
  		 * an RTT later).
  		 */
deed7be78   Yuchung Cheng   tcp: record most ...
140
  		if (rtt_us < tcp_min_rtt(tp))
659a8ad56   Yuchung Cheng   tcp: track the pa...
141
142
  			return;
  	}
deed7be78   Yuchung Cheng   tcp: record most ...
143
  	tp->rack.rtt_us = rtt_us;
9a568de48   Eric Dumazet   tcp: switch TCP T...
144
  	tp->rack.mstamp = xmit_time;
1d0833df5   Yuchung Cheng   tcp: use sequence...
145
  	tp->rack.end_seq = end_seq;
659a8ad56   Yuchung Cheng   tcp: track the pa...
146
147
  	tp->rack.advanced = 1;
  }
57dde7f70   Yuchung Cheng   tcp: add reorderi...
148
149
150
151
152
153
154
  
  /* We have waited long enough to accommodate reordering. Mark the expired
   * packets lost and retransmit them.
   */
  void tcp_rack_reo_timeout(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
57dde7f70   Yuchung Cheng   tcp: add reorderi...
155
  	u32 timeout, prior_inflight;
57dde7f70   Yuchung Cheng   tcp: add reorderi...
156
  	prior_inflight = tcp_packets_in_flight(tp);
7c1c73085   Eric Dumazet   tcp: do not pass ...
157
  	tcp_rack_detect_loss(sk, &timeout);
57dde7f70   Yuchung Cheng   tcp: add reorderi...
158
159
160
161
162
163
164
165
166
167
168
  	if (prior_inflight != tcp_packets_in_flight(tp)) {
  		if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
  			tcp_enter_recovery(sk, false);
  			if (!inet_csk(sk)->icsk_ca_ops->cong_control)
  				tcp_cwnd_reduction(sk, 1, 0);
  		}
  		tcp_xmit_retransmit_queue(sk);
  	}
  	if (inet_csk(sk)->icsk_pending != ICSK_TIME_RETRANS)
  		tcp_rearm_rto(sk);
  }