Blame view

net/ipv4/tcp_input.c 171 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
  /*
   * INET		An implementation of the TCP/IP protocol suite for the LINUX
   *		operating system.  INET is implemented using the  BSD Socket
   *		interface as the means of communication with the user level.
   *
   *		Implementation of the Transmission Control Protocol(TCP).
   *
02c30a84e   Jesper Juhl   [PATCH] update Ro...
8
   * Authors:	Ross Biro
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
   *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
   *		Mark Evans, <evansmp@uhura.aston.ac.uk>
   *		Corey Minyard <wf-rch!minyard@relay.EU.net>
   *		Florian La Roche, <flla@stud.uni-sb.de>
   *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
   *		Linus Torvalds, <torvalds@cs.helsinki.fi>
   *		Alan Cox, <gw4pts@gw4pts.ampr.org>
   *		Matthew Dillon, <dillon@apollo.west.oic.com>
   *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
   *		Jorge Cwik, <jorge@laser.satlink.net>
   */
  
  /*
   * Changes:
   *		Pedro Roque	:	Fast Retransmit/Recovery.
   *					Two receive queues.
   *					Retransmit queue handled by TCP.
   *					Better retransmit timer handling.
   *					New congestion avoidance.
   *					Header prediction.
   *					Variable renaming.
   *
   *		Eric		:	Fast Retransmit.
   *		Randy Scott	:	MSS option defines.
   *		Eric Schenk	:	Fixes to slow start algorithm.
   *		Eric Schenk	:	Yet another double ACK bug.
   *		Eric Schenk	:	Delayed ACK bug fixes.
   *		Eric Schenk	:	Floyd style fast retrans war avoidance.
   *		David S. Miller	:	Don't allow zero congestion window.
   *		Eric Schenk	:	Fix retransmitter so that it sends
   *					next packet on ack of previous packet.
   *		Andi Kleen	:	Moved open_request checking here
   *					and process RSTs for open_requests.
   *		Andi Kleen	:	Better prune_queue, and other fixes.
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
43
   *		Andrey Savochkin:	Fix RTT measurements in the presence of
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
44
45
46
47
48
49
50
   *					timestamps.
   *		Andrey Savochkin:	Check sequence numbers correctly when
   *					removing SACKs due to in sequence incoming
   *					data segments.
   *		Andi Kleen:		Make sure we never ack data there is not
   *					enough room for. Also make this condition
   *					a fatal error if it might still happen.
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
51
   *		Andi Kleen:		Add tcp_measure_rcv_mss to make
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
52
   *					connections with MSS<min(MTU,ann. MSS)
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
53
   *					work without delayed acks.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
54
55
56
57
58
59
60
61
   *		Andi Kleen:		Process packets with PSH set in the
   *					fast path.
   *		J Hadi Salim:		ECN support
   *	 	Andrei Gurtov,
   *		Pasi Sarolahti,
   *		Panu Kuhlberg:		Experimental audit of TCP (re)transmission
   *					engine. Lots of bugs are found.
   *		Pasi Sarolahti:		F-RTO for dealing with spurious RTOs
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
62
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
63
  #include <linux/mm.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
64
  #include <linux/slab.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
65
66
  #include <linux/module.h>
  #include <linux/sysctl.h>
a0bffffc1   Ilpo Järvinen   net/*: use linux/...
67
  #include <linux/kernel.h>
5ffc02a15   Satoru SATOH   ip: Use inline fu...
68
  #include <net/dst.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
69
70
71
72
  #include <net/tcp.h>
  #include <net/inet_common.h>
  #include <linux/ipsec.h>
  #include <asm/unaligned.h>
1a2449a87   Chris Leech   [I/OAT]: TCP recv...
73
  #include <net/netdma.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
74

ab32ea5d8   Brian Haley   [NET/IPV4/IPV6]: ...
75
76
77
78
79
  int sysctl_tcp_timestamps __read_mostly = 1;
  int sysctl_tcp_window_scaling __read_mostly = 1;
  int sysctl_tcp_sack __read_mostly = 1;
  int sysctl_tcp_fack __read_mostly = 1;
  int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
4bc2f18ba   Eric Dumazet   net/ipv4: EXPORT_...
80
  EXPORT_SYMBOL(sysctl_tcp_reordering);
255cac91c   Ilpo Järvinen   tcp: extend ECN s...
81
  int sysctl_tcp_ecn __read_mostly = 2;
4bc2f18ba   Eric Dumazet   net/ipv4: EXPORT_...
82
  EXPORT_SYMBOL(sysctl_tcp_ecn);
ab32ea5d8   Brian Haley   [NET/IPV4/IPV6]: ...
83
84
85
  int sysctl_tcp_dsack __read_mostly = 1;
  int sysctl_tcp_app_win __read_mostly = 31;
  int sysctl_tcp_adv_win_scale __read_mostly = 2;
4bc2f18ba   Eric Dumazet   net/ipv4: EXPORT_...
86
  EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
87

ab32ea5d8   Brian Haley   [NET/IPV4/IPV6]: ...
88
89
90
  int sysctl_tcp_stdurg __read_mostly;
  int sysctl_tcp_rfc1337 __read_mostly;
  int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
c96fd3d46   Ilpo Järvinen   [TCP]: Enable SAC...
91
  int sysctl_tcp_frto __read_mostly = 2;
3cfe3baaf   Ilpo Järvinen   [TCP]: Add two ne...
92
  int sysctl_tcp_frto_response __read_mostly;
ab32ea5d8   Brian Haley   [NET/IPV4/IPV6]: ...
93
  int sysctl_tcp_nometrics_save __read_mostly;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
94

7e3801755   Andreas Petlund   net: TCP thin dupack
95
  int sysctl_tcp_thin_dupack __read_mostly;
ab32ea5d8   Brian Haley   [NET/IPV4/IPV6]: ...
96
97
  int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
  int sysctl_tcp_abc __read_mostly;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
98

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
99
100
101
102
103
104
105
106
107
  #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
  #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
  #define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
  #define FLAG_RETRANS_DATA_ACKED	0x08 /* "" "" some of which was retransmitted.	*/
  #define FLAG_SYN_ACKED		0x10 /* This ACK acknowledged SYN.		*/
  #define FLAG_DATA_SACKED	0x20 /* New SACK.				*/
  #define FLAG_ECE		0x40 /* ECE in this ACK				*/
  #define FLAG_DATA_LOST		0x80 /* SACK detected data lossage.		*/
  #define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/
4dc2665e3   Ilpo Järvinen   [TCP]: SACK enhan...
108
  #define FLAG_ONLY_ORIG_SACKED	0x200 /* SACKs only non-rexmit sent before RTO */
2e6052941   Ilpo Järvinen   [TCP]: Also handl...
109
  #define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
564262c1f   Ryousei Takano   [TCP]: Fix incons...
110
  #define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained D-SACK info */
009a2e3e4   Ilpo Järvinen   [TCP] FRTO: Impro...
111
  #define FLAG_NONHEAD_RETRANS_ACKED	0x1000 /* Non-head rexmitted data was ACKed */
cadbd0313   Ilpo Järvinen   [TCP]: Dropped un...
112
  #define FLAG_SACK_RENEGING	0x2000 /* snd_una advanced to a sacked seq */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
113
114
115
116
117
  
  #define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)
  #define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
  #define FLAG_CA_ALERT		(FLAG_DATA_SACKED|FLAG_ECE)
  #define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)
2e6052941   Ilpo Järvinen   [TCP]: Also handl...
118
  #define FLAG_ANY_PROGRESS	(FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
119

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
120
  #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
121
  #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
122

e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
123
  /* Adapt the MSS value used to make delayed ack decision to the
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
124
   * real world.
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
125
   */
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
126
  static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
127
  {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
128
  	struct inet_connection_sock *icsk = inet_csk(sk);
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
129
  	const unsigned int lss = icsk->icsk_ack.last_seg_size;
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
130
  	unsigned int len;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
131

e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
132
  	icsk->icsk_ack.last_seg_size = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
133
134
135
136
  
  	/* skb->len may jitter because of SACKs, even if peer
  	 * sends good full-sized frames.
  	 */
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
137
  	len = skb_shinfo(skb)->gso_size ? : skb->len;
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
138
139
  	if (len >= icsk->icsk_ack.rcv_mss) {
  		icsk->icsk_ack.rcv_mss = len;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
140
141
142
143
144
145
  	} else {
  		/* Otherwise, we make more careful check taking into account,
  		 * that SACKs block is variable.
  		 *
  		 * "len" is invariant segment length, including TCP header.
  		 */
9c70220b7   Arnaldo Carvalho de Melo   [SK_BUFF]: Introd...
146
  		len += skb->data - skb_transport_header(skb);
bee7ca9ec   William Allen Simpson   net: TCP_MSS_DEFA...
147
  		if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
148
149
150
151
152
153
  		    /* If PSH is not set, packet should be
  		     * full sized, provided peer TCP is not badly broken.
  		     * This observation (if it is correct 8)) allows
  		     * to handle super-low mtu links fairly.
  		     */
  		    (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
aa8223c7b   Arnaldo Carvalho de Melo   [SK_BUFF]: Introd...
154
  		     !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
155
156
157
158
  			/* Subtract also invariant (if peer is RFC compliant),
  			 * tcp header plus fixed timestamp option length.
  			 * Resulting "len" is MSS free of SACK jitter.
  			 */
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
159
160
  			len -= tcp_sk(sk)->tcp_header_len;
  			icsk->icsk_ack.last_seg_size = len;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
161
  			if (len == lss) {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
162
  				icsk->icsk_ack.rcv_mss = len;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
163
164
165
  				return;
  			}
  		}
1ef9696c9   Alexey Kuznetsov   [TCP]: Send ACKs ...
166
167
  		if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
  			icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
168
  		icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
169
170
  	}
  }
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
171
  static void tcp_incr_quickack(struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
172
  {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
173
174
  	struct inet_connection_sock *icsk = inet_csk(sk);
  	unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
175

056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
176
177
  	if (quickacks == 0)
  		quickacks = 2;
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
178
179
  	if (quickacks > icsk->icsk_ack.quick)
  		icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
180
  }
1b9f40929   stephen hemminger   tcp: tcp_enter_qu...
181
  static void tcp_enter_quickack_mode(struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
182
  {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
183
184
185
186
  	struct inet_connection_sock *icsk = inet_csk(sk);
  	tcp_incr_quickack(sk);
  	icsk->icsk_ack.pingpong = 0;
  	icsk->icsk_ack.ato = TCP_ATO_MIN;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
187
188
189
190
191
  }
  
  /* Send ACKs quickly, if "quick" count is not exhausted
   * and the session is not interactive.
   */
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
192
  static inline int tcp_in_quickack_mode(const struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
193
  {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
194
195
  	const struct inet_connection_sock *icsk = inet_csk(sk);
  	return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
196
  }
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
197
198
  static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp)
  {
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
199
  	if (tp->ecn_flags & TCP_ECN_OK)
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
200
201
  		tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
  }
cf533ea53   Eric Dumazet   tcp: add const qu...
202
  static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
203
204
205
206
207
208
209
210
211
  {
  	if (tcp_hdr(skb)->cwr)
  		tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
  }
  
  static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp)
  {
  	tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
  }
7a269ffad   Eric Dumazet   tcp: ECN blackhol...
212
  static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
213
  {
7a269ffad   Eric Dumazet   tcp: ECN blackhol...
214
215
  	if (!(tp->ecn_flags & TCP_ECN_OK))
  		return;
b82d1bb4f   Eric Dumazet   tcp: unalias tcp_...
216
  	switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
7a269ffad   Eric Dumazet   tcp: ECN blackhol...
217
  	case INET_ECN_NOT_ECT:
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
218
  		/* Funny extension: if ECT is not set on a segment,
7a269ffad   Eric Dumazet   tcp: ECN blackhol...
219
220
221
222
  		 * and we already seen ECT on a previous segment,
  		 * it is probably a retransmit.
  		 */
  		if (tp->ecn_flags & TCP_ECN_SEEN)
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
223
  			tcp_enter_quickack_mode((struct sock *)tp);
7a269ffad   Eric Dumazet   tcp: ECN blackhol...
224
225
226
227
228
229
  		break;
  	case INET_ECN_CE:
  		tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
  		/* fallinto */
  	default:
  		tp->ecn_flags |= TCP_ECN_SEEN;
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
230
231
  	}
  }
cf533ea53   Eric Dumazet   tcp: add const qu...
232
  static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
233
  {
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
234
  	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
235
236
  		tp->ecn_flags &= ~TCP_ECN_OK;
  }
cf533ea53   Eric Dumazet   tcp: add const qu...
237
  static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
238
  {
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
239
  	if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
240
241
  		tp->ecn_flags &= ~TCP_ECN_OK;
  }
cf533ea53   Eric Dumazet   tcp: add const qu...
242
  static inline int TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
243
  {
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
244
  	if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
245
246
247
  		return 1;
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
248
249
250
251
252
253
254
  /* Buffer size and advertised window tuning.
   *
   * 1. Tuning sk->sk_sndbuf, when connection enters established state.
   */
  
  static void tcp_fixup_sndbuf(struct sock *sk)
  {
87fb4b7b5   Eric Dumazet   net: more accurat...
255
  	int sndmem = SKB_TRUESIZE(tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
256

06a59ecb9   Eric Dumazet   tcp: use TCP_INIT...
257
258
259
  	sndmem *= TCP_INIT_CWND;
  	if (sk->sk_sndbuf < sndmem)
  		sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
  }
  
  /* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
   *
   * All tcp_full_space() is split to two parts: "network" buffer, allocated
   * forward and advertised in receiver window (tp->rcv_wnd) and
   * "application buffer", required to isolate scheduling/application
   * latencies from network.
   * window_clamp is maximal advertised window. It can be less than
   * tcp_full_space(), in this case tcp_full_space() - window_clamp
   * is reserved for "application" buffer. The less window_clamp is
   * the smoother our behaviour from viewpoint of network, but the lower
   * throughput and the higher sensitivity of the connection to losses. 8)
   *
   * rcv_ssthresh is more strict window_clamp used at "slow start"
   * phase to predict further behaviour of this connection.
   * It is used for two goals:
   * - to enforce header prediction at sender, even when application
   *   requires some significant "application buffer". It is check #1.
   * - to prevent pruning of receive queue because of misprediction
   *   of receiver window. Check #2.
   *
   * The scheme does not work when sender sends good segments opening
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
283
   * window and then starts to feed us spaghetti. But it should work
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
284
285
286
287
   * in common situations. Otherwise, we have to rely on queue collapsing.
   */
  
  /* Slow part of check#2. */
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
288
  static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
289
  {
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
290
  	struct tcp_sock *tp = tcp_sk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
291
  	/* Optimize this! */
dfd4f0ae2   Eric Dumazet   [TCP]: Avoid two ...
292
293
  	int truesize = tcp_win_from_space(skb->truesize) >> 1;
  	int window = tcp_win_from_space(sysctl_tcp_rmem[2]) >> 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
294
295
296
  
  	while (tp->rcv_ssthresh <= window) {
  		if (truesize <= skb->len)
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
297
  			return 2 * inet_csk(sk)->icsk_ack.rcv_mss;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
298
299
300
301
302
303
  
  		truesize >>= 1;
  		window >>= 1;
  	}
  	return 0;
  }
cf533ea53   Eric Dumazet   tcp: add const qu...
304
  static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
305
  {
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
306
  	struct tcp_sock *tp = tcp_sk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
307
308
309
  	/* Check #1 */
  	if (tp->rcv_ssthresh < tp->window_clamp &&
  	    (int)tp->rcv_ssthresh < tcp_space(sk) &&
180d8cd94   Glauber Costa   foundations of pe...
310
  	    !sk_under_memory_pressure(sk)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
311
312
313
314
315
316
  		int incr;
  
  		/* Check #2. Increase window, if skb with such overhead
  		 * will fit to rcvbuf in future.
  		 */
  		if (tcp_win_from_space(skb->truesize) <= skb->len)
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
317
  			incr = 2 * tp->advmss;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
318
  		else
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
319
  			incr = __tcp_grow_window(sk, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
320
321
  
  		if (incr) {
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
322
323
  			tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
  					       tp->window_clamp);
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
324
  			inet_csk(sk)->icsk_ack.quick |= 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
325
326
327
328
329
330
331
332
  		}
  	}
  }
  
  /* 3. Tuning rcvbuf, when connection enters established state. */
  
  static void tcp_fixup_rcvbuf(struct sock *sk)
  {
e9266a02b   Eric Dumazet   tcp: use TCP_DEFA...
333
334
335
  	u32 mss = tcp_sk(sk)->advmss;
  	u32 icwnd = TCP_DEFAULT_INIT_RCVWND;
  	int rcvmem;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
336

e9266a02b   Eric Dumazet   tcp: use TCP_DEFA...
337
338
  	/* Limit to 10 segments if mss <= 1460,
  	 * or 14600/mss segments, with a minimum of two segments.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
339
  	 */
e9266a02b   Eric Dumazet   tcp: use TCP_DEFA...
340
341
342
343
344
  	if (mss > 1460)
  		icwnd = max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
  
  	rcvmem = SKB_TRUESIZE(mss + MAX_TCP_HEADER);
  	while (tcp_win_from_space(rcvmem) < mss)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
345
  		rcvmem += 128;
e9266a02b   Eric Dumazet   tcp: use TCP_DEFA...
346
347
348
349
350
  
  	rcvmem *= icwnd;
  
  	if (sk->sk_rcvbuf < rcvmem)
  		sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
351
  }
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
352
  /* 4. Try to fixup all. It is made immediately after connection enters
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
   *    established state.
   */
  static void tcp_init_buffer_space(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	int maxwin;
  
  	if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
  		tcp_fixup_rcvbuf(sk);
  	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
  		tcp_fixup_sndbuf(sk);
  
  	tp->rcvq_space.space = tp->rcv_wnd;
  
  	maxwin = tcp_full_space(sk);
  
  	if (tp->window_clamp >= maxwin) {
  		tp->window_clamp = maxwin;
  
  		if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
  			tp->window_clamp = max(maxwin -
  					       (maxwin >> sysctl_tcp_app_win),
  					       4 * tp->advmss);
  	}
  
  	/* Force reservation of one segment. */
  	if (sysctl_tcp_app_win &&
  	    tp->window_clamp > 2 * tp->advmss &&
  	    tp->window_clamp + tp->advmss > maxwin)
  		tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
  
  	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
  	tp->snd_cwnd_stamp = tcp_time_stamp;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
387
  /* 5. Recalculate window clamp after socket hit its memory bounds. */
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
388
  static void tcp_clamp_window(struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
389
  {
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
390
  	struct tcp_sock *tp = tcp_sk(sk);
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
391
  	struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
392

6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
393
  	icsk->icsk_ack.quick = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
394

326f36e9e   John Heffner   [TCP]: receive bu...
395
396
  	if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
  	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
180d8cd94   Glauber Costa   foundations of pe...
397
398
  	    !sk_under_memory_pressure(sk) &&
  	    sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
326f36e9e   John Heffner   [TCP]: receive bu...
399
400
  		sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
  				    sysctl_tcp_rmem[2]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
401
  	}
326f36e9e   John Heffner   [TCP]: receive bu...
402
  	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
403
  		tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
404
  }
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
405
406
407
408
409
410
411
412
413
  /* Initialize RCV_MSS value.
   * RCV_MSS is an our guess about MSS used by the peer.
   * We haven't any direct information about the MSS.
   * It's better to underestimate the RCV_MSS rather than overestimate.
   * Overestimations make us ACKing less frequently than needed.
   * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss().
   */
  void tcp_initialize_rcv_mss(struct sock *sk)
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
414
  	const struct tcp_sock *tp = tcp_sk(sk);
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
415
  	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
416
  	hint = min(hint, tp->rcv_wnd / 2);
bee7ca9ec   William Allen Simpson   net: TCP_MSS_DEFA...
417
  	hint = min(hint, TCP_MSS_DEFAULT);
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
418
419
420
421
  	hint = max(hint, TCP_MIN_MSS);
  
  	inet_csk(sk)->icsk_ack.rcv_mss = hint;
  }
4bc2f18ba   Eric Dumazet   net/ipv4: EXPORT_...
422
  EXPORT_SYMBOL(tcp_initialize_rcv_mss);
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
423

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
424
425
426
427
  /* Receiver "autotuning" code.
   *
   * The algorithm for RTT estimation w/o timestamps is based on
   * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
631dd1a88   Justin P. Mattock   Update broken web...
428
   * <http://public.lanl.gov/radiant/pubs.html#DRS>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
429
430
   *
   * More detail on this code can be found at
631dd1a88   Justin P. Mattock   Update broken web...
431
   * <http://staff.psc.edu/jheffner/>,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
   * though this reference is out of date.  A new paper
   * is pending.
   */
  static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
  {
  	u32 new_sample = tp->rcv_rtt_est.rtt;
  	long m = sample;
  
  	if (m == 0)
  		m = 1;
  
  	if (new_sample != 0) {
  		/* If we sample in larger samples in the non-timestamp
  		 * case, we could grossly overestimate the RTT especially
  		 * with chatty applications or bulk transfer apps which
  		 * are stalled on filesystem I/O.
  		 *
  		 * Also, since we are only going for a minimum in the
31f342690   Stephen Hemminger   [TCP]: More spell...
450
  		 * non-timestamp case, we do not smooth things out
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
451
  		 * else with timestamps disabled convergence takes too
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
452
453
454
455
456
457
458
459
  		 * long.
  		 */
  		if (!win_dep) {
  			m -= (new_sample >> 3);
  			new_sample += m;
  		} else if (m < new_sample)
  			new_sample = m << 3;
  	} else {
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
460
  		/* No previous measure. */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
461
462
463
464
465
466
467
468
469
470
471
472
473
  		new_sample = m << 3;
  	}
  
  	if (tp->rcv_rtt_est.rtt != new_sample)
  		tp->rcv_rtt_est.rtt = new_sample;
  }
  
  static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
  {
  	if (tp->rcv_rtt_est.time == 0)
  		goto new_measure;
  	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
  		return;
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
474
  	tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
475
476
477
478
479
  
  new_measure:
  	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
  	tp->rcv_rtt_est.time = tcp_time_stamp;
  }
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
480
481
  static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
  					  const struct sk_buff *skb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
482
  {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
483
  	struct tcp_sock *tp = tcp_sk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
484
485
  	if (tp->rx_opt.rcv_tsecr &&
  	    (TCP_SKB_CB(skb)->end_seq -
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
486
  	     TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
487
488
489
490
491
492
493
494
495
496
497
498
  		tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
  }
  
  /*
   * This function should be called every time data is copied to user space.
   * It calculates the appropriate TCP receive buffer space.
   */
  void tcp_rcv_space_adjust(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	int time;
  	int space;
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
499

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
500
501
  	if (tp->rcvq_space.time == 0)
  		goto new_measure;
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
502

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
503
  	time = tcp_time_stamp - tp->rcvq_space.time;
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
504
  	if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
505
  		return;
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
506

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
507
508
509
510
511
512
513
514
  	space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
  
  	space = max(tp->rcvq_space.space, space);
  
  	if (tp->rcvq_space.space != space) {
  		int rcvmem;
  
  		tp->rcvq_space.space = space;
6fcf9412d   John Heffner   [TCP]: rcvbuf loc...
515
516
  		if (sysctl_tcp_moderate_rcvbuf &&
  		    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
517
518
519
520
521
522
523
524
525
  			int new_clamp = space;
  
  			/* Receive space grows, normalize in order to
  			 * take into account packet headers and sk_buff
  			 * structure overhead.
  			 */
  			space /= tp->advmss;
  			if (!space)
  				space = 1;
87fb4b7b5   Eric Dumazet   net: more accurat...
526
  			rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
527
528
529
530
531
532
533
534
535
536
537
538
  			while (tcp_win_from_space(rcvmem) < tp->advmss)
  				rcvmem += 128;
  			space *= rcvmem;
  			space = min(space, sysctl_tcp_rmem[2]);
  			if (space > sk->sk_rcvbuf) {
  				sk->sk_rcvbuf = space;
  
  				/* Make the window clamp follow along.  */
  				tp->window_clamp = new_clamp;
  			}
  		}
  	}
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
539

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
  new_measure:
  	tp->rcvq_space.seq = tp->copied_seq;
  	tp->rcvq_space.time = tcp_time_stamp;
  }
  
  /* There is something which you must keep in mind when you analyze the
   * behavior of the tp->ato delayed ack timeout interval.  When a
   * connection starts up, we want to ack as quickly as possible.  The
   * problem is that "good" TCP's do slow start at the beginning of data
   * transmission.  The means that until we send the first few ACK's the
   * sender will sit on his end and only queue most of his data, because
   * he can only send snd_cwnd unacked packets at any given time.  For
   * each ACK we send, he increments snd_cwnd and transmits more of his
   * queue.  -DaveM
   */
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
555
  static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
556
  {
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
557
  	struct tcp_sock *tp = tcp_sk(sk);
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
558
  	struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
559
  	u32 now;
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
560
  	inet_csk_schedule_ack(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
561

463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
562
  	tcp_measure_rcv_mss(sk, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
563
564
  
  	tcp_rcv_rtt_measure(tp);
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
565

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
566
  	now = tcp_time_stamp;
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
567
  	if (!icsk->icsk_ack.ato) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
568
569
570
  		/* The _first_ data packet received, initialize
  		 * delayed ACK engine.
  		 */
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
571
572
  		tcp_incr_quickack(sk);
  		icsk->icsk_ack.ato = TCP_ATO_MIN;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
573
  	} else {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
574
  		int m = now - icsk->icsk_ack.lrcvtime;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
575

056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
576
  		if (m <= TCP_ATO_MIN / 2) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
577
  			/* The fastest case is the first. */
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
578
579
580
581
582
583
  			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;
  		} else if (m < icsk->icsk_ack.ato) {
  			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;
  			if (icsk->icsk_ack.ato > icsk->icsk_rto)
  				icsk->icsk_ack.ato = icsk->icsk_rto;
  		} else if (m > icsk->icsk_rto) {
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
584
  			/* Too long gap. Apparently sender failed to
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
585
586
  			 * restart window, so that we send ACKs quickly.
  			 */
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
587
  			tcp_incr_quickack(sk);
3ab224be6   Hideo Aoki   [NET] CORE: Intro...
588
  			sk_mem_reclaim(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
589
590
  		}
  	}
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
591
  	icsk->icsk_ack.lrcvtime = now;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
592
593
594
595
  
  	TCP_ECN_check_ce(tp, skb);
  
  	if (skb->len >= 128)
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
596
  		tcp_grow_window(sk, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
597
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
598
599
600
601
602
603
604
605
606
  /* Called to compute a smoothed rtt estimate. The data fed to this
   * routine either comes from timestamps, or from segments that were
   * known _not_ to have been retransmitted [see Karn/Partridge
   * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
   * piece by Van Jacobson.
   * NOTE: the next three routines used to be one big routine.
   * To save cycles in the RFC 1323 implementation it was better to break
   * it up into three procedures. -- erics
   */
2d2abbab6   Stephen Hemminger   [TCP]: simplify m...
607
  static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
608
  {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
609
  	struct tcp_sock *tp = tcp_sk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
610
  	long m = mrtt; /* RTT */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
611
612
613
  	/*	The following amusing code comes from Jacobson's
  	 *	article in SIGCOMM '88.  Note that rtt and mdev
  	 *	are scaled versions of rtt and mean deviation.
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
614
  	 *	This is designed to be as fast as possible
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
615
616
617
618
619
620
621
  	 *	m stands for "measurement".
  	 *
  	 *	On a 1990 paper the rto value is changed to:
  	 *	RTO = rtt + 4 * mdev
  	 *
  	 * Funny. This algorithm seems to be very broken.
  	 * These formulae increase RTO, when it should be decreased, increase
31f342690   Stephen Hemminger   [TCP]: More spell...
622
  	 * too slowly, when it should be increased quickly, decrease too quickly
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
623
624
625
626
  	 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
  	 * does not matter how to _calculate_ it. Seems, it was trap
  	 * that VJ failed to avoid. 8)
  	 */
2de979bd7   Stephen Hemminger   [TCP]: whitespace...
627
  	if (m == 0)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
  		m = 1;
  	if (tp->srtt != 0) {
  		m -= (tp->srtt >> 3);	/* m is now error in rtt est */
  		tp->srtt += m;		/* rtt = 7/8 rtt + 1/8 new */
  		if (m < 0) {
  			m = -m;		/* m is now abs(error) */
  			m -= (tp->mdev >> 2);   /* similar update on mdev */
  			/* This is similar to one of Eifel findings.
  			 * Eifel blocks mdev updates when rtt decreases.
  			 * This solution is a bit different: we use finer gain
  			 * for mdev in this case (alpha*beta).
  			 * Like Eifel it also prevents growth of rto,
  			 * but also it limits too fast rto decreases,
  			 * happening in pure Eifel.
  			 */
  			if (m > 0)
  				m >>= 3;
  		} else {
  			m -= (tp->mdev >> 2);   /* similar update on mdev */
  		}
  		tp->mdev += m;	    	/* mdev = 3/4 mdev + 1/4 new */
  		if (tp->mdev > tp->mdev_max) {
  			tp->mdev_max = tp->mdev;
  			if (tp->mdev_max > tp->rttvar)
  				tp->rttvar = tp->mdev_max;
  		}
  		if (after(tp->snd_una, tp->rtt_seq)) {
  			if (tp->mdev_max < tp->rttvar)
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
656
  				tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
657
  			tp->rtt_seq = tp->snd_nxt;
05bb1fad1   David S. Miller   [TCP]: Allow mini...
658
  			tp->mdev_max = tcp_rto_min(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
659
660
661
  		}
  	} else {
  		/* no previous measure. */
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
662
663
  		tp->srtt = m << 3;	/* take the measured time to be rtt */
  		tp->mdev = m << 1;	/* make sure rto = 3*rtt */
05bb1fad1   David S. Miller   [TCP]: Allow mini...
664
  		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
665
666
  		tp->rtt_seq = tp->snd_nxt;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
667
668
669
670
671
  }
  
  /* Calculate rto without backoff.  This is the second half of Van Jacobson's
   * routine referred to above.
   */
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
672
  static inline void tcp_set_rto(struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
673
  {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
674
  	const struct tcp_sock *tp = tcp_sk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
675
676
677
678
679
680
681
682
  	/* Old crap is replaced with new one. 8)
  	 *
  	 * More seriously:
  	 * 1. If rtt variance happened to be less 50msec, it is hallucination.
  	 *    It cannot be less due to utterly erratic ACK generation made
  	 *    at least by solaris and freebsd. "Erratic ACKs" has _nothing_
  	 *    to do with delayed acks, because at cwnd>2 true delack timeout
  	 *    is invisible. Actually, Linux-2.4 also generates erratic
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
683
  	 *    ACKs in some circumstances.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
684
  	 */
f1ecd5d9e   Damian Lukowski   Revert Backoff [v...
685
  	inet_csk(sk)->icsk_rto = __tcp_set_rto(tp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
686
687
688
689
  
  	/* 2. Fixups made earlier cannot be right.
  	 *    If we do not estimate RTO correctly without them,
  	 *    all the algo is pure shit and should be replaced
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
690
  	 *    with correct one. It is exactly, which we pretend to do.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
691
  	 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
692

ee6aac595   Ilpo Järvinen   tcp: drop tcp_bou...
693
694
695
  	/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
  	 * guarantees that rto is higher.
  	 */
f1ecd5d9e   Damian Lukowski   Revert Backoff [v...
696
  	tcp_bound_rto(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
  }
  
  /* Save metrics learned by this TCP session.
     This function is called only, when TCP finishes successfully
     i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
   */
  void tcp_update_metrics(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct dst_entry *dst = __sk_dst_get(sk);
  
  	if (sysctl_tcp_nometrics_save)
  		return;
  
  	dst_confirm(dst);
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
712
  	if (dst && (dst->flags & DST_HOST)) {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
713
  		const struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
714
  		int m;
c1e20f7c8   Stephen Hemminger   tcp: RTT metrics ...
715
  		unsigned long rtt;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
716

6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
717
  		if (icsk->icsk_backoff || !tp->srtt) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
718
719
720
721
722
  			/* This session failed to estimate rtt. Why?
  			 * Probably, no packets returned in time.
  			 * Reset our results.
  			 */
  			if (!(dst_metric_locked(dst, RTAX_RTT)))
defb3519a   David S. Miller   net: Abstract awa...
723
  				dst_metric_set(dst, RTAX_RTT, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
724
725
  			return;
  		}
c1e20f7c8   Stephen Hemminger   tcp: RTT metrics ...
726
727
  		rtt = dst_metric_rtt(dst, RTAX_RTT);
  		m = rtt - tp->srtt;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
728
729
730
731
732
733
734
  
  		/* If newly calculated rtt larger than stored one,
  		 * store new one. Otherwise, use EWMA. Remember,
  		 * rtt overestimation is always better than underestimation.
  		 */
  		if (!(dst_metric_locked(dst, RTAX_RTT))) {
  			if (m <= 0)
c1e20f7c8   Stephen Hemminger   tcp: RTT metrics ...
735
  				set_dst_metric_rtt(dst, RTAX_RTT, tp->srtt);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
736
  			else
c1e20f7c8   Stephen Hemminger   tcp: RTT metrics ...
737
  				set_dst_metric_rtt(dst, RTAX_RTT, rtt - (m >> 3));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
738
739
740
  		}
  
  		if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
c1e20f7c8   Stephen Hemminger   tcp: RTT metrics ...
741
  			unsigned long var;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
742
743
744
745
746
747
748
  			if (m < 0)
  				m = -m;
  
  			/* Scale deviation to rttvar fixed point */
  			m >>= 1;
  			if (m < tp->mdev)
  				m = tp->mdev;
c1e20f7c8   Stephen Hemminger   tcp: RTT metrics ...
749
750
751
  			var = dst_metric_rtt(dst, RTAX_RTTVAR);
  			if (m >= var)
  				var = m;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
752
  			else
c1e20f7c8   Stephen Hemminger   tcp: RTT metrics ...
753
754
755
  				var -= (var - m) >> 2;
  
  			set_dst_metric_rtt(dst, RTAX_RTTVAR, var);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
756
  		}
0b6a05c1d   Ilpo Järvinen   tcp: fix ssthresh...
757
  		if (tcp_in_initial_slowstart(tp)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
758
759
760
761
  			/* Slow start still did not finish. */
  			if (dst_metric(dst, RTAX_SSTHRESH) &&
  			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
  			    (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
defb3519a   David S. Miller   net: Abstract awa...
762
  				dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_cwnd >> 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
763
764
  			if (!dst_metric_locked(dst, RTAX_CWND) &&
  			    tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
defb3519a   David S. Miller   net: Abstract awa...
765
  				dst_metric_set(dst, RTAX_CWND, tp->snd_cwnd);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
766
  		} else if (tp->snd_cwnd > tp->snd_ssthresh &&
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
767
  			   icsk->icsk_ca_state == TCP_CA_Open) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
768
769
  			/* Cong. avoidance phase, cwnd is reliable. */
  			if (!dst_metric_locked(dst, RTAX_SSTHRESH))
defb3519a   David S. Miller   net: Abstract awa...
770
771
  				dst_metric_set(dst, RTAX_SSTHRESH,
  					       max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
772
  			if (!dst_metric_locked(dst, RTAX_CWND))
defb3519a   David S. Miller   net: Abstract awa...
773
774
775
  				dst_metric_set(dst, RTAX_CWND,
  					       (dst_metric(dst, RTAX_CWND) +
  						tp->snd_cwnd) >> 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
776
777
778
779
780
  		} else {
  			/* Else slow start did not finish, cwnd is non-sense,
  			   ssthresh may be also invalid.
  			 */
  			if (!dst_metric_locked(dst, RTAX_CWND))
defb3519a   David S. Miller   net: Abstract awa...
781
782
783
  				dst_metric_set(dst, RTAX_CWND,
  					       (dst_metric(dst, RTAX_CWND) +
  						tp->snd_ssthresh) >> 1);
5ffc02a15   Satoru SATOH   ip: Use inline fu...
784
  			if (dst_metric(dst, RTAX_SSTHRESH) &&
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
785
  			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
5ffc02a15   Satoru SATOH   ip: Use inline fu...
786
  			    tp->snd_ssthresh > dst_metric(dst, RTAX_SSTHRESH))
defb3519a   David S. Miller   net: Abstract awa...
787
  				dst_metric_set(dst, RTAX_SSTHRESH, tp->snd_ssthresh);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
788
789
790
  		}
  
  		if (!dst_metric_locked(dst, RTAX_REORDERING)) {
5ffc02a15   Satoru SATOH   ip: Use inline fu...
791
  			if (dst_metric(dst, RTAX_REORDERING) < tp->reordering &&
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
792
  			    tp->reordering != sysctl_tcp_reordering)
defb3519a   David S. Miller   net: Abstract awa...
793
  				dst_metric_set(dst, RTAX_REORDERING, tp->reordering);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
794
795
796
  		}
  	}
  }
cf533ea53   Eric Dumazet   tcp: add const qu...
797
  __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
798
799
  {
  	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
22b71c8f4   Gerrit Renker   tcp/dccp: Consoli...
800
  	if (!cwnd)
442b9635c   David S. Miller   tcp: Increase the...
801
  		cwnd = TCP_INIT_CWND;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
802
803
  	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
  }
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
804
  /* Set slow start threshold and cwnd not falling to slow start */
3cfe3baaf   Ilpo Järvinen   [TCP]: Add two ne...
805
  void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
806
807
  {
  	struct tcp_sock *tp = tcp_sk(sk);
3cfe3baaf   Ilpo Järvinen   [TCP]: Add two ne...
808
  	const struct inet_connection_sock *icsk = inet_csk(sk);
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
809
810
811
  
  	tp->prior_ssthresh = 0;
  	tp->bytes_acked = 0;
e01f9d779   Ilpo Järvinen   [TCP]: Complete i...
812
  	if (icsk->icsk_ca_state < TCP_CA_CWR) {
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
813
  		tp->undo_marker = 0;
3cfe3baaf   Ilpo Järvinen   [TCP]: Add two ne...
814
815
  		if (set_ssthresh)
  			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
816
817
818
819
820
821
822
823
824
825
  		tp->snd_cwnd = min(tp->snd_cwnd,
  				   tcp_packets_in_flight(tp) + 1U);
  		tp->snd_cwnd_cnt = 0;
  		tp->high_seq = tp->snd_nxt;
  		tp->snd_cwnd_stamp = tcp_time_stamp;
  		TCP_ECN_queue_cwr(tp);
  
  		tcp_set_ca_state(sk, TCP_CA_CWR);
  	}
  }
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
826
827
828
829
830
831
  /*
   * Packet counting of FACK is based on in-order assumptions, therefore TCP
   * disables it when reordering is detected
   */
  static void tcp_disable_fack(struct tcp_sock *tp)
  {
85cc391c0   Ilpo Järvinen   [TCP]: non-FACK S...
832
833
834
  	/* RFC3517 uses different metric in lost marker => reset on change */
  	if (tcp_is_fack(tp))
  		tp->lost_skb_hint = NULL;
ab56222a3   Vijay Subramanian   tcp: Replace cons...
835
  	tp->rx_opt.sack_ok &= ~TCP_FACK_ENABLED;
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
836
  }
564262c1f   Ryousei Takano   [TCP]: Fix incons...
837
  /* Take a notice that peer is sending D-SACKs */
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
838
839
  static void tcp_dsack_seen(struct tcp_sock *tp)
  {
ab56222a3   Vijay Subramanian   tcp: Replace cons...
840
  	tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
841
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
  /* Initialize metrics on socket. */
  
  static void tcp_init_metrics(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct dst_entry *dst = __sk_dst_get(sk);
  
  	if (dst == NULL)
  		goto reset;
  
  	dst_confirm(dst);
  
  	if (dst_metric_locked(dst, RTAX_CWND))
  		tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
  	if (dst_metric(dst, RTAX_SSTHRESH)) {
  		tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
  		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
  			tp->snd_ssthresh = tp->snd_cwnd_clamp;
9ad7c049f   Jerry Chu   tcp: RFC2988bis +...
860
861
862
863
864
  	} else {
  		/* ssthresh may have been reduced unnecessarily during.
  		 * 3WHS. Restore it back to its initial default.
  		 */
  		tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
865
866
867
  	}
  	if (dst_metric(dst, RTAX_REORDERING) &&
  	    tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
868
  		tcp_disable_fack(tp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
869
870
  		tp->reordering = dst_metric(dst, RTAX_REORDERING);
  	}
9ad7c049f   Jerry Chu   tcp: RFC2988bis +...
871
  	if (dst_metric(dst, RTAX_RTT) == 0 || tp->srtt == 0)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
872
873
874
875
876
877
878
879
  		goto reset;
  
  	/* Initial rtt is determined from SYN,SYN-ACK.
  	 * The segment is small and rtt may appear much
  	 * less than real one. Use per-dst memory
  	 * to make it more realistic.
  	 *
  	 * A bit of theory. RTT is time passed after "normal" sized packet
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
880
  	 * is sent until it is ACKed. In normal circumstances sending small
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
881
882
883
884
885
886
887
  	 * packets force peer to delay ACKs and calculation is correct too.
  	 * The algorithm is adaptive and, provided we follow specs, it
  	 * NEVER underestimate RTT. BUT! If peer tries to make some clever
  	 * tricks sort of "quick acks" for time long enough to decrease RTT
  	 * to low value, and then abruptly stops to do it and starts to delay
  	 * ACKs, wait for troubles.
  	 */
c1e20f7c8   Stephen Hemminger   tcp: RTT metrics ...
888
889
  	if (dst_metric_rtt(dst, RTAX_RTT) > tp->srtt) {
  		tp->srtt = dst_metric_rtt(dst, RTAX_RTT);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
890
891
  		tp->rtt_seq = tp->snd_nxt;
  	}
c1e20f7c8   Stephen Hemminger   tcp: RTT metrics ...
892
893
  	if (dst_metric_rtt(dst, RTAX_RTTVAR) > tp->mdev) {
  		tp->mdev = dst_metric_rtt(dst, RTAX_RTTVAR);
488faa2ae   Satoru SATOH   [IPV4]: Make tcp_...
894
  		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
895
  	}
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
896
  	tcp_set_rto(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
897
  reset:
9ad7c049f   Jerry Chu   tcp: RFC2988bis +...
898
899
900
901
902
903
  	if (tp->srtt == 0) {
  		/* RFC2988bis: We've failed to get a valid RTT sample from
  		 * 3WHS. This is most likely due to retransmission,
  		 * including spurious one. Reset the RTO back to 3secs
  		 * from the more aggressive 1sec to avoid more spurious
  		 * retransmission.
d9f4fbaf7   Jiri Kosina   tcp: cleanup of c...
904
  		 */
9ad7c049f   Jerry Chu   tcp: RFC2988bis +...
905
906
  		tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
  		inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
907
  	}
9ad7c049f   Jerry Chu   tcp: RFC2988bis +...
908
909
910
911
912
913
914
915
916
  	/* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
  	 * retransmitted. In light of RFC2988bis' more aggressive 1sec
  	 * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
  	 * retransmission has occurred.
  	 */
  	if (tp->total_retrans > 1)
  		tp->snd_cwnd = 1;
  	else
  		tp->snd_cwnd = tcp_init_cwnd(tp, dst);
d9f4fbaf7   Jiri Kosina   tcp: cleanup of c...
917
  	tp->snd_cwnd_stamp = tcp_time_stamp;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
918
  }
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
919
920
  static void tcp_update_reordering(struct sock *sk, const int metric,
  				  const int ts)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
921
  {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
922
  	struct tcp_sock *tp = tcp_sk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
923
  	if (metric > tp->reordering) {
40b215e59   Pavel Emelyanov   tcp: de-bloat a b...
924
  		int mib_idx;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
925
926
927
928
  		tp->reordering = min(TCP_MAX_REORDERING, metric);
  
  		/* This exciting event is worth to be remembered. 8) */
  		if (ts)
40b215e59   Pavel Emelyanov   tcp: de-bloat a b...
929
  			mib_idx = LINUX_MIB_TCPTSREORDER;
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
930
  		else if (tcp_is_reno(tp))
40b215e59   Pavel Emelyanov   tcp: de-bloat a b...
931
  			mib_idx = LINUX_MIB_TCPRENOREORDER;
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
932
  		else if (tcp_is_fack(tp))
40b215e59   Pavel Emelyanov   tcp: de-bloat a b...
933
  			mib_idx = LINUX_MIB_TCPFACKREORDER;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
934
  		else
40b215e59   Pavel Emelyanov   tcp: de-bloat a b...
935
  			mib_idx = LINUX_MIB_TCPSACKREORDER;
de0744af1   Pavel Emelyanov   mib: add net to N...
936
  		NET_INC_STATS_BH(sock_net(sk), mib_idx);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
937
938
939
  #if FASTRETRANS_DEBUG > 1
  		printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d
  ",
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
940
  		       tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
941
942
943
944
945
  		       tp->reordering,
  		       tp->fackets_out,
  		       tp->sacked_out,
  		       tp->undo_marker ? tp->undo_retrans : 0);
  #endif
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
946
  		tcp_disable_fack(tp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
947
948
  	}
  }
006f582c7   Ilpo Järvinen   tcp: convert retr...
949
  /* This must be called before lost_out is incremented */
c8c213f20   Ilpo Järvinen   tcp: move tcp_ver...
950
951
  static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
  {
006f582c7   Ilpo Järvinen   tcp: convert retr...
952
  	if ((tp->retransmit_skb_hint == NULL) ||
c8c213f20   Ilpo Järvinen   tcp: move tcp_ver...
953
954
  	    before(TCP_SKB_CB(skb)->seq,
  		   TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
006f582c7   Ilpo Järvinen   tcp: convert retr...
955
956
957
958
959
  		tp->retransmit_skb_hint = skb;
  
  	if (!tp->lost_out ||
  	    after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
  		tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
c8c213f20   Ilpo Järvinen   tcp: move tcp_ver...
960
  }
41ea36e35   Ilpo Järvinen   tcp: add helper f...
961
962
963
964
965
966
967
968
969
  static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
  {
  	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
  		tcp_verify_retransmit_hint(tp, skb);
  
  		tp->lost_out += tcp_skb_pcount(skb);
  		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
  	}
  }
e1aa680fa   Ilpo Järvinen   tcp: move tcp_sim...
970
971
  static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
  					    struct sk_buff *skb)
006f582c7   Ilpo Järvinen   tcp: convert retr...
972
973
974
975
976
977
978
979
  {
  	tcp_verify_retransmit_hint(tp, skb);
  
  	if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
  		tp->lost_out += tcp_skb_pcount(skb);
  		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
  /* This procedure tags the retransmission queue when SACKs arrive.
   *
   * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
   * Packets in queue with these bits set are counted in variables
   * sacked_out, retrans_out and lost_out, correspondingly.
   *
   * Valid combinations are:
   * Tag  InFlight	Description
   * 0	1		- orig segment is in flight.
   * S	0		- nothing flies, orig reached receiver.
   * L	0		- nothing flies, orig lost by net.
   * R	2		- both orig and retransmit are in flight.
   * L|R	1		- orig is lost, retransmit is in flight.
   * S|R  1		- orig reached receiver, retrans is still in flight.
   * (L|S|R is logically valid, it could occur when L|R is sacked,
   *  but it is equivalent to plain S and code short-curcuits it to S.
   *  L|S is logically invalid, it would mean -1 packet in flight 8))
   *
   * These 6 states form finite state machine, controlled by the following events:
   * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
   * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
   * 3. Loss detection event of one of three flavors:
   *	A. Scoreboard estimator decided the packet is lost.
   *	   A'. Reno "three dupacks" marks head of queue lost.
   *	   A''. Its FACK modfication, head until snd.fack is lost.
   *	B. SACK arrives sacking data transmitted after never retransmitted
   *	   hole was sent out.
   *	C. SACK arrives sacking SND.NXT at the moment, when the
   *	   segment was retransmitted.
   * 4. D-SACK added new rule: D-SACK changes any tag to S.
   *
   * It is pleasant to note, that state diagram turns out to be commutative,
   * so that we are allowed not to be bothered by order of our actions,
   * when multiple events arrive simultaneously. (see the function below).
   *
   * Reordering detection.
   * --------------------
   * Reordering metric is maximal distance, which a packet can be displaced
   * in packet stream. With SACKs we can estimate it:
   *
   * 1. SACK fills old hole and the corresponding segment was not
   *    ever retransmitted -> reordering. Alas, we cannot use it
   *    when segment was retransmitted.
   * 2. The last flaw is solved with D-SACK. D-SACK arrives
   *    for retransmitted and already SACKed segment -> reordering..
   * Both of these heuristics are not used in Loss state, when we cannot
   * account for retransmits accurately.
5b3c98821   Ilpo Järvinen   [TCP]: Discard fu...
1027
1028
1029
1030
1031
1032
1033
   *
   * SACK block validation.
   * ----------------------
   *
   * SACK block range validation checks that the received SACK block fits to
   * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
   * Note that SND.UNA is not included to the range though being valid because
0e835331e   Ilpo Järvinen   [TCP]: Update com...
1034
1035
1036
1037
1038
1039
1040
1041
1042
   * it means that the receiver is rather inconsistent with itself reporting
   * SACK reneging when it should advance SND.UNA. Such SACK block this is
   * perfectly valid, however, in light of RFC2018 which explicitly states
   * that "SACK block MUST reflect the newest segment.  Even if the newest
   * segment is going to be discarded ...", not that it looks very clever
   * in case of head skb. Due to potentional receiver driven attacks, we
   * choose to avoid immediate execution of a walk in write queue due to
   * reneging and defer head skb's loss recovery to standard loss recovery
   * procedure that will eventually trigger (nothing forbids us doing this).
5b3c98821   Ilpo Järvinen   [TCP]: Discard fu...
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
   *
   * Implements also blockage to start_seq wrap-around. Problem lies in the
   * fact that though start_seq (s) is before end_seq (i.e., not reversed),
   * there's no guarantee that it will be before snd_nxt (n). The problem
   * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
   * wrap (s_w):
   *
   *         <- outs wnd ->                          <- wrapzone ->
   *         u     e      n                         u_w   e_w  s n_w
   *         |     |      |                          |     |   |  |
   * |<------------+------+----- TCP seqno space --------------+---------->|
   * ...-- <2^31 ->|                                           |<--------...
   * ...---- >2^31 ------>|                                    |<--------...
   *
   * Current code wouldn't be vulnerable but it's better still to discard such
   * crazy SACK blocks. Doing this check for start_seq alone closes somewhat
   * similar case (end_seq after snd_nxt wrap) as earlier reversed check in
   * snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
   * equal to the ideal case (infinite seqno space without wrap caused issues).
   *
   * With D-SACK the lower bound is extended to cover sequence space below
   * SND.UNA down to undo_marker, which is the last point of interest. Yet
564262c1f   Ryousei Takano   [TCP]: Fix incons...
1065
   * again, D-SACK block must not to go across snd_una (for the same reason as
5b3c98821   Ilpo Järvinen   [TCP]: Discard fu...
1066
1067
1068
1069
1070
1071
1072
1073
1074
   * for the normal SACK blocks, explained above). But there all simplicity
   * ends, TCP might receive valid D-SACKs below that. As long as they reside
   * fully below undo_marker they do not affect behavior in anyway and can
   * therefore be safely ignored. In rare cases (which are more or less
   * theoretical ones), the D-SACK will nicely cross that boundary due to skb
   * fragmentation and packet reordering past skb's retransmission. To consider
   * them correctly, the acceptable range must be extended even more though
   * the exact amount is rather hard to quantify. However, tp->max_window can
   * be used as an exaggerated estimate.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1075
   */
5b3c98821   Ilpo Järvinen   [TCP]: Discard fu...
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
  static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
  				  u32 start_seq, u32 end_seq)
  {
  	/* Too far in future, or reversed (interpretation is ambiguous) */
  	if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
  		return 0;
  
  	/* Nasty start_seq wrap-around check (see comments above) */
  	if (!before(start_seq, tp->snd_nxt))
  		return 0;
564262c1f   Ryousei Takano   [TCP]: Fix incons...
1086
  	/* In outstanding window? ...This is valid exit for D-SACKs too.
5b3c98821   Ilpo Järvinen   [TCP]: Discard fu...
1087
1088
1089
1090
1091
1092
1093
1094
1095
  	 * start_seq == snd_una is non-sensical (see comments above)
  	 */
  	if (after(start_seq, tp->snd_una))
  		return 1;
  
  	if (!is_dsack || !tp->undo_marker)
  		return 0;
  
  	/* ...Then it's D-SACK, and must reside below snd_una completely */
f779b2d60   Zheng Yan   tcp: fix validati...
1096
  	if (after(end_seq, tp->snd_una))
5b3c98821   Ilpo Järvinen   [TCP]: Discard fu...
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
  		return 0;
  
  	if (!before(start_seq, tp->undo_marker))
  		return 1;
  
  	/* Too old */
  	if (!after(end_seq, tp->undo_marker))
  		return 0;
  
  	/* Undo_marker boundary crossing (overestimates a lot). Known already:
  	 *   start_seq < undo_marker and end_seq >= undo_marker.
  	 */
  	return !before(start_seq, end_seq - tp->max_window);
  }
1c1e87edb   Ilpo Järvinen   [TCP]: Separate l...
1111
1112
1113
  /* Check for lost retransmit. This superb idea is borrowed from "ratehalving".
   * Event "C". Later note: FACK people cheated me again 8), we have to account
   * for reordering! Ugly, but should help.
f785a8e28   Ilpo Järvinen   [TCP]: Fix lost_r...
1114
1115
1116
   *
   * Search retransmitted skbs from write_queue that were sent when snd_nxt was
   * less than what is now known to be received by the other end (derived from
9f58f3b72   Ilpo Järvinen   [TCP]: Make lost ...
1117
1118
   * highest SACK block). Also calculate the lowest snd_nxt among the remaining
   * retransmitted skbs to avoid some costly processing per ACKs.
1c1e87edb   Ilpo Järvinen   [TCP]: Separate l...
1119
   */
407ef1de0   Ilpo Järvinen   [TCP]: Remove sup...
1120
  static void tcp_mark_lost_retrans(struct sock *sk)
1c1e87edb   Ilpo Järvinen   [TCP]: Separate l...
1121
  {
9f58f3b72   Ilpo Järvinen   [TCP]: Make lost ...
1122
  	const struct inet_connection_sock *icsk = inet_csk(sk);
1c1e87edb   Ilpo Järvinen   [TCP]: Separate l...
1123
1124
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *skb;
f785a8e28   Ilpo Järvinen   [TCP]: Fix lost_r...
1125
  	int cnt = 0;
df2e014bf   Ilpo Järvinen   [TCP]: Remove los...
1126
  	u32 new_low_seq = tp->snd_nxt;
6859d4947   Ilpo Järvinen   [TCP]: Abstract t...
1127
  	u32 received_upto = tcp_highest_sack_seq(tp);
9f58f3b72   Ilpo Järvinen   [TCP]: Make lost ...
1128
1129
1130
1131
  
  	if (!tcp_is_fack(tp) || !tp->retrans_out ||
  	    !after(received_upto, tp->lost_retrans_low) ||
  	    icsk->icsk_ca_state != TCP_CA_Recovery)
407ef1de0   Ilpo Järvinen   [TCP]: Remove sup...
1132
  		return;
1c1e87edb   Ilpo Järvinen   [TCP]: Separate l...
1133
1134
1135
1136
1137
1138
  
  	tcp_for_write_queue(skb, sk) {
  		u32 ack_seq = TCP_SKB_CB(skb)->ack_seq;
  
  		if (skb == tcp_send_head(sk))
  			break;
f785a8e28   Ilpo Järvinen   [TCP]: Fix lost_r...
1139
  		if (cnt == tp->retrans_out)
1c1e87edb   Ilpo Järvinen   [TCP]: Separate l...
1140
1141
1142
  			break;
  		if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
  			continue;
f785a8e28   Ilpo Järvinen   [TCP]: Fix lost_r...
1143
1144
  		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
  			continue;
d0af4160d   Ilpo Järvinen   tcp: remove redun...
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
  		/* TODO: We would like to get rid of tcp_is_fack(tp) only
  		 * constraint here (see above) but figuring out that at
  		 * least tp->reordering SACK blocks reside between ack_seq
  		 * and received_upto is not easy task to do cheaply with
  		 * the available datastructures.
  		 *
  		 * Whether FACK should check here for tp->reordering segs
  		 * in-between one could argue for either way (it would be
  		 * rather simple to implement as we could count fack_count
  		 * during the walk and do tp->fackets_out - fack_count).
  		 */
  		if (after(received_upto, ack_seq)) {
1c1e87edb   Ilpo Järvinen   [TCP]: Separate l...
1157
1158
  			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
  			tp->retrans_out -= tcp_skb_pcount(skb);
006f582c7   Ilpo Järvinen   tcp: convert retr...
1159
  			tcp_skb_mark_lost_uncond_verify(tp, skb);
de0744af1   Pavel Emelyanov   mib: add net to N...
1160
  			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
f785a8e28   Ilpo Järvinen   [TCP]: Fix lost_r...
1161
  		} else {
df2e014bf   Ilpo Järvinen   [TCP]: Remove los...
1162
  			if (before(ack_seq, new_low_seq))
b08d6cb22   Ilpo Järvinen   [TCP]: Limit proc...
1163
  				new_low_seq = ack_seq;
f785a8e28   Ilpo Järvinen   [TCP]: Fix lost_r...
1164
  			cnt += tcp_skb_pcount(skb);
1c1e87edb   Ilpo Järvinen   [TCP]: Separate l...
1165
1166
  		}
  	}
b08d6cb22   Ilpo Järvinen   [TCP]: Limit proc...
1167
1168
1169
  
  	if (tp->retrans_out)
  		tp->lost_retrans_low = new_low_seq;
1c1e87edb   Ilpo Järvinen   [TCP]: Separate l...
1170
  }
5b3c98821   Ilpo Järvinen   [TCP]: Discard fu...
1171

cf533ea53   Eric Dumazet   tcp: add const qu...
1172
  static int tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
d06e021d7   David S. Miller   [TCP]: Extract DS...
1173
1174
1175
  			   struct tcp_sack_block_wire *sp, int num_sacks,
  			   u32 prior_snd_una)
  {
1ed834655   Pavel Emelyanov   tcp: replace tcp_...
1176
  	struct tcp_sock *tp = tcp_sk(sk);
d3e2ce3bc   Harvey Harrison   net: use get/put_...
1177
1178
  	u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
  	u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
d06e021d7   David S. Miller   [TCP]: Extract DS...
1179
1180
1181
1182
  	int dup_sack = 0;
  
  	if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
  		dup_sack = 1;
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
1183
  		tcp_dsack_seen(tp);
de0744af1   Pavel Emelyanov   mib: add net to N...
1184
  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
d06e021d7   David S. Miller   [TCP]: Extract DS...
1185
  	} else if (num_sacks > 1) {
d3e2ce3bc   Harvey Harrison   net: use get/put_...
1186
1187
  		u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
  		u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
d06e021d7   David S. Miller   [TCP]: Extract DS...
1188
1189
1190
1191
  
  		if (!after(end_seq_0, end_seq_1) &&
  		    !before(start_seq_0, start_seq_1)) {
  			dup_sack = 1;
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
1192
  			tcp_dsack_seen(tp);
de0744af1   Pavel Emelyanov   mib: add net to N...
1193
1194
  			NET_INC_STATS_BH(sock_net(sk),
  					LINUX_MIB_TCPDSACKOFORECV);
d06e021d7   David S. Miller   [TCP]: Extract DS...
1195
1196
1197
1198
  		}
  	}
  
  	/* D-SACK for already forgotten data... Do dumb counting. */
c24f691b5   Yuchung Cheng   tcp: undo_retrans...
1199
  	if (dup_sack && tp->undo_marker && tp->undo_retrans &&
d06e021d7   David S. Miller   [TCP]: Extract DS...
1200
1201
1202
1203
1204
1205
  	    !after(end_seq_0, prior_snd_una) &&
  	    after(end_seq_0, tp->undo_marker))
  		tp->undo_retrans--;
  
  	return dup_sack;
  }
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1206
1207
1208
1209
1210
  struct tcp_sacktag_state {
  	int reord;
  	int fack_count;
  	int flag;
  };
d19359429   Ilpo Järvinen   [TCP]: Extract tc...
1211
1212
1213
1214
1215
  /* Check if skb is fully within the SACK block. In presence of GSO skbs,
   * the incoming SACK may not exactly match but we can find smaller MSS
   * aligned portion of it that matches. Therefore we might need to fragment
   * which may fail and creates some hassle (caller must handle error case
   * returns).
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1216
1217
   *
   * FIXME: this could be merged to shift decision code
d19359429   Ilpo Järvinen   [TCP]: Extract tc...
1218
   */
0f79efdc2   Adrian Bunk   [TCP]: Make tcp_m...
1219
1220
  static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
  				 u32 start_seq, u32 end_seq)
d19359429   Ilpo Järvinen   [TCP]: Extract tc...
1221
1222
1223
  {
  	int in_sack, err;
  	unsigned int pkt_len;
adb92db85   Ilpo Järvinen   tcp: Make SACK co...
1224
  	unsigned int mss;
d19359429   Ilpo Järvinen   [TCP]: Extract tc...
1225
1226
1227
1228
1229
1230
  
  	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
  		  !before(end_seq, TCP_SKB_CB(skb)->end_seq);
  
  	if (tcp_skb_pcount(skb) > 1 && !in_sack &&
  	    after(TCP_SKB_CB(skb)->end_seq, start_seq)) {
adb92db85   Ilpo Järvinen   tcp: Make SACK co...
1231
  		mss = tcp_skb_mss(skb);
d19359429   Ilpo Järvinen   [TCP]: Extract tc...
1232
  		in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
adb92db85   Ilpo Järvinen   tcp: Make SACK co...
1233
  		if (!in_sack) {
d19359429   Ilpo Järvinen   [TCP]: Extract tc...
1234
  			pkt_len = start_seq - TCP_SKB_CB(skb)->seq;
adb92db85   Ilpo Järvinen   tcp: Make SACK co...
1235
1236
1237
  			if (pkt_len < mss)
  				pkt_len = mss;
  		} else {
d19359429   Ilpo Järvinen   [TCP]: Extract tc...
1238
  			pkt_len = end_seq - TCP_SKB_CB(skb)->seq;
adb92db85   Ilpo Järvinen   tcp: Make SACK co...
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
  			if (pkt_len < mss)
  				return -EINVAL;
  		}
  
  		/* Round if necessary so that SACKs cover only full MSSes
  		 * and/or the remaining small portion (if present)
  		 */
  		if (pkt_len > mss) {
  			unsigned int new_len = (pkt_len / mss) * mss;
  			if (!in_sack && new_len < pkt_len) {
  				new_len += mss;
  				if (new_len > skb->len)
  					return 0;
  			}
  			pkt_len = new_len;
  		}
  		err = tcp_fragment(sk, skb, pkt_len, mss);
d19359429   Ilpo Järvinen   [TCP]: Extract tc...
1256
1257
1258
1259
1260
1261
  		if (err < 0)
  			return err;
  	}
  
  	return in_sack;
  }
cf533ea53   Eric Dumazet   tcp: add const qu...
1262
  static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk,
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1263
1264
  			  struct tcp_sacktag_state *state,
  			  int dup_sack, int pcount)
9e10c47cb   Ilpo Järvinen   [TCP]: Create tcp...
1265
  {
6859d4947   Ilpo Järvinen   [TCP]: Abstract t...
1266
  	struct tcp_sock *tp = tcp_sk(sk);
9e10c47cb   Ilpo Järvinen   [TCP]: Create tcp...
1267
  	u8 sacked = TCP_SKB_CB(skb)->sacked;
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1268
  	int fack_count = state->fack_count;
9e10c47cb   Ilpo Järvinen   [TCP]: Create tcp...
1269
1270
1271
  
  	/* Account D-SACK for retransmitted packet. */
  	if (dup_sack && (sacked & TCPCB_RETRANS)) {
c24f691b5   Yuchung Cheng   tcp: undo_retrans...
1272
1273
  		if (tp->undo_marker && tp->undo_retrans &&
  		    after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
9e10c47cb   Ilpo Järvinen   [TCP]: Create tcp...
1274
  			tp->undo_retrans--;
ede9f3b18   Ilpo Järvinen   [TCP]: Unite iden...
1275
  		if (sacked & TCPCB_SACKED_ACKED)
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1276
  			state->reord = min(fack_count, state->reord);
9e10c47cb   Ilpo Järvinen   [TCP]: Create tcp...
1277
1278
1279
1280
  	}
  
  	/* Nothing to do; acked frame is about to be dropped (was ACKed). */
  	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1281
  		return sacked;
9e10c47cb   Ilpo Järvinen   [TCP]: Create tcp...
1282
1283
1284
1285
1286
1287
1288
1289
  
  	if (!(sacked & TCPCB_SACKED_ACKED)) {
  		if (sacked & TCPCB_SACKED_RETRANS) {
  			/* If the segment is not tagged as lost,
  			 * we do not clear RETRANS, believing
  			 * that retransmission is still in flight.
  			 */
  			if (sacked & TCPCB_LOST) {
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1290
  				sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
f58b22fd3   Ilpo Järvinen   tcp: make tcp_sac...
1291
1292
  				tp->lost_out -= pcount;
  				tp->retrans_out -= pcount;
9e10c47cb   Ilpo Järvinen   [TCP]: Create tcp...
1293
1294
1295
1296
1297
1298
1299
1300
  			}
  		} else {
  			if (!(sacked & TCPCB_RETRANS)) {
  				/* New sack for not retransmitted frame,
  				 * which was in hole. It is reordering.
  				 */
  				if (before(TCP_SKB_CB(skb)->seq,
  					   tcp_highest_sack_seq(tp)))
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1301
1302
  					state->reord = min(fack_count,
  							   state->reord);
9e10c47cb   Ilpo Järvinen   [TCP]: Create tcp...
1303
1304
1305
  
  				/* SACK enhanced F-RTO (RFC4138; Appendix B) */
  				if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark))
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1306
  					state->flag |= FLAG_ONLY_ORIG_SACKED;
9e10c47cb   Ilpo Järvinen   [TCP]: Create tcp...
1307
1308
1309
  			}
  
  			if (sacked & TCPCB_LOST) {
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1310
  				sacked &= ~TCPCB_LOST;
f58b22fd3   Ilpo Järvinen   tcp: make tcp_sac...
1311
  				tp->lost_out -= pcount;
9e10c47cb   Ilpo Järvinen   [TCP]: Create tcp...
1312
1313
  			}
  		}
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1314
1315
  		sacked |= TCPCB_SACKED_ACKED;
  		state->flag |= FLAG_DATA_SACKED;
f58b22fd3   Ilpo Järvinen   tcp: make tcp_sac...
1316
  		tp->sacked_out += pcount;
9e10c47cb   Ilpo Järvinen   [TCP]: Create tcp...
1317

f58b22fd3   Ilpo Järvinen   tcp: make tcp_sac...
1318
  		fack_count += pcount;
9e10c47cb   Ilpo Järvinen   [TCP]: Create tcp...
1319
1320
1321
1322
1323
  
  		/* Lost marker hint past SACKed? Tweak RFC3517 cnt */
  		if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) &&
  		    before(TCP_SKB_CB(skb)->seq,
  			   TCP_SKB_CB(tp->lost_skb_hint)->seq))
f58b22fd3   Ilpo Järvinen   tcp: make tcp_sac...
1324
  			tp->lost_cnt_hint += pcount;
9e10c47cb   Ilpo Järvinen   [TCP]: Create tcp...
1325
1326
1327
  
  		if (fack_count > tp->fackets_out)
  			tp->fackets_out = fack_count;
9e10c47cb   Ilpo Järvinen   [TCP]: Create tcp...
1328
1329
1330
1331
1332
1333
  	}
  
  	/* D-SACK. We can detect redundant retransmission in S|R and plain R
  	 * frames and clear it. undo_retrans is decreased above, L|R frames
  	 * are accounted above as well.
  	 */
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1334
1335
  	if (dup_sack && (sacked & TCPCB_SACKED_RETRANS)) {
  		sacked &= ~TCPCB_SACKED_RETRANS;
f58b22fd3   Ilpo Järvinen   tcp: make tcp_sac...
1336
  		tp->retrans_out -= pcount;
9e10c47cb   Ilpo Järvinen   [TCP]: Create tcp...
1337
  	}
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1338
  	return sacked;
9e10c47cb   Ilpo Järvinen   [TCP]: Create tcp...
1339
  }
50133161a   Ilpo Järvinen   tcp: no need to p...
1340
  static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1341
  			   struct tcp_sacktag_state *state,
9ec06ff57   Ilpo Järvinen   tcp: fix retrans_...
1342
1343
  			   unsigned int pcount, int shifted, int mss,
  			   int dup_sack)
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1344
1345
  {
  	struct tcp_sock *tp = tcp_sk(sk);
50133161a   Ilpo Järvinen   tcp: no need to p...
1346
  	struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1347
1348
  
  	BUG_ON(!pcount);
1e5289e12   Yan, Zheng   tcp: properly upd...
1349
  	if (skb == tp->lost_skb_hint)
92ee76b6d   Ilpo Järvinen   tcp: Make shiftin...
1350
  		tp->lost_cnt_hint += pcount;
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
  	TCP_SKB_CB(prev)->end_seq += shifted;
  	TCP_SKB_CB(skb)->seq += shifted;
  
  	skb_shinfo(prev)->gso_segs += pcount;
  	BUG_ON(skb_shinfo(skb)->gso_segs < pcount);
  	skb_shinfo(skb)->gso_segs -= pcount;
  
  	/* When we're adding to gso_segs == 1, gso_size will be zero,
  	 * in theory this shouldn't be necessary but as long as DSACK
  	 * code can come after this skb later on it's better to keep
  	 * setting gso_size to something.
  	 */
  	if (!skb_shinfo(prev)->gso_size) {
  		skb_shinfo(prev)->gso_size = mss;
  		skb_shinfo(prev)->gso_type = sk->sk_gso_type;
  	}
  
  	/* CHECKME: To clear or not to clear? Mimics normal skb currently */
  	if (skb_shinfo(skb)->gso_segs <= 1) {
  		skb_shinfo(skb)->gso_size = 0;
  		skb_shinfo(skb)->gso_type = 0;
  	}
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1373
  	/* We discard results */
9ec06ff57   Ilpo Järvinen   tcp: fix retrans_...
1374
  	tcp_sacktag_one(skb, sk, state, dup_sack, pcount);
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1375
1376
1377
  
  	/* Difference in this won't matter, both ACKed by the same cumul. ACK */
  	TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1378
1379
  	if (skb->len > 0) {
  		BUG_ON(!tcp_skb_pcount(skb));
111cc8b91   Ilpo Järvinen   tcp: add some mib...
1380
  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTED);
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1381
1382
1383
1384
  		return 0;
  	}
  
  	/* Whole SKB was eaten :-) */
92ee76b6d   Ilpo Järvinen   tcp: Make shiftin...
1385
1386
1387
1388
1389
1390
1391
1392
  	if (skb == tp->retransmit_skb_hint)
  		tp->retransmit_skb_hint = prev;
  	if (skb == tp->scoreboard_skb_hint)
  		tp->scoreboard_skb_hint = prev;
  	if (skb == tp->lost_skb_hint) {
  		tp->lost_skb_hint = prev;
  		tp->lost_cnt_hint -= tcp_skb_pcount(prev);
  	}
4de075e04   Eric Dumazet   tcp: rename tcp_s...
1393
  	TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(prev)->tcp_flags;
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1394
1395
1396
1397
1398
  	if (skb == tcp_highest_sack(sk))
  		tcp_advance_highest_sack(sk, skb);
  
  	tcp_unlink_write_queue(skb, sk);
  	sk_wmem_free_skb(sk, skb);
111cc8b91   Ilpo Järvinen   tcp: add some mib...
1399
  	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKMERGED);
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1400
1401
1402
1403
1404
1405
  	return 1;
  }
  
  /* I wish gso_size would have a bit more sane initialization than
   * something-or-zero which complicates things
   */
cf533ea53   Eric Dumazet   tcp: add const qu...
1406
  static int tcp_skb_seglen(const struct sk_buff *skb)
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1407
  {
775ffabf7   Ilpo Järvinen   tcp: make mtu pro...
1408
  	return tcp_skb_pcount(skb) == 1 ? skb->len : tcp_skb_mss(skb);
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1409
1410
1411
  }
  
  /* Shifting pages past head area doesn't work */
cf533ea53   Eric Dumazet   tcp: add const qu...
1412
  static int skb_can_shift(const struct sk_buff *skb)
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1413
1414
1415
1416
1417
1418
1419
1420
  {
  	return !skb_headlen(skb) && skb_is_nonlinear(skb);
  }
  
  /* Try collapsing SACK blocks spanning across multiple skbs to a single
   * skb.
   */
  static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1421
  					  struct tcp_sacktag_state *state,
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1422
  					  u32 start_seq, u32 end_seq,
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1423
  					  int dup_sack)
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *prev;
  	int mss;
  	int pcount = 0;
  	int len;
  	int in_sack;
  
  	if (!sk_can_gso(sk))
  		goto fallback;
  
  	/* Normally R but no L won't result in plain S */
  	if (!dup_sack &&
9969ca5f2   Ilpo Järvinen   tcp: Fix thinko m...
1437
  	    (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_RETRANS)) == TCPCB_SACKED_RETRANS)
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
  		goto fallback;
  	if (!skb_can_shift(skb))
  		goto fallback;
  	/* This frame is about to be dropped (was ACKed). */
  	if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
  		goto fallback;
  
  	/* Can only happen with delayed DSACK + discard craziness */
  	if (unlikely(skb == tcp_write_queue_head(sk)))
  		goto fallback;
  	prev = tcp_write_queue_prev(sk, skb);
  
  	if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
  		goto fallback;
  
  	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
  		  !before(end_seq, TCP_SKB_CB(skb)->end_seq);
  
  	if (in_sack) {
  		len = skb->len;
  		pcount = tcp_skb_pcount(skb);
775ffabf7   Ilpo Järvinen   tcp: make mtu pro...
1459
  		mss = tcp_skb_seglen(skb);
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1460
1461
1462
1463
  
  		/* TODO: Fix DSACKs to not fragment already SACKed and we can
  		 * drop this restriction as unnecessary
  		 */
775ffabf7   Ilpo Järvinen   tcp: make mtu pro...
1464
  		if (mss != tcp_skb_seglen(prev))
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
  			goto fallback;
  	} else {
  		if (!after(TCP_SKB_CB(skb)->end_seq, start_seq))
  			goto noop;
  		/* CHECKME: This is non-MSS split case only?, this will
  		 * cause skipped skbs due to advancing loop btw, original
  		 * has that feature too
  		 */
  		if (tcp_skb_pcount(skb) <= 1)
  			goto noop;
  
  		in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);
  		if (!in_sack) {
  			/* TODO: head merge to next could be attempted here
  			 * if (!after(TCP_SKB_CB(skb)->end_seq, end_seq)),
  			 * though it might not be worth of the additional hassle
  			 *
  			 * ...we can probably just fallback to what was done
  			 * previously. We could try merging non-SACKed ones
  			 * as well but it probably isn't going to buy off
  			 * because later SACKs might again split them, and
  			 * it would make skb timestamp tracking considerably
  			 * harder problem.
  			 */
  			goto fallback;
  		}
  
  		len = end_seq - TCP_SKB_CB(skb)->seq;
  		BUG_ON(len < 0);
  		BUG_ON(len > skb->len);
  
  		/* MSS boundaries should be honoured or else pcount will
  		 * severely break even though it makes things bit trickier.
  		 * Optimize common case to avoid most of the divides
  		 */
  		mss = tcp_skb_mss(skb);
  
  		/* TODO: Fix DSACKs to not fragment already SACKed and we can
  		 * drop this restriction as unnecessary
  		 */
775ffabf7   Ilpo Järvinen   tcp: make mtu pro...
1505
  		if (mss != tcp_skb_seglen(prev))
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
  			goto fallback;
  
  		if (len == mss) {
  			pcount = 1;
  		} else if (len < mss) {
  			goto noop;
  		} else {
  			pcount = len / mss;
  			len = pcount * mss;
  		}
  	}
  
  	if (!skb_shift(prev, skb, len))
  		goto fallback;
9ec06ff57   Ilpo Järvinen   tcp: fix retrans_...
1520
  	if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1521
1522
1523
1524
1525
1526
1527
1528
  		goto out;
  
  	/* Hole filled allows collapsing with the next as well, this is very
  	 * useful when hole on every nth skb pattern happens
  	 */
  	if (prev == tcp_write_queue_tail(sk))
  		goto out;
  	skb = tcp_write_queue_next(sk, prev);
f0bc52f38   Ilpo Järvinen   tcp: force mss eq...
1529
1530
1531
  	if (!skb_can_shift(skb) ||
  	    (skb == tcp_send_head(sk)) ||
  	    ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) ||
775ffabf7   Ilpo Järvinen   tcp: make mtu pro...
1532
  	    (mss != tcp_skb_seglen(skb)))
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1533
1534
1535
1536
1537
  		goto out;
  
  	len = skb->len;
  	if (skb_shift(prev, skb, len)) {
  		pcount += tcp_skb_pcount(skb);
9ec06ff57   Ilpo Järvinen   tcp: fix retrans_...
1538
  		tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1539
1540
1541
  	}
  
  out:
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1542
  	state->fack_count += pcount;
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1543
1544
1545
1546
1547
1548
  	return prev;
  
  noop:
  	return skb;
  
  fallback:
111cc8b91   Ilpo Järvinen   tcp: add some mib...
1549
  	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SACKSHIFTFALLBACK);
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1550
1551
  	return NULL;
  }
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1552
1553
  static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
  					struct tcp_sack_block *next_dup,
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1554
  					struct tcp_sacktag_state *state,
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1555
  					u32 start_seq, u32 end_seq,
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1556
  					int dup_sack_in)
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1557
  {
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1558
1559
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *tmp;
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
  	tcp_for_write_queue_from(skb, sk) {
  		int in_sack = 0;
  		int dup_sack = dup_sack_in;
  
  		if (skb == tcp_send_head(sk))
  			break;
  
  		/* queue is in-order => we can short-circuit the walk early */
  		if (!before(TCP_SKB_CB(skb)->seq, end_seq))
  			break;
  
  		if ((next_dup != NULL) &&
  		    before(TCP_SKB_CB(skb)->seq, next_dup->end_seq)) {
  			in_sack = tcp_match_skb_to_sack(sk, skb,
  							next_dup->start_seq,
  							next_dup->end_seq);
  			if (in_sack > 0)
  				dup_sack = 1;
  		}
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1579
1580
1581
1582
1583
  		/* skb reference here is a bit tricky to get right, since
  		 * shifting can eat and free both this skb and the next,
  		 * so not even _safe variant of the loop is enough.
  		 */
  		if (in_sack <= 0) {
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1584
1585
  			tmp = tcp_shift_skb_data(sk, skb, state,
  						 start_seq, end_seq, dup_sack);
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
  			if (tmp != NULL) {
  				if (tmp != skb) {
  					skb = tmp;
  					continue;
  				}
  
  				in_sack = 0;
  			} else {
  				in_sack = tcp_match_skb_to_sack(sk, skb,
  								start_seq,
  								end_seq);
  			}
  		}
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1599
1600
  		if (unlikely(in_sack < 0))
  			break;
832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1601
  		if (in_sack) {
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1602
1603
1604
1605
  			TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk,
  								  state,
  								  dup_sack,
  								  tcp_skb_pcount(skb));
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1606

832d11c5c   Ilpo Järvinen   tcp: Try to resto...
1607
1608
1609
1610
  			if (!before(TCP_SKB_CB(skb)->seq,
  				    tcp_highest_sack_seq(tp)))
  				tcp_advance_highest_sack(sk, skb);
  		}
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1611
  		state->fack_count += tcp_skb_pcount(skb);
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1612
1613
1614
1615
1616
1617
1618
1619
  	}
  	return skb;
  }
  
  /* Avoid all extra work that is being done by sacktag while walking in
   * a normal way
   */
  static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1620
1621
  					struct tcp_sacktag_state *state,
  					u32 skip_to_seq)
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1622
1623
1624
1625
  {
  	tcp_for_write_queue_from(skb, sk) {
  		if (skb == tcp_send_head(sk))
  			break;
e8bae275d   Ilpo Järvinen   tcp: more aggress...
1626
  		if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq))
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1627
  			break;
d152a7d88   Ilpo Järvinen   [TCP]: Must count...
1628

a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1629
  		state->fack_count += tcp_skb_pcount(skb);
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1630
1631
1632
1633
1634
1635
1636
  	}
  	return skb;
  }
  
  static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
  						struct sock *sk,
  						struct tcp_sack_block *next_dup,
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1637
1638
  						struct tcp_sacktag_state *state,
  						u32 skip_to_seq)
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1639
1640
1641
1642
1643
  {
  	if (next_dup == NULL)
  		return skb;
  
  	if (before(next_dup->start_seq, skip_to_seq)) {
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1644
1645
1646
1647
  		skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq);
  		skb = tcp_sacktag_walk(skb, sk, NULL, state,
  				       next_dup->start_seq, next_dup->end_seq,
  				       1);
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1648
1649
1650
1651
  	}
  
  	return skb;
  }
cf533ea53   Eric Dumazet   tcp: add const qu...
1652
  static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_block *cache)
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1653
1654
1655
  {
  	return cache < tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1656
  static int
cf533ea53   Eric Dumazet   tcp: add const qu...
1657
  tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
1658
  			u32 prior_snd_una)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1659
  {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
1660
  	const struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1661
  	struct tcp_sock *tp = tcp_sk(sk);
cf533ea53   Eric Dumazet   tcp: add const qu...
1662
1663
  	const unsigned char *ptr = (skb_transport_header(ack_skb) +
  				    TCP_SKB_CB(ack_skb)->sacked);
fd6dad616   Ilpo Järvinen   [TCP]: Earlier SA...
1664
  	struct tcp_sack_block_wire *sp_wire = (struct tcp_sack_block_wire *)(ptr+2);
4389dded7   Adam Langley   tcp: Remove redun...
1665
  	struct tcp_sack_block sp[TCP_NUM_SACKS];
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1666
  	struct tcp_sack_block *cache;
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1667
  	struct tcp_sacktag_state state;
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1668
  	struct sk_buff *skb;
4389dded7   Adam Langley   tcp: Remove redun...
1669
  	int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
fd6dad616   Ilpo Järvinen   [TCP]: Earlier SA...
1670
  	int used_sacks;
7769f4064   Ilpo Järvinen   [TCP]: Fix logic ...
1671
  	int found_dup_sack = 0;
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1672
  	int i, j;
fda03fbb5   Baruch Even   [TCP]: Advance fa...
1673
  	int first_sack_index;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1674

a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1675
1676
  	state.flag = 0;
  	state.reord = tp->packets_out;
d738cd8fc   Ilpo Järvinen   [TCP]: Add highes...
1677
  	if (!tp->sacked_out) {
de83c058a   Ilpo Järvinen   [TCP]: "Annotate"...
1678
1679
  		if (WARN_ON(tp->fackets_out))
  			tp->fackets_out = 0;
6859d4947   Ilpo Järvinen   [TCP]: Abstract t...
1680
  		tcp_highest_sack_reset(sk);
d738cd8fc   Ilpo Järvinen   [TCP]: Add highes...
1681
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1682

1ed834655   Pavel Emelyanov   tcp: replace tcp_...
1683
  	found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
d06e021d7   David S. Miller   [TCP]: Extract DS...
1684
1685
  					 num_sacks, prior_snd_una);
  	if (found_dup_sack)
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1686
  		state.flag |= FLAG_DSACKING_ACK;
6f74651ae   Baruch Even   [TCP]: Seperate D...
1687
1688
1689
1690
1691
1692
1693
  
  	/* Eliminate too old ACKs, but take into
  	 * account more or less fresh ones, they can
  	 * contain valid SACK info.
  	 */
  	if (before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una - tp->max_window))
  		return 0;
96a2d41a3   Ilpo Järvinen   [TCP]: Make sure ...
1694
1695
  	if (!tp->packets_out)
  		goto out;
fd6dad616   Ilpo Järvinen   [TCP]: Earlier SA...
1696
1697
1698
1699
  	used_sacks = 0;
  	first_sack_index = 0;
  	for (i = 0; i < num_sacks; i++) {
  		int dup_sack = !i && found_dup_sack;
d3e2ce3bc   Harvey Harrison   net: use get/put_...
1700
1701
  		sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
  		sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
fd6dad616   Ilpo Järvinen   [TCP]: Earlier SA...
1702
1703
1704
1705
  
  		if (!tcp_is_sackblock_valid(tp, dup_sack,
  					    sp[used_sacks].start_seq,
  					    sp[used_sacks].end_seq)) {
40b215e59   Pavel Emelyanov   tcp: de-bloat a b...
1706
  			int mib_idx;
fd6dad616   Ilpo Järvinen   [TCP]: Earlier SA...
1707
1708
  			if (dup_sack) {
  				if (!tp->undo_marker)
40b215e59   Pavel Emelyanov   tcp: de-bloat a b...
1709
  					mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO;
fd6dad616   Ilpo Järvinen   [TCP]: Earlier SA...
1710
  				else
40b215e59   Pavel Emelyanov   tcp: de-bloat a b...
1711
  					mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD;
fd6dad616   Ilpo Järvinen   [TCP]: Earlier SA...
1712
1713
1714
1715
1716
  			} else {
  				/* Don't count olds caused by ACK reordering */
  				if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) &&
  				    !after(sp[used_sacks].end_seq, tp->snd_una))
  					continue;
40b215e59   Pavel Emelyanov   tcp: de-bloat a b...
1717
  				mib_idx = LINUX_MIB_TCPSACKDISCARD;
fd6dad616   Ilpo Järvinen   [TCP]: Earlier SA...
1718
  			}
40b215e59   Pavel Emelyanov   tcp: de-bloat a b...
1719

de0744af1   Pavel Emelyanov   mib: add net to N...
1720
  			NET_INC_STATS_BH(sock_net(sk), mib_idx);
fd6dad616   Ilpo Järvinen   [TCP]: Earlier SA...
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
  			if (i == 0)
  				first_sack_index = -1;
  			continue;
  		}
  
  		/* Ignore very old stuff early */
  		if (!after(sp[used_sacks].end_seq, prior_snd_una))
  			continue;
  
  		used_sacks++;
  	}
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1732
1733
  	/* order SACK blocks to allow in order walk of the retrans queue */
  	for (i = used_sacks - 1; i > 0; i--) {
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
1734
1735
  		for (j = 0; j < i; j++) {
  			if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
a0bffffc1   Ilpo Järvinen   net/*: use linux/...
1736
  				swap(sp[j], sp[j + 1]);
6a438bbe6   Stephen Hemminger   [TCP]: speed up S...
1737

68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1738
1739
  				/* Track where the first SACK block goes to */
  				if (j == first_sack_index)
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
1740
  					first_sack_index = j + 1;
6a438bbe6   Stephen Hemminger   [TCP]: speed up S...
1741
1742
1743
  			}
  		}
  	}
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1744
  	skb = tcp_write_queue_head(sk);
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1745
  	state.fack_count = 0;
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
  	i = 0;
  
  	if (!tp->sacked_out) {
  		/* It's already past, so skip checking against it */
  		cache = tp->recv_sack_cache + ARRAY_SIZE(tp->recv_sack_cache);
  	} else {
  		cache = tp->recv_sack_cache;
  		/* Skip empty blocks in at head of the cache */
  		while (tcp_sack_cache_ok(tp, cache) && !cache->start_seq &&
  		       !cache->end_seq)
  			cache++;
fda03fbb5   Baruch Even   [TCP]: Advance fa...
1757
  	}
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1758
  	while (i < used_sacks) {
fd6dad616   Ilpo Järvinen   [TCP]: Earlier SA...
1759
1760
  		u32 start_seq = sp[i].start_seq;
  		u32 end_seq = sp[i].end_seq;
7769f4064   Ilpo Järvinen   [TCP]: Fix logic ...
1761
  		int dup_sack = (found_dup_sack && (i == first_sack_index));
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1762
  		struct tcp_sack_block *next_dup = NULL;
e56d6cd60   Ilpo Järvinen   [TCP]: Process DS...
1763

68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1764
1765
  		if (found_dup_sack && ((i + 1) == first_sack_index))
  			next_dup = &sp[i + 1];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1766
1767
1768
  
  		/* Event "B" in the comment above. */
  		if (after(end_seq, tp->high_seq))
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1769
  			state.flag |= FLAG_DATA_LOST;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1770

68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
  		/* Skip too early cached blocks */
  		while (tcp_sack_cache_ok(tp, cache) &&
  		       !before(start_seq, cache->end_seq))
  			cache++;
  
  		/* Can skip some work by looking recv_sack_cache? */
  		if (tcp_sack_cache_ok(tp, cache) && !dup_sack &&
  		    after(end_seq, cache->start_seq)) {
  
  			/* Head todo? */
  			if (before(start_seq, cache->start_seq)) {
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1782
1783
  				skb = tcp_sacktag_skip(skb, sk, &state,
  						       start_seq);
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
1784
  				skb = tcp_sacktag_walk(skb, sk, next_dup,
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1785
  						       &state,
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
1786
1787
  						       start_seq,
  						       cache->start_seq,
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1788
  						       dup_sack);
fda03fbb5   Baruch Even   [TCP]: Advance fa...
1789
  			}
6a438bbe6   Stephen Hemminger   [TCP]: speed up S...
1790

68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1791
  			/* Rest of the block already fully processed? */
20de20beb   Ilpo Järvinen   [TCP]: Correct DS...
1792
  			if (!after(end_seq, cache->end_seq))
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1793
  				goto advance_sp;
20de20beb   Ilpo Järvinen   [TCP]: Correct DS...
1794

056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
1795
  			skb = tcp_maybe_skipping_dsack(skb, sk, next_dup,
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1796
1797
  						       &state,
  						       cache->end_seq);
e56d6cd60   Ilpo Järvinen   [TCP]: Process DS...
1798

68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1799
  			/* ...tail remains todo... */
6859d4947   Ilpo Järvinen   [TCP]: Abstract t...
1800
  			if (tcp_highest_sack_seq(tp) == cache->end_seq) {
20de20beb   Ilpo Järvinen   [TCP]: Correct DS...
1801
  				/* ...but better entrypoint exists! */
6859d4947   Ilpo Järvinen   [TCP]: Abstract t...
1802
1803
1804
  				skb = tcp_highest_sack(sk);
  				if (skb == NULL)
  					break;
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1805
  				state.fack_count = tp->fackets_out;
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1806
1807
  				cache++;
  				goto walk;
e56d6cd60   Ilpo Järvinen   [TCP]: Process DS...
1808
  			}
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1809
  			skb = tcp_sacktag_skip(skb, sk, &state, cache->end_seq);
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1810
1811
1812
1813
  			/* Check overlap against next cached too (past this one already) */
  			cache++;
  			continue;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1814

6859d4947   Ilpo Järvinen   [TCP]: Abstract t...
1815
1816
1817
1818
  		if (!before(start_seq, tcp_highest_sack_seq(tp))) {
  			skb = tcp_highest_sack(sk);
  			if (skb == NULL)
  				break;
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1819
  			state.fack_count = tp->fackets_out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1820
  		}
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1821
  		skb = tcp_sacktag_skip(skb, sk, &state, start_seq);
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1822
1823
  
  walk:
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1824
1825
  		skb = tcp_sacktag_walk(skb, sk, next_dup, &state,
  				       start_seq, end_seq, dup_sack);
fbd52eb2b   Ilpo Järvinen   [TCP]: Split SACK...
1826

68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1827
  advance_sp:
fbd52eb2b   Ilpo Järvinen   [TCP]: Split SACK...
1828
1829
1830
1831
  		/* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct
  		 * due to in-order walk
  		 */
  		if (after(end_seq, tp->frto_highmark))
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1832
  			state.flag &= ~FLAG_ONLY_ORIG_SACKED;
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1833
1834
  
  		i++;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1835
  	}
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1836
1837
1838
1839
1840
1841
1842
  	/* Clear the head of the cache sack blocks so we can skip it next time */
  	for (i = 0; i < ARRAY_SIZE(tp->recv_sack_cache) - used_sacks; i++) {
  		tp->recv_sack_cache[i].start_seq = 0;
  		tp->recv_sack_cache[i].end_seq = 0;
  	}
  	for (j = 0; j < used_sacks; j++)
  		tp->recv_sack_cache[i++] = sp[j];
407ef1de0   Ilpo Järvinen   [TCP]: Remove sup...
1843
  	tcp_mark_lost_retrans(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1844

86426c22d   Ilpo Järvinen   [TCP]: Restore ov...
1845
  	tcp_verify_left_out(tp);
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1846
  	if ((state.reord < tp->fackets_out) &&
f57711130   Ilpo Järvinen   [TCP]: Extend reo...
1847
  	    ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) &&
c5e7af0df   Ilpo Järvinen   [TCP]: Correct re...
1848
  	    (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1849
  		tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1850

96a2d41a3   Ilpo Järvinen   [TCP]: Make sure ...
1851
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1852
  #if FASTRETRANS_DEBUG > 0
547b792ca   Ilpo Järvinen   net: convert BUG_...
1853
1854
1855
1856
  	WARN_ON((int)tp->sacked_out < 0);
  	WARN_ON((int)tp->lost_out < 0);
  	WARN_ON((int)tp->retrans_out < 0);
  	WARN_ON((int)tcp_packets_in_flight(tp) < 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1857
  #endif
a1197f5a6   Ilpo Järvinen   tcp: introduce st...
1858
  	return state.flag;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1859
  }
882bebaac   Ilpo Järvinen   [TCP]: tcp_simple...
1860
1861
  /* Limits sacked_out so that sum with lost_out isn't ever larger than
   * packets_out. Returns zero if sacked_out adjustement wasn't necessary.
30935cf4f   Ilpo Järvinen   [TCP] FRTO: Comme...
1862
   */
8eecaba90   Ilpo Järvinen   tcp: tcp_limit_re...
1863
  static int tcp_limit_reno_sacked(struct tcp_sock *tp)
4ddf66769   Ilpo Järvinen   [TCP]: Move Reno ...
1864
  {
4ddf66769   Ilpo Järvinen   [TCP]: Move Reno ...
1865
1866
1867
1868
1869
1870
1871
  	u32 holes;
  
  	holes = max(tp->lost_out, 1U);
  	holes = min(holes, tp->packets_out);
  
  	if ((tp->sacked_out + holes) > tp->packets_out) {
  		tp->sacked_out = tp->packets_out - holes;
882bebaac   Ilpo Järvinen   [TCP]: tcp_simple...
1872
  		return 1;
4ddf66769   Ilpo Järvinen   [TCP]: Move Reno ...
1873
  	}
882bebaac   Ilpo Järvinen   [TCP]: tcp_simple...
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
  	return 0;
  }
  
  /* If we receive more dupacks than we expected counting segments
   * in assumption of absent reordering, interpret this as reordering.
   * The only another reason could be bug in receiver TCP.
   */
  static void tcp_check_reno_reordering(struct sock *sk, const int addend)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	if (tcp_limit_reno_sacked(tp))
  		tcp_update_reordering(sk, tp->packets_out + addend, 0);
4ddf66769   Ilpo Järvinen   [TCP]: Move Reno ...
1886
1887
1888
1889
1890
1891
1892
1893
1894
  }
  
  /* Emulate SACKs for SACKless connection: account for a new dupack. */
  
  static void tcp_add_reno_sack(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	tp->sacked_out++;
  	tcp_check_reno_reordering(sk, 0);
005903bc3   Ilpo Järvinen   [TCP]: Left out s...
1895
  	tcp_verify_left_out(tp);
4ddf66769   Ilpo Järvinen   [TCP]: Move Reno ...
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
  }
  
  /* Account for ACK, ACKing some data in Reno Recovery phase. */
  
  static void tcp_remove_reno_sacks(struct sock *sk, int acked)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	if (acked > 0) {
  		/* One ACK acked hole. The rest eat duplicate ACKs. */
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
1906
  		if (acked - 1 >= tp->sacked_out)
4ddf66769   Ilpo Järvinen   [TCP]: Move Reno ...
1907
1908
  			tp->sacked_out = 0;
  		else
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
1909
  			tp->sacked_out -= acked - 1;
4ddf66769   Ilpo Järvinen   [TCP]: Move Reno ...
1910
1911
  	}
  	tcp_check_reno_reordering(sk, acked);
005903bc3   Ilpo Järvinen   [TCP]: Left out s...
1912
  	tcp_verify_left_out(tp);
4ddf66769   Ilpo Järvinen   [TCP]: Move Reno ...
1913
1914
1915
1916
1917
  }
  
  static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
  {
  	tp->sacked_out = 0;
4ddf66769   Ilpo Järvinen   [TCP]: Move Reno ...
1918
  }
62ab22278   Ilpo Järvinen   tcp FRTO: SACK va...
1919
1920
1921
1922
  static int tcp_is_sackfrto(const struct tcp_sock *tp)
  {
  	return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp);
  }
95eacd27e   Ilpo Järvinen   [TCP]: fix commen...
1923
1924
1925
  /* F-RTO can only be used if TCP has never retransmitted anything other than
   * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
   */
46d0de4ed   Ilpo Järvinen   [TCP] FRTO: Entry...
1926
  int tcp_use_frto(struct sock *sk)
bdaae17da   Ilpo Järvinen   [TCP] FRTO: Moved...
1927
1928
  {
  	const struct tcp_sock *tp = tcp_sk(sk);
6adb4f733   Ilpo Järvinen   [TCP]: Don't allo...
1929
  	const struct inet_connection_sock *icsk = inet_csk(sk);
46d0de4ed   Ilpo Järvinen   [TCP] FRTO: Entry...
1930
  	struct sk_buff *skb;
575ee7140   Ilpo Järvinen   [TCP] FRTO: Delay...
1931
  	if (!sysctl_tcp_frto)
46d0de4ed   Ilpo Järvinen   [TCP] FRTO: Entry...
1932
  		return 0;
bdaae17da   Ilpo Järvinen   [TCP] FRTO: Moved...
1933

6adb4f733   Ilpo Järvinen   [TCP]: Don't allo...
1934
1935
1936
  	/* MTU probe and F-RTO won't really play nicely along currently */
  	if (icsk->icsk_mtup.probe_size)
  		return 0;
62ab22278   Ilpo Järvinen   tcp FRTO: SACK va...
1937
  	if (tcp_is_sackfrto(tp))
4dc2665e3   Ilpo Järvinen   [TCP]: SACK enhan...
1938
  		return 1;
46d0de4ed   Ilpo Järvinen   [TCP] FRTO: Entry...
1939
1940
1941
  	/* Avoid expensive walking of rexmit queue if possible */
  	if (tp->retrans_out > 1)
  		return 0;
fe067e8ab   David S. Miller   [TCP]: Abstract o...
1942
  	skb = tcp_write_queue_head(sk);
28e3487b7   David S. Miller   tcp: Fix queue tr...
1943
1944
  	if (tcp_skb_is_last(sk, skb))
  		return 1;
fe067e8ab   David S. Miller   [TCP]: Abstract o...
1945
1946
1947
1948
  	skb = tcp_write_queue_next(sk, skb);	/* Skips head */
  	tcp_for_write_queue_from(skb, sk) {
  		if (skb == tcp_send_head(sk))
  			break;
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
1949
  		if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
46d0de4ed   Ilpo Järvinen   [TCP] FRTO: Entry...
1950
1951
  			return 0;
  		/* Short-circuit when first non-SACKed skb has been checked */
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
1952
  		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
46d0de4ed   Ilpo Järvinen   [TCP] FRTO: Entry...
1953
1954
1955
  			break;
  	}
  	return 1;
bdaae17da   Ilpo Järvinen   [TCP] FRTO: Moved...
1956
  }
30935cf4f   Ilpo Järvinen   [TCP] FRTO: Comme...
1957
1958
  /* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
   * recovery a bit and use heuristics in tcp_process_frto() to detect if
d1a54c6a0   Ilpo Järvinen   [TCP] FRTO: Rever...
1959
1960
1961
1962
1963
   * the RTO was spurious. Only clear SACKED_RETRANS of the head here to
   * keep retrans_out counting accurate (with SACK F-RTO, other than head
   * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
   * bits are handled if the Loss state is really to be entered (in
   * tcp_enter_frto_loss).
7487c48c4   Ilpo Järvinen   [TCP] FRTO: Conse...
1964
1965
1966
1967
   *
   * Do like tcp_enter_loss() would; when RTO expires the second time it
   * does:
   *  "Reduce ssthresh if it has not yet been made inside this window."
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1968
1969
1970
   */
  void tcp_enter_frto(struct sock *sk)
  {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
1971
  	const struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1972
1973
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *skb;
7487c48c4   Ilpo Järvinen   [TCP] FRTO: Conse...
1974
  	if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) ||
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
1975
  	    tp->snd_una == tp->high_seq ||
7487c48c4   Ilpo Järvinen   [TCP] FRTO: Conse...
1976
1977
  	    ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
  	     !icsk->icsk_retransmits)) {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
1978
  		tp->prior_ssthresh = tcp_current_ssthresh(sk);
66e93e45c   Ilpo Järvinen   [TCP] FRTO: Fake ...
1979
  		/* Our state is too optimistic in ssthresh() call because cwnd
564262c1f   Ryousei Takano   [TCP]: Fix incons...
1980
  		 * is not reduced until tcp_enter_frto_loss() when previous F-RTO
66e93e45c   Ilpo Järvinen   [TCP] FRTO: Fake ...
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
  		 * recovery has not yet completed. Pattern would be this: RTO,
  		 * Cumulative ACK, RTO (2xRTO for the same segment does not end
  		 * up here twice).
  		 * RFC4138 should be more specific on what to do, even though
  		 * RTO is quite unlikely to occur after the first Cumulative ACK
  		 * due to back-off and complexity of triggering events ...
  		 */
  		if (tp->frto_counter) {
  			u32 stored_cwnd;
  			stored_cwnd = tp->snd_cwnd;
  			tp->snd_cwnd = 2;
  			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
  			tp->snd_cwnd = stored_cwnd;
  		} else {
  			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
  		}
  		/* ... in theory, cong.control module could do "any tricks" in
  		 * ssthresh(), which means that ca_state, lost bits and lost_out
  		 * counter would have to be faked before the call occurs. We
  		 * consider that too expensive, unlikely and hacky, so modules
  		 * using these in ssthresh() must deal these incompatibility
  		 * issues if they receives CA_EVENT_FRTO and frto_counter != 0
  		 */
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2004
  		tcp_ca_event(sk, CA_EVENT_FRTO);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2005
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2006
2007
  	tp->undo_marker = tp->snd_una;
  	tp->undo_retrans = 0;
fe067e8ab   David S. Miller   [TCP]: Abstract o...
2008
  	skb = tcp_write_queue_head(sk);
009a2e3e4   Ilpo Järvinen   [TCP] FRTO: Impro...
2009
2010
  	if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
  		tp->undo_marker = 0;
d1a54c6a0   Ilpo Järvinen   [TCP] FRTO: Rever...
2011
  	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
522e7548a   Ilpo Järvinen   [TCP] FRTO: Incor...
2012
  		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
d1a54c6a0   Ilpo Järvinen   [TCP] FRTO: Rever...
2013
  		tp->retrans_out -= tcp_skb_pcount(skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2014
  	}
005903bc3   Ilpo Järvinen   [TCP]: Left out s...
2015
  	tcp_verify_left_out(tp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2016

746aa32d2   Ilpo Järvinen   [TCP] FRTO: Limit...
2017
2018
  	/* Too bad if TCP was application limited */
  	tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
4dc2665e3   Ilpo Järvinen   [TCP]: SACK enhan...
2019
2020
2021
  	/* Earlier loss recovery underway (see RFC4138; Appendix B).
  	 * The last condition is necessary at least in tp->frto_counter case.
  	 */
62ab22278   Ilpo Järvinen   tcp FRTO: SACK va...
2022
  	if (tcp_is_sackfrto(tp) && (tp->frto_counter ||
4dc2665e3   Ilpo Järvinen   [TCP]: SACK enhan...
2023
2024
2025
2026
2027
2028
  	    ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
  	    after(tp->high_seq, tp->snd_una)) {
  		tp->frto_highmark = tp->high_seq;
  	} else {
  		tp->frto_highmark = tp->snd_nxt;
  	}
7b0eb22b1   Ilpo Järvinen   [TCP] FRTO: Use D...
2029
2030
  	tcp_set_ca_state(sk, TCP_CA_Disorder);
  	tp->high_seq = tp->snd_nxt;
7487c48c4   Ilpo Järvinen   [TCP] FRTO: Conse...
2031
  	tp->frto_counter = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2032
2033
2034
2035
2036
2037
  }
  
  /* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
   * which indicates that we should follow the traditional RTO recovery,
   * i.e. mark everything lost and do go-back-N retransmission.
   */
d1a54c6a0   Ilpo Järvinen   [TCP] FRTO: Rever...
2038
  static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2039
2040
2041
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *skb;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2042

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2043
  	tp->lost_out = 0;
d1a54c6a0   Ilpo Järvinen   [TCP] FRTO: Rever...
2044
  	tp->retrans_out = 0;
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
2045
  	if (tcp_is_reno(tp))
9bff40fda   Ilpo Järvinen   [TCP] FRTO: remov...
2046
  		tcp_reset_reno_sack(tp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2047

fe067e8ab   David S. Miller   [TCP]: Abstract o...
2048
2049
2050
  	tcp_for_write_queue(skb, sk) {
  		if (skb == tcp_send_head(sk))
  			break;
23aeeec36   Ilpo Järvinen   [TCP] FRTO: Plug ...
2051
2052
  
  		TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
d1a54c6a0   Ilpo Järvinen   [TCP] FRTO: Rever...
2053
2054
2055
2056
  		/*
  		 * Count the retransmission made on RTO correctly (only when
  		 * waiting for the first ACK and did not get it)...
  		 */
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2057
  		if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) {
0a9f2a467   Ilpo Järvinen   [TCP]: Verify the...
2058
2059
2060
  			/* For some reason this R-bit might get cleared? */
  			if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
  				tp->retrans_out += tcp_skb_pcount(skb);
d1a54c6a0   Ilpo Järvinen   [TCP] FRTO: Rever...
2061
2062
2063
  			/* ...enter this if branch just for the first segment */
  			flag |= FLAG_DATA_ACKED;
  		} else {
009a2e3e4   Ilpo Järvinen   [TCP] FRTO: Impro...
2064
2065
  			if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
  				tp->undo_marker = 0;
23aeeec36   Ilpo Järvinen   [TCP] FRTO: Plug ...
2066
  			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
d1a54c6a0   Ilpo Järvinen   [TCP] FRTO: Rever...
2067
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2068

79d44516b   Ilpo Järvinen   tcp FRTO: work-ar...
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
  		/* Marking forward transmissions that were made after RTO lost
  		 * can cause unnecessary retransmissions in some scenarios,
  		 * SACK blocks will mitigate that in some but not in all cases.
  		 * We used to not mark them but it was causing break-ups with
  		 * receivers that do only in-order receival.
  		 *
  		 * TODO: we could detect presence of such receiver and select
  		 * different behavior per flow.
  		 */
  		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
9bff40fda   Ilpo Järvinen   [TCP] FRTO: remov...
2079
2080
  			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
  			tp->lost_out += tcp_skb_pcount(skb);
006f582c7   Ilpo Järvinen   tcp: convert retr...
2081
  			tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2082
2083
  		}
  	}
005903bc3   Ilpo Järvinen   [TCP]: Left out s...
2084
  	tcp_verify_left_out(tp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2085

95c4922bf   Ilpo Järvinen   [TCP] FRTO: fixes...
2086
  	tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2087
2088
  	tp->snd_cwnd_cnt = 0;
  	tp->snd_cwnd_stamp = tcp_time_stamp;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2089
  	tp->frto_counter = 0;
16e906812   Ilpo Järvinen   [TCP]: Add bytes_...
2090
  	tp->bytes_acked = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2091
2092
  
  	tp->reordering = min_t(unsigned int, tp->reordering,
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2093
  			       sysctl_tcp_reordering);
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2094
  	tcp_set_ca_state(sk, TCP_CA_Loss);
79d44516b   Ilpo Järvinen   tcp FRTO: work-ar...
2095
  	tp->high_seq = tp->snd_nxt;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2096
  	TCP_ECN_queue_cwr(tp);
6a438bbe6   Stephen Hemminger   [TCP]: speed up S...
2097

64edc2736   Ilpo Järvinen   tcp: Partial hint...
2098
  	tcp_clear_all_retrans_hints(tp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2099
  }
4cd829995   Ilpo Järvinen   [TCP]: No need to...
2100
  static void tcp_clear_retrans_partial(struct tcp_sock *tp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2101
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2102
  	tp->retrans_out = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2103
2104
2105
2106
2107
  	tp->lost_out = 0;
  
  	tp->undo_marker = 0;
  	tp->undo_retrans = 0;
  }
4cd829995   Ilpo Järvinen   [TCP]: No need to...
2108
2109
2110
2111
2112
2113
2114
  void tcp_clear_retrans(struct tcp_sock *tp)
  {
  	tcp_clear_retrans_partial(tp);
  
  	tp->fackets_out = 0;
  	tp->sacked_out = 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2115
2116
2117
2118
2119
2120
  /* Enter Loss state. If "how" is not zero, forget all SACK information
   * and reset tags completely, otherwise preserve SACKs. If receiver
   * dropped its ofo queue, we will know this due to reneging detection.
   */
  void tcp_enter_loss(struct sock *sk, int how)
  {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2121
  	const struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2122
2123
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *skb;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2124
2125
  
  	/* Reduce ssthresh if it has not yet been made inside this window. */
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2126
2127
2128
2129
2130
  	if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
  	    (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
  		tp->prior_ssthresh = tcp_current_ssthresh(sk);
  		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
  		tcp_ca_event(sk, CA_EVENT_LOSS);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2131
2132
2133
2134
  	}
  	tp->snd_cwnd	   = 1;
  	tp->snd_cwnd_cnt   = 0;
  	tp->snd_cwnd_stamp = tcp_time_stamp;
9772efb97   Stephen Hemminger   [TCP]: Appropriat...
2135
  	tp->bytes_acked = 0;
4cd829995   Ilpo Järvinen   [TCP]: No need to...
2136
2137
2138
2139
  	tcp_clear_retrans_partial(tp);
  
  	if (tcp_is_reno(tp))
  		tcp_reset_reno_sack(tp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2140

b76892051   Ilpo Järvinen   [TCP]: Avoid clea...
2141
2142
2143
  	if (!how) {
  		/* Push undo marker, if it was plain RTO and nothing
  		 * was retransmitted. */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2144
  		tp->undo_marker = tp->snd_una;
b76892051   Ilpo Järvinen   [TCP]: Avoid clea...
2145
  	} else {
4cd829995   Ilpo Järvinen   [TCP]: No need to...
2146
2147
  		tp->sacked_out = 0;
  		tp->fackets_out = 0;
b76892051   Ilpo Järvinen   [TCP]: Avoid clea...
2148
  	}
64edc2736   Ilpo Järvinen   tcp: Partial hint...
2149
  	tcp_clear_all_retrans_hints(tp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2150

fe067e8ab   David S. Miller   [TCP]: Abstract o...
2151
2152
2153
  	tcp_for_write_queue(skb, sk) {
  		if (skb == tcp_send_head(sk))
  			break;
4cd829995   Ilpo Järvinen   [TCP]: No need to...
2154

056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2155
  		if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2156
2157
2158
2159
2160
2161
  			tp->undo_marker = 0;
  		TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
  		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
  			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
  			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
  			tp->lost_out += tcp_skb_pcount(skb);
006f582c7   Ilpo Järvinen   tcp: convert retr...
2162
  			tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2163
2164
  		}
  	}
005903bc3   Ilpo Järvinen   [TCP]: Left out s...
2165
  	tcp_verify_left_out(tp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2166
2167
  
  	tp->reordering = min_t(unsigned int, tp->reordering,
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2168
  			       sysctl_tcp_reordering);
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2169
  	tcp_set_ca_state(sk, TCP_CA_Loss);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2170
2171
  	tp->high_seq = tp->snd_nxt;
  	TCP_ECN_queue_cwr(tp);
564262c1f   Ryousei Takano   [TCP]: Fix incons...
2172
  	/* Abort F-RTO algorithm if one is in progress */
580e572a4   Ilpo Järvinen   [TCP] FRTO: Preve...
2173
  	tp->frto_counter = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2174
  }
cadbd0313   Ilpo Järvinen   [TCP]: Dropped un...
2175
2176
2177
2178
2179
2180
2181
  /* If ACK arrived pointing to a remembered SACK, it means that our
   * remembered SACKs do not reflect real state of receiver i.e.
   * receiver _host_ is heavily congested (or buggy).
   *
   * Do processing similar to RTO timeout.
   */
  static int tcp_check_sack_reneging(struct sock *sk, int flag)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2182
  {
cadbd0313   Ilpo Järvinen   [TCP]: Dropped un...
2183
  	if (flag & FLAG_SACK_RENEGING) {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2184
  		struct inet_connection_sock *icsk = inet_csk(sk);
de0744af1   Pavel Emelyanov   mib: add net to N...
2185
  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2186
2187
  
  		tcp_enter_loss(sk, 1);
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2188
  		icsk->icsk_retransmits++;
fe067e8ab   David S. Miller   [TCP]: Abstract o...
2189
  		tcp_retransmit_skb(sk, tcp_write_queue_head(sk));
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
2190
  		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2191
  					  icsk->icsk_rto, TCP_RTO_MAX);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2192
2193
2194
2195
  		return 1;
  	}
  	return 0;
  }
cf533ea53   Eric Dumazet   tcp: add const qu...
2196
  static inline int tcp_fackets_out(const struct tcp_sock *tp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2197
  {
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2198
  	return tcp_is_reno(tp) ? tp->sacked_out + 1 : tp->fackets_out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2199
  }
85cc391c0   Ilpo Järvinen   [TCP]: non-FACK S...
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
  /* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
   * counter when SACK is enabled (without SACK, sacked_out is used for
   * that purpose).
   *
   * Instead, with FACK TCP uses fackets_out that includes both SACKed
   * segments up to the highest received SACK block so far and holes in
   * between them.
   *
   * With reordering, holes may still be in flight, so RFC3517 recovery
   * uses pure sacked_out (total number of SACKed segments) even though
   * it violates the RFC that uses duplicate ACKs, often these are equal
   * but when e.g. out-of-window ACKs or packet duplication occurs,
   * they differ. Since neither occurs due to loss, TCP should really
   * ignore them.
   */
cf533ea53   Eric Dumazet   tcp: add const qu...
2215
  static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
85cc391c0   Ilpo Järvinen   [TCP]: non-FACK S...
2216
2217
2218
  {
  	return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
  }
cf533ea53   Eric Dumazet   tcp: add const qu...
2219
2220
  static inline int tcp_skb_timedout(const struct sock *sk,
  				   const struct sk_buff *skb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2221
  {
a02cec215   Eric Dumazet   net: return opera...
2222
  	return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2223
  }
cf533ea53   Eric Dumazet   tcp: add const qu...
2224
  static inline int tcp_head_timedout(const struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2225
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
2226
  	const struct tcp_sock *tp = tcp_sk(sk);
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2227

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2228
  	return tp->packets_out &&
fe067e8ab   David S. Miller   [TCP]: Abstract o...
2229
  	       tcp_skb_timedout(sk, tcp_write_queue_head(sk));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
  }
  
  /* Linux NewReno/SACK/FACK/ECN state machine.
   * --------------------------------------
   *
   * "Open"	Normal state, no dubious events, fast path.
   * "Disorder"   In all the respects it is "Open",
   *		but requires a bit more attention. It is entered when
   *		we see some SACKs or dupacks. It is split of "Open"
   *		mainly to move some processing from fast path to slow one.
   * "CWR"	CWND was reduced due to some Congestion Notification event.
   *		It can be ECN, ICMP source quench, local device congestion.
   * "Recovery"	CWND was reduced, we are fast-retransmitting.
   * "Loss"	CWND was reduced due to RTO timeout or SACK reneging.
   *
   * tcp_fastretrans_alert() is entered:
   * - each incoming ACK, if state is not "Open"
   * - when arrived ACK is unusual, namely:
   *	* SACK
   *	* Duplicate ACK.
   *	* ECN ECE.
   *
   * Counting packets in flight is pretty simple.
   *
   *	in_flight = packets_out - left_out + retrans_out
   *
   *	packets_out is SND.NXT-SND.UNA counted in packets.
   *
   *	retrans_out is number of retransmitted segments.
   *
   *	left_out is number of segments left network, but not ACKed yet.
   *
   *		left_out = sacked_out + lost_out
   *
   *     sacked_out: Packets, which arrived to receiver out of order
   *		   and hence not ACKed. With SACKs this number is simply
   *		   amount of SACKed data. Even without SACKs
   *		   it is easy to give pretty reliable estimate of this number,
   *		   counting duplicate ACKs.
   *
   *       lost_out: Packets lost by network. TCP has no explicit
   *		   "loss notification" feedback from network (for now).
   *		   It means that this number can be only _guessed_.
   *		   Actually, it is the heuristics to predict lossage that
   *		   distinguishes different algorithms.
   *
   *	F.e. after RTO, when all the queue is considered as lost,
   *	lost_out = packets_out and in_flight = retrans_out.
   *
   *		Essentially, we have now two algorithms counting
   *		lost packets.
   *
   *		FACK: It is the simplest heuristics. As soon as we decided
   *		that something is lost, we decide that _all_ not SACKed
   *		packets until the most forward SACK are lost. I.e.
   *		lost_out = fackets_out - sacked_out and left_out = fackets_out.
   *		It is absolutely correct estimate, if network does not reorder
   *		packets. And it loses any connection to reality when reordering
   *		takes place. We use FACK by default until reordering
   *		is suspected on the path to this destination.
   *
   *		NewReno: when Recovery is entered, we assume that one segment
   *		is lost (classic Reno). While we are in Recovery and
   *		a partial ACK arrives, we assume that one more packet
   *		is lost (NewReno). This heuristics are the same in NewReno
   *		and SACK.
   *
   *  Imagine, that's all! Forget about all this shamanism about CWND inflation
   *  deflation etc. CWND is real congestion window, never inflated, changes
   *  only according to classic VJ rules.
   *
   * Really tricky (and requiring careful tuning) part of algorithm
   * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
   * The first determines the moment _when_ we should reduce CWND and,
   * hence, slow down forward transmission. In fact, it determines the moment
   * when we decide that hole is caused by loss, rather than by a reorder.
   *
   * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
   * holes, caused by lost packets.
   *
   * And the most logically complicated part of algorithm is undo
   * heuristics. We detect false retransmits due to both too early
   * fast retransmit (reordering) and underestimated RTO, analyzing
   * timestamps and D-SACKs. When we detect that some segments were
   * retransmitted by mistake and CWND reduction was wrong, we undo
   * window reduction and abort recovery phase. This logic is hidden
   * inside several functions named tcp_try_undo_<something>.
   */
  
  /* This function decides, when we should leave Disordered state
   * and enter Recovery phase, reducing congestion window.
   *
   * Main question: may we further continue forward transmission
   * with the same cwnd?
   */
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2325
  static int tcp_time_to_recover(struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2326
  {
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2327
  	struct tcp_sock *tp = tcp_sk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2328
  	__u32 packets_out;
564262c1f   Ryousei Takano   [TCP]: Fix incons...
2329
  	/* Do not perform any recovery during F-RTO algorithm */
52c63f1e8   Ilpo Järvinen   [TCP]: Don't ente...
2330
2331
  	if (tp->frto_counter)
  		return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2332
2333
2334
2335
2336
  	/* Trick#1: The loss is proven. */
  	if (tp->lost_out)
  		return 1;
  
  	/* Not-A-Trick#2 : Classic rule... */
ea84e5555   Andreas Petlund   net: Corrected sp...
2337
  	if (tcp_dupack_heuristics(tp) > tp->reordering)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2338
2339
2340
2341
2342
  		return 1;
  
  	/* Trick#3 : when we use RFC2988 timer restart, fast
  	 * retransmit can be triggered by timeout of queue head.
  	 */
85cc391c0   Ilpo Järvinen   [TCP]: non-FACK S...
2343
  	if (tcp_is_fack(tp) && tcp_head_timedout(sk))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2344
2345
2346
2347
2348
2349
2350
2351
  		return 1;
  
  	/* Trick#4: It is still not OK... But will it be useful to delay
  	 * recovery more?
  	 */
  	packets_out = tp->packets_out;
  	if (packets_out <= tp->reordering &&
  	    tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2352
  	    !tcp_may_send_now(sk)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2353
2354
2355
2356
2357
  		/* We have nothing to send. This connection is limited
  		 * either by receiver window or by application.
  		 */
  		return 1;
  	}
7e3801755   Andreas Petlund   net: TCP thin dupack
2358
2359
2360
2361
2362
2363
2364
2365
2366
  	/* If a thin stream is detected, retransmit after first
  	 * received dupack. Employ only if SACK is supported in order
  	 * to avoid possible corner-case series of spurious retransmissions
  	 * Use only if there are no unsent data.
  	 */
  	if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
  	    tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
  	    tcp_is_sack(tp) && !tcp_send_head(sk))
  		return 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2367
2368
  	return 0;
  }
7363a5b23   Ilpo Järvinen   tcp: separate tim...
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
  /* New heuristics: it is possible only after we switched to restart timer
   * each time when something is ACKed. Hence, we can detect timed out packets
   * during fast retransmit without falling to slow start.
   *
   * Usefulness of this as is very questionable, since we should know which of
   * the segments is the next to timeout which is relatively expensive to find
   * in general case unless we add some data structure just for that. The
   * current approach certainly won't find the right one too often and when it
   * finally does find _something_ it usually marks large part of the window
   * right away (because a retransmission with a larger timestamp blocks the
   * loop from advancing). -ij
   */
  static void tcp_timeout_skbs(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *skb;
  
  	if (!tcp_is_fack(tp) || !tcp_head_timedout(sk))
  		return;
  
  	skb = tp->scoreboard_skb_hint;
  	if (tp->scoreboard_skb_hint == NULL)
  		skb = tcp_write_queue_head(sk);
  
  	tcp_for_write_queue_from(skb, sk) {
  		if (skb == tcp_send_head(sk))
  			break;
  		if (!tcp_skb_timedout(sk, skb))
  			break;
  
  		tcp_skb_mark_lost(tp, skb);
  	}
  
  	tp->scoreboard_skb_hint = skb;
  
  	tcp_verify_left_out(tp);
  }
85cc391c0   Ilpo Järvinen   [TCP]: non-FACK S...
2406
2407
2408
  /* Mark head of queue up as lost. With RFC3517 SACK, the packets is
   * is against sacked "cnt", otherwise it's against facked "cnt"
   */
1fdb93610   Ilpo Järvinen   tcp: sack lost ma...
2409
  static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2410
  {
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2411
  	struct tcp_sock *tp = tcp_sk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2412
  	struct sk_buff *skb;
c137f3dda   Ilpo Järvinen   [TCP]: Fix NewRen...
2413
2414
2415
  	int cnt, oldcnt;
  	int err;
  	unsigned int mss;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2416

547b792ca   Ilpo Järvinen   net: convert BUG_...
2417
  	WARN_ON(packets > tp->packets_out);
6a438bbe6   Stephen Hemminger   [TCP]: speed up S...
2418
2419
2420
  	if (tp->lost_skb_hint) {
  		skb = tp->lost_skb_hint;
  		cnt = tp->lost_cnt_hint;
1fdb93610   Ilpo Järvinen   tcp: sack lost ma...
2421
2422
2423
  		/* Head already handled? */
  		if (mark_head && skb != tcp_write_queue_head(sk))
  			return;
6a438bbe6   Stephen Hemminger   [TCP]: speed up S...
2424
  	} else {
fe067e8ab   David S. Miller   [TCP]: Abstract o...
2425
  		skb = tcp_write_queue_head(sk);
6a438bbe6   Stephen Hemminger   [TCP]: speed up S...
2426
2427
  		cnt = 0;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2428

fe067e8ab   David S. Miller   [TCP]: Abstract o...
2429
2430
2431
  	tcp_for_write_queue_from(skb, sk) {
  		if (skb == tcp_send_head(sk))
  			break;
6a438bbe6   Stephen Hemminger   [TCP]: speed up S...
2432
2433
2434
2435
  		/* TODO: do this better */
  		/* this is not the most efficient way to do this... */
  		tp->lost_skb_hint = skb;
  		tp->lost_cnt_hint = cnt;
85cc391c0   Ilpo Järvinen   [TCP]: non-FACK S...
2436

c137f3dda   Ilpo Järvinen   [TCP]: Fix NewRen...
2437
2438
2439
2440
  		if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq))
  			break;
  
  		oldcnt = cnt;
ad1984e84   Ilpo Järvinen   [TCP]: NewReno mu...
2441
  		if (tcp_is_fack(tp) || tcp_is_reno(tp) ||
85cc391c0   Ilpo Järvinen   [TCP]: non-FACK S...
2442
2443
  		    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
  			cnt += tcp_skb_pcount(skb);
c137f3dda   Ilpo Järvinen   [TCP]: Fix NewRen...
2444
  		if (cnt > packets) {
b3de7559a   Yuchung Cheng   tcp: fix TSO FACK...
2445
2446
  			if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) ||
  			    (oldcnt >= packets))
c137f3dda   Ilpo Järvinen   [TCP]: Fix NewRen...
2447
2448
2449
2450
2451
2452
2453
2454
  				break;
  
  			mss = skb_shinfo(skb)->gso_size;
  			err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss);
  			if (err < 0)
  				break;
  			cnt = packets;
  		}
41ea36e35   Ilpo Järvinen   tcp: add helper f...
2455
  		tcp_skb_mark_lost(tp, skb);
1fdb93610   Ilpo Järvinen   tcp: sack lost ma...
2456
2457
2458
  
  		if (mark_head)
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2459
  	}
005903bc3   Ilpo Järvinen   [TCP]: Left out s...
2460
  	tcp_verify_left_out(tp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2461
2462
2463
  }
  
  /* Account newly detected lost packet(s) */
85cc391c0   Ilpo Järvinen   [TCP]: non-FACK S...
2464
  static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2465
  {
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2466
  	struct tcp_sock *tp = tcp_sk(sk);
85cc391c0   Ilpo Järvinen   [TCP]: non-FACK S...
2467
  	if (tcp_is_reno(tp)) {
1fdb93610   Ilpo Järvinen   tcp: sack lost ma...
2468
  		tcp_mark_head_lost(sk, 1, 1);
85cc391c0   Ilpo Järvinen   [TCP]: non-FACK S...
2469
  	} else if (tcp_is_fack(tp)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2470
2471
2472
  		int lost = tp->fackets_out - tp->reordering;
  		if (lost <= 0)
  			lost = 1;
1fdb93610   Ilpo Järvinen   tcp: sack lost ma...
2473
  		tcp_mark_head_lost(sk, lost, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2474
  	} else {
85cc391c0   Ilpo Järvinen   [TCP]: non-FACK S...
2475
  		int sacked_upto = tp->sacked_out - tp->reordering;
1fdb93610   Ilpo Järvinen   tcp: sack lost ma...
2476
2477
2478
2479
  		if (sacked_upto >= 0)
  			tcp_mark_head_lost(sk, sacked_upto, 0);
  		else if (fast_rexmit)
  			tcp_mark_head_lost(sk, 1, 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2480
  	}
7363a5b23   Ilpo Järvinen   tcp: separate tim...
2481
  	tcp_timeout_skbs(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2482
2483
2484
2485
2486
2487
2488
2489
  }
  
  /* CWND moderation, preventing bursts due to too big ACKs
   * in dubious situations.
   */
  static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
  {
  	tp->snd_cwnd = min(tp->snd_cwnd,
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2490
  			   tcp_packets_in_flight(tp) + tcp_max_burst(tp));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2491
2492
  	tp->snd_cwnd_stamp = tcp_time_stamp;
  }
72dc5b922   Stephen Hemminger   [TCP]: Minimum co...
2493
2494
2495
2496
2497
2498
2499
2500
2501
  /* Lower bound on congestion window is slow start threshold
   * unless congestion avoidance choice decides to overide it.
   */
  static inline u32 tcp_cwnd_min(const struct sock *sk)
  {
  	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
  
  	return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2502
  /* Decrease cwnd each second ack. */
1e757f999   Ilpo Järvinen   [TCP]: Fix rateha...
2503
  static void tcp_cwnd_down(struct sock *sk, int flag)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2504
  {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2505
  	struct tcp_sock *tp = tcp_sk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2506
  	int decr = tp->snd_cwnd_cnt + 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2507

056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2508
2509
2510
  	if ((flag & (FLAG_ANY_PROGRESS | FLAG_DSACKING_ACK)) ||
  	    (tcp_is_reno(tp) && !(flag & FLAG_NOT_DUP))) {
  		tp->snd_cwnd_cnt = decr & 1;
1e757f999   Ilpo Järvinen   [TCP]: Fix rateha...
2511
  		decr >>= 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2512

1e757f999   Ilpo Järvinen   [TCP]: Fix rateha...
2513
2514
  		if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
  			tp->snd_cwnd -= decr;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2515

056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2516
  		tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
1e757f999   Ilpo Järvinen   [TCP]: Fix rateha...
2517
2518
  		tp->snd_cwnd_stamp = tcp_time_stamp;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2519
2520
2521
2522
2523
  }
  
  /* Nothing was retransmitted or returned timestamp is less
   * than timestamp of the first retransmission.
   */
cf533ea53   Eric Dumazet   tcp: add const qu...
2524
  static inline int tcp_packet_delayed(const struct tcp_sock *tp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2525
2526
2527
  {
  	return !tp->retrans_stamp ||
  		(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
d7ee147d4   Arnd Hannemann   tcp: Make use of ...
2528
  		 before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2529
2530
2531
2532
2533
  }
  
  /* Undo procedures. */
  
  #if FASTRETRANS_DEBUG > 1
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2534
  static void DBGUNDO(struct sock *sk, const char *msg)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2535
  {
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2536
  	struct tcp_sock *tp = tcp_sk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2537
  	struct inet_sock *inet = inet_sk(sk);
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2538

569508c96   YOSHIFUJI Hideaki   [TCP]: Format add...
2539
  	if (sk->sk_family == AF_INET) {
673d57e72   Harvey Harrison   net: replace NIPQ...
2540
2541
  		printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u
  ",
569508c96   YOSHIFUJI Hideaki   [TCP]: Format add...
2542
  		       msg,
288fcee8b   Joe Perches   net/ipv4/tcp_inpu...
2543
  		       &inet->inet_daddr, ntohs(inet->inet_dport),
569508c96   YOSHIFUJI Hideaki   [TCP]: Format add...
2544
2545
2546
2547
  		       tp->snd_cwnd, tcp_left_out(tp),
  		       tp->snd_ssthresh, tp->prior_ssthresh,
  		       tp->packets_out);
  	}
dfd56b8b3   Eric Dumazet   net: use IS_ENABL...
2548
  #if IS_ENABLED(CONFIG_IPV6)
569508c96   YOSHIFUJI Hideaki   [TCP]: Format add...
2549
2550
  	else if (sk->sk_family == AF_INET6) {
  		struct ipv6_pinfo *np = inet6_sk(sk);
5b095d989   Harvey Harrison   net: replace %p6 ...
2551
2552
  		printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u
  ",
569508c96   YOSHIFUJI Hideaki   [TCP]: Format add...
2553
  		       msg,
288fcee8b   Joe Perches   net/ipv4/tcp_inpu...
2554
  		       &np->daddr, ntohs(inet->inet_dport),
569508c96   YOSHIFUJI Hideaki   [TCP]: Format add...
2555
2556
2557
2558
2559
  		       tp->snd_cwnd, tcp_left_out(tp),
  		       tp->snd_ssthresh, tp->prior_ssthresh,
  		       tp->packets_out);
  	}
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2560
2561
2562
2563
  }
  #else
  #define DBGUNDO(x...) do { } while (0)
  #endif
f6152737a   David S. Miller   tcp: Make undo_ss...
2564
  static void tcp_undo_cwr(struct sock *sk, const bool undo_ssthresh)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2565
  {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2566
  	struct tcp_sock *tp = tcp_sk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2567
  	if (tp->prior_ssthresh) {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2568
2569
2570
2571
  		const struct inet_connection_sock *icsk = inet_csk(sk);
  
  		if (icsk->icsk_ca_ops->undo_cwnd)
  			tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2572
  		else
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2573
  			tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh << 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2574

67d4120a1   Yuchung Cheng   tcp: avoid cwnd m...
2575
  		if (undo_ssthresh && tp->prior_ssthresh > tp->snd_ssthresh) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2576
2577
2578
2579
2580
2581
  			tp->snd_ssthresh = tp->prior_ssthresh;
  			TCP_ECN_withdraw_cwr(tp);
  		}
  	} else {
  		tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2582
2583
  	tp->snd_cwnd_stamp = tcp_time_stamp;
  }
cf533ea53   Eric Dumazet   tcp: add const qu...
2584
  static inline int tcp_may_undo(const struct tcp_sock *tp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2585
  {
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2586
  	return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2587
2588
2589
  }
  
  /* People celebrate: "We love our President!" */
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2590
  static int tcp_try_undo_recovery(struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2591
  {
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2592
  	struct tcp_sock *tp = tcp_sk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2593
  	if (tcp_may_undo(tp)) {
40b215e59   Pavel Emelyanov   tcp: de-bloat a b...
2594
  		int mib_idx;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2595
2596
2597
  		/* Happy end! We did not retransmit anything
  		 * or our original transmission succeeded.
  		 */
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2598
  		DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans");
f6152737a   David S. Miller   tcp: Make undo_ss...
2599
  		tcp_undo_cwr(sk, true);
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2600
  		if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss)
40b215e59   Pavel Emelyanov   tcp: de-bloat a b...
2601
  			mib_idx = LINUX_MIB_TCPLOSSUNDO;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2602
  		else
40b215e59   Pavel Emelyanov   tcp: de-bloat a b...
2603
  			mib_idx = LINUX_MIB_TCPFULLUNDO;
de0744af1   Pavel Emelyanov   mib: add net to N...
2604
  		NET_INC_STATS_BH(sock_net(sk), mib_idx);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2605
2606
  		tp->undo_marker = 0;
  	}
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
2607
  	if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2608
2609
2610
2611
2612
2613
  		/* Hold old state until something *above* high_seq
  		 * is ACKed. For Reno it is MUST to prevent false
  		 * fast retransmits (RFC2582). SACK TCP is safe. */
  		tcp_moderate_cwnd(tp);
  		return 1;
  	}
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2614
  	tcp_set_ca_state(sk, TCP_CA_Open);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2615
2616
2617
2618
  	return 0;
  }
  
  /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2619
  static void tcp_try_undo_dsack(struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2620
  {
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2621
  	struct tcp_sock *tp = tcp_sk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2622
  	if (tp->undo_marker && !tp->undo_retrans) {
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2623
  		DBGUNDO(sk, "D-SACK");
f6152737a   David S. Miller   tcp: Make undo_ss...
2624
  		tcp_undo_cwr(sk, true);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2625
  		tp->undo_marker = 0;
de0744af1   Pavel Emelyanov   mib: add net to N...
2626
  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2627
2628
  	}
  }
77722b177   Ilpo Järvinen   tcp: fix retrans_...
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
  /* We can clear retrans_stamp when there are no retransmissions in the
   * window. It would seem that it is trivially available for us in
   * tp->retrans_out, however, that kind of assumptions doesn't consider
   * what will happen if errors occur when sending retransmission for the
   * second time. ...It could the that such segment has only
   * TCPCB_EVER_RETRANS set at the present time. It seems that checking
   * the head skb is enough except for some reneging corner cases that
   * are not worth the effort.
   *
   * Main reason for all this complexity is the fact that connection dying
   * time now depends on the validity of the retrans_stamp, in particular,
   * that successive retransmissions of a segment must not advance
   * retrans_stamp under any conditions.
   */
cf533ea53   Eric Dumazet   tcp: add const qu...
2643
  static int tcp_any_retrans_done(const struct sock *sk)
77722b177   Ilpo Järvinen   tcp: fix retrans_...
2644
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
2645
  	const struct tcp_sock *tp = tcp_sk(sk);
77722b177   Ilpo Järvinen   tcp: fix retrans_...
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
  	struct sk_buff *skb;
  
  	if (tp->retrans_out)
  		return 1;
  
  	skb = tcp_write_queue_head(sk);
  	if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
  		return 1;
  
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2657
  /* Undo during fast recovery after partial ACK. */
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2658
  static int tcp_try_undo_partial(struct sock *sk, int acked)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2659
  {
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2660
  	struct tcp_sock *tp = tcp_sk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2661
  	/* Partial ACK arrived. Force Hoe's retransmit. */
85cc391c0   Ilpo Järvinen   [TCP]: non-FACK S...
2662
  	int failed = tcp_is_reno(tp) || (tcp_fackets_out(tp) > tp->reordering);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2663
2664
2665
2666
2667
  
  	if (tcp_may_undo(tp)) {
  		/* Plain luck! Hole if filled with delayed
  		 * packet, rather than with a retransmit.
  		 */
77722b177   Ilpo Järvinen   tcp: fix retrans_...
2668
  		if (!tcp_any_retrans_done(sk))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2669
  			tp->retrans_stamp = 0;
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2670
  		tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2671

9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2672
  		DBGUNDO(sk, "Hoe");
f6152737a   David S. Miller   tcp: Make undo_ss...
2673
  		tcp_undo_cwr(sk, false);
de0744af1   Pavel Emelyanov   mib: add net to N...
2674
  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
  
  		/* So... Do not make Hoe's retransmit yet.
  		 * If the first packet was delayed, the rest
  		 * ones are most probably delayed as well.
  		 */
  		failed = 0;
  	}
  	return failed;
  }
  
  /* Undo during loss recovery after partial ACK. */
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2686
  static int tcp_try_undo_loss(struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2687
  {
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2688
  	struct tcp_sock *tp = tcp_sk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2689
2690
  	if (tcp_may_undo(tp)) {
  		struct sk_buff *skb;
fe067e8ab   David S. Miller   [TCP]: Abstract o...
2691
2692
2693
  		tcp_for_write_queue(skb, sk) {
  			if (skb == tcp_send_head(sk))
  				break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2694
2695
  			TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
  		}
6a438bbe6   Stephen Hemminger   [TCP]: speed up S...
2696

5af4ec236   Ilpo Järvinen   [TCP]: clear_all_...
2697
  		tcp_clear_all_retrans_hints(tp);
6a438bbe6   Stephen Hemminger   [TCP]: speed up S...
2698

9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2699
  		DBGUNDO(sk, "partial loss");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2700
  		tp->lost_out = 0;
f6152737a   David S. Miller   tcp: Make undo_ss...
2701
  		tcp_undo_cwr(sk, true);
de0744af1   Pavel Emelyanov   mib: add net to N...
2702
  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
2703
  		inet_csk(sk)->icsk_retransmits = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2704
  		tp->undo_marker = 0;
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
2705
  		if (tcp_is_sack(tp))
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2706
  			tcp_set_ca_state(sk, TCP_CA_Open);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2707
2708
2709
2710
  		return 1;
  	}
  	return 0;
  }
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2711
  static inline void tcp_complete_cwr(struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2712
  {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2713
  	struct tcp_sock *tp = tcp_sk(sk);
a262f0cdf   Nandita Dukkipati   Proportional Rate...
2714
2715
2716
2717
2718
2719
2720
  
  	/* Do not moderate cwnd if it's already undone in cwr or recovery. */
  	if (tp->undo_marker) {
  		if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR)
  			tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
  		else /* PRR */
  			tp->snd_cwnd = tp->snd_ssthresh;
67d4120a1   Yuchung Cheng   tcp: avoid cwnd m...
2721
2722
  		tp->snd_cwnd_stamp = tcp_time_stamp;
  	}
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2723
  	tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2724
  }
8aca6cb11   Ilpo Järvinen   tcp: Fix inconsis...
2725
2726
2727
2728
  static void tcp_try_keep_open(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	int state = TCP_CA_Open;
f698204bd   Neal Cardwell   tcp: allow undo f...
2729
  	if (tcp_left_out(tp) || tcp_any_retrans_done(sk))
8aca6cb11   Ilpo Järvinen   tcp: Fix inconsis...
2730
2731
2732
2733
2734
2735
2736
  		state = TCP_CA_Disorder;
  
  	if (inet_csk(sk)->icsk_ca_state != state) {
  		tcp_set_ca_state(sk, state);
  		tp->high_seq = tp->snd_nxt;
  	}
  }
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2737
  static void tcp_try_to_open(struct sock *sk, int flag)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2738
  {
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2739
  	struct tcp_sock *tp = tcp_sk(sk);
86426c22d   Ilpo Järvinen   [TCP]: Restore ov...
2740
  	tcp_verify_left_out(tp);
77722b177   Ilpo Järvinen   tcp: fix retrans_...
2741
  	if (!tp->frto_counter && !tcp_any_retrans_done(sk))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2742
  		tp->retrans_stamp = 0;
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2743
  	if (flag & FLAG_ECE)
3cfe3baaf   Ilpo Järvinen   [TCP]: Add two ne...
2744
  		tcp_enter_cwr(sk, 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2745

6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2746
  	if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
8aca6cb11   Ilpo Järvinen   tcp: Fix inconsis...
2747
  		tcp_try_keep_open(sk);
8cd6d6162   Neal Cardwell   tcp: skip cwnd mo...
2748
2749
  		if (inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
  			tcp_moderate_cwnd(tp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2750
  	} else {
1e757f999   Ilpo Järvinen   [TCP]: Fix rateha...
2751
  		tcp_cwnd_down(sk, flag);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2752
2753
  	}
  }
5d424d5a6   John Heffner   [TCP]: MTU probing
2754
2755
2756
2757
2758
2759
2760
  static void tcp_mtup_probe_failed(struct sock *sk)
  {
  	struct inet_connection_sock *icsk = inet_csk(sk);
  
  	icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
  	icsk->icsk_mtup.probe_size = 0;
  }
72211e905   Ilpo Järvinen   tcp: don't check ...
2761
  static void tcp_mtup_probe_success(struct sock *sk)
5d424d5a6   John Heffner   [TCP]: MTU probing
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct inet_connection_sock *icsk = inet_csk(sk);
  
  	/* FIXME: breaks with very large cwnd */
  	tp->prior_ssthresh = tcp_current_ssthresh(sk);
  	tp->snd_cwnd = tp->snd_cwnd *
  		       tcp_mss_to_mtu(sk, tp->mss_cache) /
  		       icsk->icsk_mtup.probe_size;
  	tp->snd_cwnd_cnt = 0;
  	tp->snd_cwnd_stamp = tcp_time_stamp;
9c6d5e553   John Heffner   TCP: Fix setting ...
2773
  	tp->snd_ssthresh = tcp_current_ssthresh(sk);
5d424d5a6   John Heffner   [TCP]: MTU probing
2774
2775
2776
2777
2778
  
  	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
  	icsk->icsk_mtup.probe_size = 0;
  	tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
  }
e1aa680fa   Ilpo Järvinen   tcp: move tcp_sim...
2779
2780
2781
2782
2783
2784
2785
2786
2787
  /* Do a simple retransmit without using the backoff mechanisms in
   * tcp_timer. This is used for path mtu discovery.
   * The socket is already locked here.
   */
  void tcp_simple_retransmit(struct sock *sk)
  {
  	const struct inet_connection_sock *icsk = inet_csk(sk);
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *skb;
0c54b85f2   Ilpo Järvinen   tcp: simplify tcp...
2788
  	unsigned int mss = tcp_current_mss(sk);
e1aa680fa   Ilpo Järvinen   tcp: move tcp_sim...
2789
2790
2791
2792
2793
  	u32 prior_lost = tp->lost_out;
  
  	tcp_for_write_queue(skb, sk) {
  		if (skb == tcp_send_head(sk))
  			break;
775ffabf7   Ilpo Järvinen   tcp: make mtu pro...
2794
  		if (tcp_skb_seglen(skb) > mss &&
e1aa680fa   Ilpo Järvinen   tcp: move tcp_sim...
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
  		    !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
  			if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
  				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
  				tp->retrans_out -= tcp_skb_pcount(skb);
  			}
  			tcp_skb_mark_lost_uncond_verify(tp, skb);
  		}
  	}
  
  	tcp_clear_retrans_hints_partial(tp);
  
  	if (prior_lost == tp->lost_out)
  		return;
  
  	if (tcp_is_reno(tp))
  		tcp_limit_reno_sacked(tp);
  
  	tcp_verify_left_out(tp);
  
  	/* Don't muck with the congestion window here.
  	 * Reason is that we do not increase amount of _data_
  	 * in network, but units changed and effective
  	 * cwnd/ssthresh really reduced now.
  	 */
  	if (icsk->icsk_ca_state != TCP_CA_Loss) {
  		tp->high_seq = tp->snd_nxt;
  		tp->snd_ssthresh = tcp_current_ssthresh(sk);
  		tp->prior_ssthresh = 0;
  		tp->undo_marker = 0;
  		tcp_set_ca_state(sk, TCP_CA_Loss);
  	}
  	tcp_xmit_retransmit_queue(sk);
  }
4bc2f18ba   Eric Dumazet   net/ipv4: EXPORT_...
2828
  EXPORT_SYMBOL(tcp_simple_retransmit);
e1aa680fa   Ilpo Järvinen   tcp: move tcp_sim...
2829

a262f0cdf   Nandita Dukkipati   Proportional Rate...
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
  /* This function implements the PRR algorithm, specifcally the PRR-SSRB
   * (proportional rate reduction with slow start reduction bound) as described in
   * http://www.ietf.org/id/draft-mathis-tcpm-proportional-rate-reduction-01.txt.
   * It computes the number of packets to send (sndcnt) based on packets newly
   * delivered:
   *   1) If the packets in flight is larger than ssthresh, PRR spreads the
   *	cwnd reductions across a full RTT.
   *   2) If packets in flight is lower than ssthresh (such as due to excess
   *	losses and/or application stalls), do not perform any further cwnd
   *	reductions, but instead slow start up to ssthresh.
   */
  static void tcp_update_cwnd_in_recovery(struct sock *sk, int newly_acked_sacked,
  					int fast_rexmit, int flag)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	int sndcnt = 0;
  	int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
  
  	if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) {
  		u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
  			       tp->prior_cwnd - 1;
  		sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
  	} else {
  		sndcnt = min_t(int, delta,
  			       max_t(int, tp->prr_delivered - tp->prr_out,
  				     newly_acked_sacked) + 1);
  	}
  
  	sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
  	tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
  /* Process an event, which can update packets-in-flight not trivially.
   * Main goal of this function is to calculate new estimate for left_out,
   * taking into account both packets sitting in receiver's buffer and
   * packets lost by network.
   *
   * Besides that it does CWND reduction, when packet loss is detected
   * and changes state of machine.
   *
   * It does _not_ decide what to send, it is made in function
   * tcp_xmit_retransmit_queue().
   */
a262f0cdf   Nandita Dukkipati   Proportional Rate...
2872
  static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
7d2b55f80   Neal Cardwell   tcp: make is_dupa...
2873
2874
  				  int newly_acked_sacked, bool is_dupack,
  				  int flag)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2875
  {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2876
  	struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2877
  	struct tcp_sock *tp = tcp_sk(sk);
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2878
  	int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
85cc391c0   Ilpo Järvinen   [TCP]: non-FACK S...
2879
  				    (tcp_fackets_out(tp) > tp->reordering));
40b215e59   Pavel Emelyanov   tcp: de-bloat a b...
2880
  	int fast_rexmit = 0, mib_idx;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2881

3ccd3130b   Ilpo Järvinen   [TCP]: Make invar...
2882
  	if (WARN_ON(!tp->packets_out && tp->sacked_out))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2883
  		tp->sacked_out = 0;
91fed7a15   Ilpo Järvinen   [TCP]: Make facke...
2884
  	if (WARN_ON(!tp->sacked_out && tp->fackets_out))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2885
  		tp->fackets_out = 0;
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
2886
  	/* Now state machine starts.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2887
  	 * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2888
  	if (flag & FLAG_ECE)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2889
2890
2891
  		tp->prior_ssthresh = 0;
  
  	/* B. In all the states check for reneging SACKs. */
cadbd0313   Ilpo Järvinen   [TCP]: Dropped un...
2892
  	if (tcp_check_sack_reneging(sk, flag))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2893
2894
2895
  		return;
  
  	/* C. Process data loss notification, provided it is valid. */
85cc391c0   Ilpo Järvinen   [TCP]: non-FACK S...
2896
  	if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) &&
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2897
  	    before(tp->snd_una, tp->high_seq) &&
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2898
  	    icsk->icsk_ca_state != TCP_CA_Open &&
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2899
  	    tp->fackets_out > tp->reordering) {
1fdb93610   Ilpo Järvinen   tcp: sack lost ma...
2900
  		tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
de0744af1   Pavel Emelyanov   mib: add net to N...
2901
  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2902
  	}
005903bc3   Ilpo Järvinen   [TCP]: Left out s...
2903
2904
  	/* D. Check consistency of the current state. */
  	tcp_verify_left_out(tp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2905
2906
2907
  
  	/* E. Check state exit conditions. State can be terminated
  	 *    when high_seq is ACKed. */
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2908
  	if (icsk->icsk_ca_state == TCP_CA_Open) {
547b792ca   Ilpo Järvinen   net: convert BUG_...
2909
  		WARN_ON(tp->retrans_out != 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2910
2911
  		tp->retrans_stamp = 0;
  	} else if (!before(tp->snd_una, tp->high_seq)) {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2912
  		switch (icsk->icsk_ca_state) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2913
  		case TCP_CA_Loss:
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2914
  			icsk->icsk_retransmits = 0;
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2915
  			if (tcp_try_undo_recovery(sk))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2916
2917
2918
2919
2920
2921
2922
  				return;
  			break;
  
  		case TCP_CA_CWR:
  			/* CWR is to be held something *above* high_seq
  			 * is ACKed for CWR bit to reach receiver. */
  			if (tp->snd_una != tp->high_seq) {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2923
2924
  				tcp_complete_cwr(sk);
  				tcp_set_ca_state(sk, TCP_CA_Open);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2925
2926
  			}
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2927
  		case TCP_CA_Recovery:
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
2928
  			if (tcp_is_reno(tp))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2929
  				tcp_reset_reno_sack(tp);
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2930
  			if (tcp_try_undo_recovery(sk))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2931
  				return;
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2932
  			tcp_complete_cwr(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2933
2934
2935
2936
2937
  			break;
  		}
  	}
  
  	/* F. Process state. */
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2938
  	switch (icsk->icsk_ca_state) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2939
  	case TCP_CA_Recovery:
2e6052941   Ilpo Järvinen   [TCP]: Also handl...
2940
  		if (!(flag & FLAG_SND_UNA_ADVANCED)) {
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
2941
  			if (tcp_is_reno(tp) && is_dupack)
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2942
  				tcp_add_reno_sack(sk);
1b6d427bb   Ilpo Järvinen   [TCP]: Reduce sac...
2943
2944
  		} else
  			do_lost = tcp_try_undo_partial(sk, pkts_acked);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2945
2946
  		break;
  	case TCP_CA_Loss:
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2947
  		if (flag & FLAG_DATA_ACKED)
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2948
  			icsk->icsk_retransmits = 0;
882bebaac   Ilpo Järvinen   [TCP]: tcp_simple...
2949
2950
  		if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED)
  			tcp_reset_reno_sack(tp);
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2951
  		if (!tcp_try_undo_loss(sk)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2952
2953
2954
2955
  			tcp_moderate_cwnd(tp);
  			tcp_xmit_retransmit_queue(sk);
  			return;
  		}
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2956
  		if (icsk->icsk_ca_state != TCP_CA_Open)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2957
2958
2959
  			return;
  		/* Loss is undone; fall through to processing in Open state. */
  	default:
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
2960
  		if (tcp_is_reno(tp)) {
2e6052941   Ilpo Järvinen   [TCP]: Also handl...
2961
  			if (flag & FLAG_SND_UNA_ADVANCED)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2962
2963
  				tcp_reset_reno_sack(tp);
  			if (is_dupack)
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2964
  				tcp_add_reno_sack(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2965
  		}
f698204bd   Neal Cardwell   tcp: allow undo f...
2966
  		if (icsk->icsk_ca_state <= TCP_CA_Disorder)
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2967
  			tcp_try_undo_dsack(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2968

9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2969
2970
  		if (!tcp_time_to_recover(sk)) {
  			tcp_try_to_open(sk, flag);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2971
2972
  			return;
  		}
5d424d5a6   John Heffner   [TCP]: MTU probing
2973
2974
2975
  		/* MTU probe failure: don't reduce cwnd */
  		if (icsk->icsk_ca_state < TCP_CA_CWR &&
  		    icsk->icsk_mtup.probe_size &&
0e7b13685   John Heffner   [TCP] mtu probing...
2976
  		    tp->snd_una == tp->mtu_probe.probe_seq_start) {
5d424d5a6   John Heffner   [TCP]: MTU probing
2977
2978
2979
2980
2981
2982
  			tcp_mtup_probe_failed(sk);
  			/* Restores the reduction we did in tcp_mtup_probe() */
  			tp->snd_cwnd++;
  			tcp_simple_retransmit(sk);
  			return;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2983
  		/* Otherwise enter Recovery state */
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
2984
  		if (tcp_is_reno(tp))
40b215e59   Pavel Emelyanov   tcp: de-bloat a b...
2985
  			mib_idx = LINUX_MIB_TCPRENORECOVERY;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2986
  		else
40b215e59   Pavel Emelyanov   tcp: de-bloat a b...
2987
  			mib_idx = LINUX_MIB_TCPSACKRECOVERY;
de0744af1   Pavel Emelyanov   mib: add net to N...
2988
  		NET_INC_STATS_BH(sock_net(sk), mib_idx);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2989
2990
2991
2992
2993
  
  		tp->high_seq = tp->snd_nxt;
  		tp->prior_ssthresh = 0;
  		tp->undo_marker = tp->snd_una;
  		tp->undo_retrans = tp->retrans_out;
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2994
  		if (icsk->icsk_ca_state < TCP_CA_CWR) {
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2995
  			if (!(flag & FLAG_ECE))
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2996
2997
  				tp->prior_ssthresh = tcp_current_ssthresh(sk);
  			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2998
2999
  			TCP_ECN_queue_cwr(tp);
  		}
9772efb97   Stephen Hemminger   [TCP]: Appropriat...
3000
  		tp->bytes_acked = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3001
  		tp->snd_cwnd_cnt = 0;
a262f0cdf   Nandita Dukkipati   Proportional Rate...
3002
3003
3004
  		tp->prior_cwnd = tp->snd_cwnd;
  		tp->prr_delivered = 0;
  		tp->prr_out = 0;
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
3005
  		tcp_set_ca_state(sk, TCP_CA_Recovery);
85cc391c0   Ilpo Järvinen   [TCP]: non-FACK S...
3006
  		fast_rexmit = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3007
  	}
85cc391c0   Ilpo Järvinen   [TCP]: non-FACK S...
3008
3009
  	if (do_lost || (tcp_is_fack(tp) && tcp_head_timedout(sk)))
  		tcp_update_scoreboard(sk, fast_rexmit);
a262f0cdf   Nandita Dukkipati   Proportional Rate...
3010
3011
  	tp->prr_delivered += newly_acked_sacked;
  	tcp_update_cwnd_in_recovery(sk, newly_acked_sacked, fast_rexmit, flag);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3012
3013
  	tcp_xmit_retransmit_queue(sk);
  }
9ad7c049f   Jerry Chu   tcp: RFC2988bis +...
3014
  void tcp_valid_rtt_meas(struct sock *sk, u32 seq_rtt)
41834b733   Ilpo Järvinen   tcp: share code t...
3015
3016
3017
3018
3019
  {
  	tcp_rtt_estimator(sk, seq_rtt);
  	tcp_set_rto(sk);
  	inet_csk(sk)->icsk_backoff = 0;
  }
9ad7c049f   Jerry Chu   tcp: RFC2988bis +...
3020
  EXPORT_SYMBOL(tcp_valid_rtt_meas);
41834b733   Ilpo Järvinen   tcp: share code t...
3021

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3022
  /* Read draft-ietf-tcplw-high-performance before mucking
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
3023
   * with this code. (Supersedes RFC1323)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3024
   */
2d2abbab6   Stephen Hemminger   [TCP]: simplify m...
3025
  static void tcp_ack_saw_tstamp(struct sock *sk, int flag)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3026
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3027
3028
3029
3030
3031
3032
3033
3034
3035
  	/* RTTM Rule: A TSecr value received in a segment is used to
  	 * update the averaged RTT measurement only if the segment
  	 * acknowledges some new data, i.e., only if it advances the
  	 * left edge of the send window.
  	 *
  	 * See draft-ietf-tcplw-high-performance-00, section 3.3.
  	 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
  	 *
  	 * Changed: reset backoff as soon as we see the first valid sample.
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
3036
  	 * If we do not, we get strongly overestimated rto. With timestamps
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3037
3038
3039
3040
3041
  	 * samples are accepted even from very old segments: f.e., when rtt=1
  	 * increases to 8, we retransmit 5 times and after 8 seconds delayed
  	 * answer arrives rto becomes 120 seconds! If at least one of segments
  	 * in window is lost... Voila.	 			--ANK (010210)
  	 */
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3042
  	struct tcp_sock *tp = tcp_sk(sk);
41834b733   Ilpo Järvinen   tcp: share code t...
3043
3044
  
  	tcp_valid_rtt_meas(sk, tcp_time_stamp - tp->rx_opt.rcv_tsecr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3045
  }
2d2abbab6   Stephen Hemminger   [TCP]: simplify m...
3046
  static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
  {
  	/* We don't have a timestamp. Can only use
  	 * packets that are not retransmitted to determine
  	 * rtt estimates. Also, we must not reset the
  	 * backoff for rto until we get a non-retransmitted
  	 * packet. This allows us to deal with a situation
  	 * where the network delay has increased suddenly.
  	 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
  	 */
  
  	if (flag & FLAG_RETRANS_DATA_ACKED)
  		return;
41834b733   Ilpo Järvinen   tcp: share code t...
3059
  	tcp_valid_rtt_meas(sk, seq_rtt);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3060
  }
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3061
  static inline void tcp_ack_update_rtt(struct sock *sk, const int flag,
2d2abbab6   Stephen Hemminger   [TCP]: simplify m...
3062
  				      const s32 seq_rtt)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3063
  {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3064
  	const struct tcp_sock *tp = tcp_sk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3065
3066
  	/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
  	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
2d2abbab6   Stephen Hemminger   [TCP]: simplify m...
3067
  		tcp_ack_saw_tstamp(sk, flag);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3068
  	else if (seq_rtt >= 0)
2d2abbab6   Stephen Hemminger   [TCP]: simplify m...
3069
  		tcp_ack_no_tstamp(sk, seq_rtt, flag);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3070
  }
c3a05c605   Ilpo Järvinen   [TCP]: Cong.ctrl ...
3071
  static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3072
  {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
3073
  	const struct inet_connection_sock *icsk = inet_csk(sk);
c3a05c605   Ilpo Järvinen   [TCP]: Cong.ctrl ...
3074
  	icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight);
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
3075
  	tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3076
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3077
3078
3079
  /* Restart timer after forward progress on connection.
   * RFC2988 recommends to restart timer to now+rto.
   */
6728e7dc3   Ilpo Järvinen   [TCP]: Rename tcp...
3080
  static void tcp_rearm_rto(struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3081
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
3082
  	const struct tcp_sock *tp = tcp_sk(sk);
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
3083

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3084
  	if (!tp->packets_out) {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3085
  		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3086
  	} else {
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3087
3088
  		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
  					  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3089
3090
  	}
  }
7c46a03e6   Ilpo Järvinen   [TCP]: Cleanup tc...
3091
  /* If we get here, the whole TSO packet has not been acked. */
13fcf850c   Ilpo Järvinen   [TCP]: Move accou...
3092
  static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3093
3094
  {
  	struct tcp_sock *tp = tcp_sk(sk);
7c46a03e6   Ilpo Järvinen   [TCP]: Cleanup tc...
3095
  	u32 packets_acked;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3096

7c46a03e6   Ilpo Järvinen   [TCP]: Cleanup tc...
3097
  	BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3098
3099
  
  	packets_acked = tcp_skb_pcount(skb);
7c46a03e6   Ilpo Järvinen   [TCP]: Cleanup tc...
3100
  	if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3101
3102
3103
3104
  		return 0;
  	packets_acked -= tcp_skb_pcount(skb);
  
  	if (packets_acked) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3105
  		BUG_ON(tcp_skb_pcount(skb) == 0);
7c46a03e6   Ilpo Järvinen   [TCP]: Cleanup tc...
3106
  		BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3107
  	}
13fcf850c   Ilpo Järvinen   [TCP]: Move accou...
3108
  	return packets_acked;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3109
  }
7c46a03e6   Ilpo Järvinen   [TCP]: Cleanup tc...
3110
3111
3112
3113
  /* Remove acknowledged frames from the retransmission queue. If our packet
   * is before the ack sequence we can discard it as it's confirmed to have
   * arrived at the other end.
   */
33f5f57ee   Ilpo Järvinen   tcp: kill pointle...
3114
3115
  static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
  			       u32 prior_snd_una)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3116
3117
  {
  	struct tcp_sock *tp = tcp_sk(sk);
2d2abbab6   Stephen Hemminger   [TCP]: simplify m...
3118
  	const struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3119
  	struct sk_buff *skb;
7c46a03e6   Ilpo Järvinen   [TCP]: Cleanup tc...
3120
  	u32 now = tcp_time_stamp;
13fcf850c   Ilpo Järvinen   [TCP]: Move accou...
3121
  	int fully_acked = 1;
7c46a03e6   Ilpo Järvinen   [TCP]: Cleanup tc...
3122
  	int flag = 0;
720188359   Ilpo Järvinen   [TCP]: Cleanup lo...
3123
  	u32 pkts_acked = 0;
c7caf8d3e   Ilpo Järvinen   [TCP]: Fix reord ...
3124
  	u32 reord = tp->packets_out;
90638a04a   Ilpo Järvinen   tcp: don't clear ...
3125
  	u32 prior_sacked = tp->sacked_out;
7c46a03e6   Ilpo Järvinen   [TCP]: Cleanup tc...
3126
  	s32 seq_rtt = -1;
2072c228c   Gavin McCullagh   [TCP]: use non-de...
3127
  	s32 ca_seq_rtt = -1;
b9ce204f0   Ilpo Järvinen   [TCP]: Congestion...
3128
  	ktime_t last_ackt = net_invalid_timestamp();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3129

7c46a03e6   Ilpo Järvinen   [TCP]: Cleanup tc...
3130
  	while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
3131
  		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
720188359   Ilpo Järvinen   [TCP]: Cleanup lo...
3132
  		u32 acked_pcount;
7c46a03e6   Ilpo Järvinen   [TCP]: Cleanup tc...
3133
  		u8 sacked = scb->sacked;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3134

2072c228c   Gavin McCullagh   [TCP]: use non-de...
3135
  		/* Determine how many packets and what bytes were acked, tso and else */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3136
  		if (after(scb->end_seq, tp->snd_una)) {
13fcf850c   Ilpo Järvinen   [TCP]: Move accou...
3137
3138
3139
  			if (tcp_skb_pcount(skb) == 1 ||
  			    !after(tp->snd_una, scb->seq))
  				break;
720188359   Ilpo Järvinen   [TCP]: Cleanup lo...
3140
3141
  			acked_pcount = tcp_tso_acked(sk, skb);
  			if (!acked_pcount)
13fcf850c   Ilpo Järvinen   [TCP]: Move accou...
3142
3143
3144
  				break;
  
  			fully_acked = 0;
13fcf850c   Ilpo Järvinen   [TCP]: Move accou...
3145
  		} else {
720188359   Ilpo Järvinen   [TCP]: Cleanup lo...
3146
  			acked_pcount = tcp_skb_pcount(skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3147
  		}
89d478f7f   Ilpo Järvinen   [TCP]: Remove dup...
3148
3149
  		if (sacked & TCPCB_RETRANS) {
  			if (sacked & TCPCB_SACKED_RETRANS)
720188359   Ilpo Järvinen   [TCP]: Cleanup lo...
3150
  				tp->retrans_out -= acked_pcount;
89d478f7f   Ilpo Järvinen   [TCP]: Remove dup...
3151
3152
3153
  			flag |= FLAG_RETRANS_DATA_ACKED;
  			ca_seq_rtt = -1;
  			seq_rtt = -1;
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3154
  			if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1))
89d478f7f   Ilpo Järvinen   [TCP]: Remove dup...
3155
  				flag |= FLAG_NONHEAD_RETRANS_ACKED;
c7caf8d3e   Ilpo Järvinen   [TCP]: Fix reord ...
3156
  		} else {
2072c228c   Gavin McCullagh   [TCP]: use non-de...
3157
3158
  			ca_seq_rtt = now - scb->when;
  			last_ackt = skb->tstamp;
c7caf8d3e   Ilpo Järvinen   [TCP]: Fix reord ...
3159
  			if (seq_rtt < 0) {
2072c228c   Gavin McCullagh   [TCP]: use non-de...
3160
  				seq_rtt = ca_seq_rtt;
c7caf8d3e   Ilpo Järvinen   [TCP]: Fix reord ...
3161
  			}
89d478f7f   Ilpo Järvinen   [TCP]: Remove dup...
3162
  			if (!(sacked & TCPCB_SACKED_ACKED))
720188359   Ilpo Järvinen   [TCP]: Cleanup lo...
3163
  				reord = min(pkts_acked, reord);
2d2abbab6   Stephen Hemminger   [TCP]: simplify m...
3164
  		}
89d478f7f   Ilpo Järvinen   [TCP]: Remove dup...
3165
3166
  
  		if (sacked & TCPCB_SACKED_ACKED)
720188359   Ilpo Järvinen   [TCP]: Cleanup lo...
3167
  			tp->sacked_out -= acked_pcount;
89d478f7f   Ilpo Järvinen   [TCP]: Remove dup...
3168
  		if (sacked & TCPCB_LOST)
720188359   Ilpo Järvinen   [TCP]: Cleanup lo...
3169
  			tp->lost_out -= acked_pcount;
89d478f7f   Ilpo Järvinen   [TCP]: Remove dup...
3170

720188359   Ilpo Järvinen   [TCP]: Cleanup lo...
3171
3172
  		tp->packets_out -= acked_pcount;
  		pkts_acked += acked_pcount;
13fcf850c   Ilpo Järvinen   [TCP]: Move accou...
3173

009a2e3e4   Ilpo Järvinen   [TCP] FRTO: Impro...
3174
3175
3176
3177
3178
3179
3180
  		/* Initial outgoing SYN's get put onto the write_queue
  		 * just like anything else we transmit.  It is not
  		 * true data, and if we misinform our callers that
  		 * this ACK acks real data, we will erroneously exit
  		 * connection startup slow start one packet too
  		 * quickly.  This is severely frowned upon behavior.
  		 */
4de075e04   Eric Dumazet   tcp: rename tcp_s...
3181
  		if (!(scb->tcp_flags & TCPHDR_SYN)) {
009a2e3e4   Ilpo Järvinen   [TCP] FRTO: Impro...
3182
3183
3184
3185
3186
  			flag |= FLAG_DATA_ACKED;
  		} else {
  			flag |= FLAG_SYN_ACKED;
  			tp->retrans_stamp = 0;
  		}
13fcf850c   Ilpo Järvinen   [TCP]: Move accou...
3187
3188
  		if (!fully_acked)
  			break;
fe067e8ab   David S. Miller   [TCP]: Abstract o...
3189
  		tcp_unlink_write_queue(skb, sk);
3ab224be6   Hideo Aoki   [NET] CORE: Intro...
3190
  		sk_wmem_free_skb(sk, skb);
90638a04a   Ilpo Järvinen   tcp: don't clear ...
3191
  		tp->scoreboard_skb_hint = NULL;
ef9da47c7   Ilpo Järvinen   tcp: don't clear ...
3192
3193
  		if (skb == tp->retransmit_skb_hint)
  			tp->retransmit_skb_hint = NULL;
90638a04a   Ilpo Järvinen   tcp: don't clear ...
3194
3195
  		if (skb == tp->lost_skb_hint)
  			tp->lost_skb_hint = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3196
  	}
33f5f57ee   Ilpo Järvinen   tcp: kill pointle...
3197
3198
  	if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
  		tp->snd_up = tp->snd_una;
cadbd0313   Ilpo Järvinen   [TCP]: Dropped un...
3199
3200
  	if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
  		flag |= FLAG_SACK_RENEGING;
7c46a03e6   Ilpo Järvinen   [TCP]: Cleanup tc...
3201
  	if (flag & FLAG_ACKED) {
164891aad   Stephen Hemminger   [TCP]: Congestion...
3202
3203
  		const struct tcp_congestion_ops *ca_ops
  			= inet_csk(sk)->icsk_ca_ops;
72211e905   Ilpo Järvinen   tcp: don't check ...
3204
3205
3206
3207
  		if (unlikely(icsk->icsk_mtup.probe_size &&
  			     !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
  			tcp_mtup_probe_success(sk);
  		}
7c46a03e6   Ilpo Järvinen   [TCP]: Cleanup tc...
3208
  		tcp_ack_update_rtt(sk, flag, seq_rtt);
6728e7dc3   Ilpo Järvinen   [TCP]: Rename tcp...
3209
  		tcp_rearm_rto(sk);
317a76f9a   Stephen Hemminger   [TCP]: Add plugga...
3210

c7caf8d3e   Ilpo Järvinen   [TCP]: Fix reord ...
3211
3212
3213
  		if (tcp_is_reno(tp)) {
  			tcp_remove_reno_sacks(sk, pkts_acked);
  		} else {
59a08cba6   Ilpo Järvinen   tcp: fix lost_cnt...
3214
  			int delta;
c7caf8d3e   Ilpo Järvinen   [TCP]: Fix reord ...
3215
3216
3217
  			/* Non-retransmitted hole got filled? That's reordering */
  			if (reord < prior_fackets)
  				tcp_update_reordering(sk, tp->fackets_out - reord, 0);
90638a04a   Ilpo Järvinen   tcp: don't clear ...
3218

59a08cba6   Ilpo Järvinen   tcp: fix lost_cnt...
3219
3220
3221
  			delta = tcp_is_fack(tp) ? pkts_acked :
  						  prior_sacked - tp->sacked_out;
  			tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
c7caf8d3e   Ilpo Järvinen   [TCP]: Fix reord ...
3222
  		}
91fed7a15   Ilpo Järvinen   [TCP]: Make facke...
3223
  		tp->fackets_out -= min(pkts_acked, tp->fackets_out);
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
3224

30cfd0baf   Stephen Hemminger   [TCP]: congestion...
3225
3226
3227
3228
  		if (ca_ops->pkts_acked) {
  			s32 rtt_us = -1;
  
  			/* Is the ACK triggering packet unambiguous? */
7c46a03e6   Ilpo Järvinen   [TCP]: Cleanup tc...
3229
  			if (!(flag & FLAG_RETRANS_DATA_ACKED)) {
30cfd0baf   Stephen Hemminger   [TCP]: congestion...
3230
3231
3232
3233
3234
3235
  				/* High resolution needed and available? */
  				if (ca_ops->flags & TCP_CONG_RTT_STAMP &&
  				    !ktime_equal(last_ackt,
  						 net_invalid_timestamp()))
  					rtt_us = ktime_us_delta(ktime_get_real(),
  								last_ackt);
febf08198   stephen hemminger   tcp: fix RTT for ...
3236
  				else if (ca_seq_rtt >= 0)
2072c228c   Gavin McCullagh   [TCP]: use non-de...
3237
  					rtt_us = jiffies_to_usecs(ca_seq_rtt);
30cfd0baf   Stephen Hemminger   [TCP]: congestion...
3238
  			}
b9ce204f0   Ilpo Järvinen   [TCP]: Congestion...
3239

30cfd0baf   Stephen Hemminger   [TCP]: congestion...
3240
3241
  			ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3242
3243
3244
  	}
  
  #if FASTRETRANS_DEBUG > 0
547b792ca   Ilpo Järvinen   net: convert BUG_...
3245
3246
3247
  	WARN_ON((int)tp->sacked_out < 0);
  	WARN_ON((int)tp->lost_out < 0);
  	WARN_ON((int)tp->retrans_out < 0);
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
3248
  	if (!tp->packets_out && tcp_is_sack(tp)) {
cfcabdcc2   Stephen Hemminger   [NET]: sparse war...
3249
  		icsk = inet_csk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3250
3251
3252
  		if (tp->lost_out) {
  			printk(KERN_DEBUG "Leak l=%u %d
  ",
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
3253
  			       tp->lost_out, icsk->icsk_ca_state);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3254
3255
3256
3257
3258
  			tp->lost_out = 0;
  		}
  		if (tp->sacked_out) {
  			printk(KERN_DEBUG "Leak s=%u %d
  ",
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
3259
  			       tp->sacked_out, icsk->icsk_ca_state);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3260
3261
3262
3263
3264
  			tp->sacked_out = 0;
  		}
  		if (tp->retrans_out) {
  			printk(KERN_DEBUG "Leak r=%u %d
  ",
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
3265
  			       tp->retrans_out, icsk->icsk_ca_state);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3266
3267
3268
3269
  			tp->retrans_out = 0;
  		}
  	}
  #endif
7c46a03e6   Ilpo Järvinen   [TCP]: Cleanup tc...
3270
  	return flag;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3271
3272
3273
3274
  }
  
  static void tcp_ack_probe(struct sock *sk)
  {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3275
3276
  	const struct tcp_sock *tp = tcp_sk(sk);
  	struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3277
3278
  
  	/* Was it a usable window open? */
90840defa   Ilpo Järvinen   [TCP]: Introduce ...
3279
  	if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3280
3281
  		icsk->icsk_backoff = 0;
  		inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3282
3283
3284
3285
  		/* Socket must be waked up by subsequent tcp_data_snd_check().
  		 * This function is not for random using!
  		 */
  	} else {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3286
  		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
3f421baa4   Arnaldo Carvalho de Melo   [NET]: Just move ...
3287
3288
  					  min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
  					  TCP_RTO_MAX);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3289
3290
  	}
  }
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
3291
  static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3292
  {
a02cec215   Eric Dumazet   net: return opera...
3293
3294
  	return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
  		inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3295
  }
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
3296
  static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3297
  {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
3298
  	const struct tcp_sock *tp = tcp_sk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3299
  	return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
3300
  		!((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3301
3302
3303
3304
3305
  }
  
  /* Check that window update is acceptable.
   * The function assumes that snd_una<=ack<=snd_next.
   */
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3306
3307
3308
  static inline int tcp_may_update_window(const struct tcp_sock *tp,
  					const u32 ack, const u32 ack_seq,
  					const u32 nwin)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3309
  {
a02cec215   Eric Dumazet   net: return opera...
3310
  	return	after(ack, tp->snd_una) ||
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3311
  		after(ack_seq, tp->snd_wl1) ||
a02cec215   Eric Dumazet   net: return opera...
3312
  		(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3313
3314
3315
3316
3317
3318
3319
  }
  
  /* Update our send window.
   *
   * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
   * and in FreeBSD. NetBSD's one is even worse.) is wrong.
   */
cf533ea53   Eric Dumazet   tcp: add const qu...
3320
  static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 ack,
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
3321
  				 u32 ack_seq)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3322
  {
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
3323
  	struct tcp_sock *tp = tcp_sk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3324
  	int flag = 0;
aa8223c7b   Arnaldo Carvalho de Melo   [SK_BUFF]: Introd...
3325
  	u32 nwin = ntohs(tcp_hdr(skb)->window);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3326

aa8223c7b   Arnaldo Carvalho de Melo   [SK_BUFF]: Introd...
3327
  	if (likely(!tcp_hdr(skb)->syn))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3328
3329
3330
3331
  		nwin <<= tp->rx_opt.snd_wscale;
  
  	if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
  		flag |= FLAG_WIN_UPDATE;
ee7537b63   Hantzis Fotis   tcp: tcp_init_wl ...
3332
  		tcp_update_wl(tp, ack_seq);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3333
3334
3335
3336
3337
3338
3339
  
  		if (tp->snd_wnd != nwin) {
  			tp->snd_wnd = nwin;
  
  			/* Note, it is the only place, where
  			 * fast path is recovered for sending TCP.
  			 */
2ad41065d   Herbert Xu   [TCP]: Clear stal...
3340
  			tp->pred_flags = 0;
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
3341
  			tcp_fast_path_check(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3342
3343
3344
  
  			if (nwin > tp->max_window) {
  				tp->max_window = nwin;
d83d8461f   Arnaldo Carvalho de Melo   [IP_SOCKGLUE]: Re...
3345
  				tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3346
3347
3348
3349
3350
3351
3352
3353
  			}
  		}
  	}
  
  	tp->snd_una = ack;
  
  	return flag;
  }
9ead9a1d3   Ilpo Järvinen   [TCP] FRTO: Separ...
3354
3355
3356
3357
3358
3359
  /* A very conservative spurious RTO response algorithm: reduce cwnd and
   * continue in congestion avoidance.
   */
  static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
  {
  	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
aa8b6a7ad   Ilpo Järvinen   [TCP] FRTO: Respo...
3360
  	tp->snd_cwnd_cnt = 0;
16e906812   Ilpo Järvinen   [TCP]: Add bytes_...
3361
  	tp->bytes_acked = 0;
463236557   Ilpo Järvinen   [TCP] FRTO: Add m...
3362
  	TCP_ECN_queue_cwr(tp);
9ead9a1d3   Ilpo Järvinen   [TCP] FRTO: Separ...
3363
3364
  	tcp_moderate_cwnd(tp);
  }
3cfe3baaf   Ilpo Järvinen   [TCP]: Add two ne...
3365
3366
3367
3368
3369
  /* A conservative spurious RTO response algorithm: reduce cwnd using
   * rate halving and continue in congestion avoidance.
   */
  static void tcp_ratehalving_spur_to_response(struct sock *sk)
  {
3cfe3baaf   Ilpo Järvinen   [TCP]: Add two ne...
3370
  	tcp_enter_cwr(sk, 0);
3cfe3baaf   Ilpo Järvinen   [TCP]: Add two ne...
3371
  }
e317f6f69   Ilpo Järvinen   [TCP]: FRTO undo ...
3372
  static void tcp_undo_spur_to_response(struct sock *sk, int flag)
3cfe3baaf   Ilpo Järvinen   [TCP]: Add two ne...
3373
  {
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3374
  	if (flag & FLAG_ECE)
e317f6f69   Ilpo Järvinen   [TCP]: FRTO undo ...
3375
3376
  		tcp_ratehalving_spur_to_response(sk);
  	else
f6152737a   David S. Miller   tcp: Make undo_ss...
3377
  		tcp_undo_cwr(sk, true);
3cfe3baaf   Ilpo Järvinen   [TCP]: Add two ne...
3378
  }
30935cf4f   Ilpo Järvinen   [TCP] FRTO: Comme...
3379
3380
  /* F-RTO spurious RTO detection algorithm (RFC4138)
   *
6408d206c   Ilpo Järvinen   [TCP] FRTO: Ignor...
3381
3382
3383
   * F-RTO affects during two new ACKs following RTO (well, almost, see inline
   * comments). State (ACK number) is kept in frto_counter. When ACK advances
   * window (but not to or beyond highest sequence sent before RTO):
30935cf4f   Ilpo Järvinen   [TCP] FRTO: Comme...
3384
3385
3386
3387
3388
   *   On First ACK,  send two new segments out.
   *   On Second ACK, RTO was likely spurious. Do spurious response (response
   *                  algorithm is not part of the F-RTO detection algorithm
   *                  given in RFC4138 but can be selected separately).
   * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
d551e4541   Ilpo Järvinen   [TCP] FRTO: RFC41...
3389
3390
3391
   * and TCP falls back to conventional RTO recovery. F-RTO allows overriding
   * of Nagle, this is done using frto_counter states 2 and 3, when a new data
   * segment of any size sent during F-RTO, state 2 is upgraded to 3.
30935cf4f   Ilpo Järvinen   [TCP] FRTO: Comme...
3392
3393
3394
3395
   *
   * Rationale: if the RTO was spurious, new ACKs should arrive from the
   * original window even after we transmit two new data segments.
   *
4dc2665e3   Ilpo Järvinen   [TCP]: SACK enhan...
3396
3397
3398
3399
   * SACK version:
   *   on first step, wait until first cumulative ACK arrives, then move to
   *   the second step. In second step, the next ACK decides.
   *
30935cf4f   Ilpo Järvinen   [TCP] FRTO: Comme...
3400
3401
3402
3403
3404
3405
3406
3407
3408
   * F-RTO is implemented (mainly) in four functions:
   *   - tcp_use_frto() is used to determine if TCP is can use F-RTO
   *   - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is
   *     called when tcp_use_frto() showed green light
   *   - tcp_process_frto() handles incoming ACKs during F-RTO algorithm
   *   - tcp_enter_frto_loss() is called if there is not enough evidence
   *     to prove that the RTO is indeed spurious. It transfers the control
   *     from F-RTO to the conventional RTO recovery
   */
2e6052941   Ilpo Järvinen   [TCP]: Also handl...
3409
  static int tcp_process_frto(struct sock *sk, int flag)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3410
3411
  {
  	struct tcp_sock *tp = tcp_sk(sk);
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
3412

005903bc3   Ilpo Järvinen   [TCP]: Left out s...
3413
  	tcp_verify_left_out(tp);
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
3414

7487c48c4   Ilpo Järvinen   [TCP] FRTO: Conse...
3415
  	/* Duplicate the behavior from Loss state (fastretrans_alert) */
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3416
  	if (flag & FLAG_DATA_ACKED)
7487c48c4   Ilpo Järvinen   [TCP] FRTO: Conse...
3417
  		inet_csk(sk)->icsk_retransmits = 0;
009a2e3e4   Ilpo Järvinen   [TCP] FRTO: Impro...
3418
3419
3420
  	if ((flag & FLAG_NONHEAD_RETRANS_ACKED) ||
  	    ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED)))
  		tp->undo_marker = 0;
95c4922bf   Ilpo Järvinen   [TCP] FRTO: fixes...
3421
  	if (!before(tp->snd_una, tp->frto_highmark)) {
d551e4541   Ilpo Järvinen   [TCP] FRTO: RFC41...
3422
  		tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
7c9a4a5b6   Ilpo Järvinen   [TCP]: Prevent un...
3423
  		return 1;
95c4922bf   Ilpo Järvinen   [TCP] FRTO: fixes...
3424
  	}
62ab22278   Ilpo Järvinen   tcp FRTO: SACK va...
3425
  	if (!tcp_is_sackfrto(tp)) {
4dc2665e3   Ilpo Järvinen   [TCP]: SACK enhan...
3426
3427
3428
3429
  		/* RFC4138 shortcoming in step 2; should also have case c):
  		 * ACK isn't duplicate nor advances window, e.g., opposite dir
  		 * data, winupdate
  		 */
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3430
  		if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP))
4dc2665e3   Ilpo Järvinen   [TCP]: SACK enhan...
3431
  			return 1;
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3432
  		if (!(flag & FLAG_DATA_ACKED)) {
4dc2665e3   Ilpo Järvinen   [TCP]: SACK enhan...
3433
3434
3435
3436
3437
  			tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
  					    flag);
  			return 1;
  		}
  	} else {
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3438
  		if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
4dc2665e3   Ilpo Järvinen   [TCP]: SACK enhan...
3439
3440
3441
3442
3443
  			/* Prevent sending of new data. */
  			tp->snd_cwnd = min(tp->snd_cwnd,
  					   tcp_packets_in_flight(tp));
  			return 1;
  		}
6408d206c   Ilpo Järvinen   [TCP] FRTO: Ignor...
3444

d551e4541   Ilpo Järvinen   [TCP] FRTO: RFC41...
3445
  		if ((tp->frto_counter >= 2) &&
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3446
3447
3448
  		    (!(flag & FLAG_FORWARD_PROGRESS) ||
  		     ((flag & FLAG_DATA_SACKED) &&
  		      !(flag & FLAG_ONLY_ORIG_SACKED)))) {
4dc2665e3   Ilpo Järvinen   [TCP]: SACK enhan...
3449
  			/* RFC4138 shortcoming (see comment above) */
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3450
3451
  			if (!(flag & FLAG_FORWARD_PROGRESS) &&
  			    (flag & FLAG_NOT_DUP))
4dc2665e3   Ilpo Järvinen   [TCP]: SACK enhan...
3452
3453
3454
3455
3456
  				return 1;
  
  			tcp_enter_frto_loss(sk, 3, flag);
  			return 1;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3457
3458
3459
  	}
  
  	if (tp->frto_counter == 1) {
3e6f049e0   Ilpo Järvinen   [TCP] FRTO: Use o...
3460
  		/* tcp_may_send_now needs to see updated state */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3461
  		tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
94d0ea778   Ilpo Järvinen   [TCP] FRTO: frto_...
3462
  		tp->frto_counter = 2;
3e6f049e0   Ilpo Järvinen   [TCP] FRTO: Use o...
3463
3464
3465
  
  		if (!tcp_may_send_now(sk))
  			tcp_enter_frto_loss(sk, 2, flag);
7c9a4a5b6   Ilpo Järvinen   [TCP]: Prevent un...
3466
  		return 1;
d551e4541   Ilpo Järvinen   [TCP] FRTO: RFC41...
3467
  	} else {
3cfe3baaf   Ilpo Järvinen   [TCP]: Add two ne...
3468
3469
  		switch (sysctl_tcp_frto_response) {
  		case 2:
e317f6f69   Ilpo Järvinen   [TCP]: FRTO undo ...
3470
  			tcp_undo_spur_to_response(sk, flag);
3cfe3baaf   Ilpo Järvinen   [TCP]: Add two ne...
3471
3472
3473
3474
3475
3476
3477
  			break;
  		case 1:
  			tcp_conservative_spur_to_response(tp);
  			break;
  		default:
  			tcp_ratehalving_spur_to_response(sk);
  			break;
3ff50b799   Stephen Hemminger   [NET]: cleanup ex...
3478
  		}
94d0ea778   Ilpo Järvinen   [TCP] FRTO: frto_...
3479
  		tp->frto_counter = 0;
009a2e3e4   Ilpo Järvinen   [TCP] FRTO: Impro...
3480
  		tp->undo_marker = 0;
de0744af1   Pavel Emelyanov   mib: add net to N...
3481
  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3482
  	}
7c9a4a5b6   Ilpo Järvinen   [TCP]: Prevent un...
3483
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3484
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3485
  /* This routine deals with incoming acks, but not outgoing ones. */
cf533ea53   Eric Dumazet   tcp: add const qu...
3486
  static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3487
  {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
3488
  	struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3489
3490
3491
3492
  	struct tcp_sock *tp = tcp_sk(sk);
  	u32 prior_snd_una = tp->snd_una;
  	u32 ack_seq = TCP_SKB_CB(skb)->seq;
  	u32 ack = TCP_SKB_CB(skb)->ack_seq;
7d2b55f80   Neal Cardwell   tcp: make is_dupa...
3493
  	bool is_dupack = false;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3494
  	u32 prior_in_flight;
c7caf8d3e   Ilpo Järvinen   [TCP]: Fix reord ...
3495
  	u32 prior_fackets;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3496
  	int prior_packets;
a262f0cdf   Nandita Dukkipati   Proportional Rate...
3497
  	int prior_sacked = tp->sacked_out;
7d2b55f80   Neal Cardwell   tcp: make is_dupa...
3498
  	int pkts_acked = 0;
a262f0cdf   Nandita Dukkipati   Proportional Rate...
3499
  	int newly_acked_sacked = 0;
7c9a4a5b6   Ilpo Järvinen   [TCP]: Prevent un...
3500
  	int frto_cwnd = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3501

96e0bf4b5   John Dykstra   tcp: Discard segm...
3502
  	/* If the ack is older than previous acks
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3503
3504
  	 * then we can probably ignore it.
  	 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3505
3506
  	if (before(ack, prior_snd_una))
  		goto old_ack;
96e0bf4b5   John Dykstra   tcp: Discard segm...
3507
3508
3509
3510
3511
  	/* If the ack includes data we haven't sent yet, discard
  	 * this segment (RFC793 Section 3.9).
  	 */
  	if (after(ack, tp->snd_nxt))
  		goto invalid_ack;
2e6052941   Ilpo Järvinen   [TCP]: Also handl...
3512
3513
  	if (after(ack, prior_snd_una))
  		flag |= FLAG_SND_UNA_ADVANCED;
3fdf3f0c9   Daikichi Osuga   [TCP]: Two RFC346...
3514
3515
3516
3517
3518
  	if (sysctl_tcp_abc) {
  		if (icsk->icsk_ca_state < TCP_CA_CWR)
  			tp->bytes_acked += ack - prior_snd_una;
  		else if (icsk->icsk_ca_state == TCP_CA_Loss)
  			/* we assume just one segment left network */
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3519
3520
  			tp->bytes_acked += min(ack - prior_snd_una,
  					       tp->mss_cache);
3fdf3f0c9   Daikichi Osuga   [TCP]: Two RFC346...
3521
  	}
9772efb97   Stephen Hemminger   [TCP]: Appropriat...
3522

c7caf8d3e   Ilpo Järvinen   [TCP]: Fix reord ...
3523
  	prior_fackets = tp->fackets_out;
52d340815   Ilpo Järvinen   [TCP]: Move prior...
3524
  	prior_in_flight = tcp_packets_in_flight(tp);
c7caf8d3e   Ilpo Järvinen   [TCP]: Fix reord ...
3525

056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3526
  	if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3527
3528
3529
3530
  		/* Window is constant, pure forward advance.
  		 * No more checks are required.
  		 * Note, we use the fact that SND.UNA>=SND.WL2.
  		 */
ee7537b63   Hantzis Fotis   tcp: tcp_init_wl ...
3531
  		tcp_update_wl(tp, ack_seq);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3532
  		tp->snd_una = ack;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3533
  		flag |= FLAG_WIN_UPDATE;
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
3534
  		tcp_ca_event(sk, CA_EVENT_FAST_ACK);
317a76f9a   Stephen Hemminger   [TCP]: Add plugga...
3535

de0744af1   Pavel Emelyanov   mib: add net to N...
3536
  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3537
3538
3539
3540
  	} else {
  		if (ack_seq != TCP_SKB_CB(skb)->end_seq)
  			flag |= FLAG_DATA;
  		else
de0744af1   Pavel Emelyanov   mib: add net to N...
3541
  			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3542

9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
3543
  		flag |= tcp_ack_update_window(sk, skb, ack, ack_seq);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3544
3545
3546
  
  		if (TCP_SKB_CB(skb)->sacked)
  			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
aa8223c7b   Arnaldo Carvalho de Melo   [SK_BUFF]: Introd...
3547
  		if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3548
  			flag |= FLAG_ECE;
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
3549
  		tcp_ca_event(sk, CA_EVENT_SLOW_ACK);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3550
3551
3552
3553
3554
3555
  	}
  
  	/* We passed data and got it acked, remove any soft error
  	 * log. Something worked...
  	 */
  	sk->sk_err_soft = 0;
4b53fb67e   David S. Miller   tcp: Clear probes...
3556
  	icsk->icsk_probes_out = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3557
3558
3559
3560
  	tp->rcv_tstamp = tcp_time_stamp;
  	prior_packets = tp->packets_out;
  	if (!prior_packets)
  		goto no_queue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3561
  	/* See if we can take anything off of the retransmit queue. */
33f5f57ee   Ilpo Järvinen   tcp: kill pointle...
3562
  	flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3563

7d2b55f80   Neal Cardwell   tcp: make is_dupa...
3564
  	pkts_acked = prior_packets - tp->packets_out;
a262f0cdf   Nandita Dukkipati   Proportional Rate...
3565
3566
  	newly_acked_sacked = (prior_packets - prior_sacked) -
  			     (tp->packets_out - tp->sacked_out);
e1cd8f78f   Ilpo Järvinen   [TCP] FRTO: Clear...
3567
3568
  	if (tp->frto_counter)
  		frto_cwnd = tcp_process_frto(sk, flag);
3de96471b   Ilpo Järvinen   [TCP]: Wrap-safed...
3569
3570
3571
  	/* Guarantee sacktag reordering detection against wrap-arounds */
  	if (before(tp->frto_highmark, tp->snd_una))
  		tp->frto_highmark = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3572

6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
3573
  	if (tcp_ack_is_dubious(sk, flag)) {
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
3574
  		/* Advance CWND, if state allows this. */
7c9a4a5b6   Ilpo Järvinen   [TCP]: Prevent un...
3575
3576
  		if ((flag & FLAG_DATA_ACKED) && !frto_cwnd &&
  		    tcp_may_raise_cwnd(sk, flag))
c3a05c605   Ilpo Järvinen   [TCP]: Cong.ctrl ...
3577
  			tcp_cong_avoid(sk, ack, prior_in_flight);
7d2b55f80   Neal Cardwell   tcp: make is_dupa...
3578
3579
3580
  		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
  		tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked,
  				      is_dupack, flag);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3581
  	} else {
7c9a4a5b6   Ilpo Järvinen   [TCP]: Prevent un...
3582
  		if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
c3a05c605   Ilpo Järvinen   [TCP]: Cong.ctrl ...
3583
  			tcp_cong_avoid(sk, ack, prior_in_flight);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3584
  	}
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3585
  	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
b6c6712a4   Eric Dumazet   net: sk_dst_cache...
3586
  		dst_confirm(__sk_dst_get(sk));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3587
3588
3589
3590
  
  	return 1;
  
  no_queue:
5628adf1a   Neal Cardwell   tcp: use DSACKs t...
3591
3592
3593
3594
  	/* If data was DSACKed, see if we can undo a cwnd reduction. */
  	if (flag & FLAG_DSACKING_ACK)
  		tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked,
  				      is_dupack, flag);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3595
3596
3597
3598
  	/* If this ack opens up a zero window, clear backoff.  It was
  	 * being used to time the probes, and is probably far higher than
  	 * it needs to be for normal retransmission.
  	 */
fe067e8ab   David S. Miller   [TCP]: Abstract o...
3599
  	if (tcp_send_head(sk))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3600
3601
  		tcp_ack_probe(sk);
  	return 1;
96e0bf4b5   John Dykstra   tcp: Discard segm...
3602
3603
3604
3605
  invalid_ack:
  	SOCK_DEBUG(sk, "Ack %u after %u:%u
  ", ack, tp->snd_una, tp->snd_nxt);
  	return -1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3606
  old_ack:
e95ae2f2c   Neal Cardwell   tcp: use SACKs an...
3607
3608
3609
  	/* If data was SACKed, tag it and see if we should send more data.
  	 * If data was DSACKed, see if we can undo a cwnd reduction.
  	 */
8aca6cb11   Ilpo Järvinen   tcp: Fix inconsis...
3610
  	if (TCP_SKB_CB(skb)->sacked) {
e95ae2f2c   Neal Cardwell   tcp: use SACKs an...
3611
3612
3613
3614
  		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
  		newly_acked_sacked = tp->sacked_out - prior_sacked;
  		tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked,
  				      is_dupack, flag);
8aca6cb11   Ilpo Järvinen   tcp: Fix inconsis...
3615
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3616

96e0bf4b5   John Dykstra   tcp: Discard segm...
3617
3618
  	SOCK_DEBUG(sk, "Ack %u before %u:%u
  ", ack, tp->snd_una, tp->snd_nxt);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3619
3620
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3621
3622
3623
3624
  /* Look for tcp options. Normally only called on SYN and SYNACK packets.
   * But, this can also be called on packets in the established flow when
   * the fast version below fails.
   */
cf533ea53   Eric Dumazet   tcp: add const qu...
3625
3626
  void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx,
  		       const u8 **hvpp, int estab)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3627
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
3628
3629
  	const unsigned char *ptr;
  	const struct tcphdr *th = tcp_hdr(skb);
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3630
  	int length = (th->doff * 4) - sizeof(struct tcphdr);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3631

cf533ea53   Eric Dumazet   tcp: add const qu...
3632
  	ptr = (const unsigned char *)(th + 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3633
  	opt_rx->saw_tstamp = 0;
2de979bd7   Stephen Hemminger   [TCP]: whitespace...
3634
  	while (length > 0) {
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3635
  		int opcode = *ptr++;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3636
3637
3638
  		int opsize;
  
  		switch (opcode) {
f038ac8f9   Ilpo Järvinen   [TCP]: cleanup tc...
3639
3640
3641
3642
3643
3644
3645
3646
  		case TCPOPT_EOL:
  			return;
  		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
  			length--;
  			continue;
  		default:
  			opsize = *ptr++;
  			if (opsize < 2) /* "silly options" */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3647
  				return;
f038ac8f9   Ilpo Järvinen   [TCP]: cleanup tc...
3648
3649
3650
3651
3652
  			if (opsize > length)
  				return;	/* don't parse partial options */
  			switch (opcode) {
  			case TCPOPT_MSS:
  				if (opsize == TCPOLEN_MSS && th->syn && !estab) {
d3e2ce3bc   Harvey Harrison   net: use get/put_...
3653
  					u16 in_mss = get_unaligned_be16(ptr);
f038ac8f9   Ilpo Järvinen   [TCP]: cleanup tc...
3654
3655
3656
3657
3658
  					if (in_mss) {
  						if (opt_rx->user_mss &&
  						    opt_rx->user_mss < in_mss)
  							in_mss = opt_rx->user_mss;
  						opt_rx->mss_clamp = in_mss;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3659
  					}
f038ac8f9   Ilpo Järvinen   [TCP]: cleanup tc...
3660
3661
3662
3663
  				}
  				break;
  			case TCPOPT_WINDOW:
  				if (opsize == TCPOLEN_WINDOW && th->syn &&
bb5b7c112   David S. Miller   tcp: Revert per-r...
3664
  				    !estab && sysctl_tcp_window_scaling) {
f038ac8f9   Ilpo Järvinen   [TCP]: cleanup tc...
3665
3666
3667
3668
3669
3670
3671
3672
3673
  					__u8 snd_wscale = *(__u8 *)ptr;
  					opt_rx->wscale_ok = 1;
  					if (snd_wscale > 14) {
  						if (net_ratelimit())
  							printk(KERN_INFO "tcp_parse_options: Illegal window "
  							       "scaling value %d >14 received.
  ",
  							       snd_wscale);
  						snd_wscale = 14;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3674
  					}
f038ac8f9   Ilpo Järvinen   [TCP]: cleanup tc...
3675
3676
3677
3678
3679
3680
  					opt_rx->snd_wscale = snd_wscale;
  				}
  				break;
  			case TCPOPT_TIMESTAMP:
  				if ((opsize == TCPOLEN_TIMESTAMP) &&
  				    ((estab && opt_rx->tstamp_ok) ||
bb5b7c112   David S. Miller   tcp: Revert per-r...
3681
  				     (!estab && sysctl_tcp_timestamps))) {
f038ac8f9   Ilpo Järvinen   [TCP]: cleanup tc...
3682
  					opt_rx->saw_tstamp = 1;
d3e2ce3bc   Harvey Harrison   net: use get/put_...
3683
3684
  					opt_rx->rcv_tsval = get_unaligned_be32(ptr);
  					opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
f038ac8f9   Ilpo Järvinen   [TCP]: cleanup tc...
3685
3686
3687
3688
  				}
  				break;
  			case TCPOPT_SACK_PERM:
  				if (opsize == TCPOLEN_SACK_PERM && th->syn &&
bb5b7c112   David S. Miller   tcp: Revert per-r...
3689
  				    !estab && sysctl_tcp_sack) {
ab56222a3   Vijay Subramanian   tcp: Replace cons...
3690
  					opt_rx->sack_ok = TCP_SACK_SEEN;
f038ac8f9   Ilpo Järvinen   [TCP]: cleanup tc...
3691
3692
3693
  					tcp_sack_reset(opt_rx);
  				}
  				break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3694

f038ac8f9   Ilpo Järvinen   [TCP]: cleanup tc...
3695
3696
3697
3698
3699
3700
3701
  			case TCPOPT_SACK:
  				if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
  				   !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
  				   opt_rx->sack_ok) {
  					TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
  				}
  				break;
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
3702
  #ifdef CONFIG_TCP_MD5SIG
f038ac8f9   Ilpo Järvinen   [TCP]: cleanup tc...
3703
3704
3705
3706
3707
3708
  			case TCPOPT_MD5SIG:
  				/*
  				 * The MD5 Hash has already been
  				 * checked (see tcp_v{4,6}_do_rcv()).
  				 */
  				break;
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
3709
  #endif
4957faade   William Allen Simpson   TCPCT part 1g: Re...
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
  			case TCPOPT_COOKIE:
  				/* This option is variable length.
  				 */
  				switch (opsize) {
  				case TCPOLEN_COOKIE_BASE:
  					/* not yet implemented */
  					break;
  				case TCPOLEN_COOKIE_PAIR:
  					/* not yet implemented */
  					break;
  				case TCPOLEN_COOKIE_MIN+0:
  				case TCPOLEN_COOKIE_MIN+2:
  				case TCPOLEN_COOKIE_MIN+4:
  				case TCPOLEN_COOKIE_MIN+6:
  				case TCPOLEN_COOKIE_MAX:
  					/* 16-bit multiple */
  					opt_rx->cookie_plus = opsize;
  					*hvpp = ptr;
ccbd6a5a4   Joe Perches   net: Remove unnec...
3728
  					break;
4957faade   William Allen Simpson   TCPCT part 1g: Re...
3729
3730
3731
  				default:
  					/* ignore option */
  					break;
ccbd6a5a4   Joe Perches   net: Remove unnec...
3732
  				}
4957faade   William Allen Simpson   TCPCT part 1g: Re...
3733
  				break;
ccbd6a5a4   Joe Perches   net: Remove unnec...
3734
  			}
3ff50b799   Stephen Hemminger   [NET]: cleanup ex...
3735

f038ac8f9   Ilpo Järvinen   [TCP]: cleanup tc...
3736
3737
  			ptr += opsize-2;
  			length -= opsize;
3ff50b799   Stephen Hemminger   [NET]: cleanup ex...
3738
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3739
3740
  	}
  }
4bc2f18ba   Eric Dumazet   net/ipv4: EXPORT_...
3741
  EXPORT_SYMBOL(tcp_parse_options);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3742

cf533ea53   Eric Dumazet   tcp: add const qu...
3743
  static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr *th)
a4356b292   Ilpo Järvinen   tcp: Add tcp_pars...
3744
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
3745
  	const __be32 *ptr = (const __be32 *)(th + 1);
a4356b292   Ilpo Järvinen   tcp: Add tcp_pars...
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
  
  	if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
  			  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
  		tp->rx_opt.saw_tstamp = 1;
  		++ptr;
  		tp->rx_opt.rcv_tsval = ntohl(*ptr);
  		++ptr;
  		tp->rx_opt.rcv_tsecr = ntohl(*ptr);
  		return 1;
  	}
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3758
3759
3760
  /* Fast parse options. This hopes to only see timestamps.
   * If it is wrong it falls back on tcp_parse_options().
   */
cf533ea53   Eric Dumazet   tcp: add const qu...
3761
3762
3763
  static int tcp_fast_parse_options(const struct sk_buff *skb,
  				  const struct tcphdr *th,
  				  struct tcp_sock *tp, const u8 **hvpp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3764
  {
4957faade   William Allen Simpson   TCPCT part 1g: Re...
3765
3766
3767
3768
  	/* In the spirit of fast parsing, compare doff directly to constant
  	 * values.  Because equality is used, short doff can be ignored here.
  	 */
  	if (th->doff == (sizeof(*th) / 4)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3769
3770
3771
  		tp->rx_opt.saw_tstamp = 0;
  		return 0;
  	} else if (tp->rx_opt.tstamp_ok &&
4957faade   William Allen Simpson   TCPCT part 1g: Re...
3772
  		   th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
a4356b292   Ilpo Järvinen   tcp: Add tcp_pars...
3773
  		if (tcp_parse_aligned_timestamp(tp, th))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3774
  			return 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3775
  	}
bb5b7c112   David S. Miller   tcp: Revert per-r...
3776
  	tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3777
3778
  	return 1;
  }
7d5d5525b   YOSHIFUJI Hideaki   tcp md5sig: Share...
3779
3780
3781
3782
  #ifdef CONFIG_TCP_MD5SIG
  /*
   * Parse MD5 Signature option
   */
cf533ea53   Eric Dumazet   tcp: add const qu...
3783
  const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
7d5d5525b   YOSHIFUJI Hideaki   tcp md5sig: Share...
3784
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
3785
3786
  	int length = (th->doff << 2) - sizeof(*th);
  	const u8 *ptr = (const u8 *)(th + 1);
7d5d5525b   YOSHIFUJI Hideaki   tcp md5sig: Share...
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
  
  	/* If the TCP option is too short, we can short cut */
  	if (length < TCPOLEN_MD5SIG)
  		return NULL;
  
  	while (length > 0) {
  		int opcode = *ptr++;
  		int opsize;
  
  		switch(opcode) {
  		case TCPOPT_EOL:
  			return NULL;
  		case TCPOPT_NOP:
  			length--;
  			continue;
  		default:
  			opsize = *ptr++;
  			if (opsize < 2 || opsize > length)
  				return NULL;
  			if (opcode == TCPOPT_MD5SIG)
ba78e2ddc   Dmitry Popov   tcp: no md5sig op...
3807
  				return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
7d5d5525b   YOSHIFUJI Hideaki   tcp md5sig: Share...
3808
3809
3810
3811
3812
3813
  		}
  		ptr += opsize - 2;
  		length -= opsize;
  	}
  	return NULL;
  }
4bc2f18ba   Eric Dumazet   net/ipv4: EXPORT_...
3814
  EXPORT_SYMBOL(tcp_parse_md5sig_option);
7d5d5525b   YOSHIFUJI Hideaki   tcp md5sig: Share...
3815
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3816
3817
3818
  static inline void tcp_store_ts_recent(struct tcp_sock *tp)
  {
  	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
9d729f72d   James Morris   [NET]: Convert xt...
3819
  	tp->rx_opt.ts_recent_stamp = get_seconds();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
  }
  
  static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
  {
  	if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
  		/* PAWS bug workaround wrt. ACK frames, the PAWS discard
  		 * extra check below makes sure this can only happen
  		 * for pure ACK frames.  -DaveM
  		 *
  		 * Not only, also it occurs for expired timestamps.
  		 */
c887e6d2d   Ilpo Järvinen   tcp: consolidate ...
3831
  		if (tcp_paws_check(&tp->rx_opt, 0))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
  			tcp_store_ts_recent(tp);
  	}
  }
  
  /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
   *
   * It is not fatal. If this ACK does _not_ change critical state (seqs, window)
   * it can pass through stack. So, the following predicate verifies that
   * this segment is not used for anything but congestion avoidance or
   * fast retransmit. Moreover, we even are able to eliminate most of such
   * second order effects, if we apply some small "replay" window (~RTO)
   * to timestamp space.
   *
   * All these measures still do not guarantee that we reject wrapped ACKs
   * on networks with high bandwidth, when sequence space is recycled fastly,
   * but it guarantees that such events will be very rare and do not affect
   * connection seriously. This doesn't look nice, but alas, PAWS is really
   * buggy extension.
   *
   * [ Later note. Even worse! It is buggy for segments _with_ data. RFC
   * states that events when retransmit arrives after original data are rare.
   * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
   * the biggest problem on large power networks even with minor reordering.
   * OK, let's give it small replay window. If peer clock is even 1hz, it is safe
   * up to bandwidth of 18Gigabit/sec. 8) ]
   */
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3858
  static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3859
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
3860
3861
  	const struct tcp_sock *tp = tcp_sk(sk);
  	const struct tcphdr *th = tcp_hdr(skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
  	u32 seq = TCP_SKB_CB(skb)->seq;
  	u32 ack = TCP_SKB_CB(skb)->ack_seq;
  
  	return (/* 1. Pure ACK with correct sequence number. */
  		(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
  
  		/* 2. ... and duplicate ACK. */
  		ack == tp->snd_una &&
  
  		/* 3. ... and does not update window. */
  		!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
  
  		/* 4. ... and sits in replay window. */
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3875
  		(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3876
  }
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3877
3878
  static inline int tcp_paws_discard(const struct sock *sk,
  				   const struct sk_buff *skb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3879
  {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3880
  	const struct tcp_sock *tp = tcp_sk(sk);
c887e6d2d   Ilpo Järvinen   tcp: consolidate ...
3881
3882
3883
  
  	return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
  	       !tcp_disordered_ack(sk, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
  }
  
  /* Check segment sequence number for validity.
   *
   * Segment controls are considered valid, if the segment
   * fits to the window after truncation to the window. Acceptability
   * of data (and SYN, FIN, of course) is checked separately.
   * See tcp_data_queue(), for example.
   *
   * Also, controls (RST is main one) are accepted using RCV.WUP instead
   * of RCV.NXT. Peer still did not advance his SND.UNA when we
   * delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
   * (borrowed from freebsd)
   */
cf533ea53   Eric Dumazet   tcp: add const qu...
3898
  static inline int tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
  {
  	return	!before(end_seq, tp->rcv_wup) &&
  		!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
  }
  
  /* When we get a reset we do this. */
  static void tcp_reset(struct sock *sk)
  {
  	/* We want the right error as BSD sees it (and indeed as we do). */
  	switch (sk->sk_state) {
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
  	case TCP_SYN_SENT:
  		sk->sk_err = ECONNREFUSED;
  		break;
  	case TCP_CLOSE_WAIT:
  		sk->sk_err = EPIPE;
  		break;
  	case TCP_CLOSE:
  		return;
  	default:
  		sk->sk_err = ECONNRESET;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3919
  	}
a4d258036   Tom Marshall   tcp: Fix race in ...
3920
3921
  	/* This barrier is coupled with smp_rmb() in tcp_poll() */
  	smp_wmb();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
  
  	if (!sock_flag(sk, SOCK_DEAD))
  		sk->sk_error_report(sk);
  
  	tcp_done(sk);
  }
  
  /*
   * 	Process the FIN bit. This now behaves as it is supposed to work
   *	and the FIN takes effect when it is validly part of sequence
   *	space. Not before when we get holes.
   *
   *	If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
   *	(and thence onto LAST-ACK and finally, CLOSE, we never enter
   *	TIME-WAIT)
   *
   *	If we are in FINWAIT-1, a received FIN indicates simultaneous
   *	close and we go into CLOSING (and later onto TIME-WAIT)
   *
   *	If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
   */
20c4cb792   Eric Dumazet   tcp: remove unuse...
3943
  static void tcp_fin(struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3944
3945
  {
  	struct tcp_sock *tp = tcp_sk(sk);
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3946
  	inet_csk_schedule_ack(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3947
3948
3949
3950
3951
  
  	sk->sk_shutdown |= RCV_SHUTDOWN;
  	sock_set_flag(sk, SOCK_DONE);
  
  	switch (sk->sk_state) {
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3952
3953
3954
3955
3956
3957
  	case TCP_SYN_RECV:
  	case TCP_ESTABLISHED:
  		/* Move to CLOSE_WAIT */
  		tcp_set_state(sk, TCP_CLOSE_WAIT);
  		inet_csk(sk)->icsk_ack.pingpong = 1;
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3958

056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3959
3960
3961
3962
3963
3964
3965
3966
3967
  	case TCP_CLOSE_WAIT:
  	case TCP_CLOSING:
  		/* Received a retransmission of the FIN, do
  		 * nothing.
  		 */
  		break;
  	case TCP_LAST_ACK:
  		/* RFC793: Remain in the LAST-ACK state. */
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3968

056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
  	case TCP_FIN_WAIT1:
  		/* This case occurs when a simultaneous close
  		 * happens, we must ack the received FIN and
  		 * enter the CLOSING state.
  		 */
  		tcp_send_ack(sk);
  		tcp_set_state(sk, TCP_CLOSING);
  		break;
  	case TCP_FIN_WAIT2:
  		/* Received a FIN -- send ACK and enter TIME_WAIT. */
  		tcp_send_ack(sk);
  		tcp_time_wait(sk, TCP_TIME_WAIT, 0);
  		break;
  	default:
  		/* Only TCP_LISTEN and TCP_CLOSE are left, in these
  		 * cases we should never reach this piece of code.
  		 */
  		printk(KERN_ERR "%s: Impossible, sk->sk_state=%d
  ",
0dc47877a   Harvey Harrison   net: replace rema...
3988
  		       __func__, sk->sk_state);
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3989
  		break;
3ff50b799   Stephen Hemminger   [NET]: cleanup ex...
3990
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3991
3992
3993
3994
3995
  
  	/* It _is_ possible, that we have something out-of-order _after_ FIN.
  	 * Probably, we should reset in this case. For now drop them.
  	 */
  	__skb_queue_purge(&tp->out_of_order_queue);
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
3996
  	if (tcp_is_sack(tp))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3997
  		tcp_sack_reset(&tp->rx_opt);
3ab224be6   Hideo Aoki   [NET] CORE: Intro...
3998
  	sk_mem_reclaim(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3999
4000
4001
4002
4003
4004
4005
  
  	if (!sock_flag(sk, SOCK_DEAD)) {
  		sk->sk_state_change(sk);
  
  		/* Do not send POLL_HUP for half duplex close. */
  		if (sk->sk_shutdown == SHUTDOWN_MASK ||
  		    sk->sk_state == TCP_CLOSE)
8d8ad9d7c   Pavel Emelyanov   [NET]: Name magic...
4006
  			sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4007
  		else
8d8ad9d7c   Pavel Emelyanov   [NET]: Name magic...
4008
  			sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4009
4010
  	}
  }
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4011
4012
  static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq,
  				  u32 end_seq)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
  {
  	if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
  		if (before(seq, sp->start_seq))
  			sp->start_seq = seq;
  		if (after(end_seq, sp->end_seq))
  			sp->end_seq = end_seq;
  		return 1;
  	}
  	return 0;
  }
1ed834655   Pavel Emelyanov   tcp: replace tcp_...
4023
  static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4024
  {
1ed834655   Pavel Emelyanov   tcp: replace tcp_...
4025
  	struct tcp_sock *tp = tcp_sk(sk);
bb5b7c112   David S. Miller   tcp: Revert per-r...
4026
  	if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
40b215e59   Pavel Emelyanov   tcp: de-bloat a b...
4027
  		int mib_idx;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4028
  		if (before(seq, tp->rcv_nxt))
40b215e59   Pavel Emelyanov   tcp: de-bloat a b...
4029
  			mib_idx = LINUX_MIB_TCPDSACKOLDSENT;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4030
  		else
40b215e59   Pavel Emelyanov   tcp: de-bloat a b...
4031
  			mib_idx = LINUX_MIB_TCPDSACKOFOSENT;
de0744af1   Pavel Emelyanov   mib: add net to N...
4032
  		NET_INC_STATS_BH(sock_net(sk), mib_idx);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4033
4034
4035
4036
  
  		tp->rx_opt.dsack = 1;
  		tp->duplicate_sack[0].start_seq = seq;
  		tp->duplicate_sack[0].end_seq = end_seq;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4037
4038
  	}
  }
1ed834655   Pavel Emelyanov   tcp: replace tcp_...
4039
  static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4040
  {
1ed834655   Pavel Emelyanov   tcp: replace tcp_...
4041
  	struct tcp_sock *tp = tcp_sk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4042
  	if (!tp->rx_opt.dsack)
1ed834655   Pavel Emelyanov   tcp: replace tcp_...
4043
  		tcp_dsack_set(sk, seq, end_seq);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4044
4045
4046
  	else
  		tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
  }
cf533ea53   Eric Dumazet   tcp: add const qu...
4047
  static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4048
4049
4050
4051
4052
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
  	    before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
de0744af1   Pavel Emelyanov   mib: add net to N...
4053
  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
4054
  		tcp_enter_quickack_mode(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4055

bb5b7c112   David S. Miller   tcp: Revert per-r...
4056
  		if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4057
4058
4059
4060
  			u32 end_seq = TCP_SKB_CB(skb)->end_seq;
  
  			if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
  				end_seq = tp->rcv_nxt;
1ed834655   Pavel Emelyanov   tcp: replace tcp_...
4061
  			tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
  		}
  	}
  
  	tcp_send_ack(sk);
  }
  
  /* These routines update the SACK block as out-of-order packets arrive or
   * in-order packets close up the sequence space.
   */
  static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
  {
  	int this_sack;
  	struct tcp_sack_block *sp = &tp->selective_acks[0];
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4075
  	struct tcp_sack_block *swalk = sp + 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4076
4077
4078
4079
  
  	/* See if the recent change to the first SACK eats into
  	 * or hits the sequence space of other SACK blocks, if so coalesce.
  	 */
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4080
  	for (this_sack = 1; this_sack < tp->rx_opt.num_sacks;) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4081
4082
4083
4084
4085
4086
4087
  		if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
  			int i;
  
  			/* Zap SWALK, by moving every further SACK up by one slot.
  			 * Decrease num_sacks.
  			 */
  			tp->rx_opt.num_sacks--;
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4088
4089
  			for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
  				sp[i] = sp[i + 1];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4090
4091
4092
4093
4094
  			continue;
  		}
  		this_sack++, swalk++;
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4095
4096
4097
4098
4099
4100
4101
4102
4103
  static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct tcp_sack_block *sp = &tp->selective_acks[0];
  	int cur_sacks = tp->rx_opt.num_sacks;
  	int this_sack;
  
  	if (!cur_sacks)
  		goto new_sack;
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4104
  	for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4105
4106
  		if (tcp_sack_extend(sp, seq, end_seq)) {
  			/* Rotate this_sack to the first one. */
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4107
  			for (; this_sack > 0; this_sack--, sp--)
a0bffffc1   Ilpo Järvinen   net/*: use linux/...
4108
  				swap(*sp, *(sp - 1));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
  			if (cur_sacks > 1)
  				tcp_sack_maybe_coalesce(tp);
  			return;
  		}
  	}
  
  	/* Could not find an adjacent existing SACK, build a new one,
  	 * put it at the front, and shift everyone else down.  We
  	 * always know there is at least one SACK present already here.
  	 *
  	 * If the sack array is full, forget about the last one.
  	 */
4389dded7   Adam Langley   tcp: Remove redun...
4121
  	if (this_sack >= TCP_NUM_SACKS) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4122
4123
4124
4125
  		this_sack--;
  		tp->rx_opt.num_sacks--;
  		sp--;
  	}
2de979bd7   Stephen Hemminger   [TCP]: whitespace...
4126
  	for (; this_sack > 0; this_sack--, sp--)
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4127
  		*sp = *(sp - 1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4128
4129
4130
4131
4132
4133
  
  new_sack:
  	/* Build the new head SACK, and we're done. */
  	sp->start_seq = seq;
  	sp->end_seq = end_seq;
  	tp->rx_opt.num_sacks++;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
  }
  
  /* RCV.NXT advances, some SACKs should be eaten. */
  
  static void tcp_sack_remove(struct tcp_sock *tp)
  {
  	struct tcp_sack_block *sp = &tp->selective_acks[0];
  	int num_sacks = tp->rx_opt.num_sacks;
  	int this_sack;
  
  	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
b03efcfb2   David S. Miller   [NET]: Transform ...
4145
  	if (skb_queue_empty(&tp->out_of_order_queue)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4146
  		tp->rx_opt.num_sacks = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4147
4148
  		return;
  	}
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4149
  	for (this_sack = 0; this_sack < num_sacks;) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4150
4151
4152
4153
4154
  		/* Check if the start of the sack is covered by RCV.NXT. */
  		if (!before(tp->rcv_nxt, sp->start_seq)) {
  			int i;
  
  			/* RCV.NXT must cover all the block! */
547b792ca   Ilpo Järvinen   net: convert BUG_...
4155
  			WARN_ON(before(tp->rcv_nxt, sp->end_seq));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
  
  			/* Zap this SACK, by moving forward any other SACKS. */
  			for (i=this_sack+1; i < num_sacks; i++)
  				tp->selective_acks[i-1] = tp->selective_acks[i];
  			num_sacks--;
  			continue;
  		}
  		this_sack++;
  		sp++;
  	}
5861f8e58   Ilpo Järvinen   tcp: remove point...
4166
  	tp->rx_opt.num_sacks = num_sacks;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
  }
  
  /* This one checks to see if we can put data from the
   * out_of_order queue into the receive_queue.
   */
  static void tcp_ofo_queue(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	__u32 dsack_high = tp->rcv_nxt;
  	struct sk_buff *skb;
  
  	while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
  		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
  			break;
  
  		if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
  			__u32 dsack = dsack_high;
  			if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
  				dsack_high = TCP_SKB_CB(skb)->end_seq;
1ed834655   Pavel Emelyanov   tcp: replace tcp_...
4186
  			tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4187
4188
4189
  		}
  
  		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
b13833805   Frans Pop   net: remove trail...
4190
4191
  			SOCK_DEBUG(sk, "ofo packet was already received
  ");
8728b834b   David S. Miller   [NET]: Kill skb->...
4192
  			__skb_unlink(skb, &tp->out_of_order_queue);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4193
4194
4195
4196
4197
4198
4199
  			__kfree_skb(skb);
  			continue;
  		}
  		SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X
  ",
  			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
  			   TCP_SKB_CB(skb)->end_seq);
8728b834b   David S. Miller   [NET]: Kill skb->...
4200
  		__skb_unlink(skb, &tp->out_of_order_queue);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4201
4202
  		__skb_queue_tail(&sk->sk_receive_queue, skb);
  		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
aa8223c7b   Arnaldo Carvalho de Melo   [SK_BUFF]: Introd...
4203
  		if (tcp_hdr(skb)->fin)
20c4cb792   Eric Dumazet   tcp: remove unuse...
4204
  			tcp_fin(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4205
4206
  	}
  }
56f367bbf   Vitaliy Gusev   [TCP]: Add return...
4207
  static int tcp_prune_ofo_queue(struct sock *sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4208
  static int tcp_prune_queue(struct sock *sk);
b000cd370   Vitaliy Gusev   [TCP]: Fix never ...
4209
4210
4211
4212
4213
4214
4215
4216
4217
  static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
  {
  	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
  	    !sk_rmem_schedule(sk, size)) {
  
  		if (tcp_prune_queue(sk) < 0)
  			return -1;
  
  		if (!sk_rmem_schedule(sk, size)) {
56f367bbf   Vitaliy Gusev   [TCP]: Add return...
4218
4219
  			if (!tcp_prune_ofo_queue(sk))
  				return -1;
b000cd370   Vitaliy Gusev   [TCP]: Fix never ...
4220
4221
4222
4223
4224
4225
  			if (!sk_rmem_schedule(sk, size))
  				return -1;
  		}
  	}
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4226
4227
  static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
4228
  	const struct tcphdr *th = tcp_hdr(skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4229
4230
4231
4232
4233
  	struct tcp_sock *tp = tcp_sk(sk);
  	int eaten = -1;
  
  	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
  		goto drop;
f84af32cb   Eric Dumazet   net: ip_queue_rcv...
4234
  	skb_dst_drop(skb);
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4235
  	__skb_pull(skb, th->doff * 4);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4236
4237
  
  	TCP_ECN_accept_cwr(tp, skb);
5861f8e58   Ilpo Järvinen   tcp: remove point...
4238
  	tp->rx_opt.dsack = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
  
  	/*  Queue data for delivery to the user.
  	 *  Packets in sequence go to the receive queue.
  	 *  Out of sequence packets to the out_of_order_queue.
  	 */
  	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
  		if (tcp_receive_window(tp) == 0)
  			goto out_of_window;
  
  		/* Ok. In sequence. In window. */
  		if (tp->ucopy.task == current &&
  		    tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
  		    sock_owned_by_user(sk) && !tp->urg_data) {
  			int chunk = min_t(unsigned int, skb->len,
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4253
  					  tp->ucopy.len);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4254
4255
4256
4257
4258
4259
4260
  
  			__set_current_state(TASK_RUNNING);
  
  			local_bh_enable();
  			if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
  				tp->ucopy.len -= chunk;
  				tp->copied_seq += chunk;
44f5324b5   Jerry Chu   TCP: fix a bug th...
4261
  				eaten = (chunk == skb->len);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4262
4263
4264
4265
4266
4267
4268
4269
  				tcp_rcv_space_adjust(sk);
  			}
  			local_bh_disable();
  		}
  
  		if (eaten <= 0) {
  queue_and_out:
  			if (eaten < 0 &&
b000cd370   Vitaliy Gusev   [TCP]: Fix never ...
4270
4271
  			    tcp_try_rmem_schedule(sk, skb->truesize))
  				goto drop;
3ab224be6   Hideo Aoki   [NET] CORE: Intro...
4272
  			skb_set_owner_r(skb, sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4273
4274
4275
  			__skb_queue_tail(&sk->sk_receive_queue, skb);
  		}
  		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
2de979bd7   Stephen Hemminger   [TCP]: whitespace...
4276
  		if (skb->len)
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
4277
  			tcp_event_data_recv(sk, skb);
2de979bd7   Stephen Hemminger   [TCP]: whitespace...
4278
  		if (th->fin)
20c4cb792   Eric Dumazet   tcp: remove unuse...
4279
  			tcp_fin(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4280

b03efcfb2   David S. Miller   [NET]: Transform ...
4281
  		if (!skb_queue_empty(&tp->out_of_order_queue)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4282
4283
4284
4285
4286
  			tcp_ofo_queue(sk);
  
  			/* RFC2581. 4.2. SHOULD send immediate ACK, when
  			 * gap in queue is filled.
  			 */
b03efcfb2   David S. Miller   [NET]: Transform ...
4287
  			if (skb_queue_empty(&tp->out_of_order_queue))
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
4288
  				inet_csk(sk)->icsk_ack.pingpong = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4289
4290
4291
4292
  		}
  
  		if (tp->rx_opt.num_sacks)
  			tcp_sack_remove(tp);
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
4293
  		tcp_fast_path_check(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
  
  		if (eaten > 0)
  			__kfree_skb(skb);
  		else if (!sock_flag(sk, SOCK_DEAD))
  			sk->sk_data_ready(sk, 0);
  		return;
  	}
  
  	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
  		/* A retransmit, 2nd most common case.  Force an immediate ack. */
de0744af1   Pavel Emelyanov   mib: add net to N...
4304
  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
1ed834655   Pavel Emelyanov   tcp: replace tcp_...
4305
  		tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4306
4307
  
  out_of_window:
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
4308
4309
  		tcp_enter_quickack_mode(sk);
  		inet_csk_schedule_ack(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4310
4311
4312
4313
4314
4315
4316
4317
  drop:
  		__kfree_skb(skb);
  		return;
  	}
  
  	/* Out of window. F.e. zero window probe. */
  	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
  		goto out_of_window;
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
4318
  	tcp_enter_quickack_mode(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4319
4320
4321
4322
4323
4324
4325
  
  	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
  		/* Partial packet, seq < rcv_next < end_seq */
  		SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X
  ",
  			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
  			   TCP_SKB_CB(skb)->end_seq);
1ed834655   Pavel Emelyanov   tcp: replace tcp_...
4326
  		tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
4327

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4328
4329
4330
4331
4332
4333
4334
4335
4336
  		/* If window is closed, drop tail of packet. But after
  		 * remembering D-SACK for its head made in previous line.
  		 */
  		if (!tcp_receive_window(tp))
  			goto out_of_window;
  		goto queue_and_out;
  	}
  
  	TCP_ECN_check_ce(tp, skb);
b000cd370   Vitaliy Gusev   [TCP]: Fix never ...
4337
4338
  	if (tcp_try_rmem_schedule(sk, skb->truesize))
  		goto drop;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4339
4340
4341
  
  	/* Disable header prediction. */
  	tp->pred_flags = 0;
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
4342
  	inet_csk_schedule_ack(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4343
4344
4345
4346
  
  	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X
  ",
  		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
3ab224be6   Hideo Aoki   [NET] CORE: Intro...
4347
  	skb_set_owner_r(skb, sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4348
4349
4350
  
  	if (!skb_peek(&tp->out_of_order_queue)) {
  		/* Initial out of order segment, build 1 SACK. */
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
4351
  		if (tcp_is_sack(tp)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4352
  			tp->rx_opt.num_sacks = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4353
4354
4355
4356
  			tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
  			tp->selective_acks[0].end_seq =
  						TCP_SKB_CB(skb)->end_seq;
  		}
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4357
  		__skb_queue_head(&tp->out_of_order_queue, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4358
  	} else {
915219441   David S. Miller   tcp: Use SKB queu...
4359
  		struct sk_buff *skb1 = skb_peek_tail(&tp->out_of_order_queue);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4360
4361
4362
4363
  		u32 seq = TCP_SKB_CB(skb)->seq;
  		u32 end_seq = TCP_SKB_CB(skb)->end_seq;
  
  		if (seq == TCP_SKB_CB(skb1)->end_seq) {
7de6c0333   Gerrit Renker   [SKB]: __skb_appe...
4364
  			__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
  
  			if (!tp->rx_opt.num_sacks ||
  			    tp->selective_acks[0].end_seq != seq)
  				goto add_sack;
  
  			/* Common case: data arrive in order after hole. */
  			tp->selective_acks[0].end_seq = end_seq;
  			return;
  		}
  
  		/* Find place to insert this segment. */
915219441   David S. Miller   tcp: Use SKB queu...
4376
  		while (1) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4377
4378
  			if (!after(TCP_SKB_CB(skb1)->seq, seq))
  				break;
915219441   David S. Miller   tcp: Use SKB queu...
4379
4380
4381
4382
4383
4384
  			if (skb_queue_is_first(&tp->out_of_order_queue, skb1)) {
  				skb1 = NULL;
  				break;
  			}
  			skb1 = skb_queue_prev(&tp->out_of_order_queue, skb1);
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4385
4386
  
  		/* Do skb overlap to previous one? */
915219441   David S. Miller   tcp: Use SKB queu...
4387
  		if (skb1 && before(seq, TCP_SKB_CB(skb1)->end_seq)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4388
4389
4390
  			if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
  				/* All the bits are present. Drop. */
  				__kfree_skb(skb);
1ed834655   Pavel Emelyanov   tcp: replace tcp_...
4391
  				tcp_dsack_set(sk, seq, end_seq);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4392
4393
4394
4395
  				goto add_sack;
  			}
  			if (after(seq, TCP_SKB_CB(skb1)->seq)) {
  				/* Partial overlap. */
1ed834655   Pavel Emelyanov   tcp: replace tcp_...
4396
  				tcp_dsack_set(sk, seq,
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4397
  					      TCP_SKB_CB(skb1)->end_seq);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4398
  			} else {
915219441   David S. Miller   tcp: Use SKB queu...
4399
4400
4401
4402
4403
4404
4405
  				if (skb_queue_is_first(&tp->out_of_order_queue,
  						       skb1))
  					skb1 = NULL;
  				else
  					skb1 = skb_queue_prev(
  						&tp->out_of_order_queue,
  						skb1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4406
4407
  			}
  		}
915219441   David S. Miller   tcp: Use SKB queu...
4408
4409
4410
4411
  		if (!skb1)
  			__skb_queue_head(&tp->out_of_order_queue, skb);
  		else
  			__skb_queue_after(&tp->out_of_order_queue, skb1, skb);
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
4412

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4413
  		/* And clean segments covered by new one as whole. */
2df9001ed   Ilpo Järvinen   tcp: fix loop in ...
4414
4415
  		while (!skb_queue_is_last(&tp->out_of_order_queue, skb)) {
  			skb1 = skb_queue_next(&tp->out_of_order_queue, skb);
915219441   David S. Miller   tcp: Use SKB queu...
4416

2df9001ed   Ilpo Järvinen   tcp: fix loop in ...
4417
4418
4419
  			if (!after(end_seq, TCP_SKB_CB(skb1)->seq))
  				break;
  			if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
1ed834655   Pavel Emelyanov   tcp: replace tcp_...
4420
  				tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
2df9001ed   Ilpo Järvinen   tcp: fix loop in ...
4421
4422
  						 end_seq);
  				break;
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4423
  			}
2df9001ed   Ilpo Järvinen   tcp: fix loop in ...
4424
4425
4426
4427
  			__skb_unlink(skb1, &tp->out_of_order_queue);
  			tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
  					 TCP_SKB_CB(skb1)->end_seq);
  			__kfree_skb(skb1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4428
4429
4430
  		}
  
  add_sack:
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
4431
  		if (tcp_is_sack(tp))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4432
4433
4434
  			tcp_sack_new_ofo_skb(sk, seq, end_seq);
  	}
  }
2cf46637b   Ilpo Järvinen   tcp: Add tcp_coll...
4435
4436
4437
  static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
  					struct sk_buff_head *list)
  {
915219441   David S. Miller   tcp: Use SKB queu...
4438
4439
4440
4441
  	struct sk_buff *next = NULL;
  
  	if (!skb_queue_is_last(list, skb))
  		next = skb_queue_next(list, skb);
2cf46637b   Ilpo Järvinen   tcp: Add tcp_coll...
4442
4443
4444
4445
4446
4447
4448
  
  	__skb_unlink(skb, list);
  	__kfree_skb(skb);
  	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED);
  
  	return next;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4449
4450
  /* Collapse contiguous sequence of skbs head..tail with
   * sequence numbers start..end.
915219441   David S. Miller   tcp: Use SKB queu...
4451
4452
4453
   *
   * If tail is NULL, this means until the end of the list.
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4454
4455
4456
4457
   * Segments with FIN/SYN are not collapsed (only because this
   * simplifies code)
   */
  static void
8728b834b   David S. Miller   [NET]: Kill skb->...
4458
4459
4460
  tcp_collapse(struct sock *sk, struct sk_buff_head *list,
  	     struct sk_buff *head, struct sk_buff *tail,
  	     u32 start, u32 end)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4461
  {
915219441   David S. Miller   tcp: Use SKB queu...
4462
4463
  	struct sk_buff *skb, *n;
  	bool end_of_skbs;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4464

caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
4465
  	/* First, check that queue is collapsible and find
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4466
  	 * the point where collapsing can be useful. */
915219441   David S. Miller   tcp: Use SKB queu...
4467
4468
4469
4470
4471
4472
  	skb = head;
  restart:
  	end_of_skbs = true;
  	skb_queue_walk_from_safe(list, skb, n) {
  		if (skb == tail)
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4473
4474
  		/* No new bits? It is possible on ofo queue. */
  		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
2cf46637b   Ilpo Järvinen   tcp: Add tcp_coll...
4475
  			skb = tcp_collapse_one(sk, skb, list);
915219441   David S. Miller   tcp: Use SKB queu...
4476
4477
4478
  			if (!skb)
  				break;
  			goto restart;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4479
4480
4481
4482
4483
4484
4485
  		}
  
  		/* The first skb to collapse is:
  		 * - not SYN/FIN and
  		 * - bloated or contains data before "start" or
  		 *   overlaps to the next one.
  		 */
aa8223c7b   Arnaldo Carvalho de Melo   [SK_BUFF]: Introd...
4486
  		if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin &&
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4487
  		    (tcp_win_from_space(skb->truesize) > skb->len ||
915219441   David S. Miller   tcp: Use SKB queu...
4488
4489
  		     before(TCP_SKB_CB(skb)->seq, start))) {
  			end_of_skbs = false;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4490
  			break;
915219441   David S. Miller   tcp: Use SKB queu...
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
  		}
  
  		if (!skb_queue_is_last(list, skb)) {
  			struct sk_buff *next = skb_queue_next(list, skb);
  			if (next != tail &&
  			    TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(next)->seq) {
  				end_of_skbs = false;
  				break;
  			}
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4501
4502
4503
  
  		/* Decided to skip this, advance start seq. */
  		start = TCP_SKB_CB(skb)->end_seq;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4504
  	}
915219441   David S. Miller   tcp: Use SKB queu...
4505
  	if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4506
4507
4508
4509
  		return;
  
  	while (before(start, end)) {
  		struct sk_buff *nskb;
c2636b4d9   Chuck Lever   [NET]: Treat the ...
4510
  		unsigned int header = skb_headroom(skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4511
4512
4513
4514
4515
  		int copy = SKB_MAX_ORDER(header, 0);
  
  		/* Too big header? This can happen with IPv6. */
  		if (copy < 0)
  			return;
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4516
4517
4518
  		if (end - start < copy)
  			copy = end - start;
  		nskb = alloc_skb(copy + header, GFP_ATOMIC);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4519
4520
  		if (!nskb)
  			return;
c51957daf   Arnaldo Carvalho de Melo   [TCP]: Do the lay...
4521

98e399f82   Arnaldo Carvalho de Melo   [SK_BUFF]: Introd...
4522
  		skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head);
9c70220b7   Arnaldo Carvalho de Melo   [SK_BUFF]: Introd...
4523
4524
4525
4526
  		skb_set_network_header(nskb, (skb_network_header(skb) -
  					      skb->head));
  		skb_set_transport_header(nskb, (skb_transport_header(skb) -
  						skb->head));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4527
4528
  		skb_reserve(nskb, header);
  		memcpy(nskb->head, skb->head, header);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4529
4530
  		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
  		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
43f59c893   David S. Miller   net: Remove __skb...
4531
  		__skb_queue_before(list, skb, nskb);
3ab224be6   Hideo Aoki   [NET] CORE: Intro...
4532
  		skb_set_owner_r(nskb, sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4533
4534
4535
4536
4537
  
  		/* Copy data, releasing collapsed skbs. */
  		while (copy > 0) {
  			int offset = start - TCP_SKB_CB(skb)->seq;
  			int size = TCP_SKB_CB(skb)->end_seq - start;
09a626600   Kris Katterjohn   [NET]: Change som...
4538
  			BUG_ON(offset < 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4539
4540
4541
4542
4543
4544
4545
4546
4547
  			if (size > 0) {
  				size = min(copy, size);
  				if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
  					BUG();
  				TCP_SKB_CB(nskb)->end_seq += size;
  				copy -= size;
  				start += size;
  			}
  			if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
2cf46637b   Ilpo Järvinen   tcp: Add tcp_coll...
4548
  				skb = tcp_collapse_one(sk, skb, list);
915219441   David S. Miller   tcp: Use SKB queu...
4549
4550
  				if (!skb ||
  				    skb == tail ||
aa8223c7b   Arnaldo Carvalho de Melo   [SK_BUFF]: Introd...
4551
4552
  				    tcp_hdr(skb)->syn ||
  				    tcp_hdr(skb)->fin)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
  					return;
  			}
  		}
  	}
  }
  
  /* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
   * and tcp_collapse() them until all the queue is collapsed.
   */
  static void tcp_collapse_ofo_queue(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
  	struct sk_buff *head;
  	u32 start, end;
  
  	if (skb == NULL)
  		return;
  
  	start = TCP_SKB_CB(skb)->seq;
  	end = TCP_SKB_CB(skb)->end_seq;
  	head = skb;
  
  	for (;;) {
915219441   David S. Miller   tcp: Use SKB queu...
4577
4578
4579
4580
4581
  		struct sk_buff *next = NULL;
  
  		if (!skb_queue_is_last(&tp->out_of_order_queue, skb))
  			next = skb_queue_next(&tp->out_of_order_queue, skb);
  		skb = next;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4582
4583
4584
  
  		/* Segment is terminated when we see gap or when
  		 * we are at the end of all the queue. */
915219441   David S. Miller   tcp: Use SKB queu...
4585
  		if (!skb ||
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4586
4587
  		    after(TCP_SKB_CB(skb)->seq, end) ||
  		    before(TCP_SKB_CB(skb)->end_seq, start)) {
8728b834b   David S. Miller   [NET]: Kill skb->...
4588
4589
  			tcp_collapse(sk, &tp->out_of_order_queue,
  				     head, skb, start, end);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4590
  			head = skb;
915219441   David S. Miller   tcp: Use SKB queu...
4591
  			if (!skb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
  				break;
  			/* Start new segment */
  			start = TCP_SKB_CB(skb)->seq;
  			end = TCP_SKB_CB(skb)->end_seq;
  		} else {
  			if (before(TCP_SKB_CB(skb)->seq, start))
  				start = TCP_SKB_CB(skb)->seq;
  			if (after(TCP_SKB_CB(skb)->end_seq, end))
  				end = TCP_SKB_CB(skb)->end_seq;
  		}
  	}
  }
b000cd370   Vitaliy Gusev   [TCP]: Fix never ...
4604
4605
  /*
   * Purge the out-of-order queue.
56f367bbf   Vitaliy Gusev   [TCP]: Add return...
4606
   * Return true if queue was pruned.
b000cd370   Vitaliy Gusev   [TCP]: Fix never ...
4607
   */
56f367bbf   Vitaliy Gusev   [TCP]: Add return...
4608
  static int tcp_prune_ofo_queue(struct sock *sk)
b000cd370   Vitaliy Gusev   [TCP]: Fix never ...
4609
4610
  {
  	struct tcp_sock *tp = tcp_sk(sk);
56f367bbf   Vitaliy Gusev   [TCP]: Add return...
4611
  	int res = 0;
b000cd370   Vitaliy Gusev   [TCP]: Fix never ...
4612
4613
  
  	if (!skb_queue_empty(&tp->out_of_order_queue)) {
de0744af1   Pavel Emelyanov   mib: add net to N...
4614
  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED);
b000cd370   Vitaliy Gusev   [TCP]: Fix never ...
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
  		__skb_queue_purge(&tp->out_of_order_queue);
  
  		/* Reset SACK state.  A conforming SACK implementation will
  		 * do the same at a timeout based retransmit.  When a connection
  		 * is in a sad state like this, we care only about integrity
  		 * of the connection not performance.
  		 */
  		if (tp->rx_opt.sack_ok)
  			tcp_sack_reset(&tp->rx_opt);
  		sk_mem_reclaim(sk);
56f367bbf   Vitaliy Gusev   [TCP]: Add return...
4625
  		res = 1;
b000cd370   Vitaliy Gusev   [TCP]: Fix never ...
4626
  	}
56f367bbf   Vitaliy Gusev   [TCP]: Add return...
4627
  	return res;
b000cd370   Vitaliy Gusev   [TCP]: Fix never ...
4628
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4629
4630
4631
4632
4633
4634
4635
4636
4637
  /* Reduce allocated memory if we can, trying to get
   * the socket within its memory limits again.
   *
   * Return less than zero if we should start dropping frames
   * until the socket owning process reads some of the data
   * to stabilize the situation.
   */
  static int tcp_prune_queue(struct sock *sk)
  {
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
4638
  	struct tcp_sock *tp = tcp_sk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4639
4640
4641
  
  	SOCK_DEBUG(sk, "prune_queue: c=%x
  ", tp->copied_seq);
de0744af1   Pavel Emelyanov   mib: add net to N...
4642
  	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4643
4644
  
  	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
4645
  		tcp_clamp_window(sk);
180d8cd94   Glauber Costa   foundations of pe...
4646
  	else if (sk_under_memory_pressure(sk))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4647
4648
4649
  		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
  
  	tcp_collapse_ofo_queue(sk);
915219441   David S. Miller   tcp: Use SKB queu...
4650
4651
4652
4653
4654
  	if (!skb_queue_empty(&sk->sk_receive_queue))
  		tcp_collapse(sk, &sk->sk_receive_queue,
  			     skb_peek(&sk->sk_receive_queue),
  			     NULL,
  			     tp->copied_seq, tp->rcv_nxt);
3ab224be6   Hideo Aoki   [NET] CORE: Intro...
4655
  	sk_mem_reclaim(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4656
4657
4658
4659
4660
4661
  
  	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
  		return 0;
  
  	/* Collapsing did not help, destructive actions follow.
  	 * This must not ever occur. */
b000cd370   Vitaliy Gusev   [TCP]: Fix never ...
4662
  	tcp_prune_ofo_queue(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4663
4664
4665
4666
4667
4668
4669
4670
  
  	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
  		return 0;
  
  	/* If we are really being abused, tell the caller to silently
  	 * drop receive data on the floor.  It will get retransmitted
  	 * and hopefully then we'll have sufficient space.
  	 */
de0744af1   Pavel Emelyanov   mib: add net to N...
4671
  	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4672
4673
4674
4675
4676
  
  	/* Massive buffer overcommit. */
  	tp->pred_flags = 0;
  	return -1;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4677
4678
4679
4680
4681
4682
4683
  /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
   * As additional protections, we do not touch cwnd in retransmission phases,
   * and if application hit its sndbuf limit recently.
   */
  void tcp_cwnd_application_limited(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
4684
  	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4685
4686
  	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
  		/* Limited by application or receiver window. */
d254bcdbf   Ilpo Järvinen   [TCP]: Fixes IW >...
4687
4688
  		u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
  		u32 win_used = max(tp->snd_cwnd_used, init_win);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4689
  		if (win_used < tp->snd_cwnd) {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
4690
  			tp->snd_ssthresh = tcp_current_ssthresh(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4691
4692
4693
4694
4695
4696
  			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
  		}
  		tp->snd_cwnd_used = 0;
  	}
  	tp->snd_cwnd_stamp = tcp_time_stamp;
  }
cf533ea53   Eric Dumazet   tcp: add const qu...
4697
  static int tcp_should_expand_sndbuf(const struct sock *sk)
0d9901df6   David S. Miller   [TCP]: Break out ...
4698
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
4699
  	const struct tcp_sock *tp = tcp_sk(sk);
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
4700

0d9901df6   David S. Miller   [TCP]: Break out ...
4701
4702
4703
4704
4705
4706
4707
  	/* If the user specified a specific send buffer setting, do
  	 * not modify it.
  	 */
  	if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
  		return 0;
  
  	/* If we are under global TCP memory pressure, do not expand.  */
180d8cd94   Glauber Costa   foundations of pe...
4708
  	if (sk_under_memory_pressure(sk))
0d9901df6   David S. Miller   [TCP]: Break out ...
4709
4710
4711
  		return 0;
  
  	/* If we are under soft global TCP memory pressure, do not expand.  */
180d8cd94   Glauber Costa   foundations of pe...
4712
  	if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
0d9901df6   David S. Miller   [TCP]: Break out ...
4713
4714
4715
4716
4717
4718
4719
4720
  		return 0;
  
  	/* If we filled the congestion window, do not expand.  */
  	if (tp->packets_out >= tp->snd_cwnd)
  		return 0;
  
  	return 1;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
  
  /* When incoming ACK allowed to free some skb from write_queue,
   * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
   * on the exit from tcp input handler.
   *
   * PROBLEM: sndbuf expansion does not work well with largesend.
   */
  static void tcp_new_space(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
4731
  	if (tcp_should_expand_sndbuf(sk)) {
87fb4b7b5   Eric Dumazet   net: more accurat...
4732
4733
4734
4735
  		int sndmem = SKB_TRUESIZE(max_t(u32,
  						tp->rx_opt.mss_clamp,
  						tp->mss_cache) +
  					  MAX_TCP_HEADER);
4a7e56098   Ilpo Järvinen   tcp: cleanup mess...
4736
  		int demanded = max_t(unsigned int, tp->snd_cwnd,
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4737
4738
  				     tp->reordering + 1);
  		sndmem *= 2 * demanded;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4739
4740
4741
4742
4743
4744
4745
  		if (sndmem > sk->sk_sndbuf)
  			sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
  		tp->snd_cwnd_stamp = tcp_time_stamp;
  	}
  
  	sk->sk_write_space(sk);
  }
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
4746
  static void tcp_check_space(struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4747
4748
4749
4750
4751
4752
4753
4754
  {
  	if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
  		sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
  		if (sk->sk_socket &&
  		    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
  			tcp_new_space(sk);
  	}
  }
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
4755
  static inline void tcp_data_snd_check(struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4756
  {
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
4757
  	tcp_push_pending_frames(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
  	tcp_check_space(sk);
  }
  
  /*
   * Check if sending an ack is needed.
   */
  static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	    /* More than one full frame received... */
9d4fb27db   Joe Perches   net/ipv4: Move &&...
4769
  	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4770
4771
4772
  	     /* ... and right edge of window advances far enough.
  	      * (tcp_recvmsg() will send ACK otherwise). Or...
  	      */
9d4fb27db   Joe Perches   net/ipv4: Move &&...
4773
  	     __tcp_select_window(sk) >= tp->rcv_wnd) ||
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4774
  	    /* We ACK each frame or... */
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
4775
  	    tcp_in_quickack_mode(sk) ||
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4776
  	    /* We have out of order data. */
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4777
  	    (ofo_possible && skb_peek(&tp->out_of_order_queue))) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4778
4779
4780
4781
4782
4783
4784
  		/* Then ack it now */
  		tcp_send_ack(sk);
  	} else {
  		/* Else, send delayed ack. */
  		tcp_send_delayed_ack(sk);
  	}
  }
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
4785
  static inline void tcp_ack_snd_check(struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4786
  {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
4787
  	if (!inet_csk_ack_scheduled(sk)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4788
4789
4790
4791
4792
4793
4794
4795
  		/* We sent a data segment already. */
  		return;
  	}
  	__tcp_ack_snd_check(sk, 1);
  }
  
  /*
   *	This routine is only called when we have urgent data
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
4796
   *	signaled. Its the 'slow' part of tcp_urg. It could be
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4797
4798
4799
4800
4801
4802
   *	moved inline now as tcp_urg is only called from one
   *	place. We handle URGent data wrong. We have to - as
   *	BSD still doesn't use the correction from RFC961.
   *	For 1003.1g we should support a new option TCP_STDURG to permit
   *	either form (or just set the sysctl tcp_stdurg).
   */
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
4803

cf533ea53   Eric Dumazet   tcp: add const qu...
4804
  static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	u32 ptr = ntohs(th->urg_ptr);
  
  	if (ptr && !sysctl_tcp_stdurg)
  		ptr--;
  	ptr += ntohl(th->seq);
  
  	/* Ignore urgent data that we've already seen and read. */
  	if (after(tp->copied_seq, ptr))
  		return;
  
  	/* Do not replay urg ptr.
  	 *
  	 * NOTE: interesting situation not covered by specs.
  	 * Misbehaving sender may send urg ptr, pointing to segment,
  	 * which we already have in ofo queue. We are not able to fetch
  	 * such data and will stay in TCP_URG_NOTYET until will be eaten
  	 * by recvmsg(). Seems, we are not obliged to handle such wicked
  	 * situations. But it is worth to think about possibility of some
  	 * DoSes using some hypothetical application level deadlock.
  	 */
  	if (before(ptr, tp->rcv_nxt))
  		return;
  
  	/* Do we already have a newer (or duplicate) urgent pointer? */
  	if (tp->urg_data && !after(ptr, tp->urg_seq))
  		return;
  
  	/* Tell the world about our new urgent pointer. */
  	sk_send_sigurg(sk);
  
  	/* We may be adding urgent data when the last byte read was
  	 * urgent. To do this requires some care. We cannot just ignore
  	 * tp->copied_seq since we would read the last urgent byte again
  	 * as data, nor can we alter copied_seq until this data arrives
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
4841
  	 * or we break the semantics of SIOCATMARK (and thus sockatmark())
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
  	 *
  	 * NOTE. Double Dutch. Rendering to plain English: author of comment
  	 * above did something sort of 	send("A", MSG_OOB); send("B", MSG_OOB);
  	 * and expect that both A and B disappear from stream. This is _wrong_.
  	 * Though this happens in BSD with high probability, this is occasional.
  	 * Any application relying on this is buggy. Note also, that fix "works"
  	 * only in this artificial test. Insert some normal data between A and B and we will
  	 * decline of BSD again. Verdict: it is better to remove to trap
  	 * buggy users.
  	 */
  	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4853
  	    !sock_flag(sk, SOCK_URGINLINE) && tp->copied_seq != tp->rcv_nxt) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4854
4855
4856
  		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
  		tp->copied_seq++;
  		if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
8728b834b   David S. Miller   [NET]: Kill skb->...
4857
  			__skb_unlink(skb, &sk->sk_receive_queue);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4858
4859
4860
  			__kfree_skb(skb);
  		}
  	}
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4861
4862
  	tp->urg_data = TCP_URG_NOTYET;
  	tp->urg_seq = ptr;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4863
4864
4865
4866
4867
4868
  
  	/* Disable header prediction. */
  	tp->pred_flags = 0;
  }
  
  /* This is the 'fast' part of urgent handling. */
cf533ea53   Eric Dumazet   tcp: add const qu...
4869
  static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4870
4871
4872
4873
4874
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	/* Check if we get a new urgent pointer - normally not. */
  	if (th->urg)
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4875
  		tcp_check_urg(sk, th);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4876
4877
4878
4879
4880
  
  	/* Do we wait for any urgent data? - normally not... */
  	if (tp->urg_data == TCP_URG_NOTYET) {
  		u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
  			  th->syn;
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
4881
  		/* Is the urgent pointer pointing into this packet? */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
  		if (ptr < skb->len) {
  			u8 tmp;
  			if (skb_copy_bits(skb, ptr, &tmp, 1))
  				BUG();
  			tp->urg_data = TCP_URG_VALID | tmp;
  			if (!sock_flag(sk, SOCK_DEAD))
  				sk->sk_data_ready(sk, 0);
  		}
  	}
  }
  
  static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	int chunk = skb->len - hlen;
  	int err;
  
  	local_bh_enable();
604763722   Herbert Xu   [NET]: Treat CHEC...
4900
  	if (skb_csum_unnecessary(skb))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
  		err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk);
  	else
  		err = skb_copy_and_csum_datagram_iovec(skb, hlen,
  						       tp->ucopy.iov);
  
  	if (!err) {
  		tp->ucopy.len -= chunk;
  		tp->copied_seq += chunk;
  		tcp_rcv_space_adjust(sk);
  	}
  
  	local_bh_disable();
  	return err;
  }
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4915
4916
  static __sum16 __tcp_checksum_complete_user(struct sock *sk,
  					    struct sk_buff *skb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4917
  {
b51655b95   Al Viro   [NET]: Annotate _...
4918
  	__sum16 result;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
  
  	if (sock_owned_by_user(sk)) {
  		local_bh_enable();
  		result = __tcp_checksum_complete(skb);
  		local_bh_disable();
  	} else {
  		result = __tcp_checksum_complete(skb);
  	}
  	return result;
  }
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4929
4930
  static inline int tcp_checksum_complete_user(struct sock *sk,
  					     struct sk_buff *skb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4931
  {
604763722   Herbert Xu   [NET]: Treat CHEC...
4932
  	return !skb_csum_unnecessary(skb) &&
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4933
  	       __tcp_checksum_complete_user(sk, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
4934
  }
1a2449a87   Chris Leech   [I/OAT]: TCP recv...
4935
  #ifdef CONFIG_NET_DMA
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4936
4937
  static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb,
  				  int hlen)
1a2449a87   Chris Leech   [I/OAT]: TCP recv...
4938
4939
4940
4941
4942
4943
4944
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	int chunk = skb->len - hlen;
  	int dma_cookie;
  	int copied_early = 0;
  
  	if (tp->ucopy.wakeup)
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
4945
  		return 0;
1a2449a87   Chris Leech   [I/OAT]: TCP recv...
4946
4947
  
  	if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
f67b45999   Dan Williams   net_dma: convert ...
4948
  		tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1a2449a87   Chris Leech   [I/OAT]: TCP recv...
4949

604763722   Herbert Xu   [NET]: Treat CHEC...
4950
  	if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) {
1a2449a87   Chris Leech   [I/OAT]: TCP recv...
4951
4952
  
  		dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
4953
4954
4955
  							 skb, hlen,
  							 tp->ucopy.iov, chunk,
  							 tp->ucopy.pinned_list);
1a2449a87   Chris Leech   [I/OAT]: TCP recv...
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
  
  		if (dma_cookie < 0)
  			goto out;
  
  		tp->ucopy.dma_cookie = dma_cookie;
  		copied_early = 1;
  
  		tp->ucopy.len -= chunk;
  		tp->copied_seq += chunk;
  		tcp_rcv_space_adjust(sk);
  
  		if ((tp->ucopy.len == 0) ||
aa8223c7b   Arnaldo Carvalho de Melo   [SK_BUFF]: Introd...
4968
  		    (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) ||
1a2449a87   Chris Leech   [I/OAT]: TCP recv...
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
  		    (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
  			tp->ucopy.wakeup = 1;
  			sk->sk_data_ready(sk, 0);
  		}
  	} else if (chunk > 0) {
  		tp->ucopy.wakeup = 1;
  		sk->sk_data_ready(sk, 0);
  	}
  out:
  	return copied_early;
  }
  #endif /* CONFIG_NET_DMA */
cbe2d128a   Ilpo Järvinen   tcp: Add tcp_vali...
4981
4982
4983
4984
  /* Does PAWS and seqno based validation of an incoming segment, flags will
   * play significant role here.
   */
  static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
cf533ea53   Eric Dumazet   tcp: add const qu...
4985
  			      const struct tcphdr *th, int syn_inerr)
cbe2d128a   Ilpo Järvinen   tcp: Add tcp_vali...
4986
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
4987
  	const u8 *hash_location;
cbe2d128a   Ilpo Järvinen   tcp: Add tcp_vali...
4988
4989
4990
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	/* RFC1323: H1. Apply PAWS check first. */
4957faade   William Allen Simpson   TCPCT part 1g: Re...
4991
4992
  	if (tcp_fast_parse_options(skb, th, tp, &hash_location) &&
  	    tp->rx_opt.saw_tstamp &&
cbe2d128a   Ilpo Järvinen   tcp: Add tcp_vali...
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
  	    tcp_paws_discard(sk, skb)) {
  		if (!th->rst) {
  			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
  			tcp_send_dupack(sk, skb);
  			goto discard;
  		}
  		/* Reset is accepted even if it did not pass PAWS. */
  	}
  
  	/* Step 1: check sequence number */
  	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
  		/* RFC793, page 37: "In all states except SYN-SENT, all reset
  		 * (RST) segments are validated by checking their SEQ-fields."
  		 * And page 69: "If an incoming segment is not acceptable,
  		 * an acknowledgment should be sent in reply (unless the RST
  		 * bit is set, if so drop the segment and return)".
  		 */
  		if (!th->rst)
  			tcp_send_dupack(sk, skb);
  		goto discard;
  	}
  
  	/* Step 2: check RST bit */
  	if (th->rst) {
  		tcp_reset(sk);
  		goto discard;
  	}
  
  	/* ts_recent update must be made after we are sure that the packet
  	 * is in window.
  	 */
  	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
  
  	/* step 3: check security and precedence [ignored] */
  
  	/* step 4: Check for a SYN in window. */
  	if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
  		if (syn_inerr)
  			TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
  		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN);
  		tcp_reset(sk);
  		return -1;
  	}
  
  	return 1;
  
  discard:
  	__kfree_skb(skb);
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5043
  /*
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
5044
   *	TCP receive function for the ESTABLISHED state.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5045
   *
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
5046
   *	It is split into a fast path and a slow path. The fast path is
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5047
5048
   * 	disabled when:
   *	- A zero window was announced from us - zero window probing
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
5049
   *        is only handled properly in the slow path.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5050
5051
5052
5053
   *	- Out of order segments arrived.
   *	- Urgent data is expected.
   *	- There is no buffer space left
   *	- Unexpected TCP flags/window values/header lengths are received
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
5054
   *	  (detected by checking the TCP header against pred_flags)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5055
5056
5057
5058
5059
   *	- Data is sent in both directions. Fast path only supports pure senders
   *	  or pure receivers (this means either the sequence number or the ack
   *	  value must stay constant)
   *	- Unexpected TCP option.
   *
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
5060
   *	When these conditions are not satisfied it drops into a standard
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5061
5062
   *	receive procedure patterned after RFC793 to handle all cases.
   *	The first three cases are guaranteed by proper pred_flags setting,
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
5063
   *	the rest is checked inline. Fast processing is turned on in
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5064
5065
5066
   *	tcp_data_queue when everything is OK.
   */
  int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
cf533ea53   Eric Dumazet   tcp: add const qu...
5067
  			const struct tcphdr *th, unsigned int len)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5068
5069
  {
  	struct tcp_sock *tp = tcp_sk(sk);
cbe2d128a   Ilpo Järvinen   tcp: Add tcp_vali...
5070
  	int res;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5071
5072
5073
  
  	/*
  	 *	Header prediction.
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
5074
  	 *	The code loosely follows the one in the famous
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5075
  	 *	"30 instruction TCP receive" Van Jacobson mail.
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
5076
5077
  	 *
  	 *	Van's trick is to deposit buffers into socket queue
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5078
5079
5080
5081
  	 *	on a device interrupt, to call tcp_recv function
  	 *	on the receive process context and checksum and copy
  	 *	the buffer to user space. smart...
  	 *
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
5082
  	 *	Our current scheme is not silly either but we take the
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5083
5084
5085
5086
5087
5088
5089
  	 *	extra cost of the net_bh soft interrupt processing...
  	 *	We do checksum and copy also but from device to kernel.
  	 */
  
  	tp->rx_opt.saw_tstamp = 0;
  
  	/*	pred_flags is 0xS?10 << 16 + snd_wnd
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
5090
  	 *	if header_prediction is to be made
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5091
5092
  	 *	'S' will always be tp->tcp_header_len >> 2
  	 *	'?' will be 0 for the fast path, otherwise pred_flags is 0 to
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
5093
  	 *  turn it off	(when there are holes in the receive
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5094
5095
5096
5097
5098
  	 *	 space for instance)
  	 *	PSH flag is ignored.
  	 */
  
  	if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
96e0bf4b5   John Dykstra   tcp: Discard segm...
5099
5100
  	    TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
  	    !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5101
5102
5103
5104
5105
5106
5107
5108
5109
  		int tcp_header_len = tp->tcp_header_len;
  
  		/* Timestamp header prediction: tcp_header_len
  		 * is automatically equal to th->doff*4 due to pred_flags
  		 * match.
  		 */
  
  		/* Check timestamp */
  		if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5110
  			/* No? Slow path! */
a4356b292   Ilpo Järvinen   tcp: Add tcp_pars...
5111
  			if (!tcp_parse_aligned_timestamp(tp, th))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5112
  				goto slow_path;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
  			/* If PAWS failed, check it more carefully in slow path */
  			if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
  				goto slow_path;
  
  			/* DO NOT update ts_recent here, if checksum fails
  			 * and timestamp was corrupted part, it will result
  			 * in a hung connection since we will drop all
  			 * future packets due to the PAWS test.
  			 */
  		}
  
  		if (len <= tcp_header_len) {
  			/* Bulk data transfer: sender */
  			if (len == tcp_header_len) {
  				/* Predicted packet is in window by definition.
  				 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
  				 * Hence, check seq<=rcv_wup reduces to:
  				 */
  				if (tcp_header_len ==
  				    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
  				    tp->rcv_nxt == tp->rcv_wup)
  					tcp_store_ts_recent(tp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5135
5136
5137
5138
  				/* We know that such packets are checksummed
  				 * on entry.
  				 */
  				tcp_ack(sk, skb, 0);
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
5139
  				__kfree_skb(skb);
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
5140
  				tcp_data_snd_check(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5141
5142
  				return 0;
  			} else { /* Header too small */
63231bddf   Pavel Emelyanov   mib: add net to T...
5143
  				TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5144
5145
5146
5147
  				goto discard;
  			}
  		} else {
  			int eaten = 0;
1a2449a87   Chris Leech   [I/OAT]: TCP recv...
5148
  			int copied_early = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5149

1a2449a87   Chris Leech   [I/OAT]: TCP recv...
5150
5151
5152
5153
5154
5155
5156
5157
  			if (tp->copied_seq == tp->rcv_nxt &&
  			    len - tcp_header_len <= tp->ucopy.len) {
  #ifdef CONFIG_NET_DMA
  				if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
  					copied_early = 1;
  					eaten = 1;
  				}
  #endif
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
5158
5159
  				if (tp->ucopy.task == current &&
  				    sock_owned_by_user(sk) && !copied_early) {
1a2449a87   Chris Leech   [I/OAT]: TCP recv...
5160
  					__set_current_state(TASK_RUNNING);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5161

1a2449a87   Chris Leech   [I/OAT]: TCP recv...
5162
5163
5164
5165
  					if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
  						eaten = 1;
  				}
  				if (eaten) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5166
5167
5168
5169
5170
5171
5172
5173
5174
  					/* Predicted packet is in window by definition.
  					 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
  					 * Hence, check seq<=rcv_wup reduces to:
  					 */
  					if (tcp_header_len ==
  					    (sizeof(struct tcphdr) +
  					     TCPOLEN_TSTAMP_ALIGNED) &&
  					    tp->rcv_nxt == tp->rcv_wup)
  						tcp_store_ts_recent(tp);
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
5175
  					tcp_rcv_rtt_measure_ts(sk, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5176
5177
5178
  
  					__skb_pull(skb, tcp_header_len);
  					tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
de0744af1   Pavel Emelyanov   mib: add net to N...
5179
  					NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5180
  				}
1a2449a87   Chris Leech   [I/OAT]: TCP recv...
5181
5182
  				if (copied_early)
  					tcp_cleanup_rbuf(sk, skb->len);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
  			}
  			if (!eaten) {
  				if (tcp_checksum_complete_user(sk, skb))
  					goto csum_error;
  
  				/* Predicted packet is in window by definition.
  				 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
  				 * Hence, check seq<=rcv_wup reduces to:
  				 */
  				if (tcp_header_len ==
  				    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
  				    tp->rcv_nxt == tp->rcv_wup)
  					tcp_store_ts_recent(tp);
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
5196
  				tcp_rcv_rtt_measure_ts(sk, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5197
5198
5199
  
  				if ((int)skb->truesize > sk->sk_forward_alloc)
  					goto step5;
de0744af1   Pavel Emelyanov   mib: add net to N...
5200
  				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5201
5202
  
  				/* Bulk data transfer: receiver */
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
5203
  				__skb_pull(skb, tcp_header_len);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5204
  				__skb_queue_tail(&sk->sk_receive_queue, skb);
3ab224be6   Hideo Aoki   [NET] CORE: Intro...
5205
  				skb_set_owner_r(skb, sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5206
5207
  				tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
  			}
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
5208
  			tcp_event_data_recv(sk, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5209
5210
5211
5212
  
  			if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
  				/* Well, only one small jumplet in fast path... */
  				tcp_ack(sk, skb, FLAG_DATA);
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
5213
  				tcp_data_snd_check(sk);
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
5214
  				if (!inet_csk_ack_scheduled(sk))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5215
5216
  					goto no_ack;
  			}
53240c208   Ali Saidi   tcp: Fix possible...
5217
5218
  			if (!copied_early || tp->rcv_nxt != tp->rcv_wup)
  				__tcp_ack_snd_check(sk, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5219
  no_ack:
1a2449a87   Chris Leech   [I/OAT]: TCP recv...
5220
5221
5222
5223
5224
  #ifdef CONFIG_NET_DMA
  			if (copied_early)
  				__skb_queue_tail(&sk->sk_async_wait_queue, skb);
  			else
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5225
5226
5227
5228
5229
5230
5231
5232
5233
  			if (eaten)
  				__kfree_skb(skb);
  			else
  				sk->sk_data_ready(sk, 0);
  			return 0;
  		}
  	}
  
  slow_path:
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
5234
  	if (len < (th->doff << 2) || tcp_checksum_complete_user(sk, skb))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5235
5236
5237
  		goto csum_error;
  
  	/*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5238
5239
  	 *	Standard slow path.
  	 */
cbe2d128a   Ilpo Järvinen   tcp: Add tcp_vali...
5240
5241
5242
  	res = tcp_validate_incoming(sk, skb, th, 1);
  	if (res <= 0)
  		return -res;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5243
5244
  
  step5:
96e0bf4b5   John Dykstra   tcp: Discard segm...
5245
5246
  	if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
  		goto discard;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5247

463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
5248
  	tcp_rcv_rtt_measure_ts(sk, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5249
5250
5251
5252
5253
5254
  
  	/* Process urgent data. */
  	tcp_urg(sk, skb, th);
  
  	/* step 7: process the segment text */
  	tcp_data_queue(sk, skb);
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
5255
  	tcp_data_snd_check(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5256
5257
5258
5259
  	tcp_ack_snd_check(sk);
  	return 0;
  
  csum_error:
63231bddf   Pavel Emelyanov   mib: add net to T...
5260
  	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5261
5262
5263
5264
5265
  
  discard:
  	__kfree_skb(skb);
  	return 0;
  }
4bc2f18ba   Eric Dumazet   net/ipv4: EXPORT_...
5266
  EXPORT_SYMBOL(tcp_rcv_established);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5267
5268
  
  static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
cf533ea53   Eric Dumazet   tcp: add const qu...
5269
  					 const struct tcphdr *th, unsigned int len)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5270
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
5271
  	const u8 *hash_location;
d83d8461f   Arnaldo Carvalho de Melo   [IP_SOCKGLUE]: Re...
5272
  	struct inet_connection_sock *icsk = inet_csk(sk);
4957faade   William Allen Simpson   TCPCT part 1g: Re...
5273
  	struct tcp_sock *tp = tcp_sk(sk);
4957faade   William Allen Simpson   TCPCT part 1g: Re...
5274
5275
  	struct tcp_cookie_values *cvp = tp->cookie_values;
  	int saved_clamp = tp->rx_opt.mss_clamp;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5276

bb5b7c112   David S. Miller   tcp: Revert per-r...
5277
  	tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
  
  	if (th->ack) {
  		/* rfc793:
  		 * "If the state is SYN-SENT then
  		 *    first check the ACK bit
  		 *      If the ACK bit is set
  		 *	  If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
  		 *        a reset (unless the RST bit is set, if so drop
  		 *        the segment and return)"
  		 *
  		 *  We do not send data with SYN, so that RFC-correct
  		 *  test reduces to:
  		 */
  		if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
  			goto reset_and_undo;
  
  		if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
  		    !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
  			     tcp_time_stamp)) {
de0744af1   Pavel Emelyanov   mib: add net to N...
5297
  			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
  			goto reset_and_undo;
  		}
  
  		/* Now ACK is acceptable.
  		 *
  		 * "If the RST bit is set
  		 *    If the ACK was acceptable then signal the user "error:
  		 *    connection reset", drop the segment, enter CLOSED state,
  		 *    delete TCB, and return."
  		 */
  
  		if (th->rst) {
  			tcp_reset(sk);
  			goto discard;
  		}
  
  		/* rfc793:
  		 *   "fifth, if neither of the SYN or RST bits is set then
  		 *    drop the segment and return."
  		 *
  		 *    See note below!
  		 *                                        --ANK(990513)
  		 */
  		if (!th->syn)
  			goto discard_and_undo;
  
  		/* rfc793:
  		 *   "If the SYN bit is on ...
  		 *    are acceptable then ...
  		 *    (our SYN has been ACKed), change the connection
  		 *    state to ESTABLISHED..."
  		 */
  
  		TCP_ECN_rcv_synack(tp, th);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
  
  		tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
  		tcp_ack(sk, skb, FLAG_SLOWPATH);
  
  		/* Ok.. it's good. Set up sequence numbers and
  		 * move to established.
  		 */
  		tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
  		tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
  
  		/* RFC1323: The window in SYN & SYN/ACK segments is
  		 * never scaled.
  		 */
  		tp->snd_wnd = ntohs(th->window);
ee7537b63   Hantzis Fotis   tcp: tcp_init_wl ...
5346
  		tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
  
  		if (!tp->rx_opt.wscale_ok) {
  			tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
  			tp->window_clamp = min(tp->window_clamp, 65535U);
  		}
  
  		if (tp->rx_opt.saw_tstamp) {
  			tp->rx_opt.tstamp_ok	   = 1;
  			tp->tcp_header_len =
  				sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
  			tp->advmss	    -= TCPOLEN_TSTAMP_ALIGNED;
  			tcp_store_ts_recent(tp);
  		} else {
  			tp->tcp_header_len = sizeof(struct tcphdr);
  		}
e60402d0a   Ilpo Järvinen   [TCP]: Move sack_...
5362
5363
  		if (tcp_is_sack(tp) && sysctl_tcp_fack)
  			tcp_enable_fack(tp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5364

5d424d5a6   John Heffner   [TCP]: MTU probing
5365
  		tcp_mtup_init(sk);
d83d8461f   Arnaldo Carvalho de Melo   [IP_SOCKGLUE]: Re...
5366
  		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5367
5368
5369
5370
5371
5372
  		tcp_initialize_rcv_mss(sk);
  
  		/* Remember, tcp_poll() does not lock socket!
  		 * Change state from SYN-SENT only after copied_seq
  		 * is initialized. */
  		tp->copied_seq = tp->rcv_nxt;
4957faade   William Allen Simpson   TCPCT part 1g: Re...
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
  
  		if (cvp != NULL &&
  		    cvp->cookie_pair_size > 0 &&
  		    tp->rx_opt.cookie_plus > 0) {
  			int cookie_size = tp->rx_opt.cookie_plus
  					- TCPOLEN_COOKIE_BASE;
  			int cookie_pair_size = cookie_size
  					     + cvp->cookie_desired;
  
  			/* A cookie extension option was sent and returned.
  			 * Note that each incoming SYNACK replaces the
  			 * Responder cookie.  The initial exchange is most
  			 * fragile, as protection against spoofing relies
  			 * entirely upon the sequence and timestamp (above).
  			 * This replacement strategy allows the correct pair to
  			 * pass through, while any others will be filtered via
  			 * Responder verification later.
  			 */
  			if (sizeof(cvp->cookie_pair) >= cookie_pair_size) {
  				memcpy(&cvp->cookie_pair[cvp->cookie_desired],
  				       hash_location, cookie_size);
  				cvp->cookie_pair_size = cookie_pair_size;
  			}
  		}
e16aa207c   Ralf Baechle   [NET]: Memory bar...
5397
  		smp_mb();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5398
  		tcp_set_state(sk, TCP_ESTABLISHED);
6b877699c   Venkat Yekkirala   SELinux: Return c...
5399
  		security_inet_conn_established(sk, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5400
  		/* Make sure socket is routed, for correct metrics.  */
8292a17a3   Arnaldo Carvalho de Melo   [ICSK]: Rename st...
5401
  		icsk->icsk_af_ops->rebuild_header(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5402
5403
  
  		tcp_init_metrics(sk);
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
5404
  		tcp_init_congestion_control(sk);
317a76f9a   Stephen Hemminger   [TCP]: Add plugga...
5405

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5406
5407
5408
5409
5410
5411
5412
5413
  		/* Prevent spurious tcp_cwnd_restart() on first data
  		 * packet.
  		 */
  		tp->lsndtime = tcp_time_stamp;
  
  		tcp_init_buffer_space(sk);
  
  		if (sock_flag(sk, SOCK_KEEPOPEN))
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
5414
  			inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5415
5416
5417
5418
5419
5420
5421
5422
  
  		if (!tp->rx_opt.snd_wscale)
  			__tcp_fast_path_on(tp, tp->snd_wnd);
  		else
  			tp->pred_flags = 0;
  
  		if (!sock_flag(sk, SOCK_DEAD)) {
  			sk->sk_state_change(sk);
8d8ad9d7c   Pavel Emelyanov   [NET]: Name magic...
5423
  			sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5424
  		}
295f7324f   Arnaldo Carvalho de Melo   [ICSK]: Introduce...
5425
5426
5427
  		if (sk->sk_write_pending ||
  		    icsk->icsk_accept_queue.rskq_defer_accept ||
  		    icsk->icsk_ack.pingpong) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5428
5429
5430
5431
5432
5433
5434
  			/* Save one ACK. Data will be ready after
  			 * several ticks, if write_pending is set.
  			 *
  			 * It may be deleted, but with this feature tcpdumps
  			 * look so _wonderfully_ clever, that I was not able
  			 * to stand against the temptation 8)     --ANK
  			 */
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
5435
  			inet_csk_schedule_ack(sk);
295f7324f   Arnaldo Carvalho de Melo   [ICSK]: Introduce...
5436
5437
  			icsk->icsk_ack.lrcvtime = tcp_time_stamp;
  			icsk->icsk_ack.ato	 = TCP_ATO_MIN;
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
5438
5439
  			tcp_incr_quickack(sk);
  			tcp_enter_quickack_mode(sk);
3f421baa4   Arnaldo Carvalho de Melo   [NET]: Just move ...
5440
5441
  			inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
  						  TCP_DELACK_MAX, TCP_RTO_MAX);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
  
  discard:
  			__kfree_skb(skb);
  			return 0;
  		} else {
  			tcp_send_ack(sk);
  		}
  		return -1;
  	}
  
  	/* No ACK in the segment */
  
  	if (th->rst) {
  		/* rfc793:
  		 * "If the RST bit is set
  		 *
  		 *      Otherwise (no ACK) drop the segment and return."
  		 */
  
  		goto discard_and_undo;
  	}
  
  	/* PAWS check. */
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
5465
  	if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
c887e6d2d   Ilpo Järvinen   tcp: consolidate ...
5466
  	    tcp_paws_reject(&tp->rx_opt, 0))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
  		goto discard_and_undo;
  
  	if (th->syn) {
  		/* We see SYN without ACK. It is attempt of
  		 * simultaneous connect with crossed SYNs.
  		 * Particularly, it can be connect to self.
  		 */
  		tcp_set_state(sk, TCP_SYN_RECV);
  
  		if (tp->rx_opt.saw_tstamp) {
  			tp->rx_opt.tstamp_ok = 1;
  			tcp_store_ts_recent(tp);
  			tp->tcp_header_len =
  				sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
  		} else {
  			tp->tcp_header_len = sizeof(struct tcphdr);
  		}
  
  		tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
  		tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
  
  		/* RFC1323: The window in SYN & SYN/ACK segments is
  		 * never scaled.
  		 */
  		tp->snd_wnd    = ntohs(th->window);
  		tp->snd_wl1    = TCP_SKB_CB(skb)->seq;
  		tp->max_window = tp->snd_wnd;
  
  		TCP_ECN_rcv_syn(tp, th);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5496

5d424d5a6   John Heffner   [TCP]: MTU probing
5497
  		tcp_mtup_init(sk);
d83d8461f   Arnaldo Carvalho de Melo   [IP_SOCKGLUE]: Re...
5498
  		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5499
  		tcp_initialize_rcv_mss(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
  		tcp_send_synack(sk);
  #if 0
  		/* Note, we could accept data and URG from this segment.
  		 * There are no obstacles to make this.
  		 *
  		 * However, if we ignore data in ACKless segments sometimes,
  		 * we have no reasons to accept it sometimes.
  		 * Also, seems the code doing it in step6 of tcp_rcv_state_process
  		 * is not flawless. So, discard packet for sanity.
  		 * Uncomment this return to process the data.
  		 */
  		return -1;
  #else
  		goto discard;
  #endif
  	}
  	/* "fifth, if neither of the SYN or RST bits is set then
  	 * drop the segment and return."
  	 */
  
  discard_and_undo:
  	tcp_clear_options(&tp->rx_opt);
  	tp->rx_opt.mss_clamp = saved_clamp;
  	goto discard;
  
  reset_and_undo:
  	tcp_clear_options(&tp->rx_opt);
  	tp->rx_opt.mss_clamp = saved_clamp;
  	return 1;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5530
5531
  /*
   *	This function implements the receiving procedure of RFC 793 for
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
5532
   *	all states except ESTABLISHED and TIME_WAIT.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5533
5534
5535
   *	It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
   *	address independent.
   */
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
5536

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5537
  int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
cf533ea53   Eric Dumazet   tcp: add const qu...
5538
  			  const struct tcphdr *th, unsigned int len)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5539
5540
  {
  	struct tcp_sock *tp = tcp_sk(sk);
8292a17a3   Arnaldo Carvalho de Melo   [ICSK]: Rename st...
5541
  	struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5542
  	int queued = 0;
cbe2d128a   Ilpo Järvinen   tcp: Add tcp_vali...
5543
  	int res;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5544
5545
5546
5547
5548
5549
5550
5551
  
  	tp->rx_opt.saw_tstamp = 0;
  
  	switch (sk->sk_state) {
  	case TCP_CLOSE:
  		goto discard;
  
  	case TCP_LISTEN:
2de979bd7   Stephen Hemminger   [TCP]: whitespace...
5552
  		if (th->ack)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5553
  			return 1;
2de979bd7   Stephen Hemminger   [TCP]: whitespace...
5554
  		if (th->rst)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5555
  			goto discard;
2de979bd7   Stephen Hemminger   [TCP]: whitespace...
5556
  		if (th->syn) {
fdf5af0da   Eric Dumazet   tcp: drop SYN+FIN...
5557
5558
  			if (th->fin)
  				goto discard;
8292a17a3   Arnaldo Carvalho de Melo   [ICSK]: Rename st...
5559
  			if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5560
  				return 1;
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
5561
5562
  			/* Now we have several options: In theory there is
  			 * nothing else in the frame. KA9Q has an option to
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5563
  			 * send data with the syn, BSD accepts data with the
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
5564
5565
5566
  			 * syn up to the [to be] advertised window and
  			 * Solaris 2.1 gives you a protocol error. For now
  			 * we just ignore it, that fits the spec precisely
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5567
5568
5569
  			 * and avoids incompatibilities. It would be nice in
  			 * future to drop through and process the data.
  			 *
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
5570
  			 * Now that TTCP is starting to be used we ought to
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5571
5572
  			 * queue this data.
  			 * But, this leaves one open to an easy denial of
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
5573
  			 * service attack, and SYN cookies can't defend
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5574
  			 * against this problem. So, we drop the data
fb7e2399e   Masayuki Nakagawa   [TCP]: skb is une...
5575
5576
  			 * in the interest of security over speed unless
  			 * it's still in use.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5577
  			 */
fb7e2399e   Masayuki Nakagawa   [TCP]: skb is une...
5578
5579
  			kfree_skb(skb);
  			return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5580
5581
5582
5583
  		}
  		goto discard;
  
  	case TCP_SYN_SENT:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5584
5585
5586
5587
5588
5589
5590
  		queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
  		if (queued >= 0)
  			return queued;
  
  		/* Do step6 onward by hand. */
  		tcp_urg(sk, skb, th);
  		__kfree_skb(skb);
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
5591
  		tcp_data_snd_check(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5592
5593
  		return 0;
  	}
cbe2d128a   Ilpo Järvinen   tcp: Add tcp_vali...
5594
5595
5596
  	res = tcp_validate_incoming(sk, skb, th, 0);
  	if (res <= 0)
  		return -res;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5597
5598
5599
  
  	/* step 5: check the ACK field */
  	if (th->ack) {
96e0bf4b5   John Dykstra   tcp: Discard segm...
5600
  		int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5601

2de979bd7   Stephen Hemminger   [TCP]: whitespace...
5602
  		switch (sk->sk_state) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5603
5604
5605
  		case TCP_SYN_RECV:
  			if (acceptable) {
  				tp->copied_seq = tp->rcv_nxt;
e16aa207c   Ralf Baechle   [NET]: Memory bar...
5606
  				smp_mb();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5607
5608
5609
5610
5611
5612
5613
5614
  				tcp_set_state(sk, TCP_ESTABLISHED);
  				sk->sk_state_change(sk);
  
  				/* Note, that this wakeup is only for marginal
  				 * crossed SYN case. Passively open sockets
  				 * are not waked up, because sk->sk_sleep ==
  				 * NULL and sk->sk_socket == NULL.
  				 */
8d8ad9d7c   Pavel Emelyanov   [NET]: Name magic...
5615
5616
  				if (sk->sk_socket)
  					sk_wake_async(sk,
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
5617
  						      SOCK_WAKE_IO, POLL_OUT);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5618
5619
5620
5621
  
  				tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
  				tp->snd_wnd = ntohs(th->window) <<
  					      tp->rx_opt.snd_wscale;
ee7537b63   Hantzis Fotis   tcp: tcp_init_wl ...
5622
  				tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5623

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5624
5625
5626
5627
5628
5629
  				if (tp->rx_opt.tstamp_ok)
  					tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
  
  				/* Make sure socket is routed, for
  				 * correct metrics.
  				 */
8292a17a3   Arnaldo Carvalho de Melo   [ICSK]: Rename st...
5630
  				icsk->icsk_af_ops->rebuild_header(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5631
5632
  
  				tcp_init_metrics(sk);
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
5633
  				tcp_init_congestion_control(sk);
317a76f9a   Stephen Hemminger   [TCP]: Add plugga...
5634

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5635
5636
5637
5638
  				/* Prevent spurious tcp_cwnd_restart() on
  				 * first data packet.
  				 */
  				tp->lsndtime = tcp_time_stamp;
5d424d5a6   John Heffner   [TCP]: MTU probing
5639
  				tcp_mtup_init(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
  				tcp_initialize_rcv_mss(sk);
  				tcp_init_buffer_space(sk);
  				tcp_fast_path_on(tp);
  			} else {
  				return 1;
  			}
  			break;
  
  		case TCP_FIN_WAIT1:
  			if (tp->snd_una == tp->write_seq) {
  				tcp_set_state(sk, TCP_FIN_WAIT2);
  				sk->sk_shutdown |= SEND_SHUTDOWN;
b6c6712a4   Eric Dumazet   net: sk_dst_cache...
5652
  				dst_confirm(__sk_dst_get(sk));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
  
  				if (!sock_flag(sk, SOCK_DEAD))
  					/* Wake up lingering close() */
  					sk->sk_state_change(sk);
  				else {
  					int tmo;
  
  					if (tp->linger2 < 0 ||
  					    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
  					     after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
  						tcp_done(sk);
de0744af1   Pavel Emelyanov   mib: add net to N...
5664
  						NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5665
5666
  						return 1;
  					}
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
5667
  					tmo = tcp_fin_time(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5668
  					if (tmo > TCP_TIMEWAIT_LEN) {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
5669
  						inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5670
5671
5672
5673
5674
5675
5676
  					} else if (th->fin || sock_owned_by_user(sk)) {
  						/* Bad case. We could lose such FIN otherwise.
  						 * It is not a big problem, but it looks confusing
  						 * and not so rare event. We still can lose it now,
  						 * if it spins in bh_lock_sock(), but it is really
  						 * marginal case.
  						 */
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
5677
  						inet_csk_reset_keepalive_timer(sk, tmo);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
  					} else {
  						tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
  						goto discard;
  					}
  				}
  			}
  			break;
  
  		case TCP_CLOSING:
  			if (tp->snd_una == tp->write_seq) {
  				tcp_time_wait(sk, TCP_TIME_WAIT, 0);
  				goto discard;
  			}
  			break;
  
  		case TCP_LAST_ACK:
  			if (tp->snd_una == tp->write_seq) {
  				tcp_update_metrics(sk);
  				tcp_done(sk);
  				goto discard;
  			}
  			break;
  		}
  	} else
  		goto discard;
  
  	/* step 6: check the URG bit */
  	tcp_urg(sk, skb, th);
  
  	/* step 7: process the segment text */
  	switch (sk->sk_state) {
  	case TCP_CLOSE_WAIT:
  	case TCP_CLOSING:
  	case TCP_LAST_ACK:
  		if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
  			break;
  	case TCP_FIN_WAIT1:
  	case TCP_FIN_WAIT2:
  		/* RFC 793 says to queue data in these states,
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
5717
  		 * RFC 1122 says we MUST send a reset.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5718
5719
5720
5721
5722
  		 * BSD 4.4 also does reset.
  		 */
  		if (sk->sk_shutdown & RCV_SHUTDOWN) {
  			if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
  			    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
de0744af1   Pavel Emelyanov   mib: add net to N...
5723
  				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5724
5725
5726
5727
5728
  				tcp_reset(sk);
  				return 1;
  			}
  		}
  		/* Fall through */
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
5729
  	case TCP_ESTABLISHED:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5730
5731
5732
5733
5734
5735
5736
  		tcp_data_queue(sk, skb);
  		queued = 1;
  		break;
  	}
  
  	/* tcp_data could move socket to TIME-WAIT */
  	if (sk->sk_state != TCP_CLOSE) {
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
5737
  		tcp_data_snd_check(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5738
5739
  		tcp_ack_snd_check(sk);
  	}
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
5740
  	if (!queued) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5741
5742
5743
5744
5745
  discard:
  		__kfree_skb(skb);
  	}
  	return 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
5746
  EXPORT_SYMBOL(tcp_rcv_state_process);