Blame view

net/ipv4/tcp_output.c 107 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
7
  /*
   * INET		An implementation of the TCP/IP protocol suite for the LINUX
   *		operating system.  INET is implemented using the  BSD Socket
   *		interface as the means of communication with the user level.
   *
   *		Implementation of the Transmission Control Protocol(TCP).
   *
02c30a84e   Jesper Juhl   [PATCH] update Ro...
8
   * Authors:	Ross Biro
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
   *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
   *		Mark Evans, <evansmp@uhura.aston.ac.uk>
   *		Corey Minyard <wf-rch!minyard@relay.EU.net>
   *		Florian La Roche, <flla@stud.uni-sb.de>
   *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
   *		Linus Torvalds, <torvalds@cs.helsinki.fi>
   *		Alan Cox, <gw4pts@gw4pts.ampr.org>
   *		Matthew Dillon, <dillon@apollo.west.oic.com>
   *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
   *		Jorge Cwik, <jorge@laser.satlink.net>
   */
  
  /*
   * Changes:	Pedro Roque	:	Retransmit queue handled by TCP.
   *				:	Fragmentation on mtu decrease
   *				:	Segment collapse on retransmit
   *				:	AF independence
   *
   *		Linus Torvalds	:	send_delayed_ack
   *		David S. Miller	:	Charge memory using the right skb
   *					during syn/ack processing.
   *		David S. Miller :	Output engine completely rewritten.
   *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr.
   *		Cacophonix Gaul :	draft-minshall-nagle-01
   *		J Hadi Salim	:	ECN support
   *
   */
91df42bed   Joe Perches   net: ipv4 and ipv...
36
  #define pr_fmt(fmt) "TCP: " fmt
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
37
38
39
  #include <net/tcp.h>
  
  #include <linux/compiler.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
40
  #include <linux/gfp.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
41
  #include <linux/module.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
42
43
  
  /* People can turn this off for buggy TCP's found in printers etc. */
ab32ea5d8   Brian Haley   [NET/IPV4/IPV6]: ...
44
  int sysctl_tcp_retrans_collapse __read_mostly = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
45

09cb105ea   Jianjun Kong   net: clean up net...
46
  /* People can turn this on to work with those rare, broken TCPs that
15d99e02b   Rick Jones   [TCP]: sysctl to ...
47
48
   * interpret the window field as a signed quantity.
   */
ab32ea5d8   Brian Haley   [NET/IPV4/IPV6]: ...
49
  int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
15d99e02b   Rick Jones   [TCP]: sysctl to ...
50

c39c4c6ab   Wei Liu   tcp: double defau...
51
52
  /* Default TSQ limit of four TSO segments */
  int sysctl_tcp_limit_output_bytes __read_mostly = 262144;
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
53

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
54
55
56
57
  /* This limits the percentage of the congestion window which we
   * will allow a single TSO frame to consume.  Building TSO frames
   * which are too large can cause TCP streams to be bursty.
   */
ab32ea5d8   Brian Haley   [NET/IPV4/IPV6]: ...
58
  int sysctl_tcp_tso_win_divisor __read_mostly = 3;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
59

35089bb20   David S. Miller   [TCP]: Add tcp_sl...
60
  /* By default, RFC2861 behavior.  */
ab32ea5d8   Brian Haley   [NET/IPV4/IPV6]: ...
61
  int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
35089bb20   David S. Miller   [TCP]: Add tcp_sl...
62

46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
63
64
  static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  			   int push_one, gfp_t gfp);
519855c50   William Allen Simpson   TCPCT part 1c: sy...
65

67edfef78   Andi Kleen   TCP: Add comments...
66
  /* Account for new data that has been sent to the network. */
cf533ea53   Eric Dumazet   tcp: add const qu...
67
  static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
68
  {
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
69
  	struct inet_connection_sock *icsk = inet_csk(sk);
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
70
  	struct tcp_sock *tp = tcp_sk(sk);
66f5fe624   Ilpo Järvinen   [TCP]: Rename upd...
71
  	unsigned int prior_packets = tp->packets_out;
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
72

fe067e8ab   David S. Miller   [TCP]: Abstract o...
73
  	tcp_advance_send_head(sk, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
74
  	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
8512430e5   Ilpo Järvinen   [TCP]: Move FRTO ...
75

66f5fe624   Ilpo Järvinen   [TCP]: Rename upd...
76
  	tp->packets_out += tcp_skb_pcount(skb);
bec41a11d   Yuchung Cheng   tcp: remove early...
77
  	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
750ea2baf   Yuchung Cheng   tcp: early retran...
78
  		tcp_rearm_rto(sk);
f19c29e3e   Yuchung Cheng   tcp: snmp stats f...
79

f7324acd9   David S. Miller   tcp: Use NET_ADD_...
80
81
  	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
  		      tcp_skb_pcount(skb));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
82
  }
a4ecb15a2   Cui, Cheng   tcp: accommodate ...
83
84
  /* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
   * window scaling factor due to loss of precision.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
85
86
87
88
89
   * If window has been shrunk, what should we make? It is not clear at all.
   * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
   * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
   * invalid. OK, let's make this for now:
   */
cf533ea53   Eric Dumazet   tcp: add const qu...
90
  static inline __u32 tcp_acceptable_seq(const struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
91
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
92
  	const struct tcp_sock *tp = tcp_sk(sk);
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
93

a4ecb15a2   Cui, Cheng   tcp: accommodate ...
94
95
96
  	if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
  	    (tp->rx_opt.wscale_ok &&
  	     ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
97
98
  		return tp->snd_nxt;
  	else
90840defa   Ilpo Järvinen   [TCP]: Introduce ...
99
  		return tcp_wnd_end(tp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
  }
  
  /* Calculate mss to advertise in SYN segment.
   * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
   *
   * 1. It is independent of path mtu.
   * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
   * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
   *    attached devices, because some buggy hosts are confused by
   *    large MSS.
   * 4. We do not make 3, we advertise MSS, calculated from first
   *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
   *    This may be overridden via information stored in routing table.
   * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
   *    probably even Jumbo".
   */
  static __u16 tcp_advertise_mss(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
cf533ea53   Eric Dumazet   tcp: add const qu...
119
  	const struct dst_entry *dst = __sk_dst_get(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
120
  	int mss = tp->advmss;
0dbaee3b3   David S. Miller   net: Abstract def...
121
122
123
124
125
126
127
  	if (dst) {
  		unsigned int metric = dst_metric_advmss(dst);
  
  		if (metric < mss) {
  			mss = metric;
  			tp->advmss = mss;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
128
129
130
131
132
133
  	}
  
  	return (__u16)mss;
  }
  
  /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
6f021c62d   Eric Dumazet   tcp: fix slow sta...
134
135
136
   * This is the first part of cwnd validation mechanism.
   */
  void tcp_cwnd_restart(struct sock *sk, s32 delta)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
137
  {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
138
  	struct tcp_sock *tp = tcp_sk(sk);
6f021c62d   Eric Dumazet   tcp: fix slow sta...
139
  	u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
140
  	u32 cwnd = tp->snd_cwnd;
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
141
  	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
142

6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
143
  	tp->snd_ssthresh = tcp_current_ssthresh(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
144
  	restart_cwnd = min(restart_cwnd, cwnd);
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
145
  	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
146
147
  		cwnd >>= 1;
  	tp->snd_cwnd = max(cwnd, restart_cwnd);
c2203cf75   Eric Dumazet   tcp: use tcp_jiff...
148
  	tp->snd_cwnd_stamp = tcp_jiffies32;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
149
150
  	tp->snd_cwnd_used = 0;
  }
67edfef78   Andi Kleen   TCP: Add comments...
151
  /* Congestion state accounting after a packet has been sent. */
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
152
  static void tcp_event_data_sent(struct tcp_sock *tp,
cf533ea53   Eric Dumazet   tcp: add const qu...
153
  				struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
154
  {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
155
  	struct inet_connection_sock *icsk = inet_csk(sk);
d635fbe27   Eric Dumazet   tcp: use tcp_jiff...
156
  	const u32 now = tcp_jiffies32;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
157

05c5a46d7   Neal Cardwell   tcp: generate CA_...
158
159
  	if (tcp_packets_in_flight(tp) == 0)
  		tcp_ca_event(sk, CA_EVENT_TX_START);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
160
161
162
163
164
  	tp->lsndtime = now;
  
  	/* If it is a reply for ato after last received
  	 * packet, enter pingpong mode.
  	 */
2251ae46a   Jon Maxwell   tcp: v1 always se...
165
166
  	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
  		icsk->icsk_ack.pingpong = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
167
  }
67edfef78   Andi Kleen   TCP: Add comments...
168
  /* Account for an ACK we sent. */
78636179f   Yuchung Cheng   tcp: do not cance...
169
170
  static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
  				      u32 rcv_nxt)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
171
  {
78636179f   Yuchung Cheng   tcp: do not cance...
172
173
174
175
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	if (unlikely(rcv_nxt != tp->rcv_nxt))
  		return;  /* Special ACK sent by DCTCP to reflect ECN */
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
176
177
  	tcp_dec_quickack_mode(sk, pkts);
  	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
178
  }
85f16525a   Yuchung Cheng   tcp: properly sen...
179
180
181
182
  
  u32 tcp_default_init_rwnd(u32 mss)
  {
  	/* Initial receive window should be twice of TCP_INIT_CWND to
9ef71e0c8   Weiping Pan   tcp:typo unset sh...
183
  	 * enable proper sending of new unsent data during fast recovery
85f16525a   Yuchung Cheng   tcp: properly sen...
184
185
186
187
188
189
190
191
192
  	 * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a
  	 * limit when mss is larger than 1460.
  	 */
  	u32 init_rwnd = TCP_INIT_CWND * 2;
  
  	if (mss > 1460)
  		init_rwnd = max((1460 * init_rwnd) / mss, 2U);
  	return init_rwnd;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
193
194
195
196
197
198
199
200
201
  /* Determine a window scaling and initial window to offer.
   * Based on the assumption that the given amount of space
   * will be offered. Store the results in the tp structure.
   * NOTE: for smooth operation initial space offering should
   * be a multiple of mss if possible. We assume here that mss >= 1.
   * This MUST be enforced by all callers.
   */
  void tcp_select_initial_window(int __space, __u32 mss,
  			       __u32 *rcv_wnd, __u32 *window_clamp,
31d12926e   laurent chavey   net: Add rtnetlin...
202
203
  			       int wscale_ok, __u8 *rcv_wscale,
  			       __u32 init_rcv_wnd)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
204
205
206
207
208
  {
  	unsigned int space = (__space < 0 ? 0 : __space);
  
  	/* If no clamp set the clamp to the max possible scaled window */
  	if (*window_clamp == 0)
589c49cbf   Gao Feng   net: tcp: Define ...
209
  		(*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
210
211
212
213
  	space = min(*window_clamp, space);
  
  	/* Quantize space offering to a multiple of mss if possible. */
  	if (space > mss)
589c49cbf   Gao Feng   net: tcp: Define ...
214
  		space = rounddown(space, mss);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
215
216
  
  	/* NOTE: offering an initial window larger than 32767
15d99e02b   Rick Jones   [TCP]: sysctl to ...
217
218
219
220
221
222
  	 * will break some buggy TCP stacks. If the admin tells us
  	 * it is likely we could be speaking with such a buggy stack
  	 * we will truncate our initial window offering to 32K-1
  	 * unless the remote has sent us a window scaling option,
  	 * which we interpret as a sign the remote TCP is not
  	 * misinterpreting the window field as a signed quantity.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
223
  	 */
15d99e02b   Rick Jones   [TCP]: sysctl to ...
224
225
226
227
  	if (sysctl_tcp_workaround_signed_windows)
  		(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
  	else
  		(*rcv_wnd) = space;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
228
229
  	(*rcv_wscale) = 0;
  	if (wscale_ok) {
589c49cbf   Gao Feng   net: tcp: Define ...
230
  		/* Set window scaling on max possible window */
f626300a3   Soheil Hassas Yeganeh   tcp: consider rec...
231
232
  		space = max_t(u32, space, sysctl_tcp_rmem[2]);
  		space = max_t(u32, space, sysctl_rmem_max);
316c1592b   Stephen Hemminger   [TCP]: Limit wind...
233
  		space = min_t(u32, space, *window_clamp);
589c49cbf   Gao Feng   net: tcp: Define ...
234
  		while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
235
236
237
238
  			space >>= 1;
  			(*rcv_wscale)++;
  		}
  	}
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
239
  	if (mss > (1 << *rcv_wscale)) {
85f16525a   Yuchung Cheng   tcp: properly sen...
240
241
242
  		if (!init_rcv_wnd) /* Use default unless specified otherwise */
  			init_rcv_wnd = tcp_default_init_rwnd(mss);
  		*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
243
244
245
  	}
  
  	/* Set the clamp no higher than max representable value */
589c49cbf   Gao Feng   net: tcp: Define ...
246
  	(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
247
  }
4bc2f18ba   Eric Dumazet   net/ipv4: EXPORT_...
248
  EXPORT_SYMBOL(tcp_select_initial_window);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
249
250
251
252
253
254
  
  /* Chose a new window to advertise, update state in tcp_sock for the
   * socket, and return result with RFC1323 scaling applied.  The return
   * value can be stuffed directly into th->window for an outgoing
   * frame.
   */
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
255
  static u16 tcp_select_window(struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
256
257
  {
  	struct tcp_sock *tp = tcp_sk(sk);
8e165e203   Florian Westphal   net: tcp: add mib...
258
  	u32 old_win = tp->rcv_wnd;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
259
260
261
262
  	u32 cur_win = tcp_receive_window(tp);
  	u32 new_win = __tcp_select_window(sk);
  
  	/* Never shrink the offered window */
2de979bd7   Stephen Hemminger   [TCP]: whitespace...
263
  	if (new_win < cur_win) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
264
265
266
267
268
269
270
  		/* Danger Will Robinson!
  		 * Don't update rcv_wup/rcv_wnd here or else
  		 * we will not be able to advertise a zero
  		 * window in time.  --DaveM
  		 *
  		 * Relax Will Robinson.
  		 */
8e165e203   Florian Westphal   net: tcp: add mib...
271
272
273
  		if (new_win == 0)
  			NET_INC_STATS(sock_net(sk),
  				      LINUX_MIB_TCPWANTZEROWINDOWADV);
607bfbf2d   Patrick McHardy   [TCP]: Fix shrink...
274
  		new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
275
276
277
278
279
280
281
  	}
  	tp->rcv_wnd = new_win;
  	tp->rcv_wup = tp->rcv_nxt;
  
  	/* Make sure we do not exceed the maximum possible
  	 * scaled window.
  	 */
15d99e02b   Rick Jones   [TCP]: sysctl to ...
282
  	if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
283
284
285
286
287
288
  		new_win = min(new_win, MAX_TCP_WINDOW);
  	else
  		new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
  
  	/* RFC1323 scaling applied */
  	new_win >>= tp->rx_opt.rcv_wscale;
31770e34e   Florian Westphal   tcp: Revert "tcp:...
289
  	/* If we advertise zero window, disable fast path. */
8e165e203   Florian Westphal   net: tcp: add mib...
290
  	if (new_win == 0) {
31770e34e   Florian Westphal   tcp: Revert "tcp:...
291
  		tp->pred_flags = 0;
8e165e203   Florian Westphal   net: tcp: add mib...
292
293
294
295
296
297
  		if (old_win)
  			NET_INC_STATS(sock_net(sk),
  				      LINUX_MIB_TCPTOZEROWINDOWADV);
  	} else if (old_win == 0) {
  		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
298
299
300
  
  	return new_win;
  }
67edfef78   Andi Kleen   TCP: Add comments...
301
  /* Packet ECN state for a SYN-ACK */
735d38311   Florian Westphal   tcp: change TCP_E...
302
  static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
303
  {
30e502a34   Daniel Borkmann   net: tcp: add fla...
304
  	const struct tcp_sock *tp = tcp_sk(sk);
4de075e04   Eric Dumazet   tcp: rename tcp_s...
305
  	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
306
  	if (!(tp->ecn_flags & TCP_ECN_OK))
4de075e04   Eric Dumazet   tcp: rename tcp_s...
307
  		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
91b5b21c7   Lawrence Brakmo   bpf: Add support ...
308
309
  	else if (tcp_ca_needs_ecn(sk) ||
  		 tcp_bpf_ca_needs_ecn(sk))
30e502a34   Daniel Borkmann   net: tcp: add fla...
310
  		INET_ECN_xmit(sk);
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
311
  }
67edfef78   Andi Kleen   TCP: Add comments...
312
  /* Packet ECN state for a SYN.  */
735d38311   Florian Westphal   tcp: change TCP_E...
313
  static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
314
315
  {
  	struct tcp_sock *tp = tcp_sk(sk);
91b5b21c7   Lawrence Brakmo   bpf: Add support ...
316
  	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
f7b3bec6f   Florian Westphal   net: allow settin...
317
  	bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
91b5b21c7   Lawrence Brakmo   bpf: Add support ...
318
  		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
f7b3bec6f   Florian Westphal   net: allow settin...
319
320
321
322
323
324
325
  
  	if (!use_ecn) {
  		const struct dst_entry *dst = __sk_dst_get(sk);
  
  		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
  			use_ecn = true;
  	}
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
326
327
  
  	tp->ecn_flags = 0;
f7b3bec6f   Florian Westphal   net: allow settin...
328
329
  
  	if (use_ecn) {
4de075e04   Eric Dumazet   tcp: rename tcp_s...
330
  		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
331
  		tp->ecn_flags = TCP_ECN_OK;
91b5b21c7   Lawrence Brakmo   bpf: Add support ...
332
  		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
30e502a34   Daniel Borkmann   net: tcp: add fla...
333
  			INET_ECN_xmit(sk);
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
334
335
  	}
  }
492135557   Daniel Borkmann   tcp: add rfc3168,...
336
337
338
339
340
341
342
343
  static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
  {
  	if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
  		/* tp->ecn_flags are cleared at a later point in time when
  		 * SYN ACK is ultimatively being received.
  		 */
  		TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
  }
735d38311   Florian Westphal   tcp: change TCP_E...
344
  static void
6ac705b18   Eric Dumazet   tcp: remove tcp_e...
345
  tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
346
  {
6ac705b18   Eric Dumazet   tcp: remove tcp_e...
347
  	if (inet_rsk(req)->ecn_ok)
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
348
349
  		th->ece = 1;
  }
67edfef78   Andi Kleen   TCP: Add comments...
350
351
352
  /* Set up ECN state for a packet on a ESTABLISHED socket that is about to
   * be sent.
   */
735d38311   Florian Westphal   tcp: change TCP_E...
353
  static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
ea1627c20   Eric Dumazet   tcp: minor optimi...
354
  			 struct tcphdr *th, int tcp_header_len)
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
355
356
357
358
359
360
361
362
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	if (tp->ecn_flags & TCP_ECN_OK) {
  		/* Not-retransmitted data segment: set ECT and inject CWR. */
  		if (skb->len != tcp_header_len &&
  		    !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
  			INET_ECN_xmit(sk);
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
363
  			if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
364
  				tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
ea1627c20   Eric Dumazet   tcp: minor optimi...
365
  				th->cwr = 1;
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
366
367
  				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
  			}
30e502a34   Daniel Borkmann   net: tcp: add fla...
368
  		} else if (!tcp_ca_needs_ecn(sk)) {
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
369
370
371
372
  			/* ACK or retransmitted segment: clear ECT|CE */
  			INET_ECN_dontxmit(sk);
  		}
  		if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
ea1627c20   Eric Dumazet   tcp: minor optimi...
373
  			th->ece = 1;
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
374
375
  	}
  }
e870a8efc   Ilpo Järvinen   [TCP]: Perform se...
376
377
378
379
380
  /* Constructs common control bits of non-data skb. If SYN/FIN is present,
   * auto increment end seqno.
   */
  static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
  {
2e8e18ef5   David S. Miller   tcp: Set CHECKSUM...
381
  	skb->ip_summed = CHECKSUM_PARTIAL;
e870a8efc   Ilpo Järvinen   [TCP]: Perform se...
382
  	skb->csum = 0;
4de075e04   Eric Dumazet   tcp: rename tcp_s...
383
  	TCP_SKB_CB(skb)->tcp_flags = flags;
e870a8efc   Ilpo Järvinen   [TCP]: Perform se...
384
  	TCP_SKB_CB(skb)->sacked = 0;
cd7d8498c   Eric Dumazet   tcp: change tcp_s...
385
  	tcp_skb_pcount_set(skb, 1);
e870a8efc   Ilpo Järvinen   [TCP]: Perform se...
386
387
  
  	TCP_SKB_CB(skb)->seq = seq;
a3433f35a   Changli Gao   tcp: unify tcp fl...
388
  	if (flags & (TCPHDR_SYN | TCPHDR_FIN))
e870a8efc   Ilpo Järvinen   [TCP]: Perform se...
389
390
391
  		seq++;
  	TCP_SKB_CB(skb)->end_seq = seq;
  }
a2a385d62   Eric Dumazet   tcp: bool convers...
392
  static inline bool tcp_urg_mode(const struct tcp_sock *tp)
33f5f57ee   Ilpo Järvinen   tcp: kill pointle...
393
394
395
  {
  	return tp->snd_una != tp->snd_up;
  }
33ad798c9   Adam Langley   tcp: options clea...
396
397
398
  #define OPTION_SACK_ADVERTISE	(1 << 0)
  #define OPTION_TS		(1 << 1)
  #define OPTION_MD5		(1 << 2)
89e95a613   Ori Finkelman   IPv4 TCP fails to...
399
  #define OPTION_WSCALE		(1 << 3)
2100c8d2d   Yuchung Cheng   net-tcp: Fast Ope...
400
  #define OPTION_FAST_OPEN_COOKIE	(1 << 8)
33ad798c9   Adam Langley   tcp: options clea...
401
402
  
  struct tcp_out_options {
2100c8d2d   Yuchung Cheng   net-tcp: Fast Ope...
403
404
  	u16 options;		/* bit field of OPTION_* */
  	u16 mss;		/* 0 to disable */
33ad798c9   Adam Langley   tcp: options clea...
405
406
  	u8 ws;			/* window scale, 0 to disable */
  	u8 num_sack_blocks;	/* number of SACK blocks to include */
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
407
  	u8 hash_size;		/* bytes in hash_location */
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
408
  	__u8 *hash_location;	/* temporary pointer, overloaded */
2100c8d2d   Yuchung Cheng   net-tcp: Fast Ope...
409
410
  	__u32 tsval, tsecr;	/* need to include OPTION_TS */
  	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
33ad798c9   Adam Langley   tcp: options clea...
411
  };
67edfef78   Andi Kleen   TCP: Add comments...
412
413
414
  /* Write previously computed TCP options to the packet.
   *
   * Beware: Something in the Internet is very sensitive to the ordering of
fd6149d33   Ilpo Järvinen   tcp: Restore orde...
415
416
   * TCP options, we learned this through the hard way, so be careful here.
   * Luckily we can at least blame others for their non-compliance but from
8e3bff96a   stephen hemminger   net: more spellin...
417
   * inter-operability perspective it seems that we're somewhat stuck with
fd6149d33   Ilpo Järvinen   tcp: Restore orde...
418
419
420
421
422
423
424
   * the ordering which we have been using if we want to keep working with
   * those broken things (not that it currently hurts anybody as there isn't
   * particular reason why the ordering would need to be changed).
   *
   * At least SACK_PERM as the first option is known to lead to a disaster
   * (but it may well be that other scenarios fail similarly).
   */
33ad798c9   Adam Langley   tcp: options clea...
425
  static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
426
427
  			      struct tcp_out_options *opts)
  {
2100c8d2d   Yuchung Cheng   net-tcp: Fast Ope...
428
  	u16 options = opts->options;	/* mungable copy */
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
429

bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
430
  	if (unlikely(OPTION_MD5 & options)) {
1a2c6181c   Christoph Paasch   tcp: Remove TCPCT
431
432
  		*ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
  			       (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
433
434
  		/* overload cookie hash location */
  		opts->hash_location = (__u8 *)ptr;
33ad798c9   Adam Langley   tcp: options clea...
435
  		ptr += 4;
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
436
  	}
33ad798c9   Adam Langley   tcp: options clea...
437

fd6149d33   Ilpo Järvinen   tcp: Restore orde...
438
439
440
441
442
  	if (unlikely(opts->mss)) {
  		*ptr++ = htonl((TCPOPT_MSS << 24) |
  			       (TCPOLEN_MSS << 16) |
  			       opts->mss);
  	}
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
443
444
  	if (likely(OPTION_TS & options)) {
  		if (unlikely(OPTION_SACK_ADVERTISE & options)) {
33ad798c9   Adam Langley   tcp: options clea...
445
446
447
448
  			*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
  				       (TCPOLEN_SACK_PERM << 16) |
  				       (TCPOPT_TIMESTAMP << 8) |
  				       TCPOLEN_TIMESTAMP);
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
449
  			options &= ~OPTION_SACK_ADVERTISE;
33ad798c9   Adam Langley   tcp: options clea...
450
451
452
453
454
455
456
457
458
  		} else {
  			*ptr++ = htonl((TCPOPT_NOP << 24) |
  				       (TCPOPT_NOP << 16) |
  				       (TCPOPT_TIMESTAMP << 8) |
  				       TCPOLEN_TIMESTAMP);
  		}
  		*ptr++ = htonl(opts->tsval);
  		*ptr++ = htonl(opts->tsecr);
  	}
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
459
  	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
33ad798c9   Adam Langley   tcp: options clea...
460
461
462
463
464
  		*ptr++ = htonl((TCPOPT_NOP << 24) |
  			       (TCPOPT_NOP << 16) |
  			       (TCPOPT_SACK_PERM << 8) |
  			       TCPOLEN_SACK_PERM);
  	}
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
465
  	if (unlikely(OPTION_WSCALE & options)) {
33ad798c9   Adam Langley   tcp: options clea...
466
467
468
469
470
471
472
473
474
  		*ptr++ = htonl((TCPOPT_NOP << 24) |
  			       (TCPOPT_WINDOW << 16) |
  			       (TCPOLEN_WINDOW << 8) |
  			       opts->ws);
  	}
  
  	if (unlikely(opts->num_sack_blocks)) {
  		struct tcp_sack_block *sp = tp->rx_opt.dsack ?
  			tp->duplicate_sack : tp->selective_acks;
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
475
476
477
478
479
  		int this_sack;
  
  		*ptr++ = htonl((TCPOPT_NOP  << 24) |
  			       (TCPOPT_NOP  << 16) |
  			       (TCPOPT_SACK <<  8) |
33ad798c9   Adam Langley   tcp: options clea...
480
  			       (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
481
  						     TCPOLEN_SACK_PERBLOCK)));
2de979bd7   Stephen Hemminger   [TCP]: whitespace...
482

33ad798c9   Adam Langley   tcp: options clea...
483
484
  		for (this_sack = 0; this_sack < opts->num_sack_blocks;
  		     ++this_sack) {
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
485
486
487
  			*ptr++ = htonl(sp[this_sack].start_seq);
  			*ptr++ = htonl(sp[this_sack].end_seq);
  		}
2de979bd7   Stephen Hemminger   [TCP]: whitespace...
488

5861f8e58   Ilpo Järvinen   tcp: remove point...
489
  		tp->rx_opt.dsack = 0;
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
490
  	}
2100c8d2d   Yuchung Cheng   net-tcp: Fast Ope...
491
492
493
  
  	if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
  		struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
7f9b838b7   Daniel Lee   tcp: RFC7413 opti...
494
495
496
497
498
499
500
501
502
503
504
505
506
  		u8 *p = (u8 *)ptr;
  		u32 len; /* Fast Open option length */
  
  		if (foc->exp) {
  			len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
  			*ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
  				     TCPOPT_FASTOPEN_MAGIC);
  			p += TCPOLEN_EXP_FASTOPEN_BASE;
  		} else {
  			len = TCPOLEN_FASTOPEN_BASE + foc->len;
  			*p++ = TCPOPT_FASTOPEN;
  			*p++ = len;
  		}
2100c8d2d   Yuchung Cheng   net-tcp: Fast Ope...
507

7f9b838b7   Daniel Lee   tcp: RFC7413 opti...
508
509
510
511
  		memcpy(p, foc->val, foc->len);
  		if ((len & 3) == 2) {
  			p[foc->len] = TCPOPT_NOP;
  			p[foc->len + 1] = TCPOPT_NOP;
2100c8d2d   Yuchung Cheng   net-tcp: Fast Ope...
512
  		}
7f9b838b7   Daniel Lee   tcp: RFC7413 opti...
513
  		ptr += (len + 3) >> 2;
2100c8d2d   Yuchung Cheng   net-tcp: Fast Ope...
514
  	}
33ad798c9   Adam Langley   tcp: options clea...
515
  }
67edfef78   Andi Kleen   TCP: Add comments...
516
517
518
  /* Compute TCP options for SYN packets. This is not the final
   * network wire format yet.
   */
95c961747   Eric Dumazet   net: cleanup unsi...
519
  static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
33ad798c9   Adam Langley   tcp: options clea...
520
  				struct tcp_out_options *opts,
cf533ea53   Eric Dumazet   tcp: add const qu...
521
522
  				struct tcp_md5sig_key **md5)
  {
33ad798c9   Adam Langley   tcp: options clea...
523
  	struct tcp_sock *tp = tcp_sk(sk);
95c961747   Eric Dumazet   net: cleanup unsi...
524
  	unsigned int remaining = MAX_TCP_OPTION_SPACE;
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
525
  	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
33ad798c9   Adam Langley   tcp: options clea...
526

cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
527
  #ifdef CONFIG_TCP_MD5SIG
33ad798c9   Adam Langley   tcp: options clea...
528
529
530
  	*md5 = tp->af_specific->md5_lookup(sk, sk);
  	if (*md5) {
  		opts->options |= OPTION_MD5;
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
531
  		remaining -= TCPOLEN_MD5SIG_ALIGNED;
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
532
  	}
33ad798c9   Adam Langley   tcp: options clea...
533
534
  #else
  	*md5 = NULL;
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
535
  #endif
33ad798c9   Adam Langley   tcp: options clea...
536
537
538
539
540
541
542
543
544
545
546
  
  	/* We always get an MSS option.  The option bytes which will be seen in
  	 * normal data packets should timestamps be used, must be in the MSS
  	 * advertised.  But we subtract them from tp->mss_cache so that
  	 * calculations in tcp_sendmsg are simpler etc.  So account for this
  	 * fact here if necessary.  If we don't do this correctly, as a
  	 * receiver we won't recognize data packets as being full sized when we
  	 * should, and thus we won't abide by the delayed ACK rules correctly.
  	 * SACKs don't matter, we never delay an ACK when we have any of those
  	 * going out.  */
  	opts->mss = tcp_advertise_mss(sk);
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
547
  	remaining -= TCPOLEN_MSS_ALIGNED;
33ad798c9   Adam Langley   tcp: options clea...
548

5d2ed0521   Eric Dumazet   tcp: Namespaceify...
549
  	if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
33ad798c9   Adam Langley   tcp: options clea...
550
  		opts->options |= OPTION_TS;
7faee5c0d   Eric Dumazet   tcp: remove TCP_S...
551
  		opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
33ad798c9   Adam Langley   tcp: options clea...
552
  		opts->tsecr = tp->rx_opt.ts_recent;
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
553
  		remaining -= TCPOLEN_TSTAMP_ALIGNED;
33ad798c9   Adam Langley   tcp: options clea...
554
  	}
9bb37ef00   Eric Dumazet   tcp: Namespaceify...
555
  	if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) {
33ad798c9   Adam Langley   tcp: options clea...
556
  		opts->ws = tp->rx_opt.rcv_wscale;
89e95a613   Ori Finkelman   IPv4 TCP fails to...
557
  		opts->options |= OPTION_WSCALE;
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
558
  		remaining -= TCPOLEN_WSCALE_ALIGNED;
33ad798c9   Adam Langley   tcp: options clea...
559
  	}
f93010342   Eric Dumazet   tcp: Namespaceify...
560
  	if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) {
33ad798c9   Adam Langley   tcp: options clea...
561
  		opts->options |= OPTION_SACK_ADVERTISE;
b32d13102   David S. Miller   tcp: Fix bitmask ...
562
  		if (unlikely(!(OPTION_TS & opts->options)))
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
563
  			remaining -= TCPOLEN_SACKPERM_ALIGNED;
33ad798c9   Adam Langley   tcp: options clea...
564
  	}
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
565
  	if (fastopen && fastopen->cookie.len >= 0) {
2646c831c   Daniel Lee   tcp: RFC7413 opti...
566
567
568
569
  		u32 need = fastopen->cookie.len;
  
  		need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
  					       TCPOLEN_FASTOPEN_BASE;
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
570
571
572
573
574
575
  		need = (need + 3) & ~3U;  /* Align to 32 bits */
  		if (remaining >= need) {
  			opts->options |= OPTION_FAST_OPEN_COOKIE;
  			opts->fastopen_cookie = &fastopen->cookie;
  			remaining -= need;
  			tp->syn_fastopen = 1;
2646c831c   Daniel Lee   tcp: RFC7413 opti...
576
  			tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
577
578
  		}
  	}
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
579

bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
580
  	return MAX_TCP_OPTION_SPACE - remaining;
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
581
  }
67edfef78   Andi Kleen   TCP: Add comments...
582
  /* Set up TCP options for SYN-ACKs. */
37bfbdda0   Eric Dumazet   tcp: remove tcp_s...
583
584
585
586
587
  static unsigned int tcp_synack_options(struct request_sock *req,
  				       unsigned int mss, struct sk_buff *skb,
  				       struct tcp_out_options *opts,
  				       const struct tcp_md5sig_key *md5,
  				       struct tcp_fastopen_cookie *foc)
4957faade   William Allen Simpson   TCPCT part 1g: Re...
588
  {
33ad798c9   Adam Langley   tcp: options clea...
589
  	struct inet_request_sock *ireq = inet_rsk(req);
95c961747   Eric Dumazet   net: cleanup unsi...
590
  	unsigned int remaining = MAX_TCP_OPTION_SPACE;
33ad798c9   Adam Langley   tcp: options clea...
591

cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
592
  #ifdef CONFIG_TCP_MD5SIG
80f03e27a   Eric Dumazet   tcp: md5: fix rcu...
593
  	if (md5) {
33ad798c9   Adam Langley   tcp: options clea...
594
  		opts->options |= OPTION_MD5;
4957faade   William Allen Simpson   TCPCT part 1g: Re...
595
596
597
598
599
600
601
  		remaining -= TCPOLEN_MD5SIG_ALIGNED;
  
  		/* We can't fit any SACK blocks in a packet with MD5 + TS
  		 * options. There was discussion about disabling SACK
  		 * rather than TS in order to fit in better with old,
  		 * buggy kernels, but that was deemed to be unnecessary.
  		 */
de213e5ee   Eric Dumazet   tcp: tcp_synack_o...
602
  		ireq->tstamp_ok &= !ireq->sack_ok;
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
603
604
  	}
  #endif
33ad798c9   Adam Langley   tcp: options clea...
605

4957faade   William Allen Simpson   TCPCT part 1g: Re...
606
  	/* We always send an MSS option. */
33ad798c9   Adam Langley   tcp: options clea...
607
  	opts->mss = mss;
4957faade   William Allen Simpson   TCPCT part 1g: Re...
608
  	remaining -= TCPOLEN_MSS_ALIGNED;
33ad798c9   Adam Langley   tcp: options clea...
609
610
611
  
  	if (likely(ireq->wscale_ok)) {
  		opts->ws = ireq->rcv_wscale;
89e95a613   Ori Finkelman   IPv4 TCP fails to...
612
  		opts->options |= OPTION_WSCALE;
4957faade   William Allen Simpson   TCPCT part 1g: Re...
613
  		remaining -= TCPOLEN_WSCALE_ALIGNED;
33ad798c9   Adam Langley   tcp: options clea...
614
  	}
de213e5ee   Eric Dumazet   tcp: tcp_synack_o...
615
  	if (likely(ireq->tstamp_ok)) {
33ad798c9   Adam Langley   tcp: options clea...
616
  		opts->options |= OPTION_TS;
95a22caee   Florian Westphal   tcp: randomize tc...
617
  		opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
33ad798c9   Adam Langley   tcp: options clea...
618
  		opts->tsecr = req->ts_recent;
4957faade   William Allen Simpson   TCPCT part 1g: Re...
619
  		remaining -= TCPOLEN_TSTAMP_ALIGNED;
33ad798c9   Adam Langley   tcp: options clea...
620
621
622
  	}
  	if (likely(ireq->sack_ok)) {
  		opts->options |= OPTION_SACK_ADVERTISE;
de213e5ee   Eric Dumazet   tcp: tcp_synack_o...
623
  		if (unlikely(!ireq->tstamp_ok))
4957faade   William Allen Simpson   TCPCT part 1g: Re...
624
  			remaining -= TCPOLEN_SACKPERM_ALIGNED;
33ad798c9   Adam Langley   tcp: options clea...
625
  	}
7f9b838b7   Daniel Lee   tcp: RFC7413 opti...
626
627
628
629
630
  	if (foc != NULL && foc->len >= 0) {
  		u32 need = foc->len;
  
  		need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
  				   TCPOLEN_FASTOPEN_BASE;
8336886f7   Jerry Chu   tcp: TCP Fast Ope...
631
632
633
634
635
636
637
  		need = (need + 3) & ~3U;  /* Align to 32 bits */
  		if (remaining >= need) {
  			opts->options |= OPTION_FAST_OPEN_COOKIE;
  			opts->fastopen_cookie = foc;
  			remaining -= need;
  		}
  	}
1a2c6181c   Christoph Paasch   tcp: Remove TCPCT
638

4957faade   William Allen Simpson   TCPCT part 1g: Re...
639
  	return MAX_TCP_OPTION_SPACE - remaining;
33ad798c9   Adam Langley   tcp: options clea...
640
  }
67edfef78   Andi Kleen   TCP: Add comments...
641
642
643
  /* Compute TCP options for ESTABLISHED sockets. This is not the
   * final wire format yet.
   */
95c961747   Eric Dumazet   net: cleanup unsi...
644
  static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
33ad798c9   Adam Langley   tcp: options clea...
645
  					struct tcp_out_options *opts,
cf533ea53   Eric Dumazet   tcp: add const qu...
646
647
  					struct tcp_md5sig_key **md5)
  {
33ad798c9   Adam Langley   tcp: options clea...
648
  	struct tcp_sock *tp = tcp_sk(sk);
95c961747   Eric Dumazet   net: cleanup unsi...
649
  	unsigned int size = 0;
cabeccbd1   Ilpo Järvinen   tcp: kill eff_sac...
650
  	unsigned int eff_sacks;
33ad798c9   Adam Langley   tcp: options clea...
651

5843ef421   Andi Kleen   tcp: Always set o...
652
  	opts->options = 0;
33ad798c9   Adam Langley   tcp: options clea...
653
654
655
656
657
658
659
660
661
662
663
664
  #ifdef CONFIG_TCP_MD5SIG
  	*md5 = tp->af_specific->md5_lookup(sk, sk);
  	if (unlikely(*md5)) {
  		opts->options |= OPTION_MD5;
  		size += TCPOLEN_MD5SIG_ALIGNED;
  	}
  #else
  	*md5 = NULL;
  #endif
  
  	if (likely(tp->rx_opt.tstamp_ok)) {
  		opts->options |= OPTION_TS;
7faee5c0d   Eric Dumazet   tcp: remove TCP_S...
665
  		opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
33ad798c9   Adam Langley   tcp: options clea...
666
667
668
  		opts->tsecr = tp->rx_opt.ts_recent;
  		size += TCPOLEN_TSTAMP_ALIGNED;
  	}
cabeccbd1   Ilpo Järvinen   tcp: kill eff_sac...
669
670
  	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
  	if (unlikely(eff_sacks)) {
95c961747   Eric Dumazet   net: cleanup unsi...
671
  		const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
33ad798c9   Adam Langley   tcp: options clea...
672
  		opts->num_sack_blocks =
95c961747   Eric Dumazet   net: cleanup unsi...
673
  			min_t(unsigned int, eff_sacks,
33ad798c9   Adam Langley   tcp: options clea...
674
675
676
677
678
679
680
  			      (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
  			      TCPOLEN_SACK_PERBLOCK);
  		size += TCPOLEN_SACK_BASE_ALIGNED +
  			opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
  	}
  
  	return size;
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
681
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
682

46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
683
684
685
686
687
688
689
690
691
  
  /* TCP SMALL QUEUES (TSQ)
   *
   * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
   * to reduce RTT and bufferbloat.
   * We do this using a special skb destructor (tcp_wfree).
   *
   * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
   * needs to be reallocated in a driver.
8e3bff96a   stephen hemminger   net: more spellin...
692
   * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
693
694
695
696
697
698
699
700
701
702
   *
   * Since transmit from skb destructor is forbidden, we use a tasklet
   * to process all sockets that eventually need to send more skbs.
   * We use one tasklet per cpu, with its own queue of sockets.
   */
  struct tsq_tasklet {
  	struct tasklet_struct	tasklet;
  	struct list_head	head; /* queue of tcp sockets */
  };
  static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
6f458dfb4   Eric Dumazet   tcp: improve late...
703
704
705
706
  static void tcp_tsq_handler(struct sock *sk)
  {
  	if ((1 << sk->sk_state) &
  	    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
f9616c35a   Eric Dumazet   tcp: implement TS...
707
708
709
710
  	     TCPF_CLOSE_WAIT  | TCPF_LAST_ACK)) {
  		struct tcp_sock *tp = tcp_sk(sk);
  
  		if (tp->lost_out > tp->retrans_out &&
3a91d29f2   Koichiro Den   tcp: do tcp_mstam...
711
712
  		    tp->snd_cwnd > tcp_packets_in_flight(tp)) {
  			tcp_mstamp_refresh(tp);
f9616c35a   Eric Dumazet   tcp: implement TS...
713
  			tcp_xmit_retransmit_queue(sk);
3a91d29f2   Koichiro Den   tcp: do tcp_mstam...
714
  		}
f9616c35a   Eric Dumazet   tcp: implement TS...
715
716
  
  		tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
bf06200e7   John Ogness   tcp: tsq: fix non...
717
  			       0, GFP_ATOMIC);
f9616c35a   Eric Dumazet   tcp: implement TS...
718
  	}
6f458dfb4   Eric Dumazet   tcp: improve late...
719
  }
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
720
  /*
8e3bff96a   stephen hemminger   net: more spellin...
721
   * One tasklet per cpu tries to send more skbs.
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
722
   * We run in tasklet context but need to disable irqs when
8e3bff96a   stephen hemminger   net: more spellin...
723
   * transferring tsq->head because tcp_wfree() might
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
   * interrupt us (non NAPI drivers)
   */
  static void tcp_tasklet_func(unsigned long data)
  {
  	struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
  	LIST_HEAD(list);
  	unsigned long flags;
  	struct list_head *q, *n;
  	struct tcp_sock *tp;
  	struct sock *sk;
  
  	local_irq_save(flags);
  	list_splice_init(&tsq->head, &list);
  	local_irq_restore(flags);
  
  	list_for_each_safe(q, n, &list) {
  		tp = list_entry(q, struct tcp_sock, tsq_node);
  		list_del(&tp->tsq_node);
  
  		sk = (struct sock *)tp;
0a9648f12   Eric Dumazet   tcp: add a missin...
744
  		smp_mb__before_atomic();
7aa5470c2   Eric Dumazet   tcp: tsq: move ts...
745
  		clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
b223feb9d   Eric Dumazet   tcp: tsq: add sho...
746
  		if (!sk->sk_lock.owned &&
7aa5470c2   Eric Dumazet   tcp: tsq: move ts...
747
  		    test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) {
b223feb9d   Eric Dumazet   tcp: tsq: add sho...
748
749
  			bh_lock_sock(sk);
  			if (!sock_owned_by_user(sk)) {
7aa5470c2   Eric Dumazet   tcp: tsq: move ts...
750
  				clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
b223feb9d   Eric Dumazet   tcp: tsq: add sho...
751
752
753
  				tcp_tsq_handler(sk);
  			}
  			bh_unlock_sock(sk);
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
754
  		}
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
755

46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
756
757
758
  		sk_free(sk);
  	}
  }
40fc3423b   Eric Dumazet   tcp: tsq: add tsq...
759
760
761
762
  #define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED |		\
  			  TCPF_WRITE_TIMER_DEFERRED |	\
  			  TCPF_DELACK_TIMER_DEFERRED |	\
  			  TCPF_MTU_REDUCED_DEFERRED)
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
763
764
765
766
767
768
769
770
771
  /**
   * tcp_release_cb - tcp release_sock() callback
   * @sk: socket
   *
   * called from release_sock() to perform protocol dependent
   * actions before socket release.
   */
  void tcp_release_cb(struct sock *sk)
  {
6f458dfb4   Eric Dumazet   tcp: improve late...
772
  	unsigned long flags, nflags;
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
773

6f458dfb4   Eric Dumazet   tcp: improve late...
774
775
  	/* perform an atomic operation only if at least one flag is set */
  	do {
7aa5470c2   Eric Dumazet   tcp: tsq: move ts...
776
  		flags = sk->sk_tsq_flags;
6f458dfb4   Eric Dumazet   tcp: improve late...
777
778
779
  		if (!(flags & TCP_DEFERRED_ALL))
  			return;
  		nflags = flags & ~TCP_DEFERRED_ALL;
7aa5470c2   Eric Dumazet   tcp: tsq: move ts...
780
  	} while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
6f458dfb4   Eric Dumazet   tcp: improve late...
781

40fc3423b   Eric Dumazet   tcp: tsq: add tsq...
782
  	if (flags & TCPF_TSQ_DEFERRED)
6f458dfb4   Eric Dumazet   tcp: improve late...
783
  		tcp_tsq_handler(sk);
c3f9b0184   Eric Dumazet   tcp: tcp_release_...
784
785
786
787
788
789
790
791
792
793
  	/* Here begins the tricky part :
  	 * We are called from release_sock() with :
  	 * 1) BH disabled
  	 * 2) sk_lock.slock spinlock held
  	 * 3) socket owned by us (sk->sk_lock.owned == 1)
  	 *
  	 * But following code is meant to be called from BH handlers,
  	 * so we should keep BH disabled, but early release socket ownership
  	 */
  	sock_release_ownership(sk);
40fc3423b   Eric Dumazet   tcp: tsq: add tsq...
794
  	if (flags & TCPF_WRITE_TIMER_DEFERRED) {
6f458dfb4   Eric Dumazet   tcp: improve late...
795
  		tcp_write_timer_handler(sk);
144d56e91   Eric Dumazet   tcp: fix possible...
796
797
  		__sock_put(sk);
  	}
40fc3423b   Eric Dumazet   tcp: tsq: add tsq...
798
  	if (flags & TCPF_DELACK_TIMER_DEFERRED) {
6f458dfb4   Eric Dumazet   tcp: improve late...
799
  		tcp_delack_timer_handler(sk);
144d56e91   Eric Dumazet   tcp: fix possible...
800
801
  		__sock_put(sk);
  	}
40fc3423b   Eric Dumazet   tcp: tsq: add tsq...
802
  	if (flags & TCPF_MTU_REDUCED_DEFERRED) {
4fab90719   Neal Cardwell   tcp: fix tcp_rele...
803
  		inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
144d56e91   Eric Dumazet   tcp: fix possible...
804
805
  		__sock_put(sk);
  	}
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
  }
  EXPORT_SYMBOL(tcp_release_cb);
  
  void __init tcp_tasklet_init(void)
  {
  	int i;
  
  	for_each_possible_cpu(i) {
  		struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
  
  		INIT_LIST_HEAD(&tsq->head);
  		tasklet_init(&tsq->tasklet,
  			     tcp_tasklet_func,
  			     (unsigned long)tsq);
  	}
  }
  
  /*
   * Write buffer destructor automatically called from kfree_skb.
8e3bff96a   stephen hemminger   net: more spellin...
825
   * We can't xmit new skbs from this context, as we might already
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
826
827
   * hold qdisc lock.
   */
d6a4a1041   Eric Dumazet   tcp: GSO should b...
828
  void tcp_wfree(struct sk_buff *skb)
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
829
830
831
  {
  	struct sock *sk = skb->sk;
  	struct tcp_sock *tp = tcp_sk(sk);
408f0a6c2   Eric Dumazet   tcp: tsq: remove ...
832
  	unsigned long flags, nval, oval;
9b462d02d   Eric Dumazet   tcp: TCP Small Qu...
833
834
835
836
  
  	/* Keep one reference on sk_wmem_alloc.
  	 * Will be released by sk_free() from here or tcp_tasklet_func()
  	 */
14afee4b6   Reshetova, Elena   net: convert sock...
837
  	WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
9b462d02d   Eric Dumazet   tcp: TCP Small Qu...
838
839
840
841
842
843
844
845
  
  	/* If this softirq is serviced by ksoftirqd, we are likely under stress.
  	 * Wait until our queues (qdisc + devices) are drained.
  	 * This gives :
  	 * - less callbacks to tcp_write_xmit(), reducing stress (batches)
  	 * - chance for incoming ACK (processed by another cpu maybe)
  	 *   to migrate this flow (skb->ooo_okay will be eventually set)
  	 */
14afee4b6   Reshetova, Elena   net: convert sock...
846
  	if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
9b462d02d   Eric Dumazet   tcp: TCP Small Qu...
847
  		goto out;
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
848

7aa5470c2   Eric Dumazet   tcp: tsq: move ts...
849
  	for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
850
  		struct tsq_tasklet *tsq;
a9b204d15   Eric Dumazet   tcp: tsq: avoid o...
851
  		bool empty;
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
852

408f0a6c2   Eric Dumazet   tcp: tsq: remove ...
853
854
  		if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
  			goto out;
b223feb9d   Eric Dumazet   tcp: tsq: add sho...
855
  		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
7aa5470c2   Eric Dumazet   tcp: tsq: move ts...
856
  		nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
408f0a6c2   Eric Dumazet   tcp: tsq: remove ...
857
858
  		if (nval != oval)
  			continue;
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
859
860
  		/* queue this socket to tasklet queue */
  		local_irq_save(flags);
903ceff7c   Christoph Lameter   net: Replace get_...
861
  		tsq = this_cpu_ptr(&tsq_tasklet);
a9b204d15   Eric Dumazet   tcp: tsq: avoid o...
862
  		empty = list_empty(&tsq->head);
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
863
  		list_add(&tp->tsq_node, &tsq->head);
a9b204d15   Eric Dumazet   tcp: tsq: avoid o...
864
865
  		if (empty)
  			tasklet_schedule(&tsq->tasklet);
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
866
  		local_irq_restore(flags);
9b462d02d   Eric Dumazet   tcp: TCP Small Qu...
867
  		return;
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
868
  	}
9b462d02d   Eric Dumazet   tcp: TCP Small Qu...
869
870
  out:
  	sk_free(sk);
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
871
  }
218af599f   Eric Dumazet   tcp: internal imp...
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
  /* Note: Called under hard irq.
   * We can not call TCP stack right away.
   */
  enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
  {
  	struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
  	struct sock *sk = (struct sock *)tp;
  	unsigned long nval, oval;
  
  	for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
  		struct tsq_tasklet *tsq;
  		bool empty;
  
  		if (oval & TSQF_QUEUED)
  			break;
  
  		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
  		nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
  		if (nval != oval)
  			continue;
14afee4b6   Reshetova, Elena   net: convert sock...
892
  		if (!refcount_inc_not_zero(&sk->sk_wmem_alloc))
218af599f   Eric Dumazet   tcp: internal imp...
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
  			break;
  		/* queue this socket to tasklet queue */
  		tsq = this_cpu_ptr(&tsq_tasklet);
  		empty = list_empty(&tsq->head);
  		list_add(&tp->tsq_node, &tsq->head);
  		if (empty)
  			tasklet_schedule(&tsq->tasklet);
  		break;
  	}
  	return HRTIMER_NORESTART;
  }
  
  /* BBR congestion control needs pacing.
   * Same remark for SO_MAX_PACING_RATE.
   * sch_fq packet scheduler is efficiently handling pacing,
   * but is not always installed/used.
   * Return true if TCP stack should pace packets itself.
   */
  static bool tcp_needs_internal_pacing(const struct sock *sk)
  {
  	return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
  }
  
  static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
  {
  	u64 len_ns;
  	u32 rate;
  
  	if (!tcp_needs_internal_pacing(sk))
  		return;
  	rate = sk->sk_pacing_rate;
  	if (!rate || rate == ~0U)
  		return;
  
  	/* Should account for header sizes as sch_fq does,
  	 * but lets make things simple.
  	 */
  	len_ns = (u64)skb->len * NSEC_PER_SEC;
  	do_div(len_ns, rate);
  	hrtimer_start(&tcp_sk(sk)->pacing_timer,
  		      ktime_add_ns(ktime_get(), len_ns),
  		      HRTIMER_MODE_ABS_PINNED);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
936
937
938
939
940
941
942
943
944
945
946
  /* This routine actually transmits TCP packets queued in by
   * tcp_do_sendmsg().  This is used by both the initial
   * transmission and possible later retransmissions.
   * All SKB's seen here are completely headerless.  It is our
   * job to build the TCP header, and pass the packet down to
   * IP so it can do the same plus pass the packet off to the
   * device.
   *
   * We are working here with either a clone of the original
   * SKB, or a fresh unique copy made by the retransmit engine.
   */
f7f24b369   Yuchung Cheng   tcp: helpers to s...
947
948
  static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
  			      int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
949
  {
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
950
951
952
953
  	const struct inet_connection_sock *icsk = inet_csk(sk);
  	struct inet_sock *inet;
  	struct tcp_sock *tp;
  	struct tcp_skb_cb *tcb;
33ad798c9   Adam Langley   tcp: options clea...
954
  	struct tcp_out_options opts;
95c961747   Eric Dumazet   net: cleanup unsi...
955
  	unsigned int tcp_options_size, tcp_header_size;
8c72c65b4   Eric Dumazet   tcp: update skb->...
956
  	struct sk_buff *oskb = NULL;
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
957
  	struct tcp_md5sig_key *md5;
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
958
  	struct tcphdr *th;
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
959
960
961
  	int err;
  
  	BUG_ON(!skb || !tcp_skb_pcount(skb));
6f094b9ec   Lawrence Brakmo   tcp: add in_fligh...
962
  	tp = tcp_sk(sk);
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
963

ccdbb6e96   Eric Dumazet   tcp: tcp_transmit...
964
  	if (clone_it) {
6f094b9ec   Lawrence Brakmo   tcp: add in_fligh...
965
966
  		TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
  			- tp->snd_una;
8c72c65b4   Eric Dumazet   tcp: update skb->...
967
  		oskb = skb;
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
968
969
970
971
972
973
974
  		if (unlikely(skb_cloned(skb)))
  			skb = pskb_copy(skb, gfp_mask);
  		else
  			skb = skb_clone(skb, gfp_mask);
  		if (unlikely(!skb))
  			return -ENOBUFS;
  	}
8c72c65b4   Eric Dumazet   tcp: update skb->...
975
  	skb->skb_mstamp = tp->tcp_mstamp;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
976

dfb4b9dce   David S. Miller   [TCP] Vegas: time...
977
  	inet = inet_sk(sk);
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
978
  	tcb = TCP_SKB_CB(skb);
33ad798c9   Adam Langley   tcp: options clea...
979
  	memset(&opts, 0, sizeof(opts));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
980

4de075e04   Eric Dumazet   tcp: rename tcp_s...
981
  	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
33ad798c9   Adam Langley   tcp: options clea...
982
983
984
985
986
  		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
  	else
  		tcp_options_size = tcp_established_options(sk, skb, &opts,
  							   &md5);
  	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
987

547669d48   Eric Dumazet   tcp: xps: fix reo...
988
  	/* if no packet is in qdisc/device queue, then allow XPS to select
b2532eb9a   Eric Dumazet   tcp: fix ooo_okay...
989
990
991
992
993
  	 * another queue. We can be called from tcp_tsq_handler()
  	 * which holds one reference to sk_wmem_alloc.
  	 *
  	 * TODO: Ideally, in-flight pure ACK packets should not matter here.
  	 * One way to get this would be to set skb->truesize = 2 on them.
547669d48   Eric Dumazet   tcp: xps: fix reo...
994
  	 */
b2532eb9a   Eric Dumazet   tcp: fix ooo_okay...
995
  	skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
996

38ab52e8e   Eric Dumazet   tcp: clear pfmema...
997
998
999
1000
1001
1002
  	/* If we had to use memory reserve to allocate this skb,
  	 * this might cause drops if packet is looped back :
  	 * Other socket might not have SOCK_MEMALLOC.
  	 * Packets not looped back do not care about pfmemalloc.
  	 */
  	skb->pfmemalloc = 0;
aa8223c7b   Arnaldo Carvalho de Melo   [SK_BUFF]: Introd...
1003
1004
  	skb_push(skb, tcp_header_size);
  	skb_reset_transport_header(skb);
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
1005
1006
1007
  
  	skb_orphan(skb);
  	skb->sk = sk;
1d2077ac0   Eric Dumazet   net: add __sock_w...
1008
  	skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
b73c3d0e4   Tom Herbert   net: Save TX flow...
1009
  	skb_set_hash_from_sk(skb, sk);
14afee4b6   Reshetova, Elena   net: convert sock...
1010
  	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
1011

c3a2e8370   Julian Anastasov   tcp: replace dst_...
1012
  	skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
1013
  	/* Build TCP header and checksum it. */
ea1627c20   Eric Dumazet   tcp: minor optimi...
1014
  	th = (struct tcphdr *)skb->data;
c720c7e83   Eric Dumazet   inet: rename some...
1015
1016
  	th->source		= inet->inet_sport;
  	th->dest		= inet->inet_dport;
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
1017
  	th->seq			= htonl(tcb->seq);
f7f24b369   Yuchung Cheng   tcp: helpers to s...
1018
  	th->ack_seq		= htonl(rcv_nxt);
df7a3b07c   Al Viro   [TCP] net/ipv4/tc...
1019
  	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
4de075e04   Eric Dumazet   tcp: rename tcp_s...
1020
  					tcb->tcp_flags);
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
1021

dfb4b9dce   David S. Miller   [TCP] Vegas: time...
1022
1023
  	th->check		= 0;
  	th->urg_ptr		= 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1024

33f5f57ee   Ilpo Järvinen   tcp: kill pointle...
1025
  	/* The urg_mode check is necessary during a below snd_una win probe */
7691367d7   Herbert Xu   tcp: Always set u...
1026
1027
1028
1029
1030
  	if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
  		if (before(tp->snd_up, tcb->seq + 0x10000)) {
  			th->urg_ptr = htons(tp->snd_up - tcb->seq);
  			th->urg = 1;
  		} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
0eae88f31   Eric Dumazet   net: Fix various ...
1031
  			th->urg_ptr = htons(0xFFFF);
7691367d7   Herbert Xu   tcp: Always set u...
1032
1033
  			th->urg = 1;
  		}
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
1034
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1035

bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
1036
  	tcp_options_write((__be32 *)(th + 1), tp, &opts);
51466a754   Eric Dumazet   tcp: fill shinfo-...
1037
  	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
ea1627c20   Eric Dumazet   tcp: minor optimi...
1038
1039
1040
1041
1042
1043
1044
1045
1046
  	if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
  		th->window      = htons(tcp_select_window(sk));
  		tcp_ecn_send(sk, skb, th, tcp_header_size);
  	} else {
  		/* RFC1323: The window in SYN & SYN/ACK segments
  		 * is never scaled.
  		 */
  		th->window	= htons(min(tp->rcv_wnd, 65535U));
  	}
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
1047
1048
1049
  #ifdef CONFIG_TCP_MD5SIG
  	/* Calculate the MD5 hash, as we have all we need now */
  	if (md5) {
a465419b1   Eric Dumazet   net: Introduce sk...
1050
  		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
1051
  		tp->af_specific->calc_md5_hash(opts.hash_location,
39f8e58e5   Eric Dumazet   tcp: md5: remove ...
1052
  					       md5, sk, skb);
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
1053
1054
  	}
  #endif
bb2962461   Herbert Xu   inet: Remove unus...
1055
  	icsk->icsk_af_ops->send_check(sk, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1056

4de075e04   Eric Dumazet   tcp: rename tcp_s...
1057
  	if (likely(tcb->tcp_flags & TCPHDR_ACK))
78636179f   Yuchung Cheng   tcp: do not cance...
1058
  		tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1059

a44d6eacd   Martin KaFai Lau   tcp: Add RFC4898 ...
1060
  	if (skb->len != tcp_header_size) {
cf533ea53   Eric Dumazet   tcp: add const qu...
1061
  		tcp_event_data_sent(tp, sk);
a44d6eacd   Martin KaFai Lau   tcp: Add RFC4898 ...
1062
  		tp->data_segs_out += tcp_skb_pcount(skb);
218af599f   Eric Dumazet   tcp: internal imp...
1063
  		tcp_internal_pacing(sk, skb);
a44d6eacd   Martin KaFai Lau   tcp: Add RFC4898 ...
1064
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1065

bd37a0885   Wei Yongjun   [TCP]: SNMPv2 tcp...
1066
  	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
aa2ea0586   Tom Herbert   tcp: fix outsegs ...
1067
1068
  		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
  			      tcp_skb_pcount(skb));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1069

2efd055c5   Marcelo Ricardo Leitner   tcp: add tcpi_seg...
1070
  	tp->segs_out += tcp_skb_pcount(skb);
f69ad292c   Eric Dumazet   tcp: fill shinfo-...
1071
  	/* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */
cd7d8498c   Eric Dumazet   tcp: change tcp_s...
1072
  	skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
f69ad292c   Eric Dumazet   tcp: fill shinfo-...
1073
  	skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
cd7d8498c   Eric Dumazet   tcp: change tcp_s...
1074

7faee5c0d   Eric Dumazet   tcp: remove TCP_S...
1075
  	/* Our usage of tstamp should remain private */
2456e8553   Thomas Gleixner   ktime: Get rid of...
1076
  	skb->tstamp = 0;
971f10eca   Eric Dumazet   tcp: better TCP_S...
1077
1078
1079
1080
  
  	/* Cleanup our debris for IP stacks */
  	memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
  			       sizeof(struct inet6_skb_parm)));
b0270e910   Eric Dumazet   ipv4: add a sock ...
1081
  	err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
7faee5c0d   Eric Dumazet   tcp: remove TCP_S...
1082

8c72c65b4   Eric Dumazet   tcp: update skb->...
1083
1084
1085
1086
  	if (unlikely(err > 0)) {
  		tcp_enter_cwr(sk);
  		err = net_xmit_eval(err);
  	}
fc2257991   Eric Dumazet   tcp: fix data del...
1087
  	if (!err && oskb) {
8c72c65b4   Eric Dumazet   tcp: update skb->...
1088
  		oskb->skb_mstamp = tp->tcp_mstamp;
fc2257991   Eric Dumazet   tcp: fix data del...
1089
1090
  		tcp_rate_skb_sent(sk, oskb);
  	}
8c72c65b4   Eric Dumazet   tcp: update skb->...
1091
  	return err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1092
  }
f7f24b369   Yuchung Cheng   tcp: helpers to s...
1093
1094
1095
1096
1097
1098
  static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
  			    gfp_t gfp_mask)
  {
  	return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
  				  tcp_sk(sk)->rcv_nxt);
  }
67edfef78   Andi Kleen   TCP: Add comments...
1099
  /* This routine just queues the buffer for sending.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
   *
   * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
   * otherwise socket can stall.
   */
  static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	/* Advance write_seq and place onto the write_queue. */
  	tp->write_seq = TCP_SKB_CB(skb)->end_seq;
f4a775d14   Eric Dumazet   net: introduce __...
1110
  	__skb_header_release(skb);
fe067e8ab   David S. Miller   [TCP]: Abstract o...
1111
  	tcp_add_write_queue_tail(sk, skb);
3ab224be6   Hideo Aoki   [NET] CORE: Intro...
1112
1113
  	sk->sk_wmem_queued += skb->truesize;
  	sk_mem_charge(sk, skb->truesize);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1114
  }
67edfef78   Andi Kleen   TCP: Add comments...
1115
  /* Initialize TSO segments for a packet. */
5bbb432c8   Eric Dumazet   tcp: tcp_set_skb_...
1116
  static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
f6302d1d7   David S. Miller   [TCP]: Move send ...
1117
  {
8f26fb1c1   Eric Dumazet   tcp: remove the s...
1118
  	if (skb->len <= mss_now || skb->ip_summed == CHECKSUM_NONE) {
f6302d1d7   David S. Miller   [TCP]: Move send ...
1119
1120
1121
  		/* Avoid the costly divide in the normal
  		 * non-TSO case.
  		 */
cd7d8498c   Eric Dumazet   tcp: change tcp_s...
1122
  		tcp_skb_pcount_set(skb, 1);
f69ad292c   Eric Dumazet   tcp: fill shinfo-...
1123
  		TCP_SKB_CB(skb)->tcp_gso_size = 0;
f6302d1d7   David S. Miller   [TCP]: Move send ...
1124
  	} else {
cd7d8498c   Eric Dumazet   tcp: change tcp_s...
1125
  		tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
f69ad292c   Eric Dumazet   tcp: fill shinfo-...
1126
  		TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1127
1128
  	}
  }
91fed7a15   Ilpo Järvinen   [TCP]: Make facke...
1129
  /* When a modification to fackets out becomes necessary, we need to check
68f8353b4   Ilpo Järvinen   [TCP]: Rewrite SA...
1130
   * skb is counted to fackets_out or not.
91fed7a15   Ilpo Järvinen   [TCP]: Make facke...
1131
   */
cf533ea53   Eric Dumazet   tcp: add const qu...
1132
  static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
91fed7a15   Ilpo Järvinen   [TCP]: Make facke...
1133
1134
  				   int decr)
  {
a47e5a988   Ilpo Järvinen   [TCP]: Convert hi...
1135
  	struct tcp_sock *tp = tcp_sk(sk);
dc86967b5   Ilpo Järvinen   [TCP]: No fackets...
1136
  	if (!tp->sacked_out || tcp_is_reno(tp))
91fed7a15   Ilpo Järvinen   [TCP]: Make facke...
1137
  		return;
6859d4947   Ilpo Järvinen   [TCP]: Abstract t...
1138
  	if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
91fed7a15   Ilpo Järvinen   [TCP]: Make facke...
1139
  		tp->fackets_out -= decr;
91fed7a15   Ilpo Järvinen   [TCP]: Make facke...
1140
  }
797108d13   Ilpo Järvinen   tcp: add helper f...
1141
1142
1143
  /* Pcount in the middle of the write queue got changed, we need to do various
   * tweaks to fix counters
   */
cf533ea53   Eric Dumazet   tcp: add const qu...
1144
  static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
797108d13   Ilpo Järvinen   tcp: add helper f...
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	tp->packets_out -= decr;
  
  	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
  		tp->sacked_out -= decr;
  	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
  		tp->retrans_out -= decr;
  	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
  		tp->lost_out -= decr;
  
  	/* Reno case is special. Sigh... */
  	if (tcp_is_reno(tp) && decr > 0)
  		tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
  
  	tcp_adjust_fackets_out(sk, skb, decr);
  
  	if (tp->lost_skb_hint &&
  	    before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
52cf3cc8a   Ilpo Järvinen   tcp: fix mid-wq a...
1165
  	    (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)))
797108d13   Ilpo Järvinen   tcp: add helper f...
1166
1167
1168
1169
  		tp->lost_cnt_hint -= decr;
  
  	tcp_verify_left_out(tp);
  }
0a2cf20c3   Soheil Hassas Yeganeh   tcp: remove SKBTX...
1170
1171
1172
1173
1174
  static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
  {
  	return TCP_SKB_CB(skb)->txstamp_ack ||
  		(skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
  }
490cc7d03   Willem de Bruijn   net-timestamp: fi...
1175
1176
1177
  static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
  {
  	struct skb_shared_info *shinfo = skb_shinfo(skb);
0a2cf20c3   Soheil Hassas Yeganeh   tcp: remove SKBTX...
1178
  	if (unlikely(tcp_has_tx_tstamp(skb)) &&
490cc7d03   Willem de Bruijn   net-timestamp: fi...
1179
1180
1181
1182
1183
1184
1185
  	    !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
  		struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
  		u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
  
  		shinfo->tx_flags &= ~tsflags;
  		shinfo2->tx_flags |= tsflags;
  		swap(shinfo->tskey, shinfo2->tskey);
b51e13faf   Martin KaFai Lau   tcp: Carry txstam...
1186
1187
  		TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
  		TCP_SKB_CB(skb)->txstamp_ack = 0;
490cc7d03   Willem de Bruijn   net-timestamp: fi...
1188
1189
  	}
  }
a166140e8   Martin KaFai Lau   tcp: Handle eor b...
1190
1191
1192
1193
1194
  static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
  {
  	TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
  	TCP_SKB_CB(skb)->eor = 0;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1195
1196
  /* Function to create two new TCP segments.  Shrinks the given segment
   * to the specified size and appends a new segment with the rest of the
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
1197
   * packet to the list.  This won't be called frequently, I hope.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1198
1199
   * Remember, these are still headerless SKBs at this point.
   */
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
1200
  int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
6cc55e096   Octavian Purdila   tcp: add gfp para...
1201
  		 unsigned int mss_now, gfp_t gfp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1202
1203
1204
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *buff;
6475be16f   David S. Miller   [TCP]: Keep TSO e...
1205
  	int nsize, old_factor;
b60b49ea6   Herbert Xu   [TCP]: Account sk...
1206
  	int nlen;
9ce014610   Ilpo Järvinen   tcp: get rid of t...
1207
  	u8 flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1208

2fceec133   Ilpo Järvinen   tcp: len check is...
1209
1210
  	if (WARN_ON(len > skb->len))
  		return -EINVAL;
6a438bbe6   Stephen Hemminger   [TCP]: speed up S...
1211

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1212
1213
1214
  	nsize = skb_headlen(skb) - len;
  	if (nsize < 0)
  		nsize = 0;
6cc55e096   Octavian Purdila   tcp: add gfp para...
1215
  	if (skb_unclone(skb, gfp))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1216
1217
1218
  		return -ENOMEM;
  
  	/* Get a new skb... force flag on. */
eb9344781   Eric Dumazet   tcp: add a force_...
1219
  	buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
51456b291   Ian Morris   ipv4: coding styl...
1220
  	if (!buff)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1221
  		return -ENOMEM; /* We'll just try again later. */
ef5cb9738   Herbert Xu   [TCP]: Fix truesi...
1222

3ab224be6   Hideo Aoki   [NET] CORE: Intro...
1223
1224
  	sk->sk_wmem_queued += buff->truesize;
  	sk_mem_charge(sk, buff->truesize);
b60b49ea6   Herbert Xu   [TCP]: Account sk...
1225
1226
1227
  	nlen = skb->len - len - nsize;
  	buff->truesize += nlen;
  	skb->truesize -= nlen;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1228
1229
1230
1231
1232
1233
1234
  
  	/* Correct the sequence numbers. */
  	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
  	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
  	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
  
  	/* PSH and FIN should only be set in the second packet. */
4de075e04   Eric Dumazet   tcp: rename tcp_s...
1235
1236
1237
  	flags = TCP_SKB_CB(skb)->tcp_flags;
  	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
  	TCP_SKB_CB(buff)->tcp_flags = flags;
e14c3caf6   Herbert Xu   [TCP]: Handle SAC...
1238
  	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
a166140e8   Martin KaFai Lau   tcp: Handle eor b...
1239
  	tcp_skb_fragment_eor(skb, buff);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1240

84fa7933a   Patrick McHardy   [NET]: Replace CH...
1241
  	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1242
  		/* Copy and checksum data tail into the new buffer. */
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
1243
1244
  		buff->csum = csum_partial_copy_nocheck(skb->data + len,
  						       skb_put(buff, nsize),
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1245
1246
1247
1248
1249
1250
  						       nsize, 0);
  
  		skb_trim(skb, len);
  
  		skb->csum = csum_block_sub(skb->csum, buff->csum, len);
  	} else {
84fa7933a   Patrick McHardy   [NET]: Replace CH...
1251
  		skb->ip_summed = CHECKSUM_PARTIAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1252
1253
1254
1255
  		skb_split(skb, buff, len);
  	}
  
  	buff->ip_summed = skb->ip_summed;
a61bbcf28   Patrick McHardy   [NET]: Store skb-...
1256
  	buff->tstamp = skb->tstamp;
490cc7d03   Willem de Bruijn   net-timestamp: fi...
1257
  	tcp_fragment_tstamp(skb, buff);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1258

6475be16f   David S. Miller   [TCP]: Keep TSO e...
1259
  	old_factor = tcp_skb_pcount(skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1260
  	/* Fix up tso_factor for both original and new SKB.  */
5bbb432c8   Eric Dumazet   tcp: tcp_set_skb_...
1261
1262
  	tcp_set_skb_tso_segs(skb, mss_now);
  	tcp_set_skb_tso_segs(buff, mss_now);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1263

b9f64820f   Yuchung Cheng   tcp: track data d...
1264
1265
  	/* Update delivered info for the new segment */
  	TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
6475be16f   David S. Miller   [TCP]: Keep TSO e...
1266
1267
1268
  	/* If this packet has been sent out already, we must
  	 * adjust the various packet counters.
  	 */
cf0b450cd   Herbert Xu   [TCP]: Fix off by...
1269
  	if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
6475be16f   David S. Miller   [TCP]: Keep TSO e...
1270
1271
  		int diff = old_factor - tcp_skb_pcount(skb) -
  			tcp_skb_pcount(buff);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1272

797108d13   Ilpo Järvinen   tcp: add helper f...
1273
1274
  		if (diff)
  			tcp_adjust_pcount(sk, skb, diff);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1275
1276
1277
  	}
  
  	/* Link BUFF into the send queue. */
f4a775d14   Eric Dumazet   net: introduce __...
1278
  	__skb_header_release(buff);
fe067e8ab   David S. Miller   [TCP]: Abstract o...
1279
  	tcp_insert_write_queue_after(skb, buff, sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1280
1281
1282
  
  	return 0;
  }
f4d016666   Eric Dumazet   tcp: remove unnec...
1283
1284
  /* This is similar to __pskb_pull_tail(). The difference is that pulled
   * data is not copied, but immediately discarded.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1285
   */
7162fb242   Eric Dumazet   tcp: do not under...
1286
  static int __pskb_trim_head(struct sk_buff *skb, int len)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1287
  {
7b7fc97aa   Eric Dumazet   tcp: optimize som...
1288
  	struct skb_shared_info *shinfo;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1289
  	int i, k, eat;
4fa48bf3c   Eric Dumazet   tcp: fix tcp_trim...
1290
1291
1292
1293
1294
  	eat = min_t(int, len, skb_headlen(skb));
  	if (eat) {
  		__skb_pull(skb, eat);
  		len -= eat;
  		if (!len)
7162fb242   Eric Dumazet   tcp: do not under...
1295
  			return 0;
4fa48bf3c   Eric Dumazet   tcp: fix tcp_trim...
1296
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1297
1298
  	eat = len;
  	k = 0;
7b7fc97aa   Eric Dumazet   tcp: optimize som...
1299
1300
1301
  	shinfo = skb_shinfo(skb);
  	for (i = 0; i < shinfo->nr_frags; i++) {
  		int size = skb_frag_size(&shinfo->frags[i]);
9e903e085   Eric Dumazet   net: add skb frag...
1302
1303
  
  		if (size <= eat) {
aff65da0f   Ian Campbell   net: ipv4: conver...
1304
  			skb_frag_unref(skb, i);
9e903e085   Eric Dumazet   net: add skb frag...
1305
  			eat -= size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1306
  		} else {
7b7fc97aa   Eric Dumazet   tcp: optimize som...
1307
  			shinfo->frags[k] = shinfo->frags[i];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1308
  			if (eat) {
7b7fc97aa   Eric Dumazet   tcp: optimize som...
1309
1310
  				shinfo->frags[k].page_offset += eat;
  				skb_frag_size_sub(&shinfo->frags[k], eat);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1311
1312
1313
1314
1315
  				eat = 0;
  			}
  			k++;
  		}
  	}
7b7fc97aa   Eric Dumazet   tcp: optimize som...
1316
  	shinfo->nr_frags = k;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1317

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1318
1319
  	skb->data_len -= len;
  	skb->len = skb->data_len;
7162fb242   Eric Dumazet   tcp: do not under...
1320
  	return len;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1321
  }
67edfef78   Andi Kleen   TCP: Add comments...
1322
  /* Remove acked data from a packet in the transmit queue. */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1323
1324
  int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
  {
7162fb242   Eric Dumazet   tcp: do not under...
1325
  	u32 delta_truesize;
14bbd6a56   Pravin B Shelar   net: Add skb_uncl...
1326
  	if (skb_unclone(skb, GFP_ATOMIC))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1327
  		return -ENOMEM;
7162fb242   Eric Dumazet   tcp: do not under...
1328
  	delta_truesize = __pskb_trim_head(skb, len);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1329
1330
  
  	TCP_SKB_CB(skb)->seq += len;
84fa7933a   Patrick McHardy   [NET]: Replace CH...
1331
  	skb->ip_summed = CHECKSUM_PARTIAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1332

7162fb242   Eric Dumazet   tcp: do not under...
1333
1334
1335
1336
1337
1338
  	if (delta_truesize) {
  		skb->truesize	   -= delta_truesize;
  		sk->sk_wmem_queued -= delta_truesize;
  		sk_mem_uncharge(sk, delta_truesize);
  		sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1339

5b35e1e6e   Neal Cardwell   tcp: fix tcp_trim...
1340
  	/* Any change of skb->len requires recalculation of tso factor. */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1341
  	if (tcp_skb_pcount(skb) > 1)
5bbb432c8   Eric Dumazet   tcp: tcp_set_skb_...
1342
  		tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1343
1344
1345
  
  	return 0;
  }
1b63edd6e   Yuchung Cheng   tcp: fix SYN-data...
1346
1347
  /* Calculate MSS not accounting any TCP options.  */
  static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
5d424d5a6   John Heffner   [TCP]: MTU probing
1348
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
1349
1350
  	const struct tcp_sock *tp = tcp_sk(sk);
  	const struct inet_connection_sock *icsk = inet_csk(sk);
5d424d5a6   John Heffner   [TCP]: MTU probing
1351
1352
1353
1354
1355
1356
  	int mss_now;
  
  	/* Calculate base mss without TCP options:
  	   It is MMS_S - sizeof(tcphdr) of rfc1122
  	 */
  	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
674696014   Eric Dumazet   ipv6: RTAX_FEATUR...
1357
1358
1359
1360
1361
1362
1363
  	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
  	if (icsk->icsk_af_ops->net_frag_header_len) {
  		const struct dst_entry *dst = __sk_dst_get(sk);
  
  		if (dst && dst_allfrag(dst))
  			mss_now -= icsk->icsk_af_ops->net_frag_header_len;
  	}
5d424d5a6   John Heffner   [TCP]: MTU probing
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
  	/* Clamp it (mss_clamp does not include tcp options) */
  	if (mss_now > tp->rx_opt.mss_clamp)
  		mss_now = tp->rx_opt.mss_clamp;
  
  	/* Now subtract optional transport overhead */
  	mss_now -= icsk->icsk_ext_hdr_len;
  
  	/* Then reserve room for full set of TCP options and 8 bytes of data */
  	if (mss_now < 48)
  		mss_now = 48;
5d424d5a6   John Heffner   [TCP]: MTU probing
1374
1375
  	return mss_now;
  }
1b63edd6e   Yuchung Cheng   tcp: fix SYN-data...
1376
1377
1378
1379
1380
1381
1382
  /* Calculate MSS. Not accounting for SACKs here.  */
  int tcp_mtu_to_mss(struct sock *sk, int pmtu)
  {
  	/* Subtract TCP options size, not including SACKs */
  	return __tcp_mtu_to_mss(sk, pmtu) -
  	       (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
  }
5d424d5a6   John Heffner   [TCP]: MTU probing
1383
  /* Inverse of above */
674696014   Eric Dumazet   ipv6: RTAX_FEATUR...
1384
  int tcp_mss_to_mtu(struct sock *sk, int mss)
5d424d5a6   John Heffner   [TCP]: MTU probing
1385
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
1386
1387
  	const struct tcp_sock *tp = tcp_sk(sk);
  	const struct inet_connection_sock *icsk = inet_csk(sk);
5d424d5a6   John Heffner   [TCP]: MTU probing
1388
1389
1390
1391
1392
1393
  	int mtu;
  
  	mtu = mss +
  	      tp->tcp_header_len +
  	      icsk->icsk_ext_hdr_len +
  	      icsk->icsk_af_ops->net_header_len;
674696014   Eric Dumazet   ipv6: RTAX_FEATUR...
1394
1395
1396
1397
1398
1399
1400
  	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
  	if (icsk->icsk_af_ops->net_frag_header_len) {
  		const struct dst_entry *dst = __sk_dst_get(sk);
  
  		if (dst && dst_allfrag(dst))
  			mtu += icsk->icsk_af_ops->net_frag_header_len;
  	}
5d424d5a6   John Heffner   [TCP]: MTU probing
1401
1402
  	return mtu;
  }
556c6b46d   Neal Cardwell   tcp: export tcp_m...
1403
  EXPORT_SYMBOL(tcp_mss_to_mtu);
5d424d5a6   John Heffner   [TCP]: MTU probing
1404

67edfef78   Andi Kleen   TCP: Add comments...
1405
  /* MTU probing init per socket */
5d424d5a6   John Heffner   [TCP]: MTU probing
1406
1407
1408
1409
  void tcp_mtup_init(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct inet_connection_sock *icsk = inet_csk(sk);
b0f9ca53c   Fan Du   ipv4: Namespecify...
1410
  	struct net *net = sock_net(sk);
5d424d5a6   John Heffner   [TCP]: MTU probing
1411

b0f9ca53c   Fan Du   ipv4: Namespecify...
1412
  	icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
5d424d5a6   John Heffner   [TCP]: MTU probing
1413
  	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
1414
  			       icsk->icsk_af_ops->net_header_len;
b0f9ca53c   Fan Du   ipv4: Namespecify...
1415
  	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
5d424d5a6   John Heffner   [TCP]: MTU probing
1416
  	icsk->icsk_mtup.probe_size = 0;
05cbc0db0   Fan Du   ipv4: Create prob...
1417
  	if (icsk->icsk_mtup.enabled)
c74df29a8   Eric Dumazet   tcp: use tcp_jiff...
1418
  		icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
5d424d5a6   John Heffner   [TCP]: MTU probing
1419
  }
4bc2f18ba   Eric Dumazet   net/ipv4: EXPORT_...
1420
  EXPORT_SYMBOL(tcp_mtup_init);
5d424d5a6   John Heffner   [TCP]: MTU probing
1421

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1422
1423
1424
1425
1426
1427
  /* This function synchronize snd mss to current pmtu/exthdr set.
  
     tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
     for TCP options, but includes only bare TCP header.
  
     tp->rx_opt.mss_clamp is mss negotiated at connection setup.
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
1428
     It is minimum of user_mss and mss received with SYN.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1429
     It also does not include TCP options.
d83d8461f   Arnaldo Carvalho de Melo   [IP_SOCKGLUE]: Re...
1430
     inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1431
1432
1433
1434
1435
1436
1437
1438
  
     tp->mss_cache is current effective sending mss, including
     all tcp options except for SACKs. It is evaluated,
     taking into account current pmtu, but never exceeds
     tp->rx_opt.mss_clamp.
  
     NOTE1. rfc1122 clearly states that advertised MSS
     DOES NOT include either tcp or ip options.
d83d8461f   Arnaldo Carvalho de Melo   [IP_SOCKGLUE]: Re...
1439
1440
     NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
     are READ ONLY outside this function.		--ANK (980731)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1441
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1442
1443
1444
  unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
d83d8461f   Arnaldo Carvalho de Melo   [IP_SOCKGLUE]: Re...
1445
  	struct inet_connection_sock *icsk = inet_csk(sk);
5d424d5a6   John Heffner   [TCP]: MTU probing
1446
  	int mss_now;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1447

5d424d5a6   John Heffner   [TCP]: MTU probing
1448
1449
  	if (icsk->icsk_mtup.search_high > pmtu)
  		icsk->icsk_mtup.search_high = pmtu;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1450

5d424d5a6   John Heffner   [TCP]: MTU probing
1451
  	mss_now = tcp_mtu_to_mss(sk, pmtu);
409d22b47   Ilpo Järvinen   [TCP]: Code dupli...
1452
  	mss_now = tcp_bound_to_half_wnd(tp, mss_now);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1453
1454
  
  	/* And store cached results */
d83d8461f   Arnaldo Carvalho de Melo   [IP_SOCKGLUE]: Re...
1455
  	icsk->icsk_pmtu_cookie = pmtu;
5d424d5a6   John Heffner   [TCP]: MTU probing
1456
1457
  	if (icsk->icsk_mtup.enabled)
  		mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1458
  	tp->mss_cache = mss_now;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1459
1460
1461
  
  	return mss_now;
  }
4bc2f18ba   Eric Dumazet   net/ipv4: EXPORT_...
1462
  EXPORT_SYMBOL(tcp_sync_mss);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1463
1464
1465
  
  /* Compute the current effective MSS, taking SACKs and IP options,
   * and even PMTU discovery events into account.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1466
   */
0c54b85f2   Ilpo Järvinen   tcp: simplify tcp...
1467
  unsigned int tcp_current_mss(struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1468
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
1469
1470
  	const struct tcp_sock *tp = tcp_sk(sk);
  	const struct dst_entry *dst = __sk_dst_get(sk);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1471
  	u32 mss_now;
95c961747   Eric Dumazet   net: cleanup unsi...
1472
  	unsigned int header_len;
33ad798c9   Adam Langley   tcp: options clea...
1473
1474
  	struct tcp_out_options opts;
  	struct tcp_md5sig_key *md5;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1475
1476
  
  	mss_now = tp->mss_cache;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1477
1478
  	if (dst) {
  		u32 mtu = dst_mtu(dst);
d83d8461f   Arnaldo Carvalho de Melo   [IP_SOCKGLUE]: Re...
1479
  		if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1480
1481
  			mss_now = tcp_sync_mss(sk, mtu);
  	}
33ad798c9   Adam Langley   tcp: options clea...
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
  	header_len = tcp_established_options(sk, NULL, &opts, &md5) +
  		     sizeof(struct tcphdr);
  	/* The mss_cache is sized based on tp->tcp_header_len, which assumes
  	 * some common options. If this is an odd packet (because we have SACK
  	 * blocks etc) then our calculated header_len will be different, and
  	 * we have to adjust mss_now correspondingly */
  	if (header_len != tp->tcp_header_len) {
  		int delta = (int) header_len - tp->tcp_header_len;
  		mss_now -= delta;
  	}
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
1492

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1493
1494
  	return mss_now;
  }
86fd14ad1   Weiping Pan   tcp: make tcp_cwn...
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
  /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
   * As additional protections, we do not touch cwnd in retransmission phases,
   * and if application hit its sndbuf limit recently.
   */
  static void tcp_cwnd_application_limited(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
  	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
  		/* Limited by application or receiver window. */
  		u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
  		u32 win_used = max(tp->snd_cwnd_used, init_win);
  		if (win_used < tp->snd_cwnd) {
  			tp->snd_ssthresh = tcp_current_ssthresh(sk);
  			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
  		}
  		tp->snd_cwnd_used = 0;
  	}
c2203cf75   Eric Dumazet   tcp: use tcp_jiff...
1514
  	tp->snd_cwnd_stamp = tcp_jiffies32;
86fd14ad1   Weiping Pan   tcp: make tcp_cwn...
1515
  }
ca8a22634   Neal Cardwell   tcp: make cwnd-li...
1516
  static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
a762a9800   David S. Miller   [TCP]: Kill extra...
1517
  {
1b1fc3fdd   Wei Wang   tcp: make congest...
1518
  	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
1519
  	struct tcp_sock *tp = tcp_sk(sk);
a762a9800   David S. Miller   [TCP]: Kill extra...
1520

ca8a22634   Neal Cardwell   tcp: make cwnd-li...
1521
1522
1523
1524
1525
1526
1527
1528
1529
  	/* Track the maximum number of outstanding packets in each
  	 * window, and remember whether we were cwnd-limited then.
  	 */
  	if (!before(tp->snd_una, tp->max_packets_seq) ||
  	    tp->packets_out > tp->max_packets_out) {
  		tp->max_packets_out = tp->packets_out;
  		tp->max_packets_seq = tp->snd_nxt;
  		tp->is_cwnd_limited = is_cwnd_limited;
  	}
e114a710a   Eric Dumazet   tcp: fix cwnd lim...
1530

249015515   Eric Dumazet   tcp: remove in_fl...
1531
  	if (tcp_is_cwnd_limited(sk)) {
a762a9800   David S. Miller   [TCP]: Kill extra...
1532
1533
  		/* Network is feed fully. */
  		tp->snd_cwnd_used = 0;
c2203cf75   Eric Dumazet   tcp: use tcp_jiff...
1534
  		tp->snd_cwnd_stamp = tcp_jiffies32;
a762a9800   David S. Miller   [TCP]: Kill extra...
1535
1536
1537
1538
  	} else {
  		/* Network starves. */
  		if (tp->packets_out > tp->snd_cwnd_used)
  			tp->snd_cwnd_used = tp->packets_out;
15d33c070   David S. Miller   [TCP]: slow_start...
1539
  		if (sysctl_tcp_slow_start_after_idle &&
c2203cf75   Eric Dumazet   tcp: use tcp_jiff...
1540
  		    (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
1b1fc3fdd   Wei Wang   tcp: make congest...
1541
  		    !ca_ops->cong_control)
a762a9800   David S. Miller   [TCP]: Kill extra...
1542
  			tcp_cwnd_application_limited(sk);
b0f71bd3e   Francis Yan   tcp: instrument h...
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
  
  		/* The following conditions together indicate the starvation
  		 * is caused by insufficient sender buffer:
  		 * 1) just sent some data (see tcp_write_xmit)
  		 * 2) not cwnd limited (this else condition)
  		 * 3) no more data to send (null tcp_send_head )
  		 * 4) application is hitting buffer limit (SOCK_NOSPACE)
  		 */
  		if (!tcp_send_head(sk) && sk->sk_socket &&
  		    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
  		    (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
  			tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
a762a9800   David S. Miller   [TCP]: Kill extra...
1555
1556
  	}
  }
d4589926d   Eric Dumazet   tcp: refine TSO s...
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
  /* Minshall's variant of the Nagle send check. */
  static bool tcp_minshall_check(const struct tcp_sock *tp)
  {
  	return after(tp->snd_sml, tp->snd_una) &&
  		!after(tp->snd_sml, tp->snd_nxt);
  }
  
  /* Update snd_sml if this skb is under mss
   * Note that a TSO packet might end with a sub-mss segment
   * The test is really :
   * if ((skb->len % mss) != 0)
   *        tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
   * But we can avoid doing the divide again given we already have
   *  skb_pcount = skb->len / mss_now
0e3a4803a   Ilpo Järvinen   [TCP]: Force TSO ...
1571
   */
d4589926d   Eric Dumazet   tcp: refine TSO s...
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
  static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
  				const struct sk_buff *skb)
  {
  	if (skb->len < tcp_skb_pcount(skb) * mss_now)
  		tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
  }
  
  /* Return false, if packet can be sent now without violation Nagle's rules:
   * 1. It is full sized. (provided by caller in %partial bool)
   * 2. Or it contains FIN. (already checked by caller)
   * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
   * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
   *    With Minshall's modification: all sent small packets are ACKed.
   */
  static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
cc93fc51f   Peter Pan(潘卫平)   tcp: delete unuse...
1587
  			    int nonagle)
d4589926d   Eric Dumazet   tcp: refine TSO s...
1588
1589
1590
1591
1592
  {
  	return partial &&
  		((nonagle & TCP_NAGLE_CORK) ||
  		 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
  }
605ad7f18   Eric Dumazet   tcp: refine TSO a...
1593
1594
1595
1596
  
  /* Return how many segs we'd like on a TSO packet,
   * to send one TSO packet per ms
   */
1b3878ca1   Neal Cardwell   tcp: export tcp_t...
1597
1598
  u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
  		     int min_tso_segs)
605ad7f18   Eric Dumazet   tcp: refine TSO a...
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
  {
  	u32 bytes, segs;
  
  	bytes = min(sk->sk_pacing_rate >> 10,
  		    sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
  
  	/* Goal is to send at least one packet per ms,
  	 * not one big TSO packet every 100 ms.
  	 * This preserves ACK clocking and is consistent
  	 * with tcp_tso_should_defer() heuristic.
  	 */
1b3878ca1   Neal Cardwell   tcp: export tcp_t...
1610
  	segs = max_t(u32, bytes / mss_now, min_tso_segs);
605ad7f18   Eric Dumazet   tcp: refine TSO a...
1611

d6a76199e   Eric Dumazet   tcp_bbr: better d...
1612
  	return segs;
605ad7f18   Eric Dumazet   tcp: refine TSO a...
1613
  }
1b3878ca1   Neal Cardwell   tcp: export tcp_t...
1614
  EXPORT_SYMBOL(tcp_tso_autosize);
605ad7f18   Eric Dumazet   tcp: refine TSO a...
1615

ed6e7268b   Neal Cardwell   tcp: allow conges...
1616
1617
1618
1619
1620
1621
1622
  /* Return the number of segments we want in the skb we are transmitting.
   * See if congestion control module wants to decide; otherwise, autosize.
   */
  static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
  {
  	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
  	u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
d6a76199e   Eric Dumazet   tcp_bbr: better d...
1623
1624
1625
1626
  	if (!tso_segs)
  		tso_segs = tcp_tso_autosize(sk, mss_now,
  					    sysctl_tcp_min_tso_segs);
  	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
ed6e7268b   Neal Cardwell   tcp: allow conges...
1627
  }
d4589926d   Eric Dumazet   tcp: refine TSO s...
1628
1629
1630
1631
1632
1633
  /* Returns the portion of skb which can be sent right away */
  static unsigned int tcp_mss_split_point(const struct sock *sk,
  					const struct sk_buff *skb,
  					unsigned int mss_now,
  					unsigned int max_segs,
  					int nonagle)
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1634
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
1635
  	const struct tcp_sock *tp = tcp_sk(sk);
d4589926d   Eric Dumazet   tcp: refine TSO s...
1636
  	u32 partial, needed, window, max_len;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1637

90840defa   Ilpo Järvinen   [TCP]: Introduce ...
1638
  	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1485348d2   Ben Hutchings   tcp: Apply device...
1639
  	max_len = mss_now * max_segs;
0e3a4803a   Ilpo Järvinen   [TCP]: Force TSO ...
1640

1485348d2   Ben Hutchings   tcp: Apply device...
1641
1642
  	if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
  		return max_len;
0e3a4803a   Ilpo Järvinen   [TCP]: Force TSO ...
1643

5ea3a7480   Ilpo Järvinen   [TCP]: Prevent se...
1644
  	needed = min(skb->len, window);
1485348d2   Ben Hutchings   tcp: Apply device...
1645
1646
  	if (max_len <= needed)
  		return max_len;
0e3a4803a   Ilpo Järvinen   [TCP]: Force TSO ...
1647

d4589926d   Eric Dumazet   tcp: refine TSO s...
1648
1649
1650
1651
1652
  	partial = needed % mss_now;
  	/* If last segment is not a full MSS, check if Nagle rules allow us
  	 * to include this last segment in this skb.
  	 * Otherwise, we'll split the skb at last MSS boundary
  	 */
cc93fc51f   Peter Pan(潘卫平)   tcp: delete unuse...
1653
  	if (tcp_nagle_check(partial != 0, tp, nonagle))
d4589926d   Eric Dumazet   tcp: refine TSO s...
1654
1655
1656
  		return needed - partial;
  
  	return needed;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1657
1658
1659
1660
1661
  }
  
  /* Can at least one segment of SKB be sent right now, according to the
   * congestion window rules?  If so, return how many segments are allowed.
   */
cf533ea53   Eric Dumazet   tcp: add const qu...
1662
1663
  static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
  					 const struct sk_buff *skb)
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1664
  {
d649a7a81   Eric Dumazet   tcp: limit GSO pa...
1665
  	u32 in_flight, cwnd, halfcwnd;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1666
1667
  
  	/* Don't be strict about the congestion window for the final FIN.  */
4de075e04   Eric Dumazet   tcp: rename tcp_s...
1668
1669
  	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
  	    tcp_skb_pcount(skb) == 1)
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1670
1671
1672
1673
  		return 1;
  
  	in_flight = tcp_packets_in_flight(tp);
  	cwnd = tp->snd_cwnd;
d649a7a81   Eric Dumazet   tcp: limit GSO pa...
1674
1675
  	if (in_flight >= cwnd)
  		return 0;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1676

d649a7a81   Eric Dumazet   tcp: limit GSO pa...
1677
1678
1679
1680
1681
  	/* For better scheduling, ensure we have at least
  	 * 2 GSO packets in flight.
  	 */
  	halfcwnd = max(cwnd >> 1, 1U);
  	return min(halfcwnd, cwnd - in_flight);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1682
  }
b595076a1   Uwe Kleine-König   tree-wide: fix co...
1683
  /* Initialize TSO state of a skb.
67edfef78   Andi Kleen   TCP: Add comments...
1684
   * This must be invoked the first time we consider transmitting
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1685
1686
   * SKB onto the wire.
   */
5bbb432c8   Eric Dumazet   tcp: tcp_set_skb_...
1687
  static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1688
1689
  {
  	int tso_segs = tcp_skb_pcount(skb);
f8269a495   Ilpo Järvinen   tcp: make urg+gso...
1690
  	if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
5bbb432c8   Eric Dumazet   tcp: tcp_set_skb_...
1691
  		tcp_set_skb_tso_segs(skb, mss_now);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1692
1693
1694
1695
  		tso_segs = tcp_skb_pcount(skb);
  	}
  	return tso_segs;
  }
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1696

a2a385d62   Eric Dumazet   tcp: bool convers...
1697
  /* Return true if the Nagle test allows this packet to be
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1698
1699
   * sent now.
   */
a2a385d62   Eric Dumazet   tcp: bool convers...
1700
1701
  static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
  				  unsigned int cur_mss, int nonagle)
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1702
1703
1704
1705
1706
1707
1708
1709
  {
  	/* Nagle rule does not apply to frames, which sit in the middle of the
  	 * write_queue (they have no chances to get new data).
  	 *
  	 * This is implemented in the callers, where they modify the 'nonagle'
  	 * argument based upon the location of SKB in the send queue.
  	 */
  	if (nonagle & TCP_NAGLE_PUSH)
a2a385d62   Eric Dumazet   tcp: bool convers...
1710
  		return true;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1711

9b44190dc   Yuchung Cheng   tcp: refactor F-RTO
1712
1713
  	/* Don't use the nagle rule for urgent data (or for the final FIN). */
  	if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
a2a385d62   Eric Dumazet   tcp: bool convers...
1714
  		return true;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1715

cc93fc51f   Peter Pan(潘卫平)   tcp: delete unuse...
1716
  	if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
a2a385d62   Eric Dumazet   tcp: bool convers...
1717
  		return true;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1718

a2a385d62   Eric Dumazet   tcp: bool convers...
1719
  	return false;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1720
1721
1722
  }
  
  /* Does at least the first segment of SKB fit into the send window? */
a2a385d62   Eric Dumazet   tcp: bool convers...
1723
1724
1725
  static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
  			     const struct sk_buff *skb,
  			     unsigned int cur_mss)
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1726
1727
1728
1729
1730
  {
  	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
  
  	if (skb->len > cur_mss)
  		end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
90840defa   Ilpo Järvinen   [TCP]: Introduce ...
1731
  	return !after(end_seq, tcp_wnd_end(tp));
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1732
  }
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1733
1734
1735
1736
1737
1738
1739
  /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
   * which is put after SKB on the list.  It is very much like
   * tcp_fragment() except that it may make several kinds of assumptions
   * in order to speed up the splitting operation.  In particular, we
   * know that all the data is in scatter-gather pages, and that the
   * packet has never been sent out before (and thus is not cloned).
   */
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
1740
  static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
c4ead4c59   Eric Dumazet   tcp: tso_fragment...
1741
  			unsigned int mss_now, gfp_t gfp)
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1742
1743
1744
  {
  	struct sk_buff *buff;
  	int nlen = skb->len - len;
9ce014610   Ilpo Järvinen   tcp: get rid of t...
1745
  	u8 flags;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1746
1747
  
  	/* All of a TSO frame must be composed of paged data.  */
c8ac37746   Herbert Xu   [TCP]: Fix bug #5...
1748
  	if (skb->len != skb->data_len)
6cc55e096   Octavian Purdila   tcp: add gfp para...
1749
  		return tcp_fragment(sk, skb, len, mss_now, gfp);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1750

eb9344781   Eric Dumazet   tcp: add a force_...
1751
  	buff = sk_stream_alloc_skb(sk, 0, gfp, true);
51456b291   Ian Morris   ipv4: coding styl...
1752
  	if (unlikely(!buff))
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1753
  		return -ENOMEM;
3ab224be6   Hideo Aoki   [NET] CORE: Intro...
1754
1755
  	sk->sk_wmem_queued += buff->truesize;
  	sk_mem_charge(sk, buff->truesize);
b60b49ea6   Herbert Xu   [TCP]: Account sk...
1756
  	buff->truesize += nlen;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1757
1758
1759
1760
1761
1762
1763
1764
  	skb->truesize -= nlen;
  
  	/* Correct the sequence numbers. */
  	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
  	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
  	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
  
  	/* PSH and FIN should only be set in the second packet. */
4de075e04   Eric Dumazet   tcp: rename tcp_s...
1765
1766
1767
  	flags = TCP_SKB_CB(skb)->tcp_flags;
  	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
  	TCP_SKB_CB(buff)->tcp_flags = flags;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1768
1769
1770
  
  	/* This packet was never sent out yet, so no SACK bits. */
  	TCP_SKB_CB(buff)->sacked = 0;
a166140e8   Martin KaFai Lau   tcp: Handle eor b...
1771
  	tcp_skb_fragment_eor(skb, buff);
84fa7933a   Patrick McHardy   [NET]: Replace CH...
1772
  	buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1773
  	skb_split(skb, buff, len);
490cc7d03   Willem de Bruijn   net-timestamp: fi...
1774
  	tcp_fragment_tstamp(skb, buff);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1775
1776
  
  	/* Fix up tso_factor for both original and new SKB.  */
5bbb432c8   Eric Dumazet   tcp: tcp_set_skb_...
1777
1778
  	tcp_set_skb_tso_segs(skb, mss_now);
  	tcp_set_skb_tso_segs(buff, mss_now);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1779
1780
  
  	/* Link BUFF into the send queue. */
f4a775d14   Eric Dumazet   net: introduce __...
1781
  	__skb_header_release(buff);
fe067e8ab   David S. Miller   [TCP]: Abstract o...
1782
  	tcp_insert_write_queue_after(skb, buff, sk);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1783
1784
1785
1786
1787
1788
1789
1790
1791
  
  	return 0;
  }
  
  /* Try to defer sending, if possible, in order to minimize the amount
   * of TSO splitting we do.  View it as a kind of TSO Nagle test.
   *
   * This algorithm is from John Heffner.
   */
ca8a22634   Neal Cardwell   tcp: make cwnd-li...
1792
  static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
605ad7f18   Eric Dumazet   tcp: refine TSO a...
1793
  				 bool *is_cwnd_limited, u32 max_segs)
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1794
  {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
1795
  	const struct inet_connection_sock *icsk = inet_csk(sk);
50c8339e9   Eric Dumazet   tcp: tso: restore...
1796
1797
  	u32 age, send_win, cong_win, limit, in_flight;
  	struct tcp_sock *tp = tcp_sk(sk);
50c8339e9   Eric Dumazet   tcp: tso: restore...
1798
  	struct sk_buff *head;
ad9f4f50f   Eric Dumazet   tcp: avoid a poss...
1799
  	int win_divisor;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1800

4de075e04   Eric Dumazet   tcp: rename tcp_s...
1801
  	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
ae8064ac3   John Heffner   [TCP]: Bound TSO ...
1802
  		goto send_now;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1803

99d7662a0   Eric Dumazet   tcp: tso: allow d...
1804
  	if (icsk->icsk_ca_state >= TCP_CA_Recovery)
ae8064ac3   John Heffner   [TCP]: Bound TSO ...
1805
  		goto send_now;
5f852eb53   Eric Dumazet   tcp: tso: remove ...
1806
1807
1808
  	/* Avoid bursty behavior by allowing defer
  	 * only if the last write was recent.
  	 */
d635fbe27   Eric Dumazet   tcp: use tcp_jiff...
1809
  	if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0)
ae8064ac3   John Heffner   [TCP]: Bound TSO ...
1810
  		goto send_now;
908a75c17   David S. Miller   [TCP]: Never TSO ...
1811

c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1812
  	in_flight = tcp_packets_in_flight(tp);
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
1813
  	BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight));
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1814

90840defa   Ilpo Järvinen   [TCP]: Introduce ...
1815
  	send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1816
1817
1818
1819
1820
  
  	/* From in_flight test above, we know that cwnd > in_flight.  */
  	cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
  
  	limit = min(send_win, cong_win);
ba244fe90   David S. Miller   [TCP]: Fix tcp_ts...
1821
  	/* If a full-sized TSO skb can be sent, do it. */
605ad7f18   Eric Dumazet   tcp: refine TSO a...
1822
  	if (limit >= max_segs * tp->mss_cache)
ae8064ac3   John Heffner   [TCP]: Bound TSO ...
1823
  		goto send_now;
ba244fe90   David S. Miller   [TCP]: Fix tcp_ts...
1824

62ad27619   Ilpo Järvinen   tcp: deferring in...
1825
1826
1827
  	/* Middle in queue won't get any more data, full sendable already? */
  	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
  		goto send_now;
ad9f4f50f   Eric Dumazet   tcp: avoid a poss...
1828
1829
  	win_divisor = ACCESS_ONCE(sysctl_tcp_tso_win_divisor);
  	if (win_divisor) {
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1830
1831
1832
1833
1834
  		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
  
  		/* If at least some fraction of a window is available,
  		 * just use it.
  		 */
ad9f4f50f   Eric Dumazet   tcp: avoid a poss...
1835
  		chunk /= win_divisor;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1836
  		if (limit >= chunk)
ae8064ac3   John Heffner   [TCP]: Bound TSO ...
1837
  			goto send_now;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1838
1839
1840
1841
1842
1843
  	} else {
  		/* Different approach, try not to defer past a single
  		 * ACK.  Receiver should ACK every other full sized
  		 * frame, so if we have space for more than 3 frames
  		 * then send now.
  		 */
6b5a5c0db   Neal Cardwell   tcp: do not scale...
1844
  		if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
ae8064ac3   John Heffner   [TCP]: Bound TSO ...
1845
  			goto send_now;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1846
  	}
50c8339e9   Eric Dumazet   tcp: tso: restore...
1847
  	head = tcp_write_queue_head(sk);
385e20706   Eric Dumazet   tcp: use tp->tcp_...
1848

9a568de48   Eric Dumazet   tcp: switch TCP T...
1849
  	age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
50c8339e9   Eric Dumazet   tcp: tso: restore...
1850
1851
1852
  	/* If next ACK is likely to come too late (half srtt), do not defer */
  	if (age < (tp->srtt_us >> 4))
  		goto send_now;
5f852eb53   Eric Dumazet   tcp: tso: remove ...
1853
  	/* Ok, it looks like it is advisable to defer. */
ae8064ac3   John Heffner   [TCP]: Bound TSO ...
1854

d2e1339f4   Bendik Rønning Opstad   tcp: Fix CWV bein...
1855
  	if (cong_win < send_win && cong_win <= skb->len)
ca8a22634   Neal Cardwell   tcp: make cwnd-li...
1856
  		*is_cwnd_limited = true;
a2a385d62   Eric Dumazet   tcp: bool convers...
1857
  	return true;
ae8064ac3   John Heffner   [TCP]: Bound TSO ...
1858
1859
  
  send_now:
a2a385d62   Eric Dumazet   tcp: bool convers...
1860
  	return false;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1861
  }
05cbc0db0   Fan Du   ipv4: Create prob...
1862
1863
1864
1865
1866
1867
1868
1869
1870
  static inline void tcp_mtu_check_reprobe(struct sock *sk)
  {
  	struct inet_connection_sock *icsk = inet_csk(sk);
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct net *net = sock_net(sk);
  	u32 interval;
  	s32 delta;
  
  	interval = net->ipv4.sysctl_tcp_probe_interval;
c74df29a8   Eric Dumazet   tcp: use tcp_jiff...
1871
  	delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
05cbc0db0   Fan Du   ipv4: Create prob...
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
  	if (unlikely(delta >= interval * HZ)) {
  		int mss = tcp_current_mss(sk);
  
  		/* Update current search range */
  		icsk->icsk_mtup.probe_size = 0;
  		icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
  			sizeof(struct tcphdr) +
  			icsk->icsk_af_ops->net_header_len;
  		icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
  
  		/* Update probe time stamp */
c74df29a8   Eric Dumazet   tcp: use tcp_jiff...
1883
  		icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
05cbc0db0   Fan Du   ipv4: Create prob...
1884
1885
  	}
  }
17634603d   Ilya Lesokhin   tcp: Honor the eo...
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
  static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
  {
  	struct sk_buff *skb, *next;
  
  	skb = tcp_send_head(sk);
  	tcp_for_write_queue_from_safe(skb, next, sk) {
  		if (len <= skb->len)
  			break;
  
  		if (unlikely(TCP_SKB_CB(skb)->eor))
  			return false;
  
  		len -= skb->len;
  	}
  
  	return true;
  }
5d424d5a6   John Heffner   [TCP]: MTU probing
1903
  /* Create a new MTU probe if we are ready.
67edfef78   Andi Kleen   TCP: Add comments...
1904
1905
1906
1907
   * MTU probe is regularly attempting to increase the path MTU by
   * deliberately sending larger packets.  This discovers routing
   * changes resulting in larger path MTUs.
   *
5d424d5a6   John Heffner   [TCP]: MTU probing
1908
1909
   * Returns 0 if we should wait to probe (no cwnd available),
   *         1 if a probe was sent,
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
1910
1911
   *         -1 otherwise
   */
5d424d5a6   John Heffner   [TCP]: MTU probing
1912
1913
  static int tcp_mtu_probe(struct sock *sk)
  {
5d424d5a6   John Heffner   [TCP]: MTU probing
1914
  	struct inet_connection_sock *icsk = inet_csk(sk);
12a59abc2   Eric Dumazet   tcp: tcp_mtu_prob...
1915
  	struct tcp_sock *tp = tcp_sk(sk);
5d424d5a6   John Heffner   [TCP]: MTU probing
1916
  	struct sk_buff *skb, *nskb, *next;
6b58e0a5f   Fan Du   ipv4: Use binary ...
1917
  	struct net *net = sock_net(sk);
5d424d5a6   John Heffner   [TCP]: MTU probing
1918
  	int probe_size;
91cc17c0e   Ilpo Järvinen   [TCP]: MTUprobe: ...
1919
  	int size_needed;
12a59abc2   Eric Dumazet   tcp: tcp_mtu_prob...
1920
  	int copy, len;
5d424d5a6   John Heffner   [TCP]: MTU probing
1921
  	int mss_now;
6b58e0a5f   Fan Du   ipv4: Use binary ...
1922
  	int interval;
5d424d5a6   John Heffner   [TCP]: MTU probing
1923
1924
1925
1926
  
  	/* Not currently probing/verifying,
  	 * not in recovery,
  	 * have enough cwnd, and
12a59abc2   Eric Dumazet   tcp: tcp_mtu_prob...
1927
1928
1929
1930
1931
1932
1933
  	 * not SACKing (the variable headers throw things off)
  	 */
  	if (likely(!icsk->icsk_mtup.enabled ||
  		   icsk->icsk_mtup.probe_size ||
  		   inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
  		   tp->snd_cwnd < 11 ||
  		   tp->rx_opt.num_sacks || tp->rx_opt.dsack))
5d424d5a6   John Heffner   [TCP]: MTU probing
1934
  		return -1;
6b58e0a5f   Fan Du   ipv4: Use binary ...
1935
1936
1937
1938
  	/* Use binary search for probe_size between tcp_mss_base,
  	 * and current mss_clamp. if (search_high - search_low)
  	 * smaller than a threshold, backoff from probing.
  	 */
0c54b85f2   Ilpo Järvinen   tcp: simplify tcp...
1939
  	mss_now = tcp_current_mss(sk);
6b58e0a5f   Fan Du   ipv4: Use binary ...
1940
1941
  	probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
  				    icsk->icsk_mtup.search_low) >> 1);
91cc17c0e   Ilpo Järvinen   [TCP]: MTUprobe: ...
1942
  	size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
6b58e0a5f   Fan Du   ipv4: Use binary ...
1943
  	interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
05cbc0db0   Fan Du   ipv4: Create prob...
1944
1945
1946
1947
  	/* When misfortune happens, we are reprobing actively,
  	 * and then reprobe timer has expired. We stick with current
  	 * probing process by not resetting search range to its orignal.
  	 */
6b58e0a5f   Fan Du   ipv4: Use binary ...
1948
  	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
05cbc0db0   Fan Du   ipv4: Create prob...
1949
1950
1951
1952
1953
  		interval < net->ipv4.sysctl_tcp_probe_threshold) {
  		/* Check whether enough time has elaplased for
  		 * another round of probing.
  		 */
  		tcp_mtu_check_reprobe(sk);
5d424d5a6   John Heffner   [TCP]: MTU probing
1954
1955
1956
1957
  		return -1;
  	}
  
  	/* Have enough data in the send queue to probe? */
7f9c33e51   Ilpo Järvinen   [TCP] MTUprobe: C...
1958
  	if (tp->write_seq - tp->snd_nxt < size_needed)
5d424d5a6   John Heffner   [TCP]: MTU probing
1959
  		return -1;
91cc17c0e   Ilpo Järvinen   [TCP]: MTUprobe: ...
1960
1961
  	if (tp->snd_wnd < size_needed)
  		return -1;
90840defa   Ilpo Järvinen   [TCP]: Introduce ...
1962
  	if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
91cc17c0e   Ilpo Järvinen   [TCP]: MTUprobe: ...
1963
  		return 0;
5d424d5a6   John Heffner   [TCP]: MTU probing
1964

d67c58e9a   Ilpo Järvinen   [TCP]: Remove loc...
1965
1966
1967
  	/* Do we need to wait to drain cwnd? With none in flight, don't stall */
  	if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
  		if (!tcp_packets_in_flight(tp))
5d424d5a6   John Heffner   [TCP]: MTU probing
1968
1969
1970
1971
  			return -1;
  		else
  			return 0;
  	}
17634603d   Ilya Lesokhin   tcp: Honor the eo...
1972
1973
  	if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
  		return -1;
5d424d5a6   John Heffner   [TCP]: MTU probing
1974
  	/* We're allowed to probe.  Build it now. */
eb9344781   Eric Dumazet   tcp: add a force_...
1975
  	nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
51456b291   Ian Morris   ipv4: coding styl...
1976
  	if (!nskb)
5d424d5a6   John Heffner   [TCP]: MTU probing
1977
  		return -1;
3ab224be6   Hideo Aoki   [NET] CORE: Intro...
1978
1979
  	sk->sk_wmem_queued += nskb->truesize;
  	sk_mem_charge(sk, nskb->truesize);
5d424d5a6   John Heffner   [TCP]: MTU probing
1980

fe067e8ab   David S. Miller   [TCP]: Abstract o...
1981
  	skb = tcp_send_head(sk);
5d424d5a6   John Heffner   [TCP]: MTU probing
1982
1983
1984
  
  	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
  	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
4de075e04   Eric Dumazet   tcp: rename tcp_s...
1985
  	TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
5d424d5a6   John Heffner   [TCP]: MTU probing
1986
1987
  	TCP_SKB_CB(nskb)->sacked = 0;
  	nskb->csum = 0;
84fa7933a   Patrick McHardy   [NET]: Replace CH...
1988
  	nskb->ip_summed = skb->ip_summed;
5d424d5a6   John Heffner   [TCP]: MTU probing
1989

50c4817e9   Ilpo Järvinen   [TCP]: MTUprobe: ...
1990
  	tcp_insert_write_queue_before(nskb, skb, sk);
2b7cda9c3   Eric Dumazet   tcp: fix tcp_mtu_...
1991
  	tcp_highest_sack_replace(sk, skb, nskb);
50c4817e9   Ilpo Järvinen   [TCP]: MTUprobe: ...
1992

5d424d5a6   John Heffner   [TCP]: MTU probing
1993
  	len = 0;
234b68607   Ilpo Järvinen   [TCP]: Add tcp_fo...
1994
  	tcp_for_write_queue_from_safe(skb, next, sk) {
5d424d5a6   John Heffner   [TCP]: MTU probing
1995
  		copy = min_t(int, skb->len, probe_size - len);
2fe664f1f   Douglas Caetano dos Santos   tcp: fix wrong ch...
1996
  		if (nskb->ip_summed) {
5d424d5a6   John Heffner   [TCP]: MTU probing
1997
  			skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
2fe664f1f   Douglas Caetano dos Santos   tcp: fix wrong ch...
1998
1999
2000
2001
2002
2003
  		} else {
  			__wsum csum = skb_copy_and_csum_bits(skb, 0,
  							     skb_put(nskb, copy),
  							     copy, 0);
  			nskb->csum = csum_block_add(nskb->csum, csum, len);
  		}
5d424d5a6   John Heffner   [TCP]: MTU probing
2004
2005
2006
2007
  
  		if (skb->len <= copy) {
  			/* We've eaten all the data from this skb.
  			 * Throw it away. */
4de075e04   Eric Dumazet   tcp: rename tcp_s...
2008
  			TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
17634603d   Ilya Lesokhin   tcp: Honor the eo...
2009
2010
2011
2012
  			/* If this is the last SKB we copy and eor is set
  			 * we need to propagate it to the new skb.
  			 */
  			TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
fe067e8ab   David S. Miller   [TCP]: Abstract o...
2013
  			tcp_unlink_write_queue(skb, sk);
3ab224be6   Hideo Aoki   [NET] CORE: Intro...
2014
  			sk_wmem_free_skb(sk, skb);
5d424d5a6   John Heffner   [TCP]: MTU probing
2015
  		} else {
4de075e04   Eric Dumazet   tcp: rename tcp_s...
2016
  			TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
a3433f35a   Changli Gao   tcp: unify tcp fl...
2017
  						   ~(TCPHDR_FIN|TCPHDR_PSH);
5d424d5a6   John Heffner   [TCP]: MTU probing
2018
2019
  			if (!skb_shinfo(skb)->nr_frags) {
  				skb_pull(skb, copy);
84fa7933a   Patrick McHardy   [NET]: Replace CH...
2020
  				if (skb->ip_summed != CHECKSUM_PARTIAL)
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2021
2022
  					skb->csum = csum_partial(skb->data,
  								 skb->len, 0);
5d424d5a6   John Heffner   [TCP]: MTU probing
2023
2024
  			} else {
  				__pskb_trim_head(skb, copy);
5bbb432c8   Eric Dumazet   tcp: tcp_set_skb_...
2025
  				tcp_set_skb_tso_segs(skb, mss_now);
5d424d5a6   John Heffner   [TCP]: MTU probing
2026
2027
2028
2029
2030
  			}
  			TCP_SKB_CB(skb)->seq += copy;
  		}
  
  		len += copy;
234b68607   Ilpo Järvinen   [TCP]: Add tcp_fo...
2031
2032
2033
  
  		if (len >= probe_size)
  			break;
5d424d5a6   John Heffner   [TCP]: MTU probing
2034
  	}
5bbb432c8   Eric Dumazet   tcp: tcp_set_skb_...
2035
  	tcp_init_tso_segs(nskb, nskb->len);
5d424d5a6   John Heffner   [TCP]: MTU probing
2036
2037
  
  	/* We're ready to send.  If this fails, the probe will
7faee5c0d   Eric Dumazet   tcp: remove TCP_S...
2038
2039
  	 * be resegmented into mss-sized pieces by tcp_write_xmit().
  	 */
5d424d5a6   John Heffner   [TCP]: MTU probing
2040
2041
  	if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
  		/* Decrement cwnd here because we are sending
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2042
  		 * effectively two packets. */
5d424d5a6   John Heffner   [TCP]: MTU probing
2043
  		tp->snd_cwnd--;
66f5fe624   Ilpo Järvinen   [TCP]: Rename upd...
2044
  		tcp_event_new_data_sent(sk, nskb);
5d424d5a6   John Heffner   [TCP]: MTU probing
2045
2046
  
  		icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
0e7b13685   John Heffner   [TCP] mtu probing...
2047
2048
  		tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
  		tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
5d424d5a6   John Heffner   [TCP]: MTU probing
2049
2050
2051
2052
2053
2054
  
  		return 1;
  	}
  
  	return -1;
  }
218af599f   Eric Dumazet   tcp: internal imp...
2055
2056
2057
2058
2059
  static bool tcp_pacing_check(const struct sock *sk)
  {
  	return tcp_needs_internal_pacing(sk) &&
  	       hrtimer_active(&tcp_sk(sk)->pacing_timer);
  }
f9616c35a   Eric Dumazet   tcp: implement TS...
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
  /* TCP Small Queues :
   * Control number of packets in qdisc/devices to two packets / or ~1 ms.
   * (These limits are doubled for retransmits)
   * This allows for :
   *  - better RTT estimation and ACK scheduling
   *  - faster recovery
   *  - high rates
   * Alas, some drivers / subsystems require a fair amount
   * of queued bytes to ensure line rate.
   * One example is wifi aggregation (802.11 AMPDU)
   */
  static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
  				  unsigned int factor)
  {
  	unsigned int limit;
  
  	limit = max(2 * skb->truesize, sk->sk_pacing_rate >> 10);
  	limit = min_t(u32, limit, sysctl_tcp_limit_output_bytes);
  	limit <<= factor;
14afee4b6   Reshetova, Elena   net: convert sock...
2079
  	if (refcount_read(&sk->sk_wmem_alloc) > limit) {
75eefc6c5   Eric Dumazet   tcp: tsq: add a s...
2080
2081
2082
2083
2084
2085
2086
2087
  		/* Always send the 1st or 2nd skb in write queue.
  		 * No need to wait for TX completion to call us back,
  		 * after softirq/tasklet schedule.
  		 * This helps when TX completions are delayed too much.
  		 */
  		if (skb == sk->sk_write_queue.next ||
  		    skb->prev == sk->sk_write_queue.next)
  			return false;
7aa5470c2   Eric Dumazet   tcp: tsq: move ts...
2088
  		set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
f9616c35a   Eric Dumazet   tcp: implement TS...
2089
2090
2091
2092
2093
  		/* It is possible TX completion already happened
  		 * before we set TSQ_THROTTLED, so we must
  		 * test again the condition.
  		 */
  		smp_mb__after_atomic();
14afee4b6   Reshetova, Elena   net: convert sock...
2094
  		if (refcount_read(&sk->sk_wmem_alloc) > limit)
f9616c35a   Eric Dumazet   tcp: implement TS...
2095
2096
2097
2098
  			return true;
  	}
  	return false;
  }
05b055e89   Francis Yan   tcp: instrument t...
2099
2100
  static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
  {
628174ccc   Eric Dumazet   tcp: uses jiffies...
2101
  	const u32 now = tcp_jiffies32;
efe967cde   Arnd Bergmann   tcp: avoid bogus ...
2102
  	enum tcp_chrono old = tp->chrono_type;
05b055e89   Francis Yan   tcp: instrument t...
2103

efe967cde   Arnd Bergmann   tcp: avoid bogus ...
2104
2105
  	if (old > TCP_CHRONO_UNSPEC)
  		tp->chrono_stat[old - 1] += now - tp->chrono_start;
05b055e89   Francis Yan   tcp: instrument t...
2106
2107
2108
2109
2110
2111
2112
2113
2114
  	tp->chrono_start = now;
  	tp->chrono_type = new;
  }
  
  void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	/* If there are multiple conditions worthy of tracking in a
0f87230d1   Francis Yan   tcp: instrument h...
2115
2116
  	 * chronograph then the highest priority enum takes precedence
  	 * over the other conditions. So that if something "more interesting"
05b055e89   Francis Yan   tcp: instrument t...
2117
2118
2119
2120
2121
2122
2123
2124
2125
  	 * starts happening, stop the previous chrono and start a new one.
  	 */
  	if (type > tp->chrono_type)
  		tcp_chrono_set(tp, type);
  }
  
  void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
0f87230d1   Francis Yan   tcp: instrument h...
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
  
  	/* There are multiple conditions worthy of tracking in a
  	 * chronograph, so that the highest priority enum takes
  	 * precedence over the other conditions (see tcp_chrono_start).
  	 * If a condition stops, we only stop chrono tracking if
  	 * it's the "most interesting" or current chrono we are
  	 * tracking and starts busy chrono if we have pending data.
  	 */
  	if (tcp_write_queue_empty(sk))
  		tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
  	else if (type == tp->chrono_type)
  		tcp_chrono_set(tp, TCP_CHRONO_BUSY);
05b055e89   Francis Yan   tcp: instrument t...
2138
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2139
2140
2141
2142
  /* This routine writes packets to the network.  It advances the
   * send_head.  This happens as incoming acks open up the remote
   * window for us.
   *
f8269a495   Ilpo Järvinen   tcp: make urg+gso...
2143
2144
2145
2146
   * LARGESEND note: !tcp_urg_mode is overkill, only frames between
   * snd_up-64k-mss .. snd_up cannot be large. However, taking into
   * account rare use of URG, this is not a big flaw.
   *
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2147
2148
   * Send at most one packet when push_one > 0. Temporarily ignore
   * cwnd limit to force at most one packet out when push_one == 2.
a2a385d62   Eric Dumazet   tcp: bool convers...
2149
2150
   * Returns true, if no segments are in flight and we have queued segments,
   * but cannot send anything now because of SWS or another problem.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2151
   */
a2a385d62   Eric Dumazet   tcp: bool convers...
2152
2153
  static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  			   int push_one, gfp_t gfp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2154
2155
  {
  	struct tcp_sock *tp = tcp_sk(sk);
92df7b518   David S. Miller   [TCP]: tcp_write_...
2156
  	struct sk_buff *skb;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
2157
2158
  	unsigned int tso_segs, sent_pkts;
  	int cwnd_quota;
5d424d5a6   John Heffner   [TCP]: MTU probing
2159
  	int result;
5615f8861   Francis Yan   tcp: instrument h...
2160
  	bool is_cwnd_limited = false, is_rwnd_limited = false;
605ad7f18   Eric Dumazet   tcp: refine TSO a...
2161
  	u32 max_segs;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2162

92df7b518   David S. Miller   [TCP]: tcp_write_...
2163
  	sent_pkts = 0;
5d424d5a6   John Heffner   [TCP]: MTU probing
2164

ee1836aec   Eric Dumazet   tcp: refresh tp t...
2165
  	tcp_mstamp_refresh(tp);
d5dd9175b   Ilpo Järvinen   tcp: use tcp_writ...
2166
2167
2168
2169
  	if (!push_one) {
  		/* Do MTU probing. */
  		result = tcp_mtu_probe(sk);
  		if (!result) {
a2a385d62   Eric Dumazet   tcp: bool convers...
2170
  			return false;
d5dd9175b   Ilpo Järvinen   tcp: use tcp_writ...
2171
2172
2173
  		} else if (result > 0) {
  			sent_pkts = 1;
  		}
5d424d5a6   John Heffner   [TCP]: MTU probing
2174
  	}
ed6e7268b   Neal Cardwell   tcp: allow conges...
2175
  	max_segs = tcp_tso_segs(sk, mss_now);
fe067e8ab   David S. Miller   [TCP]: Abstract o...
2176
  	while ((skb = tcp_send_head(sk))) {
c8ac37746   Herbert Xu   [TCP]: Fix bug #5...
2177
  		unsigned int limit;
218af599f   Eric Dumazet   tcp: internal imp...
2178
2179
  		if (tcp_pacing_check(sk))
  			break;
5bbb432c8   Eric Dumazet   tcp: tcp_set_skb_...
2180
  		tso_segs = tcp_init_tso_segs(skb, mss_now);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
2181
  		BUG_ON(!tso_segs);
aa93466bd   David S. Miller   [TCP]: Eliminate ...
2182

9d186cac7   Andrey Vagin   tcp: don't use ti...
2183
  		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
7faee5c0d   Eric Dumazet   tcp: remove TCP_S...
2184
  			/* "skb_mstamp" is used as a start point for the retransmit timer */
385e20706   Eric Dumazet   tcp: use tp->tcp_...
2185
  			skb->skb_mstamp = tp->tcp_mstamp;
ec3423257   Andrew Vagin   tcp: fix retransm...
2186
  			goto repair; /* Skip network transmission */
9d186cac7   Andrey Vagin   tcp: don't use ti...
2187
  		}
ec3423257   Andrew Vagin   tcp: fix retransm...
2188

b68e9f857   Herbert Xu   [PATCH] tcp: fix ...
2189
  		cwnd_quota = tcp_cwnd_test(tp, skb);
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2190
2191
2192
2193
2194
2195
2196
  		if (!cwnd_quota) {
  			if (push_one == 2)
  				/* Force out a loss probe pkt. */
  				cwnd_quota = 1;
  			else
  				break;
  		}
b68e9f857   Herbert Xu   [PATCH] tcp: fix ...
2197

5615f8861   Francis Yan   tcp: instrument h...
2198
2199
  		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
  			is_rwnd_limited = true;
b68e9f857   Herbert Xu   [PATCH] tcp: fix ...
2200
  			break;
5615f8861   Francis Yan   tcp: instrument h...
2201
  		}
b68e9f857   Herbert Xu   [PATCH] tcp: fix ...
2202

d6a4e26af   Eric Dumazet   tcp: tcp_tso_auto...
2203
  		if (tso_segs == 1) {
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
2204
2205
2206
2207
2208
  			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
  						     (tcp_skb_is_last(sk, skb) ?
  						      nonagle : TCP_NAGLE_PUSH))))
  				break;
  		} else {
ca8a22634   Neal Cardwell   tcp: make cwnd-li...
2209
  			if (!push_one &&
605ad7f18   Eric Dumazet   tcp: refine TSO a...
2210
2211
  			    tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
  						 max_segs))
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
2212
2213
  				break;
  		}
aa93466bd   David S. Miller   [TCP]: Eliminate ...
2214

605ad7f18   Eric Dumazet   tcp: refine TSO a...
2215
  		limit = mss_now;
d6a4e26af   Eric Dumazet   tcp: tcp_tso_auto...
2216
  		if (tso_segs > 1 && !tcp_urg_mode(tp))
605ad7f18   Eric Dumazet   tcp: refine TSO a...
2217
2218
2219
2220
2221
2222
2223
2224
2225
  			limit = tcp_mss_split_point(sk, skb, mss_now,
  						    min_t(unsigned int,
  							  cwnd_quota,
  							  max_segs),
  						    nonagle);
  
  		if (skb->len > limit &&
  		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
  			break;
7aa5470c2   Eric Dumazet   tcp: tsq: move ts...
2226
2227
  		if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
  			clear_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags);
f9616c35a   Eric Dumazet   tcp: implement TS...
2228
2229
  		if (tcp_small_queue_check(sk, skb, 0))
  			break;
c9eeec26e   Eric Dumazet   tcp: TSQ can use ...
2230

d5dd9175b   Ilpo Järvinen   tcp: use tcp_writ...
2231
  		if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
92df7b518   David S. Miller   [TCP]: tcp_write_...
2232
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2233

ec3423257   Andrew Vagin   tcp: fix retransm...
2234
  repair:
92df7b518   David S. Miller   [TCP]: tcp_write_...
2235
2236
2237
  		/* Advance the send_head.  This one is sent out.
  		 * This call will increment packets_out.
  		 */
66f5fe624   Ilpo Järvinen   [TCP]: Rename upd...
2238
  		tcp_event_new_data_sent(sk, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2239

92df7b518   David S. Miller   [TCP]: tcp_write_...
2240
  		tcp_minshall_update(tp, mss_now, skb);
a262f0cdf   Nandita Dukkipati   Proportional Rate...
2241
  		sent_pkts += tcp_skb_pcount(skb);
d5dd9175b   Ilpo Järvinen   tcp: use tcp_writ...
2242
2243
2244
  
  		if (push_one)
  			break;
92df7b518   David S. Miller   [TCP]: tcp_write_...
2245
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2246

5615f8861   Francis Yan   tcp: instrument h...
2247
2248
2249
2250
  	if (is_rwnd_limited)
  		tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
  	else
  		tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
aa93466bd   David S. Miller   [TCP]: Eliminate ...
2251
  	if (likely(sent_pkts)) {
684bad110   Yuchung Cheng   tcp: use PRR to r...
2252
2253
  		if (tcp_in_cwnd_reduction(sk))
  			tp->prr_out += sent_pkts;
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2254
2255
2256
  
  		/* Send one loss probe per tail loss episode. */
  		if (push_one != 2)
241eb29c0   Neal Cardwell   tcp: when schedul...
2257
  			tcp_schedule_loss_probe(sk, false);
d2e1339f4   Bendik Rønning Opstad   tcp: Fix CWV bein...
2258
  		is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
ca8a22634   Neal Cardwell   tcp: make cwnd-li...
2259
  		tcp_cwnd_validate(sk, is_cwnd_limited);
a2a385d62   Eric Dumazet   tcp: bool convers...
2260
  		return false;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2261
  	}
b340b2645   Yuchung Cheng   tcp: TLP retransm...
2262
  	return !tp->packets_out && tcp_send_head(sk);
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2263
  }
241eb29c0   Neal Cardwell   tcp: when schedul...
2264
  bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2265
2266
2267
  {
  	struct inet_connection_sock *icsk = inet_csk(sk);
  	struct tcp_sock *tp = tcp_sk(sk);
a2815817f   Neal Cardwell   tcp: enable xmit ...
2268
  	u32 timeout, rto_delta_us;
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2269

6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2270
2271
2272
  	/* Don't do any loss probe on a Fast Open connection before 3WHS
  	 * finishes.
  	 */
f9b995822   Yuchung Cheng   tcp: send loss pr...
2273
  	if (tp->fastopen_rsk)
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2274
  		return false;
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2275
2276
2277
  	/* Schedule a loss probe in 2*RTT for SACK capable connections
  	 * in Open state, that are either limited by cwnd or application.
  	 */
bec41a11d   Yuchung Cheng   tcp: remove early...
2278
2279
2280
  	if ((sysctl_tcp_early_retrans != 3 && sysctl_tcp_early_retrans != 4) ||
  	    !tp->packets_out || !tcp_is_sack(tp) ||
  	    icsk->icsk_ca_state != TCP_CA_Open)
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2281
2282
2283
2284
2285
  		return false;
  
  	if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
  	     tcp_send_head(sk))
  		return false;
bb4d991a2   Yuchung Cheng   tcp: adjust tail ...
2286
  	/* Probe timeout is 2*rtt. Add minimum RTO to account
f9b995822   Yuchung Cheng   tcp: send loss pr...
2287
2288
  	 * for delayed ack when there's one outstanding packet. If no RTT
  	 * sample is available then probe after TCP_TIMEOUT_INIT.
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2289
  	 */
bb4d991a2   Yuchung Cheng   tcp: adjust tail ...
2290
2291
2292
2293
2294
2295
2296
2297
2298
  	if (tp->srtt_us) {
  		timeout = usecs_to_jiffies(tp->srtt_us >> 2);
  		if (tp->packets_out == 1)
  			timeout += TCP_RTO_MIN;
  		else
  			timeout += TCP_TIMEOUT_MIN;
  	} else {
  		timeout = TCP_TIMEOUT_INIT;
  	}
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2299

a2815817f   Neal Cardwell   tcp: enable xmit ...
2300
  	/* If the RTO formula yields an earlier time, then use that time. */
241eb29c0   Neal Cardwell   tcp: when schedul...
2301
2302
2303
  	rto_delta_us = advancing_rto ?
  			jiffies_to_usecs(inet_csk(sk)->icsk_rto) :
  			tcp_rto_delta_us(sk);  /* How far in future is RTO? */
a2815817f   Neal Cardwell   tcp: enable xmit ...
2304
2305
  	if (rto_delta_us > 0)
  		timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2306
2307
2308
2309
2310
  
  	inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
  				  TCP_RTO_MAX);
  	return true;
  }
1f3279ae0   Eric Dumazet   tcp: avoid retran...
2311
2312
2313
  /* Thanks to skb fast clones, we can detect if a prior transmit of
   * a packet is still in a qdisc or driver queue.
   * In this case, there is very little point doing a retransmit !
1f3279ae0   Eric Dumazet   tcp: avoid retran...
2314
2315
2316
2317
   */
  static bool skb_still_in_host_queue(const struct sock *sk,
  				    const struct sk_buff *skb)
  {
39bb5e628   Eric Dumazet   net: skb_fclone_b...
2318
  	if (unlikely(skb_fclone_busy(sk, skb))) {
c10d9310e   Eric Dumazet   tcp: do not assum...
2319
2320
  		NET_INC_STATS(sock_net(sk),
  			      LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
1f3279ae0   Eric Dumazet   tcp: avoid retran...
2321
2322
2323
2324
  		return true;
  	}
  	return false;
  }
b340b2645   Yuchung Cheng   tcp: TLP retransm...
2325
  /* When probe timeout (PTO) fires, try send a new segment if possible, else
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2326
2327
2328
2329
   * retransmit the last segment.
   */
  void tcp_send_loss_probe(struct sock *sk)
  {
9b717a8d2   Nandita Dukkipati   tcp: TLP loss det...
2330
  	struct tcp_sock *tp = tcp_sk(sk);
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2331
2332
2333
  	struct sk_buff *skb;
  	int pcount;
  	int mss = tcp_current_mss(sk);
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2334

b340b2645   Yuchung Cheng   tcp: TLP retransm...
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
  	skb = tcp_send_head(sk);
  	if (skb) {
  		if (tcp_snd_wnd_test(tp, skb, mss)) {
  			pcount = tp->packets_out;
  			tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
  			if (tp->packets_out > pcount)
  				goto probe_sent;
  			goto rearm_timer;
  		}
  		skb = tcp_write_queue_prev(sk, skb);
  	} else {
  		skb = tcp_write_queue_tail(sk);
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2347
  	}
9b717a8d2   Nandita Dukkipati   tcp: TLP loss det...
2348
2349
2350
  	/* At most one outstanding TLP retransmission. */
  	if (tp->tlp_high_seq)
  		goto rearm_timer;
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2351
  	/* Retransmit last segment. */
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2352
2353
  	if (WARN_ON(!skb))
  		goto rearm_timer;
1f3279ae0   Eric Dumazet   tcp: avoid retran...
2354
2355
  	if (skb_still_in_host_queue(sk, skb))
  		goto rearm_timer;
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2356
2357
2358
2359
2360
  	pcount = tcp_skb_pcount(skb);
  	if (WARN_ON(!pcount))
  		goto rearm_timer;
  
  	if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
6cc55e096   Octavian Purdila   tcp: add gfp para...
2361
2362
  		if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
  					  GFP_ATOMIC)))
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2363
  			goto rearm_timer;
b340b2645   Yuchung Cheng   tcp: TLP retransm...
2364
  		skb = tcp_write_queue_next(sk, skb);
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2365
2366
2367
2368
  	}
  
  	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
  		goto rearm_timer;
10d3be569   Eric Dumazet   tcp-tso: do not s...
2369
  	if (__tcp_retransmit_skb(sk, skb, 1))
b340b2645   Yuchung Cheng   tcp: TLP retransm...
2370
  		goto rearm_timer;
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2371

9b717a8d2   Nandita Dukkipati   tcp: TLP loss det...
2372
  	/* Record snd_nxt for loss detection. */
b340b2645   Yuchung Cheng   tcp: TLP retransm...
2373
  	tp->tlp_high_seq = tp->snd_nxt;
9b717a8d2   Nandita Dukkipati   tcp: TLP loss det...
2374

b340b2645   Yuchung Cheng   tcp: TLP retransm...
2375
  probe_sent:
c10d9310e   Eric Dumazet   tcp: do not assum...
2376
  	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
b340b2645   Yuchung Cheng   tcp: TLP retransm...
2377
2378
  	/* Reset s.t. tcp_rearm_rto will restart timer from now */
  	inet_csk(sk)->icsk_pending = 0;
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2379
  rearm_timer:
fcd16c0a9   Yuchung Cheng   tcp: don't extend...
2380
  	tcp_rearm_rto(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2381
  }
a762a9800   David S. Miller   [TCP]: Kill extra...
2382
2383
2384
2385
  /* Push out any pending frames which were held back due to
   * TCP_CORK or attempt at coalescing tiny packets.
   * The socket must be locked by the caller.
   */
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2386
2387
  void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
  			       int nonagle)
a762a9800   David S. Miller   [TCP]: Kill extra...
2388
  {
726e07a8a   Ilpo Järvinen   tcp: move some pa...
2389
2390
2391
2392
2393
2394
  	/* If we are closed, the bytes will have to remain here.
  	 * In time closedown will finish, we empty the write queue and
  	 * all will be happy.
  	 */
  	if (unlikely(sk->sk_state == TCP_CLOSE))
  		return;
99a1dec70   Mel Gorman   net: introduce sk...
2395
  	if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
7450aaf61   Eric Dumazet   tcp: suppress too...
2396
  			   sk_gfp_mask(sk, GFP_ATOMIC)))
726e07a8a   Ilpo Järvinen   tcp: move some pa...
2397
  		tcp_check_probe_timer(sk);
a762a9800   David S. Miller   [TCP]: Kill extra...
2398
  }
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
2399
2400
2401
2402
2403
  /* Send _single_ skb sitting at the send head. This function requires
   * true push pending frames to setup probe timer etc.
   */
  void tcp_push_one(struct sock *sk, unsigned int mss_now)
  {
fe067e8ab   David S. Miller   [TCP]: Abstract o...
2404
  	struct sk_buff *skb = tcp_send_head(sk);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
2405
2406
  
  	BUG_ON(!skb || skb->len < mss_now);
d5dd9175b   Ilpo Järvinen   tcp: use tcp_writ...
2407
  	tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
2408
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2409
2410
  /* This function returns the amount that we can raise the
   * usable window based on the following constraints
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
2411
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
   * 1. The window can never be shrunk once it is offered (RFC 793)
   * 2. We limit memory per socket
   *
   * RFC 1122:
   * "the suggested [SWS] avoidance algorithm for the receiver is to keep
   *  RECV.NEXT + RCV.WIN fixed until:
   *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
   *
   * i.e. don't raise the right edge of the window until you can raise
   * it at least MSS bytes.
   *
   * Unfortunately, the recommended algorithm breaks header prediction,
   * since header prediction assumes th->window stays fixed.
   *
   * Strictly speaking, keeping th->window fixed violates the receiver
   * side SWS prevention criteria. The problem is that under this rule
   * a stream of single byte packets will cause the right side of the
   * window to always advance by a single byte.
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
2430
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2431
2432
   * Of course, if the sender implements sender side SWS prevention
   * then this will not be a problem.
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
2433
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2434
   * BSD seems to make the following compromise:
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
2435
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
   *	If the free space is less than the 1/4 of the maximum
   *	space available and the free space is less than 1/2 mss,
   *	then set the window to 0.
   *	[ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
   *	Otherwise, just prevent the window from shrinking
   *	and from being larger than the largest representable value.
   *
   * This prevents incremental opening of the window in the regime
   * where TCP is limited by the speed of the reader side taking
   * data out of the TCP receive queue. It does nothing about
   * those cases where the window is constrained on the sender side
   * because the pipeline is full.
   *
   * BSD also seems to "accidentally" limit itself to windows that are a
   * multiple of MSS, at least until the free space gets quite small.
   * This would appear to be a side effect of the mbuf implementation.
   * Combining these two algorithms results in the observed behavior
   * of having a fixed window size at almost all times.
   *
   * Below we obtain similar behavior by forcing the offered window to
   * a multiple of the mss when it is feasible to do so.
   *
   * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
   * Regular options like TIMESTAMP are taken into account.
   */
  u32 __tcp_select_window(struct sock *sk)
  {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
2463
  	struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2464
  	struct tcp_sock *tp = tcp_sk(sk);
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
2465
  	/* MSS for the peer's data.  Previous versions used mss_clamp
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2466
2467
2468
2469
2470
  	 * here.  I don't know if the value based on our guesses
  	 * of peer's MSS is better for the performance.  It's more correct
  	 * but may be worse for the performance because of rcv_mss
  	 * fluctuations.  --SAW  1998/11/1
  	 */
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
2471
  	int mss = icsk->icsk_ack.rcv_mss;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2472
  	int free_space = tcp_space(sk);
86c1a0456   Florian Westphal   tcp: use zero-win...
2473
2474
  	int allowed_space = tcp_full_space(sk);
  	int full_space = min_t(int, tp->window_clamp, allowed_space);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2475
  	int window;
06425c308   Eric Dumazet   tcp: fix 0 divide...
2476
  	if (unlikely(mss > full_space)) {
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
2477
  		mss = full_space;
06425c308   Eric Dumazet   tcp: fix 0 divide...
2478
2479
2480
  		if (mss <= 0)
  			return 0;
  	}
b92edbe0b   Eric Dumazet   [TCP] Avoid two d...
2481
  	if (free_space < (full_space >> 1)) {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
2482
  		icsk->icsk_ack.quick = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2483

b8da51ebb   Eric Dumazet   tcp: introduce tc...
2484
  		if (tcp_under_memory_pressure(sk))
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2485
2486
  			tp->rcv_ssthresh = min(tp->rcv_ssthresh,
  					       4U * tp->advmss);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2487

86c1a0456   Florian Westphal   tcp: use zero-win...
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
  		/* free_space might become our new window, make sure we don't
  		 * increase it due to wscale.
  		 */
  		free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
  
  		/* if free space is less than mss estimate, or is below 1/16th
  		 * of the maximum allowed, try to move to zero-window, else
  		 * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
  		 * new incoming data is dropped due to memory limits.
  		 * With large window, mss test triggers way too late in order
  		 * to announce zero window in time before rmem limit kicks in.
  		 */
  		if (free_space < (allowed_space >> 4) || free_space < mss)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2501
2502
2503
2504
2505
2506
2507
2508
2509
  			return 0;
  	}
  
  	if (free_space > tp->rcv_ssthresh)
  		free_space = tp->rcv_ssthresh;
  
  	/* Don't do rounding if we are using window scaling, since the
  	 * scaled window will not line up with the MSS boundary anyway.
  	 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2510
2511
2512
2513
2514
2515
2516
  	if (tp->rx_opt.rcv_wscale) {
  		window = free_space;
  
  		/* Advertise enough space so that it won't get scaled away.
  		 * Import case: prevent zero window announcement if
  		 * 1<<rcv_wscale > mss.
  		 */
1935299d9   Gao Feng   net: tcp: Refine ...
2517
  		window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2518
  	} else {
1935299d9   Gao Feng   net: tcp: Refine ...
2519
  		window = tp->rcv_wnd;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2520
2521
2522
2523
2524
2525
2526
2527
2528
  		/* Get the largest window that is a nice multiple of mss.
  		 * Window clamp already applied above.
  		 * If our current window offering is within 1 mss of the
  		 * free space we just keep it. This prevents the divide
  		 * and multiply from happening most of the time.
  		 * We also don't do any window rounding when the free space
  		 * is too small.
  		 */
  		if (window <= free_space - mss || window > free_space)
1935299d9   Gao Feng   net: tcp: Refine ...
2529
  			window = rounddown(free_space, mss);
84565070e   John Heffner   [TCP]: Do receive...
2530
  		else if (mss == full_space &&
b92edbe0b   Eric Dumazet   [TCP] Avoid two d...
2531
  			 free_space > window + (full_space >> 1))
84565070e   John Heffner   [TCP]: Do receive...
2532
  			window = free_space;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2533
2534
2535
2536
  	}
  
  	return window;
  }
cfea5a688   Martin KaFai Lau   tcp: Merge tx_fla...
2537
2538
  void tcp_skb_collapse_tstamp(struct sk_buff *skb,
  			     const struct sk_buff *next_skb)
082ac2d51   Martin KaFai Lau   tcp: Merge tx_fla...
2539
  {
0a2cf20c3   Soheil Hassas Yeganeh   tcp: remove SKBTX...
2540
2541
2542
  	if (unlikely(tcp_has_tx_tstamp(next_skb))) {
  		const struct skb_shared_info *next_shinfo =
  			skb_shinfo(next_skb);
082ac2d51   Martin KaFai Lau   tcp: Merge tx_fla...
2543
  		struct skb_shared_info *shinfo = skb_shinfo(skb);
0a2cf20c3   Soheil Hassas Yeganeh   tcp: remove SKBTX...
2544
  		shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
082ac2d51   Martin KaFai Lau   tcp: Merge tx_fla...
2545
  		shinfo->tskey = next_shinfo->tskey;
2de8023e7   Martin KaFai Lau   tcp: Merge txstam...
2546
2547
  		TCP_SKB_CB(skb)->txstamp_ack |=
  			TCP_SKB_CB(next_skb)->txstamp_ack;
082ac2d51   Martin KaFai Lau   tcp: Merge tx_fla...
2548
2549
  	}
  }
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2550
  /* Collapses two adjacent SKB's during retransmission. */
f8071cde7   Eric Dumazet   tcp: enhance tcp_...
2551
  static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2552
2553
  {
  	struct tcp_sock *tp = tcp_sk(sk);
fe067e8ab   David S. Miller   [TCP]: Abstract o...
2554
  	struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
2555
  	int skb_size, next_skb_size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2556

058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
2557
2558
  	skb_size = skb->len;
  	next_skb_size = next_skb->len;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2559

058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
2560
  	BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
a6963a6b3   Ilpo Järvinen   [TCP]: Re-place h...
2561

f8071cde7   Eric Dumazet   tcp: enhance tcp_...
2562
2563
2564
2565
2566
2567
2568
  	if (next_skb_size) {
  		if (next_skb_size <= skb_availroom(skb))
  			skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
  				      next_skb_size);
  		else if (!skb_shift(skb, next_skb, next_skb_size))
  			return false;
  	}
2b7cda9c3   Eric Dumazet   tcp: fix tcp_mtu_...
2569
  	tcp_highest_sack_replace(sk, next_skb, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2570

058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
2571
  	tcp_unlink_write_queue(next_skb, sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2572

058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
2573
2574
  	if (next_skb->ip_summed == CHECKSUM_PARTIAL)
  		skb->ip_summed = CHECKSUM_PARTIAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2575

058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
2576
2577
  	if (skb->ip_summed != CHECKSUM_PARTIAL)
  		skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2578

058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
2579
2580
  	/* Update sequence range on original skb. */
  	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2581

e6c7d0857   Ilpo Järvinen   tcp: drop unneces...
2582
  	/* Merge over control information. This moves PSH/FIN etc. over */
4de075e04   Eric Dumazet   tcp: rename tcp_s...
2583
  	TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
2584
2585
2586
2587
2588
  
  	/* All done, get rid of second SKB and account for it so
  	 * packet counting does not break.
  	 */
  	TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
a643b5d41   Martin KaFai Lau   tcp: Handle eor b...
2589
  	TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
2590
2591
  
  	/* changed transmit queue under us so clear hints */
ef9da47c7   Ilpo Järvinen   tcp: don't clear ...
2592
2593
2594
  	tcp_clear_retrans_hints_partial(tp);
  	if (next_skb == tp->retransmit_skb_hint)
  		tp->retransmit_skb_hint = skb;
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
2595

797108d13   Ilpo Järvinen   tcp: add helper f...
2596
  	tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
082ac2d51   Martin KaFai Lau   tcp: Merge tx_fla...
2597
  	tcp_skb_collapse_tstamp(skb, next_skb);
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
2598
  	sk_wmem_free_skb(sk, next_skb);
f8071cde7   Eric Dumazet   tcp: enhance tcp_...
2599
  	return true;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2600
  }
67edfef78   Andi Kleen   TCP: Add comments...
2601
  /* Check if coalescing SKBs is legal. */
a2a385d62   Eric Dumazet   tcp: bool convers...
2602
  static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2603
2604
  {
  	if (tcp_skb_pcount(skb) > 1)
a2a385d62   Eric Dumazet   tcp: bool convers...
2605
  		return false;
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2606
  	if (skb_cloned(skb))
a2a385d62   Eric Dumazet   tcp: bool convers...
2607
  		return false;
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2608
  	if (skb == tcp_send_head(sk))
a2a385d62   Eric Dumazet   tcp: bool convers...
2609
  		return false;
2331ccc5b   Eric Dumazet   tcp: enhance tcp ...
2610
  	/* Some heuristics for collapsing over SACK'd could be invented */
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2611
  	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
a2a385d62   Eric Dumazet   tcp: bool convers...
2612
  		return false;
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2613

a2a385d62   Eric Dumazet   tcp: bool convers...
2614
  	return true;
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2615
  }
67edfef78   Andi Kleen   TCP: Add comments...
2616
2617
2618
  /* Collapse packets in the retransmit queue to make to create
   * less packets on the wire. This is only done on retransmission.
   */
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2619
2620
2621
2622
2623
  static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
  				     int space)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *skb = to, *tmp;
a2a385d62   Eric Dumazet   tcp: bool convers...
2624
  	bool first = true;
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2625
2626
2627
  
  	if (!sysctl_tcp_retrans_collapse)
  		return;
4de075e04   Eric Dumazet   tcp: rename tcp_s...
2628
  	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2629
2630
2631
2632
2633
  		return;
  
  	tcp_for_write_queue_from_safe(skb, tmp, sk) {
  		if (!tcp_can_collapse(sk, skb))
  			break;
a643b5d41   Martin KaFai Lau   tcp: Handle eor b...
2634
2635
  		if (!tcp_skb_can_collapse_to(to))
  			break;
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2636
2637
2638
  		space -= skb->len;
  
  		if (first) {
a2a385d62   Eric Dumazet   tcp: bool convers...
2639
  			first = false;
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2640
2641
2642
2643
2644
  			continue;
  		}
  
  		if (space < 0)
  			break;
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2645
2646
2647
  
  		if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
  			break;
f8071cde7   Eric Dumazet   tcp: enhance tcp_...
2648
2649
  		if (!tcp_collapse_retrans(sk, to))
  			break;
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2650
2651
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2652
2653
2654
2655
  /* This retransmits one SKB.  Policy decisions and retransmit queue
   * state updates are done by the caller.  Returns non-zero if an
   * error occurred which prevented the send.
   */
10d3be569   Eric Dumazet   tcp-tso: do not s...
2656
  int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2657
  {
5d424d5a6   John Heffner   [TCP]: MTU probing
2658
  	struct inet_connection_sock *icsk = inet_csk(sk);
10d3be569   Eric Dumazet   tcp-tso: do not s...
2659
  	struct tcp_sock *tp = tcp_sk(sk);
7d227cd23   Sridhar Samudrala   tcp: TCP connecti...
2660
  	unsigned int cur_mss;
10d3be569   Eric Dumazet   tcp-tso: do not s...
2661
  	int diff, len, err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2662

10d3be569   Eric Dumazet   tcp-tso: do not s...
2663
2664
  	/* Inconclusive MTU probe */
  	if (icsk->icsk_mtup.probe_size)
5d424d5a6   John Heffner   [TCP]: MTU probing
2665
  		icsk->icsk_mtup.probe_size = 0;
5d424d5a6   John Heffner   [TCP]: MTU probing
2666

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2667
  	/* Do not sent more than we queued. 1/4 is reserved for possible
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
2668
  	 * copying overhead: fragmentation, tunneling, mangling etc.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2669
  	 */
14afee4b6   Reshetova, Elena   net: convert sock...
2670
  	if (refcount_read(&sk->sk_wmem_alloc) >
ffb4d6c85   Eric Dumazet   tcp: fix overflow...
2671
2672
  	    min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
  		  sk->sk_sndbuf))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2673
  		return -EAGAIN;
1f3279ae0   Eric Dumazet   tcp: avoid retran...
2674
2675
  	if (skb_still_in_host_queue(sk, skb))
  		return -EBUSY;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2676
  	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
c89d53430   Eric Dumazet   tcp: purge write ...
2677
2678
2679
2680
  		if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
  			WARN_ON_ONCE(1);
  			return -EINVAL;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2681
2682
2683
  		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
  			return -ENOMEM;
  	}
7d227cd23   Sridhar Samudrala   tcp: TCP connecti...
2684
2685
  	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
  		return -EHOSTUNREACH; /* Routing failure or similar. */
0c54b85f2   Ilpo Järvinen   tcp: simplify tcp...
2686
  	cur_mss = tcp_current_mss(sk);
7d227cd23   Sridhar Samudrala   tcp: TCP connecti...
2687

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2688
2689
2690
2691
2692
  	/* If receiver has shrunk his window, and skb is out of
  	 * new window, do not retransmit it. The exception is the
  	 * case, when window is shrunk to zero. In this case
  	 * our retransmit serves as a zero window probe.
  	 */
9d4fb27db   Joe Perches   net/ipv4: Move &&...
2693
2694
  	if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
  	    TCP_SKB_CB(skb)->seq != tp->snd_una)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2695
  		return -EAGAIN;
10d3be569   Eric Dumazet   tcp-tso: do not s...
2696
2697
2698
  	len = cur_mss * segs;
  	if (skb->len > len) {
  		if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2699
  			return -ENOMEM; /* We'll try again later. */
02276f3c9   Ilpo Järvinen   tcp: fix corner c...
2700
  	} else {
10d3be569   Eric Dumazet   tcp-tso: do not s...
2701
2702
  		if (skb_unclone(skb, GFP_ATOMIC))
  			return -ENOMEM;
9eb9362e5   Ilpo Järvinen   tcp: miscounts du...
2703

10d3be569   Eric Dumazet   tcp-tso: do not s...
2704
2705
2706
2707
2708
2709
2710
  		diff = tcp_skb_pcount(skb);
  		tcp_set_skb_tso_segs(skb, cur_mss);
  		diff -= tcp_skb_pcount(skb);
  		if (diff)
  			tcp_adjust_pcount(sk, skb, diff);
  		if (skb->len < cur_mss)
  			tcp_retrans_try_collapse(sk, skb, cur_mss);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2711
  	}
492135557   Daniel Borkmann   tcp: add rfc3168,...
2712
2713
2714
  	/* RFC3168, section 6.1.1.1. ECN fallback */
  	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
  		tcp_ecn_clear_syn(sk, skb);
678550c65   Yuchung Cheng   tcp: include loca...
2715
2716
2717
2718
2719
2720
  	/* Update global and local TCP statistics. */
  	segs = tcp_skb_pcount(skb);
  	TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
  	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
  		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
  	tp->total_retrans += segs;
50bceae9b   Thomas Graf   tcp: Reallocate h...
2721
2722
2723
2724
2725
2726
  	/* make sure skb->data is aligned on arches that require it
  	 * and check if ack-trimming & collapsing extended the headroom
  	 * beyond what csum_start can cover.
  	 */
  	if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
  		     skb_headroom(skb) >= 0xFFFF)) {
10a81980f   Eric Dumazet   tcp: refresh skb ...
2727
  		struct sk_buff *nskb;
10a81980f   Eric Dumazet   tcp: refresh skb ...
2728
  		nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
c84a57113   Yuchung Cheng   tcp: fix bogus RT...
2729
2730
  		err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
  			     -ENOBUFS;
5889e2c0e   Yousuk Seung   tcp: call tcp_rat...
2731
  		if (!err) {
8c72c65b4   Eric Dumazet   tcp: update skb->...
2732
  			skb->skb_mstamp = tp->tcp_mstamp;
5889e2c0e   Yousuk Seung   tcp: call tcp_rat...
2733
2734
  			tcp_rate_skb_sent(sk, skb);
  		}
117632e64   Eric Dumazet   tcp: take care of...
2735
  	} else {
c84a57113   Yuchung Cheng   tcp: fix bogus RT...
2736
  		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
117632e64   Eric Dumazet   tcp: take care of...
2737
  	}
c84a57113   Yuchung Cheng   tcp: fix bogus RT...
2738

fc9f35010   Eric Dumazet   tcp: increment re...
2739
  	if (likely(!err)) {
c84a57113   Yuchung Cheng   tcp: fix bogus RT...
2740
  		TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
678550c65   Yuchung Cheng   tcp: include loca...
2741
2742
  	} else if (err != -EBUSY) {
  		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
fc9f35010   Eric Dumazet   tcp: increment re...
2743
  	}
c84a57113   Yuchung Cheng   tcp: fix bogus RT...
2744
  	return err;
93b174ad7   Yuchung Cheng   tcp: bug fix Fast...
2745
  }
10d3be569   Eric Dumazet   tcp-tso: do not s...
2746
  int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
93b174ad7   Yuchung Cheng   tcp: bug fix Fast...
2747
2748
  {
  	struct tcp_sock *tp = tcp_sk(sk);
10d3be569   Eric Dumazet   tcp-tso: do not s...
2749
  	int err = __tcp_retransmit_skb(sk, skb, segs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2750
2751
  
  	if (err == 0) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2752
  #if FASTRETRANS_DEBUG > 0
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2753
  		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
e87cc4728   Joe Perches   net: Convert net_...
2754
2755
  			net_dbg_ratelimited("retrans_out leaked
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2756
2757
2758
2759
2760
2761
2762
  		}
  #endif
  		TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
  		tp->retrans_out += tcp_skb_pcount(skb);
  
  		/* Save stamp of the first retransmit. */
  		if (!tp->retrans_stamp)
7faee5c0d   Eric Dumazet   tcp: remove TCP_S...
2763
  			tp->retrans_stamp = tcp_skb_timestamp(skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2764

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2765
  	}
6e08d5e3c   Yuchung Cheng   tcp: fix false un...
2766
2767
2768
2769
  
  	if (tp->undo_retrans < 0)
  		tp->undo_retrans = 0;
  	tp->undo_retrans += tcp_skb_pcount(skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
  	return err;
  }
  
  /* This gets called after a retransmit timeout, and the initially
   * retransmitted data is acknowledged.  It tries to continue
   * resending the rest of the retransmit queue, until either
   * we've sent it all or the congestion window limit is reached.
   * If doing SACK, the first ACK which comes back for a timeout
   * based retransmit packet might feed us FACK information again.
   * If so, we use it to avoid unnecessarily retransmissions.
   */
  void tcp_xmit_retransmit_queue(struct sock *sk)
  {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2783
  	const struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2784
2785
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *skb;
0e1c54c2a   Ilpo Järvinen   tcp: reorganize r...
2786
  	struct sk_buff *hole = NULL;
840a3cbe8   Yuchung Cheng   tcp: remove forwa...
2787
  	u32 max_segs;
61eb55f4d   Ilpo Järvinen   tcp: Reorganize s...
2788
  	int mib_idx;
6a438bbe6   Stephen Hemminger   [TCP]: speed up S...
2789

45e77d314   Ilpo Järvinen   tcp: fix crash in...
2790
2791
  	if (!tp->packets_out)
  		return;
618d9f255   Ilpo Järvinen   tcp: back retrans...
2792
  	if (tp->retransmit_skb_hint) {
6a438bbe6   Stephen Hemminger   [TCP]: speed up S...
2793
  		skb = tp->retransmit_skb_hint;
618d9f255   Ilpo Järvinen   tcp: back retrans...
2794
  	} else {
fe067e8ab   David S. Miller   [TCP]: Abstract o...
2795
  		skb = tcp_write_queue_head(sk);
618d9f255   Ilpo Järvinen   tcp: back retrans...
2796
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2797

ed6e7268b   Neal Cardwell   tcp: allow conges...
2798
  	max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
08ebd1721   Ilpo Järvinen   tcp: remove tp->l...
2799
  	tcp_for_write_queue_from(skb, sk) {
dca0aaf84   Eric Dumazet   tcp: defer sacked...
2800
  		__u8 sacked;
10d3be569   Eric Dumazet   tcp-tso: do not s...
2801
  		int segs;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2802

08ebd1721   Ilpo Järvinen   tcp: remove tp->l...
2803
2804
  		if (skb == tcp_send_head(sk))
  			break;
218af599f   Eric Dumazet   tcp: internal imp...
2805
2806
2807
  
  		if (tcp_pacing_check(sk))
  			break;
08ebd1721   Ilpo Järvinen   tcp: remove tp->l...
2808
  		/* we could do better than to assign each time */
51456b291   Ian Morris   ipv4: coding styl...
2809
  		if (!hole)
0e1c54c2a   Ilpo Järvinen   tcp: reorganize r...
2810
  			tp->retransmit_skb_hint = skb;
08ebd1721   Ilpo Järvinen   tcp: remove tp->l...
2811

10d3be569   Eric Dumazet   tcp-tso: do not s...
2812
2813
  		segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
  		if (segs <= 0)
08ebd1721   Ilpo Järvinen   tcp: remove tp->l...
2814
  			return;
dca0aaf84   Eric Dumazet   tcp: defer sacked...
2815
  		sacked = TCP_SKB_CB(skb)->sacked;
a3d2e9f8e   Eric Dumazet   tcp: do not send ...
2816
2817
2818
2819
  		/* In case tcp_shift_skb_data() have aggregated large skbs,
  		 * we need to make sure not sending too bigs TSO packets
  		 */
  		segs = min_t(int, segs, max_segs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2820

840a3cbe8   Yuchung Cheng   tcp: remove forwa...
2821
2822
  		if (tp->retrans_out >= tp->lost_out) {
  			break;
0e1c54c2a   Ilpo Järvinen   tcp: reorganize r...
2823
  		} else if (!(sacked & TCPCB_LOST)) {
51456b291   Ian Morris   ipv4: coding styl...
2824
  			if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
0e1c54c2a   Ilpo Järvinen   tcp: reorganize r...
2825
2826
  				hole = skb;
  			continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2827

0e1c54c2a   Ilpo Järvinen   tcp: reorganize r...
2828
2829
2830
2831
2832
2833
  		} else {
  			if (icsk->icsk_ca_state != TCP_CA_Loss)
  				mib_idx = LINUX_MIB_TCPFASTRETRANS;
  			else
  				mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2834

0e1c54c2a   Ilpo Järvinen   tcp: reorganize r...
2835
  		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2836
  			continue;
f9616c35a   Eric Dumazet   tcp: implement TS...
2837
2838
  		if (tcp_small_queue_check(sk, skb, 1))
  			return;
10d3be569   Eric Dumazet   tcp-tso: do not s...
2839
  		if (tcp_retransmit_skb(sk, skb, segs))
0e1c54c2a   Ilpo Järvinen   tcp: reorganize r...
2840
  			return;
24ab6bec8   Yuchung Cheng   tcp: account all ...
2841

de1d65781   Yuchung Cheng   tcp: fix under-ac...
2842
  		NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2843

684bad110   Yuchung Cheng   tcp: use PRR to r...
2844
  		if (tcp_in_cwnd_reduction(sk))
a262f0cdf   Nandita Dukkipati   Proportional Rate...
2845
  			tp->prr_out += tcp_skb_pcount(skb);
57dde7f70   Yuchung Cheng   tcp: add reorderi...
2846
2847
  		if (skb == tcp_write_queue_head(sk) &&
  		    icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
3f421baa4   Arnaldo Carvalho de Melo   [NET]: Just move ...
2848
2849
2850
  			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
  						  inet_csk(sk)->icsk_rto,
  						  TCP_RTO_MAX);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2851
2852
  	}
  }
d83769a58   Eric Dumazet   tcp: fix possible...
2853
2854
  /* We allow to exceed memory limits for FIN packets to expedite
   * connection tear down and (memory) recovery.
845704a53   Eric Dumazet   tcp: avoid loopin...
2855
2856
   * Otherwise tcp_send_fin() could be tempted to either delay FIN
   * or even be forced to close flow without any FIN.
a6c5ea4cc   Eric Dumazet   tcp: rename sk_fo...
2857
2858
   * In general, we want to allow one skb per socket to avoid hangs
   * with edge trigger epoll()
d83769a58   Eric Dumazet   tcp: fix possible...
2859
   */
a6c5ea4cc   Eric Dumazet   tcp: rename sk_fo...
2860
  void sk_forced_mem_schedule(struct sock *sk, int size)
d83769a58   Eric Dumazet   tcp: fix possible...
2861
  {
e805605c7   Johannes Weiner   net: tcp_memcontr...
2862
  	int amt;
d83769a58   Eric Dumazet   tcp: fix possible...
2863
2864
2865
2866
2867
  
  	if (size <= sk->sk_forward_alloc)
  		return;
  	amt = sk_mem_pages(size);
  	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
e805605c7   Johannes Weiner   net: tcp_memcontr...
2868
  	sk_memory_allocated_add(sk, amt);
baac50bbc   Johannes Weiner   net: tcp_memcontr...
2869
2870
  	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
  		mem_cgroup_charge_skmem(sk->sk_memcg, amt);
d83769a58   Eric Dumazet   tcp: fix possible...
2871
  }
845704a53   Eric Dumazet   tcp: avoid loopin...
2872
2873
  /* Send a FIN. The caller locks the socket for us.
   * We should try to send a FIN packet really hard, but eventually give up.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2874
2875
2876
   */
  void tcp_send_fin(struct sock *sk)
  {
845704a53   Eric Dumazet   tcp: avoid loopin...
2877
  	struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
2878
  	struct tcp_sock *tp = tcp_sk(sk);
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
2879

845704a53   Eric Dumazet   tcp: avoid loopin...
2880
2881
2882
2883
  	/* Optimization, tack on the FIN if we have one skb in write queue and
  	 * this skb was not yet sent, or we are under memory pressure.
  	 * Note: in the latter case, FIN packet will be sent after a timeout,
  	 * as TCP stack thinks it has already been transmitted.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2884
  	 */
b8da51ebb   Eric Dumazet   tcp: introduce tc...
2885
  	if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) {
845704a53   Eric Dumazet   tcp: avoid loopin...
2886
2887
2888
  coalesce:
  		TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
  		TCP_SKB_CB(tskb)->end_seq++;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2889
  		tp->write_seq++;
845704a53   Eric Dumazet   tcp: avoid loopin...
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
  		if (!tcp_send_head(sk)) {
  			/* This means tskb was already sent.
  			 * Pretend we included the FIN on previous transmit.
  			 * We need to set tp->snd_nxt to the value it would have
  			 * if FIN had been sent. This is because retransmit path
  			 * does not change tp->snd_nxt.
  			 */
  			tp->snd_nxt++;
  			return;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2900
  	} else {
845704a53   Eric Dumazet   tcp: avoid loopin...
2901
2902
2903
2904
2905
  		skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
  		if (unlikely(!skb)) {
  			if (tskb)
  				goto coalesce;
  			return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2906
  		}
d83769a58   Eric Dumazet   tcp: fix possible...
2907
  		skb_reserve(skb, MAX_TCP_HEADER);
a6c5ea4cc   Eric Dumazet   tcp: rename sk_fo...
2908
  		sk_forced_mem_schedule(sk, skb->truesize);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2909
  		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
e870a8efc   Ilpo Järvinen   [TCP]: Perform se...
2910
  		tcp_init_nondata_skb(skb, tp->write_seq,
a3433f35a   Changli Gao   tcp: unify tcp fl...
2911
  				     TCPHDR_ACK | TCPHDR_FIN);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2912
2913
  		tcp_queue_skb(sk, skb);
  	}
845704a53   Eric Dumazet   tcp: avoid loopin...
2914
  	__tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2915
2916
2917
2918
2919
  }
  
  /* We get here when a process closes a file descriptor (either due to
   * an explicit close() or as a byproduct of exit()'ing) and there
   * was unread data in the receive queue.  This behavior is recommended
65bb723c9   Gerrit Renker   [TCP]: Update ref...
2920
   * by RFC 2525, section 2.17.  -DaveM
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2921
   */
dd0fc66fb   Al Viro   [PATCH] gfp flags...
2922
  void tcp_send_active_reset(struct sock *sk, gfp_t priority)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2923
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2924
  	struct sk_buff *skb;
7cc2b043b   Gao Feng   net: tcp: Increas...
2925
  	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2926
2927
2928
  	/* NOTE: No TCP options attached and we never retransmit this. */
  	skb = alloc_skb(MAX_TCP_HEADER, priority);
  	if (!skb) {
4e6734447   Pavel Emelyanov   mib: add net to N...
2929
  		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2930
2931
2932
2933
2934
  		return;
  	}
  
  	/* Reserve space for headers and prepare control bits. */
  	skb_reserve(skb, MAX_TCP_HEADER);
e870a8efc   Ilpo Järvinen   [TCP]: Perform se...
2935
  	tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
a3433f35a   Changli Gao   tcp: unify tcp fl...
2936
  			     TCPHDR_ACK | TCPHDR_RST);
9a568de48   Eric Dumazet   tcp: switch TCP T...
2937
  	tcp_mstamp_refresh(tcp_sk(sk));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2938
  	/* Send it off. */
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
2939
  	if (tcp_transmit_skb(sk, skb, 0, priority))
4e6734447   Pavel Emelyanov   mib: add net to N...
2940
  		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2941
  }
67edfef78   Andi Kleen   TCP: Add comments...
2942
2943
  /* Send a crossed SYN-ACK during socket establishment.
   * WARNING: This routine must only be called when we have already sent
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2944
2945
2946
2947
2948
2949
   * a SYN packet that crossed the incoming SYN that caused this routine
   * to get called. If this assumption fails then the initial rcv_wnd
   * and rcv_wscale values will not be correct.
   */
  int tcp_send_synack(struct sock *sk)
  {
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2950
  	struct sk_buff *skb;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2951

fe067e8ab   David S. Miller   [TCP]: Abstract o...
2952
  	skb = tcp_write_queue_head(sk);
51456b291   Ian Morris   ipv4: coding styl...
2953
  	if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
91df42bed   Joe Perches   net: ipv4 and ipv...
2954
2955
  		pr_debug("%s: wrong queue state
  ", __func__);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2956
2957
  		return -EFAULT;
  	}
4de075e04   Eric Dumazet   tcp: rename tcp_s...
2958
  	if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2959
2960
  		if (skb_cloned(skb)) {
  			struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
51456b291   Ian Morris   ipv4: coding styl...
2961
  			if (!nskb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2962
  				return -ENOMEM;
fe067e8ab   David S. Miller   [TCP]: Abstract o...
2963
  			tcp_unlink_write_queue(skb, sk);
f4a775d14   Eric Dumazet   net: introduce __...
2964
  			__skb_header_release(nskb);
fe067e8ab   David S. Miller   [TCP]: Abstract o...
2965
  			__tcp_add_write_queue_head(sk, nskb);
3ab224be6   Hideo Aoki   [NET] CORE: Intro...
2966
2967
2968
  			sk_wmem_free_skb(sk, skb);
  			sk->sk_wmem_queued += nskb->truesize;
  			sk_mem_charge(sk, nskb->truesize);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2969
2970
  			skb = nskb;
  		}
4de075e04   Eric Dumazet   tcp: rename tcp_s...
2971
  		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
735d38311   Florian Westphal   tcp: change TCP_E...
2972
  		tcp_ecn_send_synack(sk, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2973
  	}
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
2974
  	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2975
  }
4aea39c11   Eric Dumazet   tcp: tcp_make_syn...
2976
2977
2978
2979
2980
  /**
   * tcp_make_synack - Prepare a SYN-ACK.
   * sk: listener socket
   * dst: dst entry attached to the SYNACK
   * req: request_sock pointer
4aea39c11   Eric Dumazet   tcp: tcp_make_syn...
2981
2982
2983
2984
   *
   * Allocate one skb and build a SYNACK packet.
   * @dst is consumed : Caller should not use it again.
   */
5d062de7f   Eric Dumazet   tcp: constify tcp...
2985
  struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
e6b4d1136   William Allen Simpson   TCPCT part 1a: ad...
2986
  				struct request_sock *req,
ca6fb0651   Eric Dumazet   tcp: attach SYNAC...
2987
  				struct tcp_fastopen_cookie *foc,
b3d051477   Eric Dumazet   tcp: do not mess ...
2988
  				enum tcp_synack_type synack_type)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2989
  {
2e6599cb8   Arnaldo Carvalho de Melo   [NET] Generalise ...
2990
  	struct inet_request_sock *ireq = inet_rsk(req);
5d062de7f   Eric Dumazet   tcp: constify tcp...
2991
  	const struct tcp_sock *tp = tcp_sk(sk);
80f03e27a   Eric Dumazet   tcp: md5: fix rcu...
2992
  	struct tcp_md5sig_key *md5 = NULL;
5d062de7f   Eric Dumazet   tcp: constify tcp...
2993
2994
  	struct tcp_out_options opts;
  	struct sk_buff *skb;
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
2995
  	int tcp_header_size;
5d062de7f   Eric Dumazet   tcp: constify tcp...
2996
  	struct tcphdr *th;
f5fff5dc8   Tom Quetchenbach   tcp: advertise MS...
2997
  	int mss;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2998

ca6fb0651   Eric Dumazet   tcp: attach SYNAC...
2999
  	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
4aea39c11   Eric Dumazet   tcp: tcp_make_syn...
3000
3001
  	if (unlikely(!skb)) {
  		dst_release(dst);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3002
  		return NULL;
4aea39c11   Eric Dumazet   tcp: tcp_make_syn...
3003
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3004
3005
  	/* Reserve space for headers. */
  	skb_reserve(skb, MAX_TCP_HEADER);
b3d051477   Eric Dumazet   tcp: do not mess ...
3006
3007
  	switch (synack_type) {
  	case TCP_SYNACK_NORMAL:
9e17f8a47   Eric Dumazet   net: make skb_set...
3008
  		skb_set_owner_w(skb, req_to_sk(req));
b3d051477   Eric Dumazet   tcp: do not mess ...
3009
3010
3011
3012
3013
3014
3015
  		break;
  	case TCP_SYNACK_COOKIE:
  		/* Under synflood, we do not attach skb to a socket,
  		 * to avoid false sharing.
  		 */
  		break;
  	case TCP_SYNACK_FASTOPEN:
ca6fb0651   Eric Dumazet   tcp: attach SYNAC...
3016
3017
3018
3019
3020
  		/* sk is a const pointer, because we want to express multiple
  		 * cpu might call us concurrently.
  		 * sk->sk_wmem_alloc in an atomic, we can promote to rw.
  		 */
  		skb_set_owner_w(skb, (struct sock *)sk);
b3d051477   Eric Dumazet   tcp: do not mess ...
3021
  		break;
ca6fb0651   Eric Dumazet   tcp: attach SYNAC...
3022
  	}
4aea39c11   Eric Dumazet   tcp: tcp_make_syn...
3023
  	skb_dst_set(skb, dst);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3024

3541f9e8b   Eric Dumazet   tcp: add tcp_mss_...
3025
  	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
f5fff5dc8   Tom Quetchenbach   tcp: advertise MS...
3026

33ad798c9   Adam Langley   tcp: options clea...
3027
  	memset(&opts, 0, sizeof(opts));
8b5f12d04   Florian Westphal   syncookies: fix i...
3028
3029
  #ifdef CONFIG_SYN_COOKIES
  	if (unlikely(req->cookie_ts))
9a568de48   Eric Dumazet   tcp: switch TCP T...
3030
  		skb->skb_mstamp = cookie_init_timestamp(req);
8b5f12d04   Florian Westphal   syncookies: fix i...
3031
3032
  	else
  #endif
9a568de48   Eric Dumazet   tcp: switch TCP T...
3033
  		skb->skb_mstamp = tcp_clock_us();
80f03e27a   Eric Dumazet   tcp: md5: fix rcu...
3034
3035
3036
  
  #ifdef CONFIG_TCP_MD5SIG
  	rcu_read_lock();
fd3a154a0   Eric Dumazet   tcp: md5: get rid...
3037
  	md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
80f03e27a   Eric Dumazet   tcp: md5: fix rcu...
3038
  #endif
58d607d3e   Eric Dumazet   tcp: provide skb-...
3039
  	skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
37bfbdda0   Eric Dumazet   tcp: remove tcp_s...
3040
3041
  	tcp_header_size = tcp_synack_options(req, mss, skb, &opts, md5, foc) +
  			  sizeof(*th);
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
3042

aa8223c7b   Arnaldo Carvalho de Melo   [SK_BUFF]: Introd...
3043
3044
  	skb_push(skb, tcp_header_size);
  	skb_reset_transport_header(skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3045

ea1627c20   Eric Dumazet   tcp: minor optimi...
3046
  	th = (struct tcphdr *)skb->data;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3047
3048
3049
  	memset(th, 0, sizeof(struct tcphdr));
  	th->syn = 1;
  	th->ack = 1;
6ac705b18   Eric Dumazet   tcp: remove tcp_e...
3050
  	tcp_ecn_make_synack(req, th);
b44084c2c   Eric Dumazet   inet: rename ir_l...
3051
  	th->source = htons(ireq->ir_num);
634fb979e   Eric Dumazet   inet: includes a ...
3052
  	th->dest = ireq->ir_rmt_port;
e05a90ec9   Jamal Hadi Salim   net: reflect mark...
3053
  	skb->mark = ireq->ir_mark;
3b1177503   Eric Dumazet   tcp: do not mangl...
3054
3055
  	skb->ip_summed = CHECKSUM_PARTIAL;
  	th->seq = htonl(tcp_rsk(req)->snt_isn);
8336886f7   Jerry Chu   tcp: TCP Fast Ope...
3056
3057
  	/* XXX data is queued and acked as is. No buffer/window check */
  	th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3058
3059
  
  	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
ed53d0ab7   Eric Dumazet   net: shrink struc...
3060
  	th->window = htons(min(req->rsk_rcv_wnd, 65535U));
5d062de7f   Eric Dumazet   tcp: constify tcp...
3061
  	tcp_options_write((__be32 *)(th + 1), NULL, &opts);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3062
  	th->doff = (tcp_header_size >> 2);
90bbcc608   Eric Dumazet   net: tcp: rename ...
3063
  	__TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
3064
3065
3066
  
  #ifdef CONFIG_TCP_MD5SIG
  	/* Okay, we have all we need - do the md5 hash if needed */
80f03e27a   Eric Dumazet   tcp: md5: fix rcu...
3067
  	if (md5)
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
3068
  		tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
39f8e58e5   Eric Dumazet   tcp: md5: remove ...
3069
  					       md5, req_to_sk(req), skb);
80f03e27a   Eric Dumazet   tcp: md5: fix rcu...
3070
  	rcu_read_unlock();
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
3071
  #endif
b50edd781   Eric Dumazet   tcp: tcp_make_syn...
3072
  	/* Do not fool tcpdump (if any), clean our debris */
2456e8553   Thomas Gleixner   ktime: Get rid of...
3073
  	skb->tstamp = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3074
3075
  	return skb;
  }
4bc2f18ba   Eric Dumazet   net/ipv4: EXPORT_...
3076
  EXPORT_SYMBOL(tcp_make_synack);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3077

81164413a   Daniel Borkmann   net: tcp: add per...
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
  static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
  {
  	struct inet_connection_sock *icsk = inet_csk(sk);
  	const struct tcp_congestion_ops *ca;
  	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
  
  	if (ca_key == TCP_CA_UNSPEC)
  		return;
  
  	rcu_read_lock();
  	ca = tcp_ca_find_key(ca_key);
  	if (likely(ca && try_module_get(ca->owner))) {
  		module_put(icsk->icsk_ca_ops->owner);
  		icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
  		icsk->icsk_ca_ops = ca;
  	}
  	rcu_read_unlock();
  }
67edfef78   Andi Kleen   TCP: Add comments...
3096
  /* Do all connect socket setups that can be done AF independent. */
f7e56a76a   stephen hemminger   tcp: make local f...
3097
  static void tcp_connect_init(struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3098
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
3099
  	const struct dst_entry *dst = __sk_dst_get(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3100
3101
  	struct tcp_sock *tp = tcp_sk(sk);
  	__u8 rcv_wscale;
13d3b1ebe   Lawrence Brakmo   bpf: Support for ...
3102
  	u32 rcv_wnd;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3103
3104
3105
3106
  
  	/* We'll fix this up when we get a response from the other end.
  	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
  	 */
5d2ed0521   Eric Dumazet   tcp: Namespaceify...
3107
3108
3109
  	tp->tcp_header_len = sizeof(struct tcphdr);
  	if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
  		tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3110

cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
3111
  #ifdef CONFIG_TCP_MD5SIG
00db41243   Ian Morris   ipv4: coding styl...
3112
  	if (tp->af_specific->md5_lookup(sk, sk))
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
3113
3114
  		tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3115
3116
3117
3118
  	/* If user gave his TCP_MAXSEG, record it to clamp */
  	if (tp->rx_opt.user_mss)
  		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
  	tp->max_window = 0;
5d424d5a6   John Heffner   [TCP]: MTU probing
3119
  	tcp_mtup_init(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3120
  	tcp_sync_mss(sk, dst_mtu(dst));
81164413a   Daniel Borkmann   net: tcp: add per...
3121
  	tcp_ca_dst_init(sk, dst);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3122
3123
  	if (!tp->window_clamp)
  		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
3541f9e8b   Eric Dumazet   tcp: add tcp_mss_...
3124
  	tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
f5fff5dc8   Tom Quetchenbach   tcp: advertise MS...
3125

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3126
  	tcp_initialize_rcv_mss(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3127

e88c64f0a   Hagen Paul Pfeifer   tcp: allow effect...
3128
3129
3130
3131
  	/* limit the window selection if the user enforce a smaller rx buffer */
  	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
  	    (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
  		tp->window_clamp = tcp_full_space(sk);
13d3b1ebe   Lawrence Brakmo   bpf: Support for ...
3132
3133
3134
  	rcv_wnd = tcp_rwnd_init_bpf(sk);
  	if (rcv_wnd == 0)
  		rcv_wnd = dst_metric(dst, RTAX_INITRWND);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3135
3136
3137
3138
  	tcp_select_initial_window(tcp_full_space(sk),
  				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
  				  &tp->rcv_wnd,
  				  &tp->window_clamp,
9bb37ef00   Eric Dumazet   tcp: Namespaceify...
3139
  				  sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
31d12926e   laurent chavey   net: Add rtnetlin...
3140
  				  &rcv_wscale,
13d3b1ebe   Lawrence Brakmo   bpf: Support for ...
3141
  				  rcv_wnd);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3142
3143
3144
3145
3146
3147
3148
  
  	tp->rx_opt.rcv_wscale = rcv_wscale;
  	tp->rcv_ssthresh = tp->rcv_wnd;
  
  	sk->sk_err = 0;
  	sock_reset_flag(sk, SOCK_DONE);
  	tp->snd_wnd = 0;
ee7537b63   Hantzis Fotis   tcp: tcp_init_wl ...
3149
  	tcp_init_wl(tp, 0);
c89d53430   Eric Dumazet   tcp: purge write ...
3150
  	tcp_write_queue_purge(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3151
3152
  	tp->snd_una = tp->write_seq;
  	tp->snd_sml = tp->write_seq;
33f5f57ee   Ilpo Järvinen   tcp: kill pointle...
3153
  	tp->snd_up = tp->write_seq;
370816aef   Pavel Emelyanov   tcp: Move code ar...
3154
  	tp->snd_nxt = tp->write_seq;
ee9952831   Pavel Emelyanov   tcp: Initial repa...
3155
3156
3157
  
  	if (likely(!tp->repair))
  		tp->rcv_nxt = 0;
c7781a6e3   Andrew Vagin   tcp: initialize r...
3158
  	else
70eabf0e1   Eric Dumazet   tcp: use tcp_jiff...
3159
  		tp->rcv_tstamp = tcp_jiffies32;
ee9952831   Pavel Emelyanov   tcp: Initial repa...
3160
3161
  	tp->rcv_wup = tp->rcv_nxt;
  	tp->copied_seq = tp->rcv_nxt;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3162

8550f328f   Lawrence Brakmo   bpf: Support for ...
3163
  	inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3164
  	inet_csk(sk)->icsk_retransmits = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3165
3166
  	tcp_clear_retrans(tp);
  }
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3167
3168
3169
3170
3171
3172
  static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
  
  	tcb->end_seq += skb->len;
f4a775d14   Eric Dumazet   net: introduce __...
3173
  	__skb_header_release(skb);
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
  	__tcp_add_write_queue_tail(sk, skb);
  	sk->sk_wmem_queued += skb->truesize;
  	sk_mem_charge(sk, skb->truesize);
  	tp->write_seq = tcb->end_seq;
  	tp->packets_out += tcp_skb_pcount(skb);
  }
  
  /* Build and send a SYN with data and (cached) Fast Open cookie. However,
   * queue a data-only packet after the regular SYN, such that regular SYNs
   * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
   * only the SYN sequence, the data are retransmitted in the first ACK.
   * If cookie is not cached or other error occurs, falls back to send a
   * regular SYN with Fast Open cookie request option.
   */
  static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct tcp_fastopen_request *fo = tp->fastopen_req;
065263f40   Wei Wang   net/tcp-fastopen:...
3192
  	int space, err = 0;
355a901e6   Eric Dumazet   tcp: make connect...
3193
  	struct sk_buff *syn_data;
aab487435   Yuchung Cheng   net-tcp: Fast Ope...
3194

67da22d23   Yuchung Cheng   net-tcp: Fast Ope...
3195
  	tp->rx_opt.mss_clamp = tp->advmss;  /* If MSS is not cached */
065263f40   Wei Wang   net/tcp-fastopen:...
3196
  	if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3197
3198
3199
3200
3201
3202
  		goto fallback;
  
  	/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
  	 * user-MSS. Reserve maximum option space for middleboxes that add
  	 * private TCP options. The cost is reduced data space in SYN :(
  	 */
3541f9e8b   Eric Dumazet   tcp: add tcp_mss_...
3203
  	tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
1b63edd6e   Yuchung Cheng   tcp: fix SYN-data...
3204
  	space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3205
  		MAX_TCP_OPTION_SPACE;
f5ddcbbb4   Eric Dumazet   net-tcp: fastopen...
3206
3207
3208
3209
  	space = min_t(size_t, space, fo->size);
  
  	/* limit to order-0 allocations */
  	space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
eb9344781   Eric Dumazet   tcp: add a force_...
3210
  	syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false);
355a901e6   Eric Dumazet   tcp: make connect...
3211
  	if (!syn_data)
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3212
  		goto fallback;
355a901e6   Eric Dumazet   tcp: make connect...
3213
3214
  	syn_data->ip_summed = CHECKSUM_PARTIAL;
  	memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
07e100f98   Eric Dumazet   tcp: restore fast...
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
  	if (space) {
  		int copied = copy_from_iter(skb_put(syn_data, space), space,
  					    &fo->data->msg_iter);
  		if (unlikely(!copied)) {
  			kfree_skb(syn_data);
  			goto fallback;
  		}
  		if (copied != space) {
  			skb_trim(syn_data, copied);
  			space = copied;
  		}
57be5bdad   Al Viro   ip: convert tcp_s...
3226
  	}
355a901e6   Eric Dumazet   tcp: make connect...
3227
3228
3229
3230
  	/* No more data pending in inet_wait_for_connect() */
  	if (space == fo->size)
  		fo->data = NULL;
  	fo->copied = space;
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3231

355a901e6   Eric Dumazet   tcp: make connect...
3232
  	tcp_connect_queue_skb(sk, syn_data);
0f87230d1   Francis Yan   tcp: instrument h...
3233
3234
  	if (syn_data->len)
  		tcp_chrono_start(sk, TCP_CHRONO_BUSY);
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3235

355a901e6   Eric Dumazet   tcp: make connect...
3236
  	err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3237

355a901e6   Eric Dumazet   tcp: make connect...
3238
  	syn->skb_mstamp = syn_data->skb_mstamp;
431a91242   Eric Dumazet   tcp: timestamp SY...
3239

355a901e6   Eric Dumazet   tcp: make connect...
3240
3241
3242
3243
3244
3245
3246
3247
  	/* Now full SYN+DATA was cloned and sent (or not),
  	 * remove the SYN from the original skb (syn_data)
  	 * we keep in write queue in case of a retransmit, as we
  	 * also have the SYN packet (with no data) in the same queue.
  	 */
  	TCP_SKB_CB(syn_data)->seq++;
  	TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
  	if (!err) {
67da22d23   Yuchung Cheng   net-tcp: Fast Ope...
3248
  		tp->syn_data = (fo->copied > 0);
f19c29e3e   Yuchung Cheng   tcp: snmp stats f...
3249
  		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3250
3251
  		goto done;
  	}
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3252

b5b7db8d6   Eric Dumazet   tcp: fastopen: fi...
3253
3254
3255
  	/* data was not sent, this is our new send_head */
  	sk->sk_send_head = syn_data;
  	tp->packets_out -= tcp_skb_pcount(syn_data);
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3256
3257
3258
3259
3260
3261
3262
  fallback:
  	/* Send a regular SYN with Fast Open cookie request option */
  	if (fo->cookie.len > 0)
  		fo->cookie.len = 0;
  	err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
  	if (err)
  		tp->syn_fastopen = 0;
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3263
3264
3265
3266
  done:
  	fo->cookie.len = -1;  /* Exclude Fast Open option for SYN retries */
  	return err;
  }
67edfef78   Andi Kleen   TCP: Add comments...
3267
  /* Build a SYN and send it off. */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3268
3269
3270
3271
  int tcp_connect(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *buff;
ee5868119   Eric Paris   network: tcp_conn...
3272
  	int err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3273

9872a4bde   Lawrence Brakmo   bpf: Add TCP conn...
3274
  	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB);
8ba609247   Eric Dumazet   tcp: fastopen: tc...
3275
3276
3277
  
  	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
  		return -EHOSTUNREACH; /* Routing failure or similar. */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3278
  	tcp_connect_init(sk);
2b9164771   Andrey Vagin   ipv6: adapt conne...
3279
3280
3281
3282
  	if (unlikely(tp->repair)) {
  		tcp_finish_connect(sk, NULL);
  		return 0;
  	}
eb9344781   Eric Dumazet   tcp: add a force_...
3283
  	buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
355a901e6   Eric Dumazet   tcp: make connect...
3284
  	if (unlikely(!buff))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3285
  		return -ENOBUFS;
a3433f35a   Changli Gao   tcp: unify tcp fl...
3286
  	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
9a568de48   Eric Dumazet   tcp: switch TCP T...
3287
3288
  	tcp_mstamp_refresh(tp);
  	tp->retrans_stamp = tcp_time_stamp(tp);
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3289
  	tcp_connect_queue_skb(sk, buff);
735d38311   Florian Westphal   tcp: change TCP_E...
3290
  	tcp_ecn_send_syn(sk, buff);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3291

783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3292
3293
3294
  	/* Send off SYN; include data in Fast Open. */
  	err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
  	      tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
ee5868119   Eric Paris   network: tcp_conn...
3295
3296
  	if (err == -ECONNREFUSED)
  		return err;
bd37a0885   Wei Yongjun   [TCP]: SNMPv2 tcp...
3297
3298
3299
3300
3301
3302
  
  	/* We change tp->snd_nxt after the tcp_transmit_skb() call
  	 * in order to make this packet get counted in tcpOutSegs.
  	 */
  	tp->snd_nxt = tp->write_seq;
  	tp->pushed_seq = tp->write_seq;
b5b7db8d6   Eric Dumazet   tcp: fastopen: fi...
3303
3304
3305
3306
3307
  	buff = tcp_send_head(sk);
  	if (unlikely(buff)) {
  		tp->snd_nxt	= TCP_SKB_CB(buff)->seq;
  		tp->pushed_seq	= TCP_SKB_CB(buff)->seq;
  	}
81cc8a75d   Pavel Emelyanov   mib: add net to T...
3308
  	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3309
3310
  
  	/* Timer for repeating the SYN until an answer. */
3f421baa4   Arnaldo Carvalho de Melo   [NET]: Just move ...
3311
3312
  	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
  				  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3313
3314
  	return 0;
  }
4bc2f18ba   Eric Dumazet   net/ipv4: EXPORT_...
3315
  EXPORT_SYMBOL(tcp_connect);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3316
3317
3318
3319
3320
3321
3322
  
  /* Send out a delayed ack, the caller does the policy checking
   * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
   * for details.
   */
  void tcp_send_delayed_ack(struct sock *sk)
  {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3323
3324
  	struct inet_connection_sock *icsk = inet_csk(sk);
  	int ato = icsk->icsk_ack.ato;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3325
3326
3327
  	unsigned long timeout;
  
  	if (ato > TCP_DELACK_MIN) {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3328
  		const struct tcp_sock *tp = tcp_sk(sk);
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3329
  		int max_ato = HZ / 2;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3330

056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3331
3332
  		if (icsk->icsk_ack.pingpong ||
  		    (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3333
3334
3335
3336
3337
  			max_ato = TCP_DELACK_MAX;
  
  		/* Slow path, intersegment interval is "high". */
  
  		/* If some rtt estimate is known, use it to bound delayed ack.
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3338
  		 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3339
3340
  		 * directly.
  		 */
740b0f184   Eric Dumazet   tcp: switch rtt e...
3341
3342
3343
  		if (tp->srtt_us) {
  			int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
  					TCP_DELACK_MIN);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
  
  			if (rtt < max_ato)
  				max_ato = rtt;
  		}
  
  		ato = min(ato, max_ato);
  	}
  
  	/* Stay within the limit we were given */
  	timeout = jiffies + ato;
  
  	/* Use new timeout only if there wasn't a older one earlier. */
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3356
  	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3357
3358
3359
  		/* If delack timer was blocked or is about to expire,
  		 * send ACK now.
  		 */
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3360
3361
  		if (icsk->icsk_ack.blocked ||
  		    time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3362
3363
3364
  			tcp_send_ack(sk);
  			return;
  		}
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3365
3366
  		if (!time_before(timeout, icsk->icsk_ack.timeout))
  			timeout = icsk->icsk_ack.timeout;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3367
  	}
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3368
3369
3370
  	icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
  	icsk->icsk_ack.timeout = timeout;
  	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3371
3372
3373
  }
  
  /* This routine sends an ack and also updates the window. */
f7f24b369   Yuchung Cheng   tcp: helpers to s...
3374
  void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3375
  {
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3376
  	struct sk_buff *buff;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3377

058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3378
3379
3380
  	/* If we have been reset, we may not send again. */
  	if (sk->sk_state == TCP_CLOSE)
  		return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3381

058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3382
3383
3384
3385
  	/* We are not putting this on the write queue, so
  	 * tcp_transmit_skb() will set the ownership to this
  	 * sock.
  	 */
7450aaf61   Eric Dumazet   tcp: suppress too...
3386
3387
3388
  	buff = alloc_skb(MAX_TCP_HEADER,
  			 sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
  	if (unlikely(!buff)) {
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3389
3390
3391
3392
3393
  		inet_csk_schedule_ack(sk);
  		inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
  		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
  					  TCP_DELACK_MAX, TCP_RTO_MAX);
  		return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3394
  	}
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3395
3396
3397
  
  	/* Reserve space for headers and prepare control bits. */
  	skb_reserve(buff, MAX_TCP_HEADER);
a3433f35a   Changli Gao   tcp: unify tcp fl...
3398
  	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3399

987819657   Eric Dumazet   tcp: do not pace ...
3400
3401
3402
  	/* We do not want pure acks influencing TCP Small Queues or fq/pacing
  	 * too much.
  	 * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
987819657   Eric Dumazet   tcp: do not pace ...
3403
3404
  	 */
  	skb_set_tcp_pure_ack(buff);
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3405
  	/* Send it off, this clears delayed acks for us. */
f7f24b369   Yuchung Cheng   tcp: helpers to s...
3406
3407
  	__tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
  }
78636179f   Yuchung Cheng   tcp: do not cance...
3408
  EXPORT_SYMBOL_GPL(__tcp_send_ack);
f7f24b369   Yuchung Cheng   tcp: helpers to s...
3409
3410
3411
3412
  
  void tcp_send_ack(struct sock *sk)
  {
  	__tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
  }
  
  /* This routine sends a packet with an out of date sequence
   * number. It assumes the other end will try to ack it.
   *
   * Question: what should we make while urgent mode?
   * 4.4BSD forces sending single byte of data. We cannot send
   * out of window data, because we have SND.NXT==SND.MAX...
   *
   * Current solution: to send TWO zero-length segments in urgent mode:
   * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
   * out-of-date with SND.UNA-1 to probe window.
   */
e520af48c   Eric Dumazet   tcp: add TCPWinPr...
3426
  static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3427
3428
3429
3430
3431
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *skb;
  
  	/* We don't queue it, tcp_transmit_skb() sets ownership. */
7450aaf61   Eric Dumazet   tcp: suppress too...
3432
3433
  	skb = alloc_skb(MAX_TCP_HEADER,
  			sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
51456b291   Ian Morris   ipv4: coding styl...
3434
  	if (!skb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3435
3436
3437
3438
  		return -1;
  
  	/* Reserve space for headers and set control bits. */
  	skb_reserve(skb, MAX_TCP_HEADER);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3439
3440
3441
3442
  	/* Use a previous sequence.  This should cause the other
  	 * end to send an ack.  Don't queue or clone SKB, just
  	 * send it.
  	 */
a3433f35a   Changli Gao   tcp: unify tcp fl...
3443
  	tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
e2e8009ff   Renato Westphal   tcp: remove impro...
3444
  	NET_INC_STATS(sock_net(sk), mib);
7450aaf61   Eric Dumazet   tcp: suppress too...
3445
  	return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3446
  }
385e20706   Eric Dumazet   tcp: use tp->tcp_...
3447
  /* Called from setsockopt( ... TCP_REPAIR ) */
ee9952831   Pavel Emelyanov   tcp: Initial repa...
3448
3449
3450
3451
  void tcp_send_window_probe(struct sock *sk)
  {
  	if (sk->sk_state == TCP_ESTABLISHED) {
  		tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
9a568de48   Eric Dumazet   tcp: switch TCP T...
3452
  		tcp_mstamp_refresh(tcp_sk(sk));
e520af48c   Eric Dumazet   tcp: add TCPWinPr...
3453
  		tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
ee9952831   Pavel Emelyanov   tcp: Initial repa...
3454
3455
  	}
  }
67edfef78   Andi Kleen   TCP: Add comments...
3456
  /* Initiate keepalive or window probe from timer. */
e520af48c   Eric Dumazet   tcp: add TCPWinPr...
3457
  int tcp_write_wakeup(struct sock *sk, int mib)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3458
  {
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3459
3460
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *skb;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3461

058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3462
3463
  	if (sk->sk_state == TCP_CLOSE)
  		return -1;
00db41243   Ian Morris   ipv4: coding styl...
3464
3465
  	skb = tcp_send_head(sk);
  	if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3466
  		int err;
0c54b85f2   Ilpo Järvinen   tcp: simplify tcp...
3467
  		unsigned int mss = tcp_current_mss(sk);
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
  		unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
  
  		if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
  			tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
  
  		/* We are probing the opening of a window
  		 * but the window size is != 0
  		 * must have been a result SWS avoidance ( sender )
  		 */
  		if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
  		    skb->len > mss) {
  			seg_size = min(seg_size, mss);
4de075e04   Eric Dumazet   tcp: rename tcp_s...
3480
  			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
6cc55e096   Octavian Purdila   tcp: add gfp para...
3481
  			if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3482
3483
  				return -1;
  		} else if (!tcp_skb_pcount(skb))
5bbb432c8   Eric Dumazet   tcp: tcp_set_skb_...
3484
  			tcp_set_skb_tso_segs(skb, mss);
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3485

4de075e04   Eric Dumazet   tcp: rename tcp_s...
3486
  		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3487
3488
3489
3490
3491
  		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
  		if (!err)
  			tcp_event_new_data_sent(sk, skb);
  		return err;
  	} else {
33f5f57ee   Ilpo Järvinen   tcp: kill pointle...
3492
  		if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
e520af48c   Eric Dumazet   tcp: add TCPWinPr...
3493
3494
  			tcp_xmit_probe_skb(sk, 1, mib);
  		return tcp_xmit_probe_skb(sk, 0, mib);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3495
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3496
3497
3498
3499
3500
3501
3502
  }
  
  /* A window probe timeout has occurred.  If window is not closed send
   * a partial packet else a zero probe.
   */
  void tcp_send_probe0(struct sock *sk)
  {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3503
  	struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3504
  	struct tcp_sock *tp = tcp_sk(sk);
c6214a97c   Nikolay Borisov   ipv4: Namespaceif...
3505
  	struct net *net = sock_net(sk);
fcdd1cf4d   Eric Dumazet   tcp: avoid possib...
3506
  	unsigned long probe_max;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3507
  	int err;
e520af48c   Eric Dumazet   tcp: add TCPWinPr...
3508
  	err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3509

fe067e8ab   David S. Miller   [TCP]: Abstract o...
3510
  	if (tp->packets_out || !tcp_send_head(sk)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3511
  		/* Cancel probe timer, if it is not required. */
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
3512
  		icsk->icsk_probes_out = 0;
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3513
  		icsk->icsk_backoff = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3514
3515
3516
3517
  		return;
  	}
  
  	if (err <= 0) {
c6214a97c   Nikolay Borisov   ipv4: Namespaceif...
3518
  		if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3519
  			icsk->icsk_backoff++;
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
3520
  		icsk->icsk_probes_out++;
fcdd1cf4d   Eric Dumazet   tcp: avoid possib...
3521
  		probe_max = TCP_RTO_MAX;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3522
3523
  	} else {
  		/* If packet was not sent due to local congestion,
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
3524
  		 * do not backoff and do not remember icsk_probes_out.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3525
3526
3527
3528
  		 * Let local senders to fight for local resources.
  		 *
  		 * Use accumulated backoff yet.
  		 */
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
3529
3530
  		if (!icsk->icsk_probes_out)
  			icsk->icsk_probes_out = 1;
fcdd1cf4d   Eric Dumazet   tcp: avoid possib...
3531
  		probe_max = TCP_RESOURCE_PROBE_INTERVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3532
  	}
fcdd1cf4d   Eric Dumazet   tcp: avoid possib...
3533
  	inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
21c8fe991   Eric Dumazet   tcp: adjust windo...
3534
  				  tcp_probe0_when(sk, probe_max),
fcdd1cf4d   Eric Dumazet   tcp: avoid possib...
3535
  				  TCP_RTO_MAX);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3536
  }
5db92c994   Octavian Purdila   tcp: unify tcp_v4...
3537

ea3bea3a1   Eric Dumazet   tcp/dccp: constif...
3538
  int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
5db92c994   Octavian Purdila   tcp: unify tcp_v4...
3539
3540
3541
3542
  {
  	const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
  	struct flowi fl;
  	int res;
58d607d3e   Eric Dumazet   tcp: provide skb-...
3543
  	tcp_rsk(req)->txhash = net_tx_rndhash();
b3d051477   Eric Dumazet   tcp: do not mess ...
3544
  	res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
5db92c994   Octavian Purdila   tcp: unify tcp_v4...
3545
  	if (!res) {
90bbcc608   Eric Dumazet   net: tcp: rename ...
3546
  		__TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
02a1d6e7a   Eric Dumazet   net: rename NET_{...
3547
  		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
7e32b4436   Yuchung Cheng   tcp: properly acc...
3548
3549
  		if (unlikely(tcp_passive_fastopen(sk)))
  			tcp_sk(sk)->total_retrans++;
5db92c994   Octavian Purdila   tcp: unify tcp_v4...
3550
3551
3552
3553
  	}
  	return res;
  }
  EXPORT_SYMBOL(tcp_rtx_synack);