Blame view

net/ipv4/tcp_output.c 111 KB
457c89965   Thomas Gleixner   treewide: Add SPD...
1
  // SPDX-License-Identifier: GPL-2.0-only
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2
3
4
5
6
7
8
  /*
   * INET		An implementation of the TCP/IP protocol suite for the LINUX
   *		operating system.  INET is implemented using the  BSD Socket
   *		interface as the means of communication with the user level.
   *
   *		Implementation of the Transmission Control Protocol(TCP).
   *
02c30a84e   Jesper Juhl   [PATCH] update Ro...
9
   * Authors:	Ross Biro
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
   *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
   *		Mark Evans, <evansmp@uhura.aston.ac.uk>
   *		Corey Minyard <wf-rch!minyard@relay.EU.net>
   *		Florian La Roche, <flla@stud.uni-sb.de>
   *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
   *		Linus Torvalds, <torvalds@cs.helsinki.fi>
   *		Alan Cox, <gw4pts@gw4pts.ampr.org>
   *		Matthew Dillon, <dillon@apollo.west.oic.com>
   *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
   *		Jorge Cwik, <jorge@laser.satlink.net>
   */
  
  /*
   * Changes:	Pedro Roque	:	Retransmit queue handled by TCP.
   *				:	Fragmentation on mtu decrease
   *				:	Segment collapse on retransmit
   *				:	AF independence
   *
   *		Linus Torvalds	:	send_delayed_ack
   *		David S. Miller	:	Charge memory using the right skb
   *					during syn/ack processing.
   *		David S. Miller :	Output engine completely rewritten.
   *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr.
   *		Cacophonix Gaul :	draft-minshall-nagle-01
   *		J Hadi Salim	:	ECN support
   *
   */
91df42bed   Joe Perches   net: ipv4 and ipv...
37
  #define pr_fmt(fmt) "TCP: " fmt
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
38
39
40
  #include <net/tcp.h>
  
  #include <linux/compiler.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
41
  #include <linux/gfp.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
42
  #include <linux/module.h>
60e2a7780   Ursula Braun   tcp: TCP experime...
43
  #include <linux/static_key.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
44

e086101b1   Cong Wang   tcp: add a tracep...
45
  #include <trace/events/tcp.h>
35089bb20   David S. Miller   [TCP]: Add tcp_sl...
46

9799ccb0e   Eric Dumazet   tcp: add tcp_wsta...
47
48
49
50
51
52
  /* Refresh clocks of a TCP socket,
   * ensuring monotically increasing values.
   */
  void tcp_mstamp_refresh(struct tcp_sock *tp)
  {
  	u64 val = tcp_clock_ns();
e6d140701   Eric Dumazet   tcp: remove condi...
53
54
  	tp->tcp_clock_cache = val;
  	tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC);
9799ccb0e   Eric Dumazet   tcp: add tcp_wsta...
55
  }
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
56
57
  static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  			   int push_one, gfp_t gfp);
519855c50   William Allen Simpson   TCPCT part 1c: sy...
58

67edfef78   Andi Kleen   TCP: Add comments...
59
  /* Account for new data that has been sent to the network. */
75c119afe   Eric Dumazet   tcp: implement rb...
60
  static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
61
  {
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
62
  	struct inet_connection_sock *icsk = inet_csk(sk);
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
63
  	struct tcp_sock *tp = tcp_sk(sk);
66f5fe624   Ilpo Järvinen   [TCP]: Rename upd...
64
  	unsigned int prior_packets = tp->packets_out;
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
65

e0d694d63   Eric Dumazet   tcp: annotate tp-...
66
  	WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
8512430e5   Ilpo Järvinen   [TCP]: Move FRTO ...
67

75c119afe   Eric Dumazet   tcp: implement rb...
68
69
  	__skb_unlink(skb, &sk->sk_write_queue);
  	tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
4c5fa9d3c   Cambda Zhu   tcp: Fix highest_...
70
71
  	if (tp->highest_sack == NULL)
  		tp->highest_sack = skb;
66f5fe624   Ilpo Järvinen   [TCP]: Rename upd...
72
  	tp->packets_out += tcp_skb_pcount(skb);
bec41a11d   Yuchung Cheng   tcp: remove early...
73
  	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
750ea2baf   Yuchung Cheng   tcp: early retran...
74
  		tcp_rearm_rto(sk);
f19c29e3e   Yuchung Cheng   tcp: snmp stats f...
75

f7324acd9   David S. Miller   tcp: Use NET_ADD_...
76
77
  	NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
  		      tcp_skb_pcount(skb));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
78
  }
a4ecb15a2   Cui, Cheng   tcp: accommodate ...
79
80
  /* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
   * window scaling factor due to loss of precision.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
81
82
83
84
85
   * If window has been shrunk, what should we make? It is not clear at all.
   * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
   * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
   * invalid. OK, let's make this for now:
   */
cf533ea53   Eric Dumazet   tcp: add const qu...
86
  static inline __u32 tcp_acceptable_seq(const struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
87
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
88
  	const struct tcp_sock *tp = tcp_sk(sk);
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
89

a4ecb15a2   Cui, Cheng   tcp: accommodate ...
90
91
92
  	if (!before(tcp_wnd_end(tp), tp->snd_nxt) ||
  	    (tp->rx_opt.wscale_ok &&
  	     ((tp->snd_nxt - tcp_wnd_end(tp)) < (1 << tp->rx_opt.rcv_wscale))))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
93
94
  		return tp->snd_nxt;
  	else
90840defa   Ilpo Järvinen   [TCP]: Introduce ...
95
  		return tcp_wnd_end(tp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
  }
  
  /* Calculate mss to advertise in SYN segment.
   * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
   *
   * 1. It is independent of path mtu.
   * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
   * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
   *    attached devices, because some buggy hosts are confused by
   *    large MSS.
   * 4. We do not make 3, we advertise MSS, calculated from first
   *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
   *    This may be overridden via information stored in routing table.
   * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
   *    probably even Jumbo".
   */
  static __u16 tcp_advertise_mss(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
cf533ea53   Eric Dumazet   tcp: add const qu...
115
  	const struct dst_entry *dst = __sk_dst_get(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
116
  	int mss = tp->advmss;
0dbaee3b3   David S. Miller   net: Abstract def...
117
118
119
120
121
122
123
  	if (dst) {
  		unsigned int metric = dst_metric_advmss(dst);
  
  		if (metric < mss) {
  			mss = metric;
  			tp->advmss = mss;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
124
125
126
127
128
129
  	}
  
  	return (__u16)mss;
  }
  
  /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
6f021c62d   Eric Dumazet   tcp: fix slow sta...
130
131
132
   * This is the first part of cwnd validation mechanism.
   */
  void tcp_cwnd_restart(struct sock *sk, s32 delta)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
133
  {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
134
  	struct tcp_sock *tp = tcp_sk(sk);
6f021c62d   Eric Dumazet   tcp: fix slow sta...
135
  	u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
136
  	u32 cwnd = tp->snd_cwnd;
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
137
  	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
138

6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
139
  	tp->snd_ssthresh = tcp_current_ssthresh(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
140
  	restart_cwnd = min(restart_cwnd, cwnd);
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
141
  	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
142
143
  		cwnd >>= 1;
  	tp->snd_cwnd = max(cwnd, restart_cwnd);
c2203cf75   Eric Dumazet   tcp: use tcp_jiff...
144
  	tp->snd_cwnd_stamp = tcp_jiffies32;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
145
146
  	tp->snd_cwnd_used = 0;
  }
67edfef78   Andi Kleen   TCP: Add comments...
147
  /* Congestion state accounting after a packet has been sent. */
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
148
  static void tcp_event_data_sent(struct tcp_sock *tp,
cf533ea53   Eric Dumazet   tcp: add const qu...
149
  				struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
150
  {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
151
  	struct inet_connection_sock *icsk = inet_csk(sk);
d635fbe27   Eric Dumazet   tcp: use tcp_jiff...
152
  	const u32 now = tcp_jiffies32;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
153

05c5a46d7   Neal Cardwell   tcp: generate CA_...
154
155
  	if (tcp_packets_in_flight(tp) == 0)
  		tcp_ca_event(sk, CA_EVENT_TX_START);
4a41f453b   Wei Wang   tcp: change pingp...
156
157
158
159
  	/* If this is the first data packet sent in response to the
  	 * previous received data,
  	 * and it is a reply for ato after last received packet,
  	 * increase pingpong count.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
160
  	 */
4a41f453b   Wei Wang   tcp: change pingp...
161
162
163
164
165
  	if (before(tp->lsndtime, icsk->icsk_ack.lrcvtime) &&
  	    (u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
  		inet_csk_inc_pingpong_cnt(sk);
  
  	tp->lsndtime = now;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
166
  }
67edfef78   Andi Kleen   TCP: Add comments...
167
  /* Account for an ACK we sent. */
27cde44a2   Yuchung Cheng   tcp: do not cance...
168
169
  static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
  				      u32 rcv_nxt)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
170
  {
5d9f4262b   Eric Dumazet   tcp: add SACK com...
171
  	struct tcp_sock *tp = tcp_sk(sk);
86de5921a   Eric Dumazet   tcp: defer SACK c...
172
  	if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) {
200d95f45   Eric Dumazet   tcp: add TCPAckCo...
173
  		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
86de5921a   Eric Dumazet   tcp: defer SACK c...
174
175
  			      tp->compressed_ack - TCP_FASTRETRANS_THRESH);
  		tp->compressed_ack = TCP_FASTRETRANS_THRESH;
5d9f4262b   Eric Dumazet   tcp: add SACK com...
176
177
178
  		if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
  			__sock_put(sk);
  	}
27cde44a2   Yuchung Cheng   tcp: do not cance...
179
180
181
  
  	if (unlikely(rcv_nxt != tp->rcv_nxt))
  		return;  /* Special ACK sent by DCTCP to reflect ECN */
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
182
183
  	tcp_dec_quickack_mode(sk, pkts);
  	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
184
185
186
187
188
189
190
191
192
  }
  
  /* Determine a window scaling and initial window to offer.
   * Based on the assumption that the given amount of space
   * will be offered. Store the results in the tp structure.
   * NOTE: for smooth operation initial space offering should
   * be a multiple of mss if possible. We assume here that mss >= 1.
   * This MUST be enforced by all callers.
   */
ceef9ab6b   Eric Dumazet   tcp: Namespace-if...
193
  void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
194
  			       __u32 *rcv_wnd, __u32 *window_clamp,
31d12926e   laurent chavey   net: Add rtnetlin...
195
196
  			       int wscale_ok, __u8 *rcv_wscale,
  			       __u32 init_rcv_wnd)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
197
198
199
200
201
  {
  	unsigned int space = (__space < 0 ? 0 : __space);
  
  	/* If no clamp set the clamp to the max possible scaled window */
  	if (*window_clamp == 0)
589c49cbf   Gao Feng   net: tcp: Define ...
202
  		(*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
203
204
205
206
  	space = min(*window_clamp, space);
  
  	/* Quantize space offering to a multiple of mss if possible. */
  	if (space > mss)
589c49cbf   Gao Feng   net: tcp: Define ...
207
  		space = rounddown(space, mss);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
208
209
  
  	/* NOTE: offering an initial window larger than 32767
15d99e02b   Rick Jones   [TCP]: sysctl to ...
210
211
212
213
214
215
  	 * will break some buggy TCP stacks. If the admin tells us
  	 * it is likely we could be speaking with such a buggy stack
  	 * we will truncate our initial window offering to 32K-1
  	 * unless the remote has sent us a window scaling option,
  	 * which we interpret as a sign the remote TCP is not
  	 * misinterpreting the window field as a signed quantity.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
216
  	 */
ceef9ab6b   Eric Dumazet   tcp: Namespace-if...
217
  	if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
15d99e02b   Rick Jones   [TCP]: sysctl to ...
218
219
  		(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
  	else
a337531b9   Yuchung Cheng   tcp: up initial r...
220
221
222
223
  		(*rcv_wnd) = min_t(u32, space, U16_MAX);
  
  	if (init_rcv_wnd)
  		*rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
15d99e02b   Rick Jones   [TCP]: sysctl to ...
224

19bf62613   Eric Dumazet   tcp: remove loop ...
225
  	*rcv_wscale = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
226
  	if (wscale_ok) {
589c49cbf   Gao Feng   net: tcp: Define ...
227
  		/* Set window scaling on max possible window */
356d1833b   Eric Dumazet   tcp: Namespace-if...
228
  		space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
f626300a3   Soheil Hassas Yeganeh   tcp: consider rec...
229
  		space = max_t(u32, space, sysctl_rmem_max);
316c1592b   Stephen Hemminger   [TCP]: Limit wind...
230
  		space = min_t(u32, space, *window_clamp);
19bf62613   Eric Dumazet   tcp: remove loop ...
231
232
  		*rcv_wscale = clamp_t(int, ilog2(space) - 15,
  				      0, TCP_MAX_WSCALE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
233
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
234
  	/* Set the clamp no higher than max representable value */
589c49cbf   Gao Feng   net: tcp: Define ...
235
  	(*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
236
  }
4bc2f18ba   Eric Dumazet   net/ipv4: EXPORT_...
237
  EXPORT_SYMBOL(tcp_select_initial_window);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
238
239
240
241
242
243
  
  /* Chose a new window to advertise, update state in tcp_sock for the
   * socket, and return result with RFC1323 scaling applied.  The return
   * value can be stuffed directly into th->window for an outgoing
   * frame.
   */
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
244
  static u16 tcp_select_window(struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
245
246
  {
  	struct tcp_sock *tp = tcp_sk(sk);
8e165e203   Florian Westphal   net: tcp: add mib...
247
  	u32 old_win = tp->rcv_wnd;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
248
249
250
251
  	u32 cur_win = tcp_receive_window(tp);
  	u32 new_win = __tcp_select_window(sk);
  
  	/* Never shrink the offered window */
2de979bd7   Stephen Hemminger   [TCP]: whitespace...
252
  	if (new_win < cur_win) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
253
254
255
256
257
258
259
  		/* Danger Will Robinson!
  		 * Don't update rcv_wup/rcv_wnd here or else
  		 * we will not be able to advertise a zero
  		 * window in time.  --DaveM
  		 *
  		 * Relax Will Robinson.
  		 */
8e165e203   Florian Westphal   net: tcp: add mib...
260
261
262
  		if (new_win == 0)
  			NET_INC_STATS(sock_net(sk),
  				      LINUX_MIB_TCPWANTZEROWINDOWADV);
607bfbf2d   Patrick McHardy   [TCP]: Fix shrink...
263
  		new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
264
265
266
267
268
269
270
  	}
  	tp->rcv_wnd = new_win;
  	tp->rcv_wup = tp->rcv_nxt;
  
  	/* Make sure we do not exceed the maximum possible
  	 * scaled window.
  	 */
ceef9ab6b   Eric Dumazet   tcp: Namespace-if...
271
272
  	if (!tp->rx_opt.rcv_wscale &&
  	    sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
273
274
275
276
277
278
  		new_win = min(new_win, MAX_TCP_WINDOW);
  	else
  		new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
  
  	/* RFC1323 scaling applied */
  	new_win >>= tp->rx_opt.rcv_wscale;
31770e34e   Florian Westphal   tcp: Revert "tcp:...
279
  	/* If we advertise zero window, disable fast path. */
8e165e203   Florian Westphal   net: tcp: add mib...
280
  	if (new_win == 0) {
31770e34e   Florian Westphal   tcp: Revert "tcp:...
281
  		tp->pred_flags = 0;
8e165e203   Florian Westphal   net: tcp: add mib...
282
283
284
285
286
287
  		if (old_win)
  			NET_INC_STATS(sock_net(sk),
  				      LINUX_MIB_TCPTOZEROWINDOWADV);
  	} else if (old_win == 0) {
  		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
288
289
290
  
  	return new_win;
  }
67edfef78   Andi Kleen   TCP: Add comments...
291
  /* Packet ECN state for a SYN-ACK */
735d38311   Florian Westphal   tcp: change TCP_E...
292
  static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
293
  {
30e502a34   Daniel Borkmann   net: tcp: add fla...
294
  	const struct tcp_sock *tp = tcp_sk(sk);
4de075e04   Eric Dumazet   tcp: rename tcp_s...
295
  	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
296
  	if (!(tp->ecn_flags & TCP_ECN_OK))
4de075e04   Eric Dumazet   tcp: rename tcp_s...
297
  		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
91b5b21c7   Lawrence Brakmo   bpf: Add support ...
298
299
  	else if (tcp_ca_needs_ecn(sk) ||
  		 tcp_bpf_ca_needs_ecn(sk))
30e502a34   Daniel Borkmann   net: tcp: add fla...
300
  		INET_ECN_xmit(sk);
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
301
  }
67edfef78   Andi Kleen   TCP: Add comments...
302
  /* Packet ECN state for a SYN.  */
735d38311   Florian Westphal   tcp: change TCP_E...
303
  static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
304
305
  {
  	struct tcp_sock *tp = tcp_sk(sk);
91b5b21c7   Lawrence Brakmo   bpf: Add support ...
306
  	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
f7b3bec6f   Florian Westphal   net: allow settin...
307
  	bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
91b5b21c7   Lawrence Brakmo   bpf: Add support ...
308
  		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
f7b3bec6f   Florian Westphal   net: allow settin...
309
310
311
312
313
314
315
  
  	if (!use_ecn) {
  		const struct dst_entry *dst = __sk_dst_get(sk);
  
  		if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
  			use_ecn = true;
  	}
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
316
317
  
  	tp->ecn_flags = 0;
f7b3bec6f   Florian Westphal   net: allow settin...
318
319
  
  	if (use_ecn) {
4de075e04   Eric Dumazet   tcp: rename tcp_s...
320
  		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
321
  		tp->ecn_flags = TCP_ECN_OK;
91b5b21c7   Lawrence Brakmo   bpf: Add support ...
322
  		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
30e502a34   Daniel Borkmann   net: tcp: add fla...
323
  			INET_ECN_xmit(sk);
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
324
325
  	}
  }
492135557   Daniel Borkmann   tcp: add rfc3168,...
326
327
328
329
330
331
332
333
  static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
  {
  	if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
  		/* tp->ecn_flags are cleared at a later point in time when
  		 * SYN ACK is ultimatively being received.
  		 */
  		TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
  }
735d38311   Florian Westphal   tcp: change TCP_E...
334
  static void
6ac705b18   Eric Dumazet   tcp: remove tcp_e...
335
  tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
336
  {
6ac705b18   Eric Dumazet   tcp: remove tcp_e...
337
  	if (inet_rsk(req)->ecn_ok)
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
338
339
  		th->ece = 1;
  }
67edfef78   Andi Kleen   TCP: Add comments...
340
341
342
  /* Set up ECN state for a packet on a ESTABLISHED socket that is about to
   * be sent.
   */
735d38311   Florian Westphal   tcp: change TCP_E...
343
  static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
ea1627c20   Eric Dumazet   tcp: minor optimi...
344
  			 struct tcphdr *th, int tcp_header_len)
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
345
346
347
348
349
350
351
352
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	if (tp->ecn_flags & TCP_ECN_OK) {
  		/* Not-retransmitted data segment: set ECT and inject CWR. */
  		if (skb->len != tcp_header_len &&
  		    !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
  			INET_ECN_xmit(sk);
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
353
  			if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) {
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
354
  				tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
ea1627c20   Eric Dumazet   tcp: minor optimi...
355
  				th->cwr = 1;
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
356
357
  				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
  			}
30e502a34   Daniel Borkmann   net: tcp: add fla...
358
  		} else if (!tcp_ca_needs_ecn(sk)) {
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
359
360
361
362
  			/* ACK or retransmitted segment: clear ECT|CE */
  			INET_ECN_dontxmit(sk);
  		}
  		if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)
ea1627c20   Eric Dumazet   tcp: minor optimi...
363
  			th->ece = 1;
bdf1ee5d3   Ilpo Järvinen   [TCP]: Move code ...
364
365
  	}
  }
e870a8efc   Ilpo Järvinen   [TCP]: Perform se...
366
367
368
369
370
  /* Constructs common control bits of non-data skb. If SYN/FIN is present,
   * auto increment end seqno.
   */
  static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
  {
2e8e18ef5   David S. Miller   tcp: Set CHECKSUM...
371
  	skb->ip_summed = CHECKSUM_PARTIAL;
e870a8efc   Ilpo Järvinen   [TCP]: Perform se...
372

4de075e04   Eric Dumazet   tcp: rename tcp_s...
373
  	TCP_SKB_CB(skb)->tcp_flags = flags;
e870a8efc   Ilpo Järvinen   [TCP]: Perform se...
374
  	TCP_SKB_CB(skb)->sacked = 0;
cd7d8498c   Eric Dumazet   tcp: change tcp_s...
375
  	tcp_skb_pcount_set(skb, 1);
e870a8efc   Ilpo Järvinen   [TCP]: Perform se...
376
377
  
  	TCP_SKB_CB(skb)->seq = seq;
a3433f35a   Changli Gao   tcp: unify tcp fl...
378
  	if (flags & (TCPHDR_SYN | TCPHDR_FIN))
e870a8efc   Ilpo Järvinen   [TCP]: Perform se...
379
380
381
  		seq++;
  	TCP_SKB_CB(skb)->end_seq = seq;
  }
a2a385d62   Eric Dumazet   tcp: bool convers...
382
  static inline bool tcp_urg_mode(const struct tcp_sock *tp)
33f5f57ee   Ilpo Järvinen   tcp: kill pointle...
383
384
385
  {
  	return tp->snd_una != tp->snd_up;
  }
33ad798c9   Adam Langley   tcp: options clea...
386
387
388
  #define OPTION_SACK_ADVERTISE	(1 << 0)
  #define OPTION_TS		(1 << 1)
  #define OPTION_MD5		(1 << 2)
89e95a613   Ori Finkelman   IPv4 TCP fails to...
389
  #define OPTION_WSCALE		(1 << 3)
2100c8d2d   Yuchung Cheng   net-tcp: Fast Ope...
390
  #define OPTION_FAST_OPEN_COOKIE	(1 << 8)
60e2a7780   Ursula Braun   tcp: TCP experime...
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
  #define OPTION_SMC		(1 << 9)
  
  static void smc_options_write(__be32 *ptr, u16 *options)
  {
  #if IS_ENABLED(CONFIG_SMC)
  	if (static_branch_unlikely(&tcp_have_smc)) {
  		if (unlikely(OPTION_SMC & *options)) {
  			*ptr++ = htonl((TCPOPT_NOP  << 24) |
  				       (TCPOPT_NOP  << 16) |
  				       (TCPOPT_EXP <<  8) |
  				       (TCPOLEN_EXP_SMC_BASE));
  			*ptr++ = htonl(TCPOPT_SMC_MAGIC);
  		}
  	}
  #endif
  }
33ad798c9   Adam Langley   tcp: options clea...
407
408
  
  struct tcp_out_options {
2100c8d2d   Yuchung Cheng   net-tcp: Fast Ope...
409
410
  	u16 options;		/* bit field of OPTION_* */
  	u16 mss;		/* 0 to disable */
33ad798c9   Adam Langley   tcp: options clea...
411
412
  	u8 ws;			/* window scale, 0 to disable */
  	u8 num_sack_blocks;	/* number of SACK blocks to include */
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
413
  	u8 hash_size;		/* bytes in hash_location */
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
414
  	__u8 *hash_location;	/* temporary pointer, overloaded */
2100c8d2d   Yuchung Cheng   net-tcp: Fast Ope...
415
416
  	__u32 tsval, tsecr;	/* need to include OPTION_TS */
  	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
33ad798c9   Adam Langley   tcp: options clea...
417
  };
67edfef78   Andi Kleen   TCP: Add comments...
418
419
420
  /* Write previously computed TCP options to the packet.
   *
   * Beware: Something in the Internet is very sensitive to the ordering of
fd6149d33   Ilpo Järvinen   tcp: Restore orde...
421
422
   * TCP options, we learned this through the hard way, so be careful here.
   * Luckily we can at least blame others for their non-compliance but from
8e3bff96a   stephen hemminger   net: more spellin...
423
   * inter-operability perspective it seems that we're somewhat stuck with
fd6149d33   Ilpo Järvinen   tcp: Restore orde...
424
425
426
427
428
429
430
   * the ordering which we have been using if we want to keep working with
   * those broken things (not that it currently hurts anybody as there isn't
   * particular reason why the ordering would need to be changed).
   *
   * At least SACK_PERM as the first option is known to lead to a disaster
   * (but it may well be that other scenarios fail similarly).
   */
33ad798c9   Adam Langley   tcp: options clea...
431
  static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
432
433
  			      struct tcp_out_options *opts)
  {
2100c8d2d   Yuchung Cheng   net-tcp: Fast Ope...
434
  	u16 options = opts->options;	/* mungable copy */
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
435

bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
436
  	if (unlikely(OPTION_MD5 & options)) {
1a2c6181c   Christoph Paasch   tcp: Remove TCPCT
437
438
  		*ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
  			       (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
439
440
  		/* overload cookie hash location */
  		opts->hash_location = (__u8 *)ptr;
33ad798c9   Adam Langley   tcp: options clea...
441
  		ptr += 4;
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
442
  	}
33ad798c9   Adam Langley   tcp: options clea...
443

fd6149d33   Ilpo Järvinen   tcp: Restore orde...
444
445
446
447
448
  	if (unlikely(opts->mss)) {
  		*ptr++ = htonl((TCPOPT_MSS << 24) |
  			       (TCPOLEN_MSS << 16) |
  			       opts->mss);
  	}
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
449
450
  	if (likely(OPTION_TS & options)) {
  		if (unlikely(OPTION_SACK_ADVERTISE & options)) {
33ad798c9   Adam Langley   tcp: options clea...
451
452
453
454
  			*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
  				       (TCPOLEN_SACK_PERM << 16) |
  				       (TCPOPT_TIMESTAMP << 8) |
  				       TCPOLEN_TIMESTAMP);
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
455
  			options &= ~OPTION_SACK_ADVERTISE;
33ad798c9   Adam Langley   tcp: options clea...
456
457
458
459
460
461
462
463
464
  		} else {
  			*ptr++ = htonl((TCPOPT_NOP << 24) |
  				       (TCPOPT_NOP << 16) |
  				       (TCPOPT_TIMESTAMP << 8) |
  				       TCPOLEN_TIMESTAMP);
  		}
  		*ptr++ = htonl(opts->tsval);
  		*ptr++ = htonl(opts->tsecr);
  	}
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
465
  	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
33ad798c9   Adam Langley   tcp: options clea...
466
467
468
469
470
  		*ptr++ = htonl((TCPOPT_NOP << 24) |
  			       (TCPOPT_NOP << 16) |
  			       (TCPOPT_SACK_PERM << 8) |
  			       TCPOLEN_SACK_PERM);
  	}
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
471
  	if (unlikely(OPTION_WSCALE & options)) {
33ad798c9   Adam Langley   tcp: options clea...
472
473
474
475
476
477
478
479
480
  		*ptr++ = htonl((TCPOPT_NOP << 24) |
  			       (TCPOPT_WINDOW << 16) |
  			       (TCPOLEN_WINDOW << 8) |
  			       opts->ws);
  	}
  
  	if (unlikely(opts->num_sack_blocks)) {
  		struct tcp_sack_block *sp = tp->rx_opt.dsack ?
  			tp->duplicate_sack : tp->selective_acks;
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
481
482
483
484
485
  		int this_sack;
  
  		*ptr++ = htonl((TCPOPT_NOP  << 24) |
  			       (TCPOPT_NOP  << 16) |
  			       (TCPOPT_SACK <<  8) |
33ad798c9   Adam Langley   tcp: options clea...
486
  			       (TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
487
  						     TCPOLEN_SACK_PERBLOCK)));
2de979bd7   Stephen Hemminger   [TCP]: whitespace...
488

33ad798c9   Adam Langley   tcp: options clea...
489
490
  		for (this_sack = 0; this_sack < opts->num_sack_blocks;
  		     ++this_sack) {
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
491
492
493
  			*ptr++ = htonl(sp[this_sack].start_seq);
  			*ptr++ = htonl(sp[this_sack].end_seq);
  		}
2de979bd7   Stephen Hemminger   [TCP]: whitespace...
494

5861f8e58   Ilpo Järvinen   tcp: remove point...
495
  		tp->rx_opt.dsack = 0;
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
496
  	}
2100c8d2d   Yuchung Cheng   net-tcp: Fast Ope...
497
498
499
  
  	if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
  		struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
7f9b838b7   Daniel Lee   tcp: RFC7413 opti...
500
501
502
503
504
505
506
507
508
509
510
511
512
  		u8 *p = (u8 *)ptr;
  		u32 len; /* Fast Open option length */
  
  		if (foc->exp) {
  			len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
  			*ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
  				     TCPOPT_FASTOPEN_MAGIC);
  			p += TCPOLEN_EXP_FASTOPEN_BASE;
  		} else {
  			len = TCPOLEN_FASTOPEN_BASE + foc->len;
  			*p++ = TCPOPT_FASTOPEN;
  			*p++ = len;
  		}
2100c8d2d   Yuchung Cheng   net-tcp: Fast Ope...
513

7f9b838b7   Daniel Lee   tcp: RFC7413 opti...
514
515
516
517
  		memcpy(p, foc->val, foc->len);
  		if ((len & 3) == 2) {
  			p[foc->len] = TCPOPT_NOP;
  			p[foc->len + 1] = TCPOPT_NOP;
2100c8d2d   Yuchung Cheng   net-tcp: Fast Ope...
518
  		}
7f9b838b7   Daniel Lee   tcp: RFC7413 opti...
519
  		ptr += (len + 3) >> 2;
2100c8d2d   Yuchung Cheng   net-tcp: Fast Ope...
520
  	}
60e2a7780   Ursula Braun   tcp: TCP experime...
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
  
  	smc_options_write(ptr, &options);
  }
  
  static void smc_set_option(const struct tcp_sock *tp,
  			   struct tcp_out_options *opts,
  			   unsigned int *remaining)
  {
  #if IS_ENABLED(CONFIG_SMC)
  	if (static_branch_unlikely(&tcp_have_smc)) {
  		if (tp->syn_smc) {
  			if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
  				opts->options |= OPTION_SMC;
  				*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
  			}
  		}
  	}
  #endif
  }
  
  static void smc_set_option_cond(const struct tcp_sock *tp,
  				const struct inet_request_sock *ireq,
  				struct tcp_out_options *opts,
  				unsigned int *remaining)
  {
  #if IS_ENABLED(CONFIG_SMC)
  	if (static_branch_unlikely(&tcp_have_smc)) {
  		if (tp->syn_smc && ireq->smc_ok) {
  			if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
  				opts->options |= OPTION_SMC;
  				*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
  			}
  		}
  	}
  #endif
33ad798c9   Adam Langley   tcp: options clea...
556
  }
67edfef78   Andi Kleen   TCP: Add comments...
557
558
559
  /* Compute TCP options for SYN packets. This is not the final
   * network wire format yet.
   */
95c961747   Eric Dumazet   net: cleanup unsi...
560
  static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
33ad798c9   Adam Langley   tcp: options clea...
561
  				struct tcp_out_options *opts,
cf533ea53   Eric Dumazet   tcp: add const qu...
562
563
  				struct tcp_md5sig_key **md5)
  {
33ad798c9   Adam Langley   tcp: options clea...
564
  	struct tcp_sock *tp = tcp_sk(sk);
95c961747   Eric Dumazet   net: cleanup unsi...
565
  	unsigned int remaining = MAX_TCP_OPTION_SPACE;
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
566
  	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
33ad798c9   Adam Langley   tcp: options clea...
567

8c2320e84   Eric Dumazet   tcp: md5: only ca...
568
  	*md5 = NULL;
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
569
  #ifdef CONFIG_TCP_MD5SIG
921f9a0f2   Eric Dumazet   tcp: convert tcp_...
570
  	if (static_branch_unlikely(&tcp_md5_needed) &&
6015c71e6   Eric Dumazet   tcp: md5: add tcp...
571
  	    rcu_access_pointer(tp->md5sig_info)) {
8c2320e84   Eric Dumazet   tcp: md5: only ca...
572
573
574
575
576
  		*md5 = tp->af_specific->md5_lookup(sk, sk);
  		if (*md5) {
  			opts->options |= OPTION_MD5;
  			remaining -= TCPOLEN_MD5SIG_ALIGNED;
  		}
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
577
578
  	}
  #endif
33ad798c9   Adam Langley   tcp: options clea...
579
580
581
582
583
584
585
586
587
588
589
  
  	/* We always get an MSS option.  The option bytes which will be seen in
  	 * normal data packets should timestamps be used, must be in the MSS
  	 * advertised.  But we subtract them from tp->mss_cache so that
  	 * calculations in tcp_sendmsg are simpler etc.  So account for this
  	 * fact here if necessary.  If we don't do this correctly, as a
  	 * receiver we won't recognize data packets as being full sized when we
  	 * should, and thus we won't abide by the delayed ACK rules correctly.
  	 * SACKs don't matter, we never delay an ACK when we have any of those
  	 * going out.  */
  	opts->mss = tcp_advertise_mss(sk);
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
590
  	remaining -= TCPOLEN_MSS_ALIGNED;
33ad798c9   Adam Langley   tcp: options clea...
591

5d2ed0521   Eric Dumazet   tcp: Namespaceify...
592
  	if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
33ad798c9   Adam Langley   tcp: options clea...
593
  		opts->options |= OPTION_TS;
7faee5c0d   Eric Dumazet   tcp: remove TCP_S...
594
  		opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
33ad798c9   Adam Langley   tcp: options clea...
595
  		opts->tsecr = tp->rx_opt.ts_recent;
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
596
  		remaining -= TCPOLEN_TSTAMP_ALIGNED;
33ad798c9   Adam Langley   tcp: options clea...
597
  	}
9bb37ef00   Eric Dumazet   tcp: Namespaceify...
598
  	if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) {
33ad798c9   Adam Langley   tcp: options clea...
599
  		opts->ws = tp->rx_opt.rcv_wscale;
89e95a613   Ori Finkelman   IPv4 TCP fails to...
600
  		opts->options |= OPTION_WSCALE;
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
601
  		remaining -= TCPOLEN_WSCALE_ALIGNED;
33ad798c9   Adam Langley   tcp: options clea...
602
  	}
f93010342   Eric Dumazet   tcp: Namespaceify...
603
  	if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) {
33ad798c9   Adam Langley   tcp: options clea...
604
  		opts->options |= OPTION_SACK_ADVERTISE;
b32d13102   David S. Miller   tcp: Fix bitmask ...
605
  		if (unlikely(!(OPTION_TS & opts->options)))
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
606
  			remaining -= TCPOLEN_SACKPERM_ALIGNED;
33ad798c9   Adam Langley   tcp: options clea...
607
  	}
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
608
  	if (fastopen && fastopen->cookie.len >= 0) {
2646c831c   Daniel Lee   tcp: RFC7413 opti...
609
610
611
612
  		u32 need = fastopen->cookie.len;
  
  		need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
  					       TCPOLEN_FASTOPEN_BASE;
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
613
614
615
616
617
618
  		need = (need + 3) & ~3U;  /* Align to 32 bits */
  		if (remaining >= need) {
  			opts->options |= OPTION_FAST_OPEN_COOKIE;
  			opts->fastopen_cookie = &fastopen->cookie;
  			remaining -= need;
  			tp->syn_fastopen = 1;
2646c831c   Daniel Lee   tcp: RFC7413 opti...
619
  			tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
620
621
  		}
  	}
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
622

60e2a7780   Ursula Braun   tcp: TCP experime...
623
  	smc_set_option(tp, opts, &remaining);
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
624
  	return MAX_TCP_OPTION_SPACE - remaining;
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
625
  }
67edfef78   Andi Kleen   TCP: Add comments...
626
  /* Set up TCP options for SYN-ACKs. */
60e2a7780   Ursula Braun   tcp: TCP experime...
627
628
  static unsigned int tcp_synack_options(const struct sock *sk,
  				       struct request_sock *req,
37bfbdda0   Eric Dumazet   tcp: remove tcp_s...
629
630
631
632
  				       unsigned int mss, struct sk_buff *skb,
  				       struct tcp_out_options *opts,
  				       const struct tcp_md5sig_key *md5,
  				       struct tcp_fastopen_cookie *foc)
4957faade   William Allen Simpson   TCPCT part 1g: Re...
633
  {
33ad798c9   Adam Langley   tcp: options clea...
634
  	struct inet_request_sock *ireq = inet_rsk(req);
95c961747   Eric Dumazet   net: cleanup unsi...
635
  	unsigned int remaining = MAX_TCP_OPTION_SPACE;
33ad798c9   Adam Langley   tcp: options clea...
636

cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
637
  #ifdef CONFIG_TCP_MD5SIG
80f03e27a   Eric Dumazet   tcp: md5: fix rcu...
638
  	if (md5) {
33ad798c9   Adam Langley   tcp: options clea...
639
  		opts->options |= OPTION_MD5;
4957faade   William Allen Simpson   TCPCT part 1g: Re...
640
641
642
643
644
645
646
  		remaining -= TCPOLEN_MD5SIG_ALIGNED;
  
  		/* We can't fit any SACK blocks in a packet with MD5 + TS
  		 * options. There was discussion about disabling SACK
  		 * rather than TS in order to fit in better with old,
  		 * buggy kernels, but that was deemed to be unnecessary.
  		 */
de213e5ee   Eric Dumazet   tcp: tcp_synack_o...
647
  		ireq->tstamp_ok &= !ireq->sack_ok;
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
648
649
  	}
  #endif
33ad798c9   Adam Langley   tcp: options clea...
650

4957faade   William Allen Simpson   TCPCT part 1g: Re...
651
  	/* We always send an MSS option. */
33ad798c9   Adam Langley   tcp: options clea...
652
  	opts->mss = mss;
4957faade   William Allen Simpson   TCPCT part 1g: Re...
653
  	remaining -= TCPOLEN_MSS_ALIGNED;
33ad798c9   Adam Langley   tcp: options clea...
654
655
656
  
  	if (likely(ireq->wscale_ok)) {
  		opts->ws = ireq->rcv_wscale;
89e95a613   Ori Finkelman   IPv4 TCP fails to...
657
  		opts->options |= OPTION_WSCALE;
4957faade   William Allen Simpson   TCPCT part 1g: Re...
658
  		remaining -= TCPOLEN_WSCALE_ALIGNED;
33ad798c9   Adam Langley   tcp: options clea...
659
  	}
de213e5ee   Eric Dumazet   tcp: tcp_synack_o...
660
  	if (likely(ireq->tstamp_ok)) {
33ad798c9   Adam Langley   tcp: options clea...
661
  		opts->options |= OPTION_TS;
95a22caee   Florian Westphal   tcp: randomize tc...
662
  		opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
33ad798c9   Adam Langley   tcp: options clea...
663
  		opts->tsecr = req->ts_recent;
4957faade   William Allen Simpson   TCPCT part 1g: Re...
664
  		remaining -= TCPOLEN_TSTAMP_ALIGNED;
33ad798c9   Adam Langley   tcp: options clea...
665
666
667
  	}
  	if (likely(ireq->sack_ok)) {
  		opts->options |= OPTION_SACK_ADVERTISE;
de213e5ee   Eric Dumazet   tcp: tcp_synack_o...
668
  		if (unlikely(!ireq->tstamp_ok))
4957faade   William Allen Simpson   TCPCT part 1g: Re...
669
  			remaining -= TCPOLEN_SACKPERM_ALIGNED;
33ad798c9   Adam Langley   tcp: options clea...
670
  	}
7f9b838b7   Daniel Lee   tcp: RFC7413 opti...
671
672
673
674
675
  	if (foc != NULL && foc->len >= 0) {
  		u32 need = foc->len;
  
  		need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
  				   TCPOLEN_FASTOPEN_BASE;
8336886f7   Jerry Chu   tcp: TCP Fast Ope...
676
677
678
679
680
681
682
  		need = (need + 3) & ~3U;  /* Align to 32 bits */
  		if (remaining >= need) {
  			opts->options |= OPTION_FAST_OPEN_COOKIE;
  			opts->fastopen_cookie = foc;
  			remaining -= need;
  		}
  	}
1a2c6181c   Christoph Paasch   tcp: Remove TCPCT
683

60e2a7780   Ursula Braun   tcp: TCP experime...
684
  	smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
4957faade   William Allen Simpson   TCPCT part 1g: Re...
685
  	return MAX_TCP_OPTION_SPACE - remaining;
33ad798c9   Adam Langley   tcp: options clea...
686
  }
67edfef78   Andi Kleen   TCP: Add comments...
687
688
689
  /* Compute TCP options for ESTABLISHED sockets. This is not the
   * final wire format yet.
   */
95c961747   Eric Dumazet   net: cleanup unsi...
690
  static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
33ad798c9   Adam Langley   tcp: options clea...
691
  					struct tcp_out_options *opts,
cf533ea53   Eric Dumazet   tcp: add const qu...
692
693
  					struct tcp_md5sig_key **md5)
  {
33ad798c9   Adam Langley   tcp: options clea...
694
  	struct tcp_sock *tp = tcp_sk(sk);
95c961747   Eric Dumazet   net: cleanup unsi...
695
  	unsigned int size = 0;
cabeccbd1   Ilpo Järvinen   tcp: kill eff_sac...
696
  	unsigned int eff_sacks;
33ad798c9   Adam Langley   tcp: options clea...
697

5843ef421   Andi Kleen   tcp: Always set o...
698
  	opts->options = 0;
8c2320e84   Eric Dumazet   tcp: md5: only ca...
699
  	*md5 = NULL;
33ad798c9   Adam Langley   tcp: options clea...
700
  #ifdef CONFIG_TCP_MD5SIG
921f9a0f2   Eric Dumazet   tcp: convert tcp_...
701
  	if (static_branch_unlikely(&tcp_md5_needed) &&
6015c71e6   Eric Dumazet   tcp: md5: add tcp...
702
  	    rcu_access_pointer(tp->md5sig_info)) {
8c2320e84   Eric Dumazet   tcp: md5: only ca...
703
704
705
706
707
  		*md5 = tp->af_specific->md5_lookup(sk, sk);
  		if (*md5) {
  			opts->options |= OPTION_MD5;
  			size += TCPOLEN_MD5SIG_ALIGNED;
  		}
33ad798c9   Adam Langley   tcp: options clea...
708
  	}
33ad798c9   Adam Langley   tcp: options clea...
709
710
711
712
  #endif
  
  	if (likely(tp->rx_opt.tstamp_ok)) {
  		opts->options |= OPTION_TS;
7faee5c0d   Eric Dumazet   tcp: remove TCP_S...
713
  		opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
33ad798c9   Adam Langley   tcp: options clea...
714
715
716
  		opts->tsecr = tp->rx_opt.ts_recent;
  		size += TCPOLEN_TSTAMP_ALIGNED;
  	}
cabeccbd1   Ilpo Järvinen   tcp: kill eff_sac...
717
718
  	eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
  	if (unlikely(eff_sacks)) {
95c961747   Eric Dumazet   net: cleanup unsi...
719
  		const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
33ad798c9   Adam Langley   tcp: options clea...
720
  		opts->num_sack_blocks =
95c961747   Eric Dumazet   net: cleanup unsi...
721
  			min_t(unsigned int, eff_sacks,
33ad798c9   Adam Langley   tcp: options clea...
722
723
  			      (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
  			      TCPOLEN_SACK_PERBLOCK);
2fc7d173e   Eric Dumazet   tcp: md5: fix pot...
724
725
726
  		if (likely(opts->num_sack_blocks))
  			size += TCPOLEN_SACK_BASE_ALIGNED +
  				opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
33ad798c9   Adam Langley   tcp: options clea...
727
728
729
  	}
  
  	return size;
40efc6fa1   Stephen Hemminger   [TCP]: less inline's
730
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
731

46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
732
733
734
735
736
737
738
739
740
  
  /* TCP SMALL QUEUES (TSQ)
   *
   * TSQ goal is to keep small amount of skbs per tcp flow in tx queues (qdisc+dev)
   * to reduce RTT and bufferbloat.
   * We do this using a special skb destructor (tcp_wfree).
   *
   * Its important tcp_wfree() can be replaced by sock_wfree() in the event skb
   * needs to be reallocated in a driver.
8e3bff96a   stephen hemminger   net: more spellin...
741
   * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
742
743
744
745
746
747
748
749
750
751
   *
   * Since transmit from skb destructor is forbidden, we use a tasklet
   * to process all sockets that eventually need to send more skbs.
   * We use one tasklet per cpu, with its own queue of sockets.
   */
  struct tsq_tasklet {
  	struct tasklet_struct	tasklet;
  	struct list_head	head; /* queue of tcp sockets */
  };
  static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
73a6bab5a   Eric Dumazet   tcp: switch pacin...
752
  static void tcp_tsq_write(struct sock *sk)
6f458dfb4   Eric Dumazet   tcp: improve late...
753
754
755
  {
  	if ((1 << sk->sk_state) &
  	    (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING |
f9616c35a   Eric Dumazet   tcp: implement TS...
756
757
758
759
  	     TCPF_CLOSE_WAIT  | TCPF_LAST_ACK)) {
  		struct tcp_sock *tp = tcp_sk(sk);
  
  		if (tp->lost_out > tp->retrans_out &&
3a91d29f2   Koichiro Den   tcp: do tcp_mstam...
760
761
  		    tp->snd_cwnd > tcp_packets_in_flight(tp)) {
  			tcp_mstamp_refresh(tp);
f9616c35a   Eric Dumazet   tcp: implement TS...
762
  			tcp_xmit_retransmit_queue(sk);
3a91d29f2   Koichiro Den   tcp: do tcp_mstam...
763
  		}
f9616c35a   Eric Dumazet   tcp: implement TS...
764
765
  
  		tcp_write_xmit(sk, tcp_current_mss(sk), tp->nonagle,
bf06200e7   John Ogness   tcp: tsq: fix non...
766
  			       0, GFP_ATOMIC);
f9616c35a   Eric Dumazet   tcp: implement TS...
767
  	}
6f458dfb4   Eric Dumazet   tcp: improve late...
768
  }
73a6bab5a   Eric Dumazet   tcp: switch pacin...
769
770
771
772
773
774
775
776
777
778
  
  static void tcp_tsq_handler(struct sock *sk)
  {
  	bh_lock_sock(sk);
  	if (!sock_owned_by_user(sk))
  		tcp_tsq_write(sk);
  	else if (!test_and_set_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
  		sock_hold(sk);
  	bh_unlock_sock(sk);
  }
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
779
  /*
8e3bff96a   stephen hemminger   net: more spellin...
780
   * One tasklet per cpu tries to send more skbs.
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
781
   * We run in tasklet context but need to disable irqs when
8e3bff96a   stephen hemminger   net: more spellin...
782
   * transferring tsq->head because tcp_wfree() might
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
   * interrupt us (non NAPI drivers)
   */
  static void tcp_tasklet_func(unsigned long data)
  {
  	struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
  	LIST_HEAD(list);
  	unsigned long flags;
  	struct list_head *q, *n;
  	struct tcp_sock *tp;
  	struct sock *sk;
  
  	local_irq_save(flags);
  	list_splice_init(&tsq->head, &list);
  	local_irq_restore(flags);
  
  	list_for_each_safe(q, n, &list) {
  		tp = list_entry(q, struct tcp_sock, tsq_node);
  		list_del(&tp->tsq_node);
  
  		sk = (struct sock *)tp;
0a9648f12   Eric Dumazet   tcp: add a missin...
803
  		smp_mb__before_atomic();
7aa5470c2   Eric Dumazet   tcp: tsq: move ts...
804
  		clear_bit(TSQ_QUEUED, &sk->sk_tsq_flags);
73a6bab5a   Eric Dumazet   tcp: switch pacin...
805
  		tcp_tsq_handler(sk);
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
806
807
808
  		sk_free(sk);
  	}
  }
40fc3423b   Eric Dumazet   tcp: tsq: add tsq...
809
810
811
812
  #define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED |		\
  			  TCPF_WRITE_TIMER_DEFERRED |	\
  			  TCPF_DELACK_TIMER_DEFERRED |	\
  			  TCPF_MTU_REDUCED_DEFERRED)
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
813
814
815
816
817
818
819
820
821
  /**
   * tcp_release_cb - tcp release_sock() callback
   * @sk: socket
   *
   * called from release_sock() to perform protocol dependent
   * actions before socket release.
   */
  void tcp_release_cb(struct sock *sk)
  {
6f458dfb4   Eric Dumazet   tcp: improve late...
822
  	unsigned long flags, nflags;
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
823

6f458dfb4   Eric Dumazet   tcp: improve late...
824
825
  	/* perform an atomic operation only if at least one flag is set */
  	do {
7aa5470c2   Eric Dumazet   tcp: tsq: move ts...
826
  		flags = sk->sk_tsq_flags;
6f458dfb4   Eric Dumazet   tcp: improve late...
827
828
829
  		if (!(flags & TCP_DEFERRED_ALL))
  			return;
  		nflags = flags & ~TCP_DEFERRED_ALL;
7aa5470c2   Eric Dumazet   tcp: tsq: move ts...
830
  	} while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
6f458dfb4   Eric Dumazet   tcp: improve late...
831

73a6bab5a   Eric Dumazet   tcp: switch pacin...
832
833
834
835
  	if (flags & TCPF_TSQ_DEFERRED) {
  		tcp_tsq_write(sk);
  		__sock_put(sk);
  	}
c3f9b0184   Eric Dumazet   tcp: tcp_release_...
836
837
838
839
840
841
842
843
844
845
  	/* Here begins the tricky part :
  	 * We are called from release_sock() with :
  	 * 1) BH disabled
  	 * 2) sk_lock.slock spinlock held
  	 * 3) socket owned by us (sk->sk_lock.owned == 1)
  	 *
  	 * But following code is meant to be called from BH handlers,
  	 * so we should keep BH disabled, but early release socket ownership
  	 */
  	sock_release_ownership(sk);
40fc3423b   Eric Dumazet   tcp: tsq: add tsq...
846
  	if (flags & TCPF_WRITE_TIMER_DEFERRED) {
6f458dfb4   Eric Dumazet   tcp: improve late...
847
  		tcp_write_timer_handler(sk);
144d56e91   Eric Dumazet   tcp: fix possible...
848
849
  		__sock_put(sk);
  	}
40fc3423b   Eric Dumazet   tcp: tsq: add tsq...
850
  	if (flags & TCPF_DELACK_TIMER_DEFERRED) {
6f458dfb4   Eric Dumazet   tcp: improve late...
851
  		tcp_delack_timer_handler(sk);
144d56e91   Eric Dumazet   tcp: fix possible...
852
853
  		__sock_put(sk);
  	}
40fc3423b   Eric Dumazet   tcp: tsq: add tsq...
854
  	if (flags & TCPF_MTU_REDUCED_DEFERRED) {
4fab90719   Neal Cardwell   tcp: fix tcp_rele...
855
  		inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
144d56e91   Eric Dumazet   tcp: fix possible...
856
857
  		__sock_put(sk);
  	}
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
  }
  EXPORT_SYMBOL(tcp_release_cb);
  
  void __init tcp_tasklet_init(void)
  {
  	int i;
  
  	for_each_possible_cpu(i) {
  		struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
  
  		INIT_LIST_HEAD(&tsq->head);
  		tasklet_init(&tsq->tasklet,
  			     tcp_tasklet_func,
  			     (unsigned long)tsq);
  	}
  }
  
  /*
   * Write buffer destructor automatically called from kfree_skb.
8e3bff96a   stephen hemminger   net: more spellin...
877
   * We can't xmit new skbs from this context, as we might already
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
878
879
   * hold qdisc lock.
   */
d6a4a1041   Eric Dumazet   tcp: GSO should b...
880
  void tcp_wfree(struct sk_buff *skb)
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
881
882
883
  {
  	struct sock *sk = skb->sk;
  	struct tcp_sock *tp = tcp_sk(sk);
408f0a6c2   Eric Dumazet   tcp: tsq: remove ...
884
  	unsigned long flags, nval, oval;
9b462d02d   Eric Dumazet   tcp: TCP Small Qu...
885
886
887
888
  
  	/* Keep one reference on sk_wmem_alloc.
  	 * Will be released by sk_free() from here or tcp_tasklet_func()
  	 */
14afee4b6   Reshetova, Elena   net: convert sock...
889
  	WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
9b462d02d   Eric Dumazet   tcp: TCP Small Qu...
890
891
892
893
894
895
896
897
  
  	/* If this softirq is serviced by ksoftirqd, we are likely under stress.
  	 * Wait until our queues (qdisc + devices) are drained.
  	 * This gives :
  	 * - less callbacks to tcp_write_xmit(), reducing stress (batches)
  	 * - chance for incoming ACK (processed by another cpu maybe)
  	 *   to migrate this flow (skb->ooo_okay will be eventually set)
  	 */
14afee4b6   Reshetova, Elena   net: convert sock...
898
  	if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
9b462d02d   Eric Dumazet   tcp: TCP Small Qu...
899
  		goto out;
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
900

7aa5470c2   Eric Dumazet   tcp: tsq: move ts...
901
  	for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
902
  		struct tsq_tasklet *tsq;
a9b204d15   Eric Dumazet   tcp: tsq: avoid o...
903
  		bool empty;
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
904

408f0a6c2   Eric Dumazet   tcp: tsq: remove ...
905
906
  		if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
  			goto out;
73a6bab5a   Eric Dumazet   tcp: switch pacin...
907
  		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
7aa5470c2   Eric Dumazet   tcp: tsq: move ts...
908
  		nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
408f0a6c2   Eric Dumazet   tcp: tsq: remove ...
909
910
  		if (nval != oval)
  			continue;
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
911
912
  		/* queue this socket to tasklet queue */
  		local_irq_save(flags);
903ceff7c   Christoph Lameter   net: Replace get_...
913
  		tsq = this_cpu_ptr(&tsq_tasklet);
a9b204d15   Eric Dumazet   tcp: tsq: avoid o...
914
  		empty = list_empty(&tsq->head);
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
915
  		list_add(&tp->tsq_node, &tsq->head);
a9b204d15   Eric Dumazet   tcp: tsq: avoid o...
916
917
  		if (empty)
  			tasklet_schedule(&tsq->tasklet);
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
918
  		local_irq_restore(flags);
9b462d02d   Eric Dumazet   tcp: TCP Small Qu...
919
  		return;
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
920
  	}
9b462d02d   Eric Dumazet   tcp: TCP Small Qu...
921
922
  out:
  	sk_free(sk);
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
923
  }
73a6bab5a   Eric Dumazet   tcp: switch pacin...
924
925
  /* Note: Called under soft irq.
   * We can call TCP stack right away, unless socket is owned by user.
218af599f   Eric Dumazet   tcp: internal imp...
926
927
928
929
930
   */
  enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
  {
  	struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
  	struct sock *sk = (struct sock *)tp;
218af599f   Eric Dumazet   tcp: internal imp...
931

73a6bab5a   Eric Dumazet   tcp: switch pacin...
932
933
  	tcp_tsq_handler(sk);
  	sock_put(sk);
218af599f   Eric Dumazet   tcp: internal imp...
934

218af599f   Eric Dumazet   tcp: internal imp...
935
936
  	return HRTIMER_NORESTART;
  }
a7a256306   Eric Dumazet   tcp: mitigate sch...
937
938
  static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
  				      u64 prior_wstamp)
e2080072e   Eric Dumazet   tcp: new list for...
939
  {
ab408b6dc   Eric Dumazet   tcp: switch tcp a...
940
  	struct tcp_sock *tp = tcp_sk(sk);
ab408b6dc   Eric Dumazet   tcp: switch tcp a...
941
  	if (sk->sk_pacing_status != SK_PACING_NONE) {
76a9ebe81   Eric Dumazet   net: extend sk_pa...
942
  		unsigned long rate = sk->sk_pacing_rate;
ab408b6dc   Eric Dumazet   tcp: switch tcp a...
943
944
945
946
947
  
  		/* Original sch_fq does not pace first 10 MSS
  		 * Note that tp->data_segs_out overflows after 2^32 packets,
  		 * this is a minor annoyance.
  		 */
76a9ebe81   Eric Dumazet   net: extend sk_pa...
948
  		if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
a7a256306   Eric Dumazet   tcp: mitigate sch...
949
950
951
952
953
954
  			u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
  			u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
  
  			/* take into account OS jitter */
  			len_ns -= min_t(u64, len_ns / 2, credit);
  			tp->tcp_wstamp_ns += len_ns;
ab408b6dc   Eric Dumazet   tcp: switch tcp a...
955
956
  		}
  	}
e2080072e   Eric Dumazet   tcp: new list for...
957
958
  	list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
959
960
961
962
963
964
965
966
967
968
969
  /* This routine actually transmits TCP packets queued in by
   * tcp_do_sendmsg().  This is used by both the initial
   * transmission and possible later retransmissions.
   * All SKB's seen here are completely headerless.  It is our
   * job to build the TCP header, and pass the packet down to
   * IP so it can do the same plus pass the packet off to the
   * device.
   *
   * We are working here with either a clone of the original
   * SKB, or a fresh unique copy made by the retransmit engine.
   */
2987babb6   Yuchung Cheng   tcp: helpers to s...
970
971
  static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
  			      int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
972
  {
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
973
974
975
976
  	const struct inet_connection_sock *icsk = inet_csk(sk);
  	struct inet_sock *inet;
  	struct tcp_sock *tp;
  	struct tcp_skb_cb *tcb;
33ad798c9   Adam Langley   tcp: options clea...
977
  	struct tcp_out_options opts;
95c961747   Eric Dumazet   net: cleanup unsi...
978
  	unsigned int tcp_options_size, tcp_header_size;
8c72c65b4   Eric Dumazet   tcp: update skb->...
979
  	struct sk_buff *oskb = NULL;
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
980
  	struct tcp_md5sig_key *md5;
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
981
  	struct tcphdr *th;
a7a256306   Eric Dumazet   tcp: mitigate sch...
982
  	u64 prior_wstamp;
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
983
984
985
  	int err;
  
  	BUG_ON(!skb || !tcp_skb_pcount(skb));
6f094b9ec   Lawrence Brakmo   tcp: add in_fligh...
986
  	tp = tcp_sk(sk);
7f12422c4   Yuchung Cheng   tcp: always times...
987
988
989
  	prior_wstamp = tp->tcp_wstamp_ns;
  	tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
  	skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
ccdbb6e96   Eric Dumazet   tcp: tcp_transmit...
990
  	if (clone_it) {
6f094b9ec   Lawrence Brakmo   tcp: add in_fligh...
991
992
  		TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
  			- tp->snd_una;
8c72c65b4   Eric Dumazet   tcp: update skb->...
993
  		oskb = skb;
e2080072e   Eric Dumazet   tcp: new list for...
994
995
996
997
998
999
1000
  
  		tcp_skb_tsorted_save(oskb) {
  			if (unlikely(skb_cloned(oskb)))
  				skb = pskb_copy(oskb, gfp_mask);
  			else
  				skb = skb_clone(oskb, gfp_mask);
  		} tcp_skb_tsorted_restore(oskb);
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
1001
1002
1003
  		if (unlikely(!skb))
  			return -ENOBUFS;
  	}
5f6188a80   Eric Dumazet   tcp: do not chang...
1004

dfb4b9dce   David S. Miller   [TCP] Vegas: time...
1005
  	inet = inet_sk(sk);
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
1006
  	tcb = TCP_SKB_CB(skb);
33ad798c9   Adam Langley   tcp: options clea...
1007
  	memset(&opts, 0, sizeof(opts));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1008

051ba6744   Eric Dumazet   tcp: force a PSH ...
1009
  	if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
33ad798c9   Adam Langley   tcp: options clea...
1010
  		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
051ba6744   Eric Dumazet   tcp: force a PSH ...
1011
  	} else {
33ad798c9   Adam Langley   tcp: options clea...
1012
1013
  		tcp_options_size = tcp_established_options(sk, skb, &opts,
  							   &md5);
051ba6744   Eric Dumazet   tcp: force a PSH ...
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
  		/* Force a PSH flag on all (GSO) packets to expedite GRO flush
  		 * at receiver : This slightly improve GRO performance.
  		 * Note that we do not force the PSH flag for non GSO packets,
  		 * because they might be sent under high congestion events,
  		 * and in this case it is better to delay the delivery of 1-MSS
  		 * packets and thus the corresponding ACK packet that would
  		 * release the following packet.
  		 */
  		if (tcp_skb_pcount(skb) > 1)
  			tcb->tcp_flags |= TCPHDR_PSH;
  	}
33ad798c9   Adam Langley   tcp: options clea...
1025
  	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
1026

547669d48   Eric Dumazet   tcp: xps: fix reo...
1027
  	/* if no packet is in qdisc/device queue, then allow XPS to select
b2532eb9a   Eric Dumazet   tcp: fix ooo_okay...
1028
  	 * another queue. We can be called from tcp_tsq_handler()
73a6bab5a   Eric Dumazet   tcp: switch pacin...
1029
  	 * which holds one reference to sk.
b2532eb9a   Eric Dumazet   tcp: fix ooo_okay...
1030
1031
1032
  	 *
  	 * TODO: Ideally, in-flight pure ACK packets should not matter here.
  	 * One way to get this would be to set skb->truesize = 2 on them.
547669d48   Eric Dumazet   tcp: xps: fix reo...
1033
  	 */
b2532eb9a   Eric Dumazet   tcp: fix ooo_okay...
1034
  	skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
1035

38ab52e8e   Eric Dumazet   tcp: clear pfmema...
1036
1037
1038
1039
1040
1041
  	/* If we had to use memory reserve to allocate this skb,
  	 * this might cause drops if packet is looped back :
  	 * Other socket might not have SOCK_MEMALLOC.
  	 * Packets not looped back do not care about pfmemalloc.
  	 */
  	skb->pfmemalloc = 0;
aa8223c7b   Arnaldo Carvalho de Melo   [SK_BUFF]: Introd...
1042
1043
  	skb_push(skb, tcp_header_size);
  	skb_reset_transport_header(skb);
46d3ceabd   Eric Dumazet   tcp: TCP Small Qu...
1044
1045
1046
  
  	skb_orphan(skb);
  	skb->sk = sk;
1d2077ac0   Eric Dumazet   net: add __sock_w...
1047
  	skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
b73c3d0e4   Tom Herbert   net: Save TX flow...
1048
  	skb_set_hash_from_sk(skb, sk);
14afee4b6   Reshetova, Elena   net: convert sock...
1049
  	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
1050

c3a2e8370   Julian Anastasov   tcp: replace dst_...
1051
  	skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
1052
  	/* Build TCP header and checksum it. */
ea1627c20   Eric Dumazet   tcp: minor optimi...
1053
  	th = (struct tcphdr *)skb->data;
c720c7e83   Eric Dumazet   inet: rename some...
1054
1055
  	th->source		= inet->inet_sport;
  	th->dest		= inet->inet_dport;
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
1056
  	th->seq			= htonl(tcb->seq);
2987babb6   Yuchung Cheng   tcp: helpers to s...
1057
  	th->ack_seq		= htonl(rcv_nxt);
df7a3b07c   Al Viro   [TCP] net/ipv4/tc...
1058
  	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
4de075e04   Eric Dumazet   tcp: rename tcp_s...
1059
  					tcb->tcp_flags);
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
1060

dfb4b9dce   David S. Miller   [TCP] Vegas: time...
1061
1062
  	th->check		= 0;
  	th->urg_ptr		= 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1063

33f5f57ee   Ilpo Järvinen   tcp: kill pointle...
1064
  	/* The urg_mode check is necessary during a below snd_una win probe */
7691367d7   Herbert Xu   tcp: Always set u...
1065
1066
1067
1068
1069
  	if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
  		if (before(tp->snd_up, tcb->seq + 0x10000)) {
  			th->urg_ptr = htons(tp->snd_up - tcb->seq);
  			th->urg = 1;
  		} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
0eae88f31   Eric Dumazet   net: Fix various ...
1070
  			th->urg_ptr = htons(0xFFFF);
7691367d7   Herbert Xu   tcp: Always set u...
1071
1072
  			th->urg = 1;
  		}
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
1073
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1074

bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
1075
  	tcp_options_write((__be32 *)(th + 1), tp, &opts);
51466a754   Eric Dumazet   tcp: fill shinfo-...
1076
  	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
ea1627c20   Eric Dumazet   tcp: minor optimi...
1077
1078
1079
1080
1081
1082
1083
1084
1085
  	if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
  		th->window      = htons(tcp_select_window(sk));
  		tcp_ecn_send(sk, skb, th, tcp_header_size);
  	} else {
  		/* RFC1323: The window in SYN & SYN/ACK segments
  		 * is never scaled.
  		 */
  		th->window	= htons(min(tp->rcv_wnd, 65535U));
  	}
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
1086
1087
1088
  #ifdef CONFIG_TCP_MD5SIG
  	/* Calculate the MD5 hash, as we have all we need now */
  	if (md5) {
a465419b1   Eric Dumazet   net: Introduce sk...
1089
  		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
1090
  		tp->af_specific->calc_md5_hash(opts.hash_location,
39f8e58e5   Eric Dumazet   tcp: md5: remove ...
1091
  					       md5, sk, skb);
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
1092
1093
  	}
  #endif
bb2962461   Herbert Xu   inet: Remove unus...
1094
  	icsk->icsk_af_ops->send_check(sk, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1095

4de075e04   Eric Dumazet   tcp: rename tcp_s...
1096
  	if (likely(tcb->tcp_flags & TCPHDR_ACK))
27cde44a2   Yuchung Cheng   tcp: do not cance...
1097
  		tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1098

a44d6eacd   Martin KaFai Lau   tcp: Add RFC4898 ...
1099
  	if (skb->len != tcp_header_size) {
cf533ea53   Eric Dumazet   tcp: add const qu...
1100
  		tcp_event_data_sent(tp, sk);
a44d6eacd   Martin KaFai Lau   tcp: Add RFC4898 ...
1101
  		tp->data_segs_out += tcp_skb_pcount(skb);
ba113c3aa   Wei Wang   tcp: add data byt...
1102
  		tp->bytes_sent += skb->len - tcp_header_size;
a44d6eacd   Martin KaFai Lau   tcp: Add RFC4898 ...
1103
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1104

bd37a0885   Wei Yongjun   [TCP]: SNMPv2 tcp...
1105
  	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
aa2ea0586   Tom Herbert   tcp: fix outsegs ...
1106
1107
  		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
  			      tcp_skb_pcount(skb));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1108

2efd055c5   Marcelo Ricardo Leitner   tcp: add tcpi_seg...
1109
  	tp->segs_out += tcp_skb_pcount(skb);
f69ad292c   Eric Dumazet   tcp: fill shinfo-...
1110
  	/* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */
cd7d8498c   Eric Dumazet   tcp: change tcp_s...
1111
  	skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
f69ad292c   Eric Dumazet   tcp: fill shinfo-...
1112
  	skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
cd7d8498c   Eric Dumazet   tcp: change tcp_s...
1113

d3edd06ea   Eric Dumazet   tcp: provide earl...
1114
  	/* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
971f10eca   Eric Dumazet   tcp: better TCP_S...
1115
1116
1117
1118
  
  	/* Cleanup our debris for IP stacks */
  	memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
  			       sizeof(struct inet6_skb_parm)));
a842fe142   Eric Dumazet   tcp: add optional...
1119
  	tcp_add_tx_delay(skb, tp);
b0270e910   Eric Dumazet   ipv4: add a sock ...
1120
  	err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
7faee5c0d   Eric Dumazet   tcp: remove TCP_S...
1121

8c72c65b4   Eric Dumazet   tcp: update skb->...
1122
1123
1124
1125
  	if (unlikely(err > 0)) {
  		tcp_enter_cwr(sk);
  		err = net_xmit_eval(err);
  	}
fc2257991   Eric Dumazet   tcp: fix data del...
1126
  	if (!err && oskb) {
a7a256306   Eric Dumazet   tcp: mitigate sch...
1127
  		tcp_update_skb_after_send(sk, oskb, prior_wstamp);
fc2257991   Eric Dumazet   tcp: fix data del...
1128
1129
  		tcp_rate_skb_sent(sk, oskb);
  	}
8c72c65b4   Eric Dumazet   tcp: update skb->...
1130
  	return err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1131
  }
2987babb6   Yuchung Cheng   tcp: helpers to s...
1132
1133
1134
1135
1136
1137
  static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
  			    gfp_t gfp_mask)
  {
  	return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
  				  tcp_sk(sk)->rcv_nxt);
  }
67edfef78   Andi Kleen   TCP: Add comments...
1138
  /* This routine just queues the buffer for sending.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1139
1140
1141
1142
1143
1144
1145
1146
1147
   *
   * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
   * otherwise socket can stall.
   */
  static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	/* Advance write_seq and place onto the write_queue. */
0f3174645   Eric Dumazet   tcp: annotate tp-...
1148
  	WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
f4a775d14   Eric Dumazet   net: introduce __...
1149
  	__skb_header_release(skb);
fe067e8ab   David S. Miller   [TCP]: Abstract o...
1150
  	tcp_add_write_queue_tail(sk, skb);
ab4e846a8   Eric Dumazet   tcp: annotate sk-...
1151
  	sk_wmem_queued_add(sk, skb->truesize);
3ab224be6   Hideo Aoki   [NET] CORE: Intro...
1152
  	sk_mem_charge(sk, skb->truesize);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1153
  }
67edfef78   Andi Kleen   TCP: Add comments...
1154
  /* Initialize TSO segments for a packet. */
5bbb432c8   Eric Dumazet   tcp: tcp_set_skb_...
1155
  static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
f6302d1d7   David S. Miller   [TCP]: Move send ...
1156
  {
4a64fd6cc   Eric Dumazet   tcp: remove dead ...
1157
  	if (skb->len <= mss_now) {
f6302d1d7   David S. Miller   [TCP]: Move send ...
1158
1159
1160
  		/* Avoid the costly divide in the normal
  		 * non-TSO case.
  		 */
cd7d8498c   Eric Dumazet   tcp: change tcp_s...
1161
  		tcp_skb_pcount_set(skb, 1);
f69ad292c   Eric Dumazet   tcp: fill shinfo-...
1162
  		TCP_SKB_CB(skb)->tcp_gso_size = 0;
f6302d1d7   David S. Miller   [TCP]: Move send ...
1163
  	} else {
cd7d8498c   Eric Dumazet   tcp: change tcp_s...
1164
  		tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
f69ad292c   Eric Dumazet   tcp: fill shinfo-...
1165
  		TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1166
1167
  	}
  }
797108d13   Ilpo Järvinen   tcp: add helper f...
1168
1169
1170
  /* Pcount in the middle of the write queue got changed, we need to do various
   * tweaks to fix counters
   */
cf533ea53   Eric Dumazet   tcp: add const qu...
1171
  static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int decr)
797108d13   Ilpo Järvinen   tcp: add helper f...
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	tp->packets_out -= decr;
  
  	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
  		tp->sacked_out -= decr;
  	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
  		tp->retrans_out -= decr;
  	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
  		tp->lost_out -= decr;
  
  	/* Reno case is special. Sigh... */
  	if (tcp_is_reno(tp) && decr > 0)
  		tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
797108d13   Ilpo Järvinen   tcp: add helper f...
1187
1188
  	if (tp->lost_skb_hint &&
  	    before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
713bafea9   Yuchung Cheng   tcp: retire FACK ...
1189
  	    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
797108d13   Ilpo Järvinen   tcp: add helper f...
1190
1191
1192
1193
  		tp->lost_cnt_hint -= decr;
  
  	tcp_verify_left_out(tp);
  }
0a2cf20c3   Soheil Hassas Yeganeh   tcp: remove SKBTX...
1194
1195
1196
1197
1198
  static bool tcp_has_tx_tstamp(const struct sk_buff *skb)
  {
  	return TCP_SKB_CB(skb)->txstamp_ack ||
  		(skb_shinfo(skb)->tx_flags & SKBTX_ANY_TSTAMP);
  }
490cc7d03   Willem de Bruijn   net-timestamp: fi...
1199
1200
1201
  static void tcp_fragment_tstamp(struct sk_buff *skb, struct sk_buff *skb2)
  {
  	struct skb_shared_info *shinfo = skb_shinfo(skb);
0a2cf20c3   Soheil Hassas Yeganeh   tcp: remove SKBTX...
1202
  	if (unlikely(tcp_has_tx_tstamp(skb)) &&
490cc7d03   Willem de Bruijn   net-timestamp: fi...
1203
1204
1205
1206
1207
1208
1209
  	    !before(shinfo->tskey, TCP_SKB_CB(skb2)->seq)) {
  		struct skb_shared_info *shinfo2 = skb_shinfo(skb2);
  		u8 tsflags = shinfo->tx_flags & SKBTX_ANY_TSTAMP;
  
  		shinfo->tx_flags &= ~tsflags;
  		shinfo2->tx_flags |= tsflags;
  		swap(shinfo->tskey, shinfo2->tskey);
b51e13faf   Martin KaFai Lau   tcp: Carry txstam...
1210
1211
  		TCP_SKB_CB(skb2)->txstamp_ack = TCP_SKB_CB(skb)->txstamp_ack;
  		TCP_SKB_CB(skb)->txstamp_ack = 0;
490cc7d03   Willem de Bruijn   net-timestamp: fi...
1212
1213
  	}
  }
a166140e8   Martin KaFai Lau   tcp: Handle eor b...
1214
1215
1216
1217
1218
  static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
  {
  	TCP_SKB_CB(skb2)->eor = TCP_SKB_CB(skb)->eor;
  	TCP_SKB_CB(skb)->eor = 0;
  }
75c119afe   Eric Dumazet   tcp: implement rb...
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
  /* Insert buff after skb on the write or rtx queue of sk.  */
  static void tcp_insert_write_queue_after(struct sk_buff *skb,
  					 struct sk_buff *buff,
  					 struct sock *sk,
  					 enum tcp_queue tcp_queue)
  {
  	if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
  		__skb_queue_after(&sk->sk_write_queue, skb, buff);
  	else
  		tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1230
1231
  /* Function to create two new TCP segments.  Shrinks the given segment
   * to the specified size and appends a new segment with the rest of the
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
1232
   * packet to the list.  This won't be called frequently, I hope.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1233
1234
   * Remember, these are still headerless SKBs at this point.
   */
75c119afe   Eric Dumazet   tcp: implement rb...
1235
1236
  int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
  		 struct sk_buff *skb, u32 len,
6cc55e096   Octavian Purdila   tcp: add gfp para...
1237
  		 unsigned int mss_now, gfp_t gfp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1238
1239
1240
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *buff;
6475be16f   David S. Miller   [TCP]: Keep TSO e...
1241
  	int nsize, old_factor;
b617158dc   Eric Dumazet   tcp: be more care...
1242
  	long limit;
b60b49ea6   Herbert Xu   [TCP]: Account sk...
1243
  	int nlen;
9ce014610   Ilpo Järvinen   tcp: get rid of t...
1244
  	u8 flags;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1245

2fceec133   Ilpo Järvinen   tcp: len check is...
1246
1247
  	if (WARN_ON(len > skb->len))
  		return -EINVAL;
6a438bbe6   Stephen Hemminger   [TCP]: speed up S...
1248

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1249
1250
1251
  	nsize = skb_headlen(skb) - len;
  	if (nsize < 0)
  		nsize = 0;
b617158dc   Eric Dumazet   tcp: be more care...
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
  	/* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb.
  	 * We need some allowance to not penalize applications setting small
  	 * SO_SNDBUF values.
  	 * Also allow first and last skb in retransmit queue to be split.
  	 */
  	limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_MAX_SIZE);
  	if (unlikely((sk->sk_wmem_queued >> 1) > limit &&
  		     tcp_queue != TCP_FRAG_IN_WRITE_QUEUE &&
  		     skb != tcp_rtx_queue_head(sk) &&
  		     skb != tcp_rtx_queue_tail(sk))) {
f070ef2ac   Eric Dumazet   tcp: tcp_fragment...
1262
1263
1264
  		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
  		return -ENOMEM;
  	}
6cc55e096   Octavian Purdila   tcp: add gfp para...
1265
  	if (skb_unclone(skb, gfp))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1266
1267
1268
  		return -ENOMEM;
  
  	/* Get a new skb... force flag on. */
eb9344781   Eric Dumazet   tcp: add a force_...
1269
  	buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
51456b291   Ian Morris   ipv4: coding styl...
1270
  	if (!buff)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1271
  		return -ENOMEM; /* We'll just try again later. */
414776621   Jakub Kicinski   net/tls: prevent ...
1272
  	skb_copy_decrypted(buff, skb);
ef5cb9738   Herbert Xu   [TCP]: Fix truesi...
1273

ab4e846a8   Eric Dumazet   tcp: annotate sk-...
1274
  	sk_wmem_queued_add(sk, buff->truesize);
3ab224be6   Hideo Aoki   [NET] CORE: Intro...
1275
  	sk_mem_charge(sk, buff->truesize);
b60b49ea6   Herbert Xu   [TCP]: Account sk...
1276
1277
1278
  	nlen = skb->len - len - nsize;
  	buff->truesize += nlen;
  	skb->truesize -= nlen;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1279
1280
1281
1282
1283
1284
1285
  
  	/* Correct the sequence numbers. */
  	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
  	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
  	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
  
  	/* PSH and FIN should only be set in the second packet. */
4de075e04   Eric Dumazet   tcp: rename tcp_s...
1286
1287
1288
  	flags = TCP_SKB_CB(skb)->tcp_flags;
  	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
  	TCP_SKB_CB(buff)->tcp_flags = flags;
e14c3caf6   Herbert Xu   [TCP]: Handle SAC...
1289
  	TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
a166140e8   Martin KaFai Lau   tcp: Handle eor b...
1290
  	tcp_skb_fragment_eor(skb, buff);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1291

98be9b120   Eric Dumazet   tcp: remove dead ...
1292
  	skb_split(skb, buff, len);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1293

98be9b120   Eric Dumazet   tcp: remove dead ...
1294
  	buff->ip_summed = CHECKSUM_PARTIAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1295

a61bbcf28   Patrick McHardy   [NET]: Store skb-...
1296
  	buff->tstamp = skb->tstamp;
490cc7d03   Willem de Bruijn   net-timestamp: fi...
1297
  	tcp_fragment_tstamp(skb, buff);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1298

6475be16f   David S. Miller   [TCP]: Keep TSO e...
1299
  	old_factor = tcp_skb_pcount(skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1300
  	/* Fix up tso_factor for both original and new SKB.  */
5bbb432c8   Eric Dumazet   tcp: tcp_set_skb_...
1301
1302
  	tcp_set_skb_tso_segs(skb, mss_now);
  	tcp_set_skb_tso_segs(buff, mss_now);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1303

b9f64820f   Yuchung Cheng   tcp: track data d...
1304
1305
  	/* Update delivered info for the new segment */
  	TCP_SKB_CB(buff)->tx = TCP_SKB_CB(skb)->tx;
6475be16f   David S. Miller   [TCP]: Keep TSO e...
1306
1307
1308
  	/* If this packet has been sent out already, we must
  	 * adjust the various packet counters.
  	 */
cf0b450cd   Herbert Xu   [TCP]: Fix off by...
1309
  	if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
6475be16f   David S. Miller   [TCP]: Keep TSO e...
1310
1311
  		int diff = old_factor - tcp_skb_pcount(skb) -
  			tcp_skb_pcount(buff);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1312

797108d13   Ilpo Järvinen   tcp: add helper f...
1313
1314
  		if (diff)
  			tcp_adjust_pcount(sk, skb, diff);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1315
1316
1317
  	}
  
  	/* Link BUFF into the send queue. */
f4a775d14   Eric Dumazet   net: introduce __...
1318
  	__skb_header_release(buff);
75c119afe   Eric Dumazet   tcp: implement rb...
1319
  	tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
f67971e68   Eric Dumazet   tcp: tcp_fragment...
1320
1321
  	if (tcp_queue == TCP_FRAG_IN_RTX_QUEUE)
  		list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1322
1323
1324
  
  	return 0;
  }
f4d016666   Eric Dumazet   tcp: remove unnec...
1325
1326
  /* This is similar to __pskb_pull_tail(). The difference is that pulled
   * data is not copied, but immediately discarded.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1327
   */
7162fb242   Eric Dumazet   tcp: do not under...
1328
  static int __pskb_trim_head(struct sk_buff *skb, int len)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1329
  {
7b7fc97aa   Eric Dumazet   tcp: optimize som...
1330
  	struct skb_shared_info *shinfo;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1331
  	int i, k, eat;
4fa48bf3c   Eric Dumazet   tcp: fix tcp_trim...
1332
1333
1334
1335
1336
  	eat = min_t(int, len, skb_headlen(skb));
  	if (eat) {
  		__skb_pull(skb, eat);
  		len -= eat;
  		if (!len)
7162fb242   Eric Dumazet   tcp: do not under...
1337
  			return 0;
4fa48bf3c   Eric Dumazet   tcp: fix tcp_trim...
1338
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1339
1340
  	eat = len;
  	k = 0;
7b7fc97aa   Eric Dumazet   tcp: optimize som...
1341
1342
1343
  	shinfo = skb_shinfo(skb);
  	for (i = 0; i < shinfo->nr_frags; i++) {
  		int size = skb_frag_size(&shinfo->frags[i]);
9e903e085   Eric Dumazet   net: add skb frag...
1344
1345
  
  		if (size <= eat) {
aff65da0f   Ian Campbell   net: ipv4: conver...
1346
  			skb_frag_unref(skb, i);
9e903e085   Eric Dumazet   net: add skb frag...
1347
  			eat -= size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1348
  		} else {
7b7fc97aa   Eric Dumazet   tcp: optimize som...
1349
  			shinfo->frags[k] = shinfo->frags[i];
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1350
  			if (eat) {
b54c9d5bd   Jonathan Lemon   net: Use skb_frag...
1351
  				skb_frag_off_add(&shinfo->frags[k], eat);
7b7fc97aa   Eric Dumazet   tcp: optimize som...
1352
  				skb_frag_size_sub(&shinfo->frags[k], eat);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1353
1354
1355
1356
1357
  				eat = 0;
  			}
  			k++;
  		}
  	}
7b7fc97aa   Eric Dumazet   tcp: optimize som...
1358
  	shinfo->nr_frags = k;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1359

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1360
1361
  	skb->data_len -= len;
  	skb->len = skb->data_len;
7162fb242   Eric Dumazet   tcp: do not under...
1362
  	return len;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1363
  }
67edfef78   Andi Kleen   TCP: Add comments...
1364
  /* Remove acked data from a packet in the transmit queue. */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1365
1366
  int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
  {
7162fb242   Eric Dumazet   tcp: do not under...
1367
  	u32 delta_truesize;
14bbd6a56   Pravin B Shelar   net: Add skb_uncl...
1368
  	if (skb_unclone(skb, GFP_ATOMIC))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1369
  		return -ENOMEM;
7162fb242   Eric Dumazet   tcp: do not under...
1370
  	delta_truesize = __pskb_trim_head(skb, len);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1371
1372
  
  	TCP_SKB_CB(skb)->seq += len;
84fa7933a   Patrick McHardy   [NET]: Replace CH...
1373
  	skb->ip_summed = CHECKSUM_PARTIAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1374

7162fb242   Eric Dumazet   tcp: do not under...
1375
1376
  	if (delta_truesize) {
  		skb->truesize	   -= delta_truesize;
ab4e846a8   Eric Dumazet   tcp: annotate sk-...
1377
  		sk_wmem_queued_add(sk, -delta_truesize);
7162fb242   Eric Dumazet   tcp: do not under...
1378
1379
1380
  		sk_mem_uncharge(sk, delta_truesize);
  		sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1381

5b35e1e6e   Neal Cardwell   tcp: fix tcp_trim...
1382
  	/* Any change of skb->len requires recalculation of tso factor. */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1383
  	if (tcp_skb_pcount(skb) > 1)
5bbb432c8   Eric Dumazet   tcp: tcp_set_skb_...
1384
  		tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1385
1386
1387
  
  	return 0;
  }
1b63edd6e   Yuchung Cheng   tcp: fix SYN-data...
1388
1389
  /* Calculate MSS not accounting any TCP options.  */
  static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
5d424d5a6   John Heffner   [TCP]: MTU probing
1390
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
1391
1392
  	const struct tcp_sock *tp = tcp_sk(sk);
  	const struct inet_connection_sock *icsk = inet_csk(sk);
5d424d5a6   John Heffner   [TCP]: MTU probing
1393
1394
1395
1396
1397
1398
  	int mss_now;
  
  	/* Calculate base mss without TCP options:
  	   It is MMS_S - sizeof(tcphdr) of rfc1122
  	 */
  	mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
674696014   Eric Dumazet   ipv6: RTAX_FEATUR...
1399
1400
1401
1402
1403
1404
1405
  	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
  	if (icsk->icsk_af_ops->net_frag_header_len) {
  		const struct dst_entry *dst = __sk_dst_get(sk);
  
  		if (dst && dst_allfrag(dst))
  			mss_now -= icsk->icsk_af_ops->net_frag_header_len;
  	}
5d424d5a6   John Heffner   [TCP]: MTU probing
1406
1407
1408
1409
1410
1411
1412
1413
  	/* Clamp it (mss_clamp does not include tcp options) */
  	if (mss_now > tp->rx_opt.mss_clamp)
  		mss_now = tp->rx_opt.mss_clamp;
  
  	/* Now subtract optional transport overhead */
  	mss_now -= icsk->icsk_ext_hdr_len;
  
  	/* Then reserve room for full set of TCP options and 8 bytes of data */
5f3e2bf00   Eric Dumazet   tcp: add tcp_min_...
1414
  	mss_now = max(mss_now, sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss);
5d424d5a6   John Heffner   [TCP]: MTU probing
1415
1416
  	return mss_now;
  }
1b63edd6e   Yuchung Cheng   tcp: fix SYN-data...
1417
1418
1419
1420
1421
1422
1423
  /* Calculate MSS. Not accounting for SACKs here.  */
  int tcp_mtu_to_mss(struct sock *sk, int pmtu)
  {
  	/* Subtract TCP options size, not including SACKs */
  	return __tcp_mtu_to_mss(sk, pmtu) -
  	       (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
  }
5d424d5a6   John Heffner   [TCP]: MTU probing
1424
  /* Inverse of above */
674696014   Eric Dumazet   ipv6: RTAX_FEATUR...
1425
  int tcp_mss_to_mtu(struct sock *sk, int mss)
5d424d5a6   John Heffner   [TCP]: MTU probing
1426
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
1427
1428
  	const struct tcp_sock *tp = tcp_sk(sk);
  	const struct inet_connection_sock *icsk = inet_csk(sk);
5d424d5a6   John Heffner   [TCP]: MTU probing
1429
1430
1431
1432
1433
1434
  	int mtu;
  
  	mtu = mss +
  	      tp->tcp_header_len +
  	      icsk->icsk_ext_hdr_len +
  	      icsk->icsk_af_ops->net_header_len;
674696014   Eric Dumazet   ipv6: RTAX_FEATUR...
1435
1436
1437
1438
1439
1440
1441
  	/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
  	if (icsk->icsk_af_ops->net_frag_header_len) {
  		const struct dst_entry *dst = __sk_dst_get(sk);
  
  		if (dst && dst_allfrag(dst))
  			mtu += icsk->icsk_af_ops->net_frag_header_len;
  	}
5d424d5a6   John Heffner   [TCP]: MTU probing
1442
1443
  	return mtu;
  }
556c6b46d   Neal Cardwell   tcp: export tcp_m...
1444
  EXPORT_SYMBOL(tcp_mss_to_mtu);
5d424d5a6   John Heffner   [TCP]: MTU probing
1445

67edfef78   Andi Kleen   TCP: Add comments...
1446
  /* MTU probing init per socket */
5d424d5a6   John Heffner   [TCP]: MTU probing
1447
1448
1449
1450
  void tcp_mtup_init(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct inet_connection_sock *icsk = inet_csk(sk);
b0f9ca53c   Fan Du   ipv4: Namespecify...
1451
  	struct net *net = sock_net(sk);
5d424d5a6   John Heffner   [TCP]: MTU probing
1452

b0f9ca53c   Fan Du   ipv4: Namespecify...
1453
  	icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
5d424d5a6   John Heffner   [TCP]: MTU probing
1454
  	icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
1455
  			       icsk->icsk_af_ops->net_header_len;
b0f9ca53c   Fan Du   ipv4: Namespecify...
1456
  	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
5d424d5a6   John Heffner   [TCP]: MTU probing
1457
  	icsk->icsk_mtup.probe_size = 0;
05cbc0db0   Fan Du   ipv4: Create prob...
1458
  	if (icsk->icsk_mtup.enabled)
c74df29a8   Eric Dumazet   tcp: use tcp_jiff...
1459
  		icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
5d424d5a6   John Heffner   [TCP]: MTU probing
1460
  }
4bc2f18ba   Eric Dumazet   net/ipv4: EXPORT_...
1461
  EXPORT_SYMBOL(tcp_mtup_init);
5d424d5a6   John Heffner   [TCP]: MTU probing
1462

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1463
1464
1465
1466
1467
1468
  /* This function synchronize snd mss to current pmtu/exthdr set.
  
     tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
     for TCP options, but includes only bare TCP header.
  
     tp->rx_opt.mss_clamp is mss negotiated at connection setup.
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
1469
     It is minimum of user_mss and mss received with SYN.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1470
     It also does not include TCP options.
d83d8461f   Arnaldo Carvalho de Melo   [IP_SOCKGLUE]: Re...
1471
     inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1472
1473
1474
1475
1476
1477
1478
1479
  
     tp->mss_cache is current effective sending mss, including
     all tcp options except for SACKs. It is evaluated,
     taking into account current pmtu, but never exceeds
     tp->rx_opt.mss_clamp.
  
     NOTE1. rfc1122 clearly states that advertised MSS
     DOES NOT include either tcp or ip options.
d83d8461f   Arnaldo Carvalho de Melo   [IP_SOCKGLUE]: Re...
1480
1481
     NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
     are READ ONLY outside this function.		--ANK (980731)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1482
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1483
1484
1485
  unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
d83d8461f   Arnaldo Carvalho de Melo   [IP_SOCKGLUE]: Re...
1486
  	struct inet_connection_sock *icsk = inet_csk(sk);
5d424d5a6   John Heffner   [TCP]: MTU probing
1487
  	int mss_now;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1488

5d424d5a6   John Heffner   [TCP]: MTU probing
1489
1490
  	if (icsk->icsk_mtup.search_high > pmtu)
  		icsk->icsk_mtup.search_high = pmtu;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1491

5d424d5a6   John Heffner   [TCP]: MTU probing
1492
  	mss_now = tcp_mtu_to_mss(sk, pmtu);
409d22b47   Ilpo Järvinen   [TCP]: Code dupli...
1493
  	mss_now = tcp_bound_to_half_wnd(tp, mss_now);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1494
1495
  
  	/* And store cached results */
d83d8461f   Arnaldo Carvalho de Melo   [IP_SOCKGLUE]: Re...
1496
  	icsk->icsk_pmtu_cookie = pmtu;
5d424d5a6   John Heffner   [TCP]: MTU probing
1497
1498
  	if (icsk->icsk_mtup.enabled)
  		mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1499
  	tp->mss_cache = mss_now;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1500
1501
1502
  
  	return mss_now;
  }
4bc2f18ba   Eric Dumazet   net/ipv4: EXPORT_...
1503
  EXPORT_SYMBOL(tcp_sync_mss);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1504
1505
1506
  
  /* Compute the current effective MSS, taking SACKs and IP options,
   * and even PMTU discovery events into account.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1507
   */
0c54b85f2   Ilpo Järvinen   tcp: simplify tcp...
1508
  unsigned int tcp_current_mss(struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1509
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
1510
1511
  	const struct tcp_sock *tp = tcp_sk(sk);
  	const struct dst_entry *dst = __sk_dst_get(sk);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1512
  	u32 mss_now;
95c961747   Eric Dumazet   net: cleanup unsi...
1513
  	unsigned int header_len;
33ad798c9   Adam Langley   tcp: options clea...
1514
1515
  	struct tcp_out_options opts;
  	struct tcp_md5sig_key *md5;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1516
1517
  
  	mss_now = tp->mss_cache;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1518
1519
  	if (dst) {
  		u32 mtu = dst_mtu(dst);
d83d8461f   Arnaldo Carvalho de Melo   [IP_SOCKGLUE]: Re...
1520
  		if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1521
1522
  			mss_now = tcp_sync_mss(sk, mtu);
  	}
33ad798c9   Adam Langley   tcp: options clea...
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
  	header_len = tcp_established_options(sk, NULL, &opts, &md5) +
  		     sizeof(struct tcphdr);
  	/* The mss_cache is sized based on tp->tcp_header_len, which assumes
  	 * some common options. If this is an odd packet (because we have SACK
  	 * blocks etc) then our calculated header_len will be different, and
  	 * we have to adjust mss_now correspondingly */
  	if (header_len != tp->tcp_header_len) {
  		int delta = (int) header_len - tp->tcp_header_len;
  		mss_now -= delta;
  	}
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
1533

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1534
1535
  	return mss_now;
  }
86fd14ad1   Weiping Pan   tcp: make tcp_cwn...
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
  /* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
   * As additional protections, we do not touch cwnd in retransmission phases,
   * and if application hit its sndbuf limit recently.
   */
  static void tcp_cwnd_application_limited(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
  	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
  		/* Limited by application or receiver window. */
  		u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
  		u32 win_used = max(tp->snd_cwnd_used, init_win);
  		if (win_used < tp->snd_cwnd) {
  			tp->snd_ssthresh = tcp_current_ssthresh(sk);
  			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
  		}
  		tp->snd_cwnd_used = 0;
  	}
c2203cf75   Eric Dumazet   tcp: use tcp_jiff...
1555
  	tp->snd_cwnd_stamp = tcp_jiffies32;
86fd14ad1   Weiping Pan   tcp: make tcp_cwn...
1556
  }
ca8a22634   Neal Cardwell   tcp: make cwnd-li...
1557
  static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
a762a9800   David S. Miller   [TCP]: Kill extra...
1558
  {
1b1fc3fdd   Wei Wang   tcp: make congest...
1559
  	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
1560
  	struct tcp_sock *tp = tcp_sk(sk);
a762a9800   David S. Miller   [TCP]: Kill extra...
1561

ca8a22634   Neal Cardwell   tcp: make cwnd-li...
1562
1563
1564
1565
1566
1567
1568
1569
1570
  	/* Track the maximum number of outstanding packets in each
  	 * window, and remember whether we were cwnd-limited then.
  	 */
  	if (!before(tp->snd_una, tp->max_packets_seq) ||
  	    tp->packets_out > tp->max_packets_out) {
  		tp->max_packets_out = tp->packets_out;
  		tp->max_packets_seq = tp->snd_nxt;
  		tp->is_cwnd_limited = is_cwnd_limited;
  	}
e114a710a   Eric Dumazet   tcp: fix cwnd lim...
1571

249015515   Eric Dumazet   tcp: remove in_fl...
1572
  	if (tcp_is_cwnd_limited(sk)) {
a762a9800   David S. Miller   [TCP]: Kill extra...
1573
1574
  		/* Network is feed fully. */
  		tp->snd_cwnd_used = 0;
c2203cf75   Eric Dumazet   tcp: use tcp_jiff...
1575
  		tp->snd_cwnd_stamp = tcp_jiffies32;
a762a9800   David S. Miller   [TCP]: Kill extra...
1576
1577
1578
1579
  	} else {
  		/* Network starves. */
  		if (tp->packets_out > tp->snd_cwnd_used)
  			tp->snd_cwnd_used = tp->packets_out;
b510f0d23   Eric Dumazet   tcp: Namespace-if...
1580
  		if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
c2203cf75   Eric Dumazet   tcp: use tcp_jiff...
1581
  		    (s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
1b1fc3fdd   Wei Wang   tcp: make congest...
1582
  		    !ca_ops->cong_control)
a762a9800   David S. Miller   [TCP]: Kill extra...
1583
  			tcp_cwnd_application_limited(sk);
b0f71bd3e   Francis Yan   tcp: instrument h...
1584
1585
1586
1587
1588
  
  		/* The following conditions together indicate the starvation
  		 * is caused by insufficient sender buffer:
  		 * 1) just sent some data (see tcp_write_xmit)
  		 * 2) not cwnd limited (this else condition)
75c119afe   Eric Dumazet   tcp: implement rb...
1589
  		 * 3) no more data to send (tcp_write_queue_empty())
b0f71bd3e   Francis Yan   tcp: instrument h...
1590
1591
  		 * 4) application is hitting buffer limit (SOCK_NOSPACE)
  		 */
75c119afe   Eric Dumazet   tcp: implement rb...
1592
  		if (tcp_write_queue_empty(sk) && sk->sk_socket &&
b0f71bd3e   Francis Yan   tcp: instrument h...
1593
1594
1595
  		    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
  		    (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
  			tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
a762a9800   David S. Miller   [TCP]: Kill extra...
1596
1597
  	}
  }
d4589926d   Eric Dumazet   tcp: refine TSO s...
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
  /* Minshall's variant of the Nagle send check. */
  static bool tcp_minshall_check(const struct tcp_sock *tp)
  {
  	return after(tp->snd_sml, tp->snd_una) &&
  		!after(tp->snd_sml, tp->snd_nxt);
  }
  
  /* Update snd_sml if this skb is under mss
   * Note that a TSO packet might end with a sub-mss segment
   * The test is really :
   * if ((skb->len % mss) != 0)
   *        tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
   * But we can avoid doing the divide again given we already have
   *  skb_pcount = skb->len / mss_now
0e3a4803a   Ilpo Järvinen   [TCP]: Force TSO ...
1612
   */
d4589926d   Eric Dumazet   tcp: refine TSO s...
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
  static void tcp_minshall_update(struct tcp_sock *tp, unsigned int mss_now,
  				const struct sk_buff *skb)
  {
  	if (skb->len < tcp_skb_pcount(skb) * mss_now)
  		tp->snd_sml = TCP_SKB_CB(skb)->end_seq;
  }
  
  /* Return false, if packet can be sent now without violation Nagle's rules:
   * 1. It is full sized. (provided by caller in %partial bool)
   * 2. Or it contains FIN. (already checked by caller)
   * 3. Or TCP_CORK is not set, and TCP_NODELAY is set.
   * 4. Or TCP_CORK is not set, and all sent packets are ACKed.
   *    With Minshall's modification: all sent small packets are ACKed.
   */
  static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
cc93fc51f   Peter Pan(潘卫平)   tcp: delete unuse...
1628
  			    int nonagle)
d4589926d   Eric Dumazet   tcp: refine TSO s...
1629
1630
1631
1632
1633
  {
  	return partial &&
  		((nonagle & TCP_NAGLE_CORK) ||
  		 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
  }
605ad7f18   Eric Dumazet   tcp: refine TSO a...
1634
1635
1636
1637
  
  /* Return how many segs we'd like on a TSO packet,
   * to send one TSO packet per ms
   */
dcb8c9b43   Eric Dumazet   tcp_bbr: better d...
1638
1639
  static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
  			    int min_tso_segs)
605ad7f18   Eric Dumazet   tcp: refine TSO a...
1640
1641
  {
  	u32 bytes, segs;
76a9ebe81   Eric Dumazet   net: extend sk_pa...
1642
  	bytes = min_t(unsigned long,
8f8e806c5   Eric Dumazet   net: annotate loc...
1643
  		      sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
76a9ebe81   Eric Dumazet   net: extend sk_pa...
1644
  		      sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
605ad7f18   Eric Dumazet   tcp: refine TSO a...
1645
1646
1647
1648
1649
1650
  
  	/* Goal is to send at least one packet per ms,
  	 * not one big TSO packet every 100 ms.
  	 * This preserves ACK clocking and is consistent
  	 * with tcp_tso_should_defer() heuristic.
  	 */
1b3878ca1   Neal Cardwell   tcp: export tcp_t...
1651
  	segs = max_t(u32, bytes / mss_now, min_tso_segs);
605ad7f18   Eric Dumazet   tcp: refine TSO a...
1652

350c9f484   Eric Dumazet   tcp_bbr: better d...
1653
  	return segs;
605ad7f18   Eric Dumazet   tcp: refine TSO a...
1654
  }
ed6e7268b   Neal Cardwell   tcp: allow conges...
1655
1656
1657
1658
1659
1660
  /* Return the number of segments we want in the skb we are transmitting.
   * See if congestion control module wants to decide; otherwise, autosize.
   */
  static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
  {
  	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
dcb8c9b43   Eric Dumazet   tcp_bbr: better d...
1661
  	u32 min_tso, tso_segs;
ed6e7268b   Neal Cardwell   tcp: allow conges...
1662

dcb8c9b43   Eric Dumazet   tcp_bbr: better d...
1663
1664
1665
1666
1667
  	min_tso = ca_ops->min_tso_segs ?
  			ca_ops->min_tso_segs(sk) :
  			sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;
  
  	tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
350c9f484   Eric Dumazet   tcp_bbr: better d...
1668
  	return min_t(u32, tso_segs, sk->sk_gso_max_segs);
ed6e7268b   Neal Cardwell   tcp: allow conges...
1669
  }
d4589926d   Eric Dumazet   tcp: refine TSO s...
1670
1671
1672
1673
1674
1675
  /* Returns the portion of skb which can be sent right away */
  static unsigned int tcp_mss_split_point(const struct sock *sk,
  					const struct sk_buff *skb,
  					unsigned int mss_now,
  					unsigned int max_segs,
  					int nonagle)
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1676
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
1677
  	const struct tcp_sock *tp = tcp_sk(sk);
d4589926d   Eric Dumazet   tcp: refine TSO s...
1678
  	u32 partial, needed, window, max_len;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1679

90840defa   Ilpo Järvinen   [TCP]: Introduce ...
1680
  	window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1485348d2   Ben Hutchings   tcp: Apply device...
1681
  	max_len = mss_now * max_segs;
0e3a4803a   Ilpo Järvinen   [TCP]: Force TSO ...
1682

1485348d2   Ben Hutchings   tcp: Apply device...
1683
1684
  	if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
  		return max_len;
0e3a4803a   Ilpo Järvinen   [TCP]: Force TSO ...
1685

5ea3a7480   Ilpo Järvinen   [TCP]: Prevent se...
1686
  	needed = min(skb->len, window);
1485348d2   Ben Hutchings   tcp: Apply device...
1687
1688
  	if (max_len <= needed)
  		return max_len;
0e3a4803a   Ilpo Järvinen   [TCP]: Force TSO ...
1689

d4589926d   Eric Dumazet   tcp: refine TSO s...
1690
1691
1692
1693
1694
  	partial = needed % mss_now;
  	/* If last segment is not a full MSS, check if Nagle rules allow us
  	 * to include this last segment in this skb.
  	 * Otherwise, we'll split the skb at last MSS boundary
  	 */
cc93fc51f   Peter Pan(潘卫平)   tcp: delete unuse...
1695
  	if (tcp_nagle_check(partial != 0, tp, nonagle))
d4589926d   Eric Dumazet   tcp: refine TSO s...
1696
1697
1698
  		return needed - partial;
  
  	return needed;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1699
1700
1701
1702
1703
  }
  
  /* Can at least one segment of SKB be sent right now, according to the
   * congestion window rules?  If so, return how many segments are allowed.
   */
cf533ea53   Eric Dumazet   tcp: add const qu...
1704
1705
  static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
  					 const struct sk_buff *skb)
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1706
  {
d649a7a81   Eric Dumazet   tcp: limit GSO pa...
1707
  	u32 in_flight, cwnd, halfcwnd;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1708
1709
  
  	/* Don't be strict about the congestion window for the final FIN.  */
4de075e04   Eric Dumazet   tcp: rename tcp_s...
1710
1711
  	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
  	    tcp_skb_pcount(skb) == 1)
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1712
1713
1714
1715
  		return 1;
  
  	in_flight = tcp_packets_in_flight(tp);
  	cwnd = tp->snd_cwnd;
d649a7a81   Eric Dumazet   tcp: limit GSO pa...
1716
1717
  	if (in_flight >= cwnd)
  		return 0;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1718

d649a7a81   Eric Dumazet   tcp: limit GSO pa...
1719
1720
1721
1722
1723
  	/* For better scheduling, ensure we have at least
  	 * 2 GSO packets in flight.
  	 */
  	halfcwnd = max(cwnd >> 1, 1U);
  	return min(halfcwnd, cwnd - in_flight);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1724
  }
b595076a1   Uwe Kleine-König   tree-wide: fix co...
1725
  /* Initialize TSO state of a skb.
67edfef78   Andi Kleen   TCP: Add comments...
1726
   * This must be invoked the first time we consider transmitting
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1727
1728
   * SKB onto the wire.
   */
5bbb432c8   Eric Dumazet   tcp: tcp_set_skb_...
1729
  static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1730
1731
  {
  	int tso_segs = tcp_skb_pcount(skb);
f8269a495   Ilpo Järvinen   tcp: make urg+gso...
1732
  	if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
5bbb432c8   Eric Dumazet   tcp: tcp_set_skb_...
1733
  		tcp_set_skb_tso_segs(skb, mss_now);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1734
1735
1736
1737
  		tso_segs = tcp_skb_pcount(skb);
  	}
  	return tso_segs;
  }
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1738

a2a385d62   Eric Dumazet   tcp: bool convers...
1739
  /* Return true if the Nagle test allows this packet to be
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1740
1741
   * sent now.
   */
a2a385d62   Eric Dumazet   tcp: bool convers...
1742
1743
  static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buff *skb,
  				  unsigned int cur_mss, int nonagle)
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1744
1745
1746
1747
1748
1749
1750
1751
  {
  	/* Nagle rule does not apply to frames, which sit in the middle of the
  	 * write_queue (they have no chances to get new data).
  	 *
  	 * This is implemented in the callers, where they modify the 'nonagle'
  	 * argument based upon the location of SKB in the send queue.
  	 */
  	if (nonagle & TCP_NAGLE_PUSH)
a2a385d62   Eric Dumazet   tcp: bool convers...
1752
  		return true;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1753

9b44190dc   Yuchung Cheng   tcp: refactor F-RTO
1754
1755
  	/* Don't use the nagle rule for urgent data (or for the final FIN). */
  	if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
a2a385d62   Eric Dumazet   tcp: bool convers...
1756
  		return true;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1757

cc93fc51f   Peter Pan(潘卫平)   tcp: delete unuse...
1758
  	if (!tcp_nagle_check(skb->len < cur_mss, tp, nonagle))
a2a385d62   Eric Dumazet   tcp: bool convers...
1759
  		return true;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1760

a2a385d62   Eric Dumazet   tcp: bool convers...
1761
  	return false;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1762
1763
1764
  }
  
  /* Does at least the first segment of SKB fit into the send window? */
a2a385d62   Eric Dumazet   tcp: bool convers...
1765
1766
1767
  static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
  			     const struct sk_buff *skb,
  			     unsigned int cur_mss)
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1768
1769
1770
1771
1772
  {
  	u32 end_seq = TCP_SKB_CB(skb)->end_seq;
  
  	if (skb->len > cur_mss)
  		end_seq = TCP_SKB_CB(skb)->seq + cur_mss;
90840defa   Ilpo Järvinen   [TCP]: Introduce ...
1773
  	return !after(end_seq, tcp_wnd_end(tp));
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1774
  }
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1775
1776
1777
1778
1779
1780
1781
  /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
   * which is put after SKB on the list.  It is very much like
   * tcp_fragment() except that it may make several kinds of assumptions
   * in order to speed up the splitting operation.  In particular, we
   * know that all the data is in scatter-gather pages, and that the
   * packet has never been sent out before (and thus is not cloned).
   */
564833419   Eric Dumazet   tcp: remove tcp_q...
1782
  static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
c4ead4c59   Eric Dumazet   tcp: tso_fragment...
1783
  			unsigned int mss_now, gfp_t gfp)
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1784
  {
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1785
  	int nlen = skb->len - len;
564833419   Eric Dumazet   tcp: remove tcp_q...
1786
  	struct sk_buff *buff;
9ce014610   Ilpo Järvinen   tcp: get rid of t...
1787
  	u8 flags;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1788
1789
  
  	/* All of a TSO frame must be composed of paged data.  */
c8ac37746   Herbert Xu   [TCP]: Fix bug #5...
1790
  	if (skb->len != skb->data_len)
564833419   Eric Dumazet   tcp: remove tcp_q...
1791
1792
  		return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
  				    skb, len, mss_now, gfp);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1793

eb9344781   Eric Dumazet   tcp: add a force_...
1794
  	buff = sk_stream_alloc_skb(sk, 0, gfp, true);
51456b291   Ian Morris   ipv4: coding styl...
1795
  	if (unlikely(!buff))
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1796
  		return -ENOMEM;
414776621   Jakub Kicinski   net/tls: prevent ...
1797
  	skb_copy_decrypted(buff, skb);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1798

ab4e846a8   Eric Dumazet   tcp: annotate sk-...
1799
  	sk_wmem_queued_add(sk, buff->truesize);
3ab224be6   Hideo Aoki   [NET] CORE: Intro...
1800
  	sk_mem_charge(sk, buff->truesize);
b60b49ea6   Herbert Xu   [TCP]: Account sk...
1801
  	buff->truesize += nlen;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1802
1803
1804
1805
1806
1807
1808
1809
  	skb->truesize -= nlen;
  
  	/* Correct the sequence numbers. */
  	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
  	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
  	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
  
  	/* PSH and FIN should only be set in the second packet. */
4de075e04   Eric Dumazet   tcp: rename tcp_s...
1810
1811
1812
  	flags = TCP_SKB_CB(skb)->tcp_flags;
  	TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
  	TCP_SKB_CB(buff)->tcp_flags = flags;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1813
1814
1815
  
  	/* This packet was never sent out yet, so no SACK bits. */
  	TCP_SKB_CB(buff)->sacked = 0;
a166140e8   Martin KaFai Lau   tcp: Handle eor b...
1816
  	tcp_skb_fragment_eor(skb, buff);
98be9b120   Eric Dumazet   tcp: remove dead ...
1817
  	buff->ip_summed = CHECKSUM_PARTIAL;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1818
  	skb_split(skb, buff, len);
490cc7d03   Willem de Bruijn   net-timestamp: fi...
1819
  	tcp_fragment_tstamp(skb, buff);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1820
1821
  
  	/* Fix up tso_factor for both original and new SKB.  */
5bbb432c8   Eric Dumazet   tcp: tcp_set_skb_...
1822
1823
  	tcp_set_skb_tso_segs(skb, mss_now);
  	tcp_set_skb_tso_segs(buff, mss_now);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1824
1825
  
  	/* Link BUFF into the send queue. */
f4a775d14   Eric Dumazet   net: introduce __...
1826
  	__skb_header_release(buff);
564833419   Eric Dumazet   tcp: remove tcp_q...
1827
  	tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1828
1829
1830
1831
1832
1833
1834
1835
1836
  
  	return 0;
  }
  
  /* Try to defer sending, if possible, in order to minimize the amount
   * of TSO splitting we do.  View it as a kind of TSO Nagle test.
   *
   * This algorithm is from John Heffner.
   */
ca8a22634   Neal Cardwell   tcp: make cwnd-li...
1837
  static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
f9bfe4e6a   Eric Dumazet   tcp: lack of avai...
1838
1839
1840
  				 bool *is_cwnd_limited,
  				 bool *is_rwnd_limited,
  				 u32 max_segs)
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1841
  {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
1842
  	const struct inet_connection_sock *icsk = inet_csk(sk);
f1c6ea382   Eric Dumazet   tcp: refine tcp_t...
1843
  	u32 send_win, cong_win, limit, in_flight;
50c8339e9   Eric Dumazet   tcp: tso: restore...
1844
  	struct tcp_sock *tp = tcp_sk(sk);
50c8339e9   Eric Dumazet   tcp: tso: restore...
1845
  	struct sk_buff *head;
ad9f4f50f   Eric Dumazet   tcp: avoid a poss...
1846
  	int win_divisor;
f1c6ea382   Eric Dumazet   tcp: refine tcp_t...
1847
  	s64 delta;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1848

99d7662a0   Eric Dumazet   tcp: tso: allow d...
1849
  	if (icsk->icsk_ca_state >= TCP_CA_Recovery)
ae8064ac3   John Heffner   [TCP]: Bound TSO ...
1850
  		goto send_now;
5f852eb53   Eric Dumazet   tcp: tso: remove ...
1851
  	/* Avoid bursty behavior by allowing defer
a682850a1   Eric Dumazet   tcp: get rid of t...
1852
1853
1854
  	 * only if the last write was recent (1 ms).
  	 * Note that tp->tcp_wstamp_ns can be in the future if we have
  	 * packets waiting in a qdisc or device for EDT delivery.
5f852eb53   Eric Dumazet   tcp: tso: remove ...
1855
  	 */
a682850a1   Eric Dumazet   tcp: get rid of t...
1856
1857
  	delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
  	if (delta > 0)
ae8064ac3   John Heffner   [TCP]: Bound TSO ...
1858
  		goto send_now;
908a75c17   David S. Miller   [TCP]: Never TSO ...
1859

c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1860
  	in_flight = tcp_packets_in_flight(tp);
c8c9aeb51   Stefano Brivio   tcp: Split BUG_ON...
1861
1862
  	BUG_ON(tcp_skb_pcount(skb) <= 1);
  	BUG_ON(tp->snd_cwnd <= in_flight);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1863

90840defa   Ilpo Järvinen   [TCP]: Introduce ...
1864
  	send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1865
1866
1867
1868
1869
  
  	/* From in_flight test above, we know that cwnd > in_flight.  */
  	cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
  
  	limit = min(send_win, cong_win);
ba244fe90   David S. Miller   [TCP]: Fix tcp_ts...
1870
  	/* If a full-sized TSO skb can be sent, do it. */
605ad7f18   Eric Dumazet   tcp: refine TSO a...
1871
  	if (limit >= max_segs * tp->mss_cache)
ae8064ac3   John Heffner   [TCP]: Bound TSO ...
1872
  		goto send_now;
ba244fe90   David S. Miller   [TCP]: Fix tcp_ts...
1873

62ad27619   Ilpo Järvinen   tcp: deferring in...
1874
1875
1876
  	/* Middle in queue won't get any more data, full sendable already? */
  	if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
  		goto send_now;
5bbcc0f59   Linus Torvalds   Merge git://git.k...
1877
  	win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
ad9f4f50f   Eric Dumazet   tcp: avoid a poss...
1878
  	if (win_divisor) {
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1879
1880
1881
1882
1883
  		u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
  
  		/* If at least some fraction of a window is available,
  		 * just use it.
  		 */
ad9f4f50f   Eric Dumazet   tcp: avoid a poss...
1884
  		chunk /= win_divisor;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1885
  		if (limit >= chunk)
ae8064ac3   John Heffner   [TCP]: Bound TSO ...
1886
  			goto send_now;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1887
1888
1889
1890
1891
1892
  	} else {
  		/* Different approach, try not to defer past a single
  		 * ACK.  Receiver should ACK every other full sized
  		 * frame, so if we have space for more than 3 frames
  		 * then send now.
  		 */
6b5a5c0db   Neal Cardwell   tcp: do not scale...
1893
  		if (limit > tcp_max_tso_deferred_mss(tp) * tp->mss_cache)
ae8064ac3   John Heffner   [TCP]: Bound TSO ...
1894
  			goto send_now;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1895
  	}
75c119afe   Eric Dumazet   tcp: implement rb...
1896
1897
1898
1899
  	/* TODO : use tsorted_sent_queue ? */
  	head = tcp_rtx_queue_head(sk);
  	if (!head)
  		goto send_now;
f1c6ea382   Eric Dumazet   tcp: refine tcp_t...
1900
  	delta = tp->tcp_clock_cache - head->tstamp;
50c8339e9   Eric Dumazet   tcp: tso: restore...
1901
  	/* If next ACK is likely to come too late (half srtt), do not defer */
f1c6ea382   Eric Dumazet   tcp: refine tcp_t...
1902
  	if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0)
50c8339e9   Eric Dumazet   tcp: tso: restore...
1903
  		goto send_now;
f9bfe4e6a   Eric Dumazet   tcp: lack of avai...
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
  	/* Ok, it looks like it is advisable to defer.
  	 * Three cases are tracked :
  	 * 1) We are cwnd-limited
  	 * 2) We are rwnd-limited
  	 * 3) We are application limited.
  	 */
  	if (cong_win < send_win) {
  		if (cong_win <= skb->len) {
  			*is_cwnd_limited = true;
  			return true;
  		}
  	} else {
  		if (send_win <= skb->len) {
  			*is_rwnd_limited = true;
  			return true;
  		}
  	}
ae8064ac3   John Heffner   [TCP]: Bound TSO ...
1921

f9bfe4e6a   Eric Dumazet   tcp: lack of avai...
1922
  	/* If this packet won't get more data, do not wait. */
d8ed257f3   Eric Dumazet   tcp: handle EOR a...
1923
1924
  	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
  	    TCP_SKB_CB(skb)->eor)
f9bfe4e6a   Eric Dumazet   tcp: lack of avai...
1925
  		goto send_now;
ca8a22634   Neal Cardwell   tcp: make cwnd-li...
1926

a2a385d62   Eric Dumazet   tcp: bool convers...
1927
  	return true;
ae8064ac3   John Heffner   [TCP]: Bound TSO ...
1928
1929
  
  send_now:
a2a385d62   Eric Dumazet   tcp: bool convers...
1930
  	return false;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
1931
  }
05cbc0db0   Fan Du   ipv4: Create prob...
1932
1933
1934
1935
1936
1937
1938
1939
1940
  static inline void tcp_mtu_check_reprobe(struct sock *sk)
  {
  	struct inet_connection_sock *icsk = inet_csk(sk);
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct net *net = sock_net(sk);
  	u32 interval;
  	s32 delta;
  
  	interval = net->ipv4.sysctl_tcp_probe_interval;
c74df29a8   Eric Dumazet   tcp: use tcp_jiff...
1941
  	delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
05cbc0db0   Fan Du   ipv4: Create prob...
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
  	if (unlikely(delta >= interval * HZ)) {
  		int mss = tcp_current_mss(sk);
  
  		/* Update current search range */
  		icsk->icsk_mtup.probe_size = 0;
  		icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
  			sizeof(struct tcphdr) +
  			icsk->icsk_af_ops->net_header_len;
  		icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
  
  		/* Update probe time stamp */
c74df29a8   Eric Dumazet   tcp: use tcp_jiff...
1953
  		icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
05cbc0db0   Fan Du   ipv4: Create prob...
1954
1955
  	}
  }
808cf9e38   Ilya Lesokhin   tcp: Honor the eo...
1956
1957
1958
1959
1960
1961
1962
1963
  static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
  {
  	struct sk_buff *skb, *next;
  
  	skb = tcp_send_head(sk);
  	tcp_for_write_queue_from_safe(skb, next, sk) {
  		if (len <= skb->len)
  			break;
888a5c53c   Willem de Bruijn   tcp: inherit time...
1964
  		if (unlikely(TCP_SKB_CB(skb)->eor) || tcp_has_tx_tstamp(skb))
808cf9e38   Ilya Lesokhin   tcp: Honor the eo...
1965
1966
1967
1968
1969
1970
1971
  			return false;
  
  		len -= skb->len;
  	}
  
  	return true;
  }
5d424d5a6   John Heffner   [TCP]: MTU probing
1972
  /* Create a new MTU probe if we are ready.
67edfef78   Andi Kleen   TCP: Add comments...
1973
1974
1975
1976
   * MTU probe is regularly attempting to increase the path MTU by
   * deliberately sending larger packets.  This discovers routing
   * changes resulting in larger path MTUs.
   *
5d424d5a6   John Heffner   [TCP]: MTU probing
1977
1978
   * Returns 0 if we should wait to probe (no cwnd available),
   *         1 if a probe was sent,
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
1979
1980
   *         -1 otherwise
   */
5d424d5a6   John Heffner   [TCP]: MTU probing
1981
1982
  static int tcp_mtu_probe(struct sock *sk)
  {
5d424d5a6   John Heffner   [TCP]: MTU probing
1983
  	struct inet_connection_sock *icsk = inet_csk(sk);
12a59abc2   Eric Dumazet   tcp: tcp_mtu_prob...
1984
  	struct tcp_sock *tp = tcp_sk(sk);
5d424d5a6   John Heffner   [TCP]: MTU probing
1985
  	struct sk_buff *skb, *nskb, *next;
6b58e0a5f   Fan Du   ipv4: Use binary ...
1986
  	struct net *net = sock_net(sk);
5d424d5a6   John Heffner   [TCP]: MTU probing
1987
  	int probe_size;
91cc17c0e   Ilpo Järvinen   [TCP]: MTUprobe: ...
1988
  	int size_needed;
12a59abc2   Eric Dumazet   tcp: tcp_mtu_prob...
1989
  	int copy, len;
5d424d5a6   John Heffner   [TCP]: MTU probing
1990
  	int mss_now;
6b58e0a5f   Fan Du   ipv4: Use binary ...
1991
  	int interval;
5d424d5a6   John Heffner   [TCP]: MTU probing
1992
1993
1994
1995
  
  	/* Not currently probing/verifying,
  	 * not in recovery,
  	 * have enough cwnd, and
12a59abc2   Eric Dumazet   tcp: tcp_mtu_prob...
1996
1997
1998
1999
2000
2001
2002
  	 * not SACKing (the variable headers throw things off)
  	 */
  	if (likely(!icsk->icsk_mtup.enabled ||
  		   icsk->icsk_mtup.probe_size ||
  		   inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
  		   tp->snd_cwnd < 11 ||
  		   tp->rx_opt.num_sacks || tp->rx_opt.dsack))
5d424d5a6   John Heffner   [TCP]: MTU probing
2003
  		return -1;
6b58e0a5f   Fan Du   ipv4: Use binary ...
2004
2005
2006
2007
  	/* Use binary search for probe_size between tcp_mss_base,
  	 * and current mss_clamp. if (search_high - search_low)
  	 * smaller than a threshold, backoff from probing.
  	 */
0c54b85f2   Ilpo Järvinen   tcp: simplify tcp...
2008
  	mss_now = tcp_current_mss(sk);
6b58e0a5f   Fan Du   ipv4: Use binary ...
2009
2010
  	probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
  				    icsk->icsk_mtup.search_low) >> 1);
91cc17c0e   Ilpo Järvinen   [TCP]: MTUprobe: ...
2011
  	size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
6b58e0a5f   Fan Du   ipv4: Use binary ...
2012
  	interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
05cbc0db0   Fan Du   ipv4: Create prob...
2013
2014
2015
2016
  	/* When misfortune happens, we are reprobing actively,
  	 * and then reprobe timer has expired. We stick with current
  	 * probing process by not resetting search range to its orignal.
  	 */
6b58e0a5f   Fan Du   ipv4: Use binary ...
2017
  	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
05cbc0db0   Fan Du   ipv4: Create prob...
2018
2019
2020
2021
2022
  		interval < net->ipv4.sysctl_tcp_probe_threshold) {
  		/* Check whether enough time has elaplased for
  		 * another round of probing.
  		 */
  		tcp_mtu_check_reprobe(sk);
5d424d5a6   John Heffner   [TCP]: MTU probing
2023
2024
2025
2026
  		return -1;
  	}
  
  	/* Have enough data in the send queue to probe? */
7f9c33e51   Ilpo Järvinen   [TCP] MTUprobe: C...
2027
  	if (tp->write_seq - tp->snd_nxt < size_needed)
5d424d5a6   John Heffner   [TCP]: MTU probing
2028
  		return -1;
91cc17c0e   Ilpo Järvinen   [TCP]: MTUprobe: ...
2029
2030
  	if (tp->snd_wnd < size_needed)
  		return -1;
90840defa   Ilpo Järvinen   [TCP]: Introduce ...
2031
  	if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp)))
91cc17c0e   Ilpo Järvinen   [TCP]: MTUprobe: ...
2032
  		return 0;
5d424d5a6   John Heffner   [TCP]: MTU probing
2033

d67c58e9a   Ilpo Järvinen   [TCP]: Remove loc...
2034
2035
2036
  	/* Do we need to wait to drain cwnd? With none in flight, don't stall */
  	if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
  		if (!tcp_packets_in_flight(tp))
5d424d5a6   John Heffner   [TCP]: MTU probing
2037
2038
2039
2040
  			return -1;
  		else
  			return 0;
  	}
808cf9e38   Ilya Lesokhin   tcp: Honor the eo...
2041
2042
  	if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
  		return -1;
5d424d5a6   John Heffner   [TCP]: MTU probing
2043
  	/* We're allowed to probe.  Build it now. */
eb9344781   Eric Dumazet   tcp: add a force_...
2044
  	nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
51456b291   Ian Morris   ipv4: coding styl...
2045
  	if (!nskb)
5d424d5a6   John Heffner   [TCP]: MTU probing
2046
  		return -1;
ab4e846a8   Eric Dumazet   tcp: annotate sk-...
2047
  	sk_wmem_queued_add(sk, nskb->truesize);
3ab224be6   Hideo Aoki   [NET] CORE: Intro...
2048
  	sk_mem_charge(sk, nskb->truesize);
5d424d5a6   John Heffner   [TCP]: MTU probing
2049

fe067e8ab   David S. Miller   [TCP]: Abstract o...
2050
  	skb = tcp_send_head(sk);
414776621   Jakub Kicinski   net/tls: prevent ...
2051
  	skb_copy_decrypted(nskb, skb);
5d424d5a6   John Heffner   [TCP]: MTU probing
2052
2053
2054
  
  	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
  	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
4de075e04   Eric Dumazet   tcp: rename tcp_s...
2055
  	TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
5d424d5a6   John Heffner   [TCP]: MTU probing
2056
2057
  	TCP_SKB_CB(nskb)->sacked = 0;
  	nskb->csum = 0;
98be9b120   Eric Dumazet   tcp: remove dead ...
2058
  	nskb->ip_summed = CHECKSUM_PARTIAL;
5d424d5a6   John Heffner   [TCP]: MTU probing
2059

50c4817e9   Ilpo Järvinen   [TCP]: MTUprobe: ...
2060
  	tcp_insert_write_queue_before(nskb, skb, sk);
2b7cda9c3   Eric Dumazet   tcp: fix tcp_mtu_...
2061
  	tcp_highest_sack_replace(sk, skb, nskb);
50c4817e9   Ilpo Järvinen   [TCP]: MTUprobe: ...
2062

5d424d5a6   John Heffner   [TCP]: MTU probing
2063
  	len = 0;
234b68607   Ilpo Järvinen   [TCP]: Add tcp_fo...
2064
  	tcp_for_write_queue_from_safe(skb, next, sk) {
5d424d5a6   John Heffner   [TCP]: MTU probing
2065
  		copy = min_t(int, skb->len, probe_size - len);
98be9b120   Eric Dumazet   tcp: remove dead ...
2066
  		skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
5d424d5a6   John Heffner   [TCP]: MTU probing
2067
2068
2069
2070
  
  		if (skb->len <= copy) {
  			/* We've eaten all the data from this skb.
  			 * Throw it away. */
4de075e04   Eric Dumazet   tcp: rename tcp_s...
2071
  			TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
808cf9e38   Ilya Lesokhin   tcp: Honor the eo...
2072
2073
2074
2075
  			/* If this is the last SKB we copy and eor is set
  			 * we need to propagate it to the new skb.
  			 */
  			TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
888a5c53c   Willem de Bruijn   tcp: inherit time...
2076
  			tcp_skb_collapse_tstamp(nskb, skb);
fe067e8ab   David S. Miller   [TCP]: Abstract o...
2077
  			tcp_unlink_write_queue(skb, sk);
3ab224be6   Hideo Aoki   [NET] CORE: Intro...
2078
  			sk_wmem_free_skb(sk, skb);
5d424d5a6   John Heffner   [TCP]: MTU probing
2079
  		} else {
4de075e04   Eric Dumazet   tcp: rename tcp_s...
2080
  			TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
a3433f35a   Changli Gao   tcp: unify tcp fl...
2081
  						   ~(TCPHDR_FIN|TCPHDR_PSH);
5d424d5a6   John Heffner   [TCP]: MTU probing
2082
2083
  			if (!skb_shinfo(skb)->nr_frags) {
  				skb_pull(skb, copy);
5d424d5a6   John Heffner   [TCP]: MTU probing
2084
2085
  			} else {
  				__pskb_trim_head(skb, copy);
5bbb432c8   Eric Dumazet   tcp: tcp_set_skb_...
2086
  				tcp_set_skb_tso_segs(skb, mss_now);
5d424d5a6   John Heffner   [TCP]: MTU probing
2087
2088
2089
2090
2091
  			}
  			TCP_SKB_CB(skb)->seq += copy;
  		}
  
  		len += copy;
234b68607   Ilpo Järvinen   [TCP]: Add tcp_fo...
2092
2093
2094
  
  		if (len >= probe_size)
  			break;
5d424d5a6   John Heffner   [TCP]: MTU probing
2095
  	}
5bbb432c8   Eric Dumazet   tcp: tcp_set_skb_...
2096
  	tcp_init_tso_segs(nskb, nskb->len);
5d424d5a6   John Heffner   [TCP]: MTU probing
2097
2098
  
  	/* We're ready to send.  If this fails, the probe will
7faee5c0d   Eric Dumazet   tcp: remove TCP_S...
2099
2100
  	 * be resegmented into mss-sized pieces by tcp_write_xmit().
  	 */
5d424d5a6   John Heffner   [TCP]: MTU probing
2101
2102
  	if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
  		/* Decrement cwnd here because we are sending
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2103
  		 * effectively two packets. */
5d424d5a6   John Heffner   [TCP]: MTU probing
2104
  		tp->snd_cwnd--;
66f5fe624   Ilpo Järvinen   [TCP]: Rename upd...
2105
  		tcp_event_new_data_sent(sk, nskb);
5d424d5a6   John Heffner   [TCP]: MTU probing
2106
2107
  
  		icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
0e7b13685   John Heffner   [TCP] mtu probing...
2108
2109
  		tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
  		tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
5d424d5a6   John Heffner   [TCP]: MTU probing
2110
2111
2112
2113
2114
2115
  
  		return 1;
  	}
  
  	return -1;
  }
864e5c090   Eric Dumazet   tcp: optimize tcp...
2116
  static bool tcp_pacing_check(struct sock *sk)
218af599f   Eric Dumazet   tcp: internal imp...
2117
  {
864e5c090   Eric Dumazet   tcp: optimize tcp...
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	if (!tcp_needs_internal_pacing(sk))
  		return false;
  
  	if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
  		return false;
  
  	if (!hrtimer_is_queued(&tp->pacing_timer)) {
  		hrtimer_start(&tp->pacing_timer,
  			      ns_to_ktime(tp->tcp_wstamp_ns),
  			      HRTIMER_MODE_ABS_PINNED_SOFT);
  		sock_hold(sk);
  	}
  	return true;
218af599f   Eric Dumazet   tcp: internal imp...
2133
  }
f9616c35a   Eric Dumazet   tcp: implement TS...
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
  /* TCP Small Queues :
   * Control number of packets in qdisc/devices to two packets / or ~1 ms.
   * (These limits are doubled for retransmits)
   * This allows for :
   *  - better RTT estimation and ACK scheduling
   *  - faster recovery
   *  - high rates
   * Alas, some drivers / subsystems require a fair amount
   * of queued bytes to ensure line rate.
   * One example is wifi aggregation (802.11 AMPDU)
   */
  static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
  				  unsigned int factor)
  {
76a9ebe81   Eric Dumazet   net: extend sk_pa...
2148
  	unsigned long limit;
f9616c35a   Eric Dumazet   tcp: implement TS...
2149

76a9ebe81   Eric Dumazet   net: extend sk_pa...
2150
2151
  	limit = max_t(unsigned long,
  		      2 * skb->truesize,
8f8e806c5   Eric Dumazet   net: annotate loc...
2152
  		      sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift));
c73e5807e   Eric Dumazet   tcp: tsq: no long...
2153
2154
2155
  	if (sk->sk_pacing_status == SK_PACING_NONE)
  		limit = min_t(unsigned long, limit,
  			      sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
f9616c35a   Eric Dumazet   tcp: implement TS...
2156
  	limit <<= factor;
a842fe142   Eric Dumazet   tcp: add optional...
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
  	if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
  	    tcp_sk(sk)->tcp_tx_delay) {
  		u64 extra_bytes = (u64)sk->sk_pacing_rate * tcp_sk(sk)->tcp_tx_delay;
  
  		/* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
  		 * approximate our needs assuming an ~100% skb->truesize overhead.
  		 * USEC_PER_SEC is approximated by 2^20.
  		 * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift.
  		 */
  		extra_bytes >>= (20 - 1);
  		limit += extra_bytes;
  	}
14afee4b6   Reshetova, Elena   net: convert sock...
2169
  	if (refcount_read(&sk->sk_wmem_alloc) > limit) {
75c119afe   Eric Dumazet   tcp: implement rb...
2170
  		/* Always send skb if rtx queue is empty.
75eefc6c5   Eric Dumazet   tcp: tsq: add a s...
2171
2172
2173
2174
  		 * No need to wait for TX completion to call us back,
  		 * after softirq/tasklet schedule.
  		 * This helps when TX completions are delayed too much.
  		 */
75c119afe   Eric Dumazet   tcp: implement rb...
2175
  		if (tcp_rtx_queue_empty(sk))
75eefc6c5   Eric Dumazet   tcp: tsq: add a s...
2176
  			return false;
7aa5470c2   Eric Dumazet   tcp: tsq: move ts...
2177
  		set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
f9616c35a   Eric Dumazet   tcp: implement TS...
2178
2179
2180
2181
2182
  		/* It is possible TX completion already happened
  		 * before we set TSQ_THROTTLED, so we must
  		 * test again the condition.
  		 */
  		smp_mb__after_atomic();
14afee4b6   Reshetova, Elena   net: convert sock...
2183
  		if (refcount_read(&sk->sk_wmem_alloc) > limit)
f9616c35a   Eric Dumazet   tcp: implement TS...
2184
2185
2186
2187
  			return true;
  	}
  	return false;
  }
05b055e89   Francis Yan   tcp: instrument t...
2188
2189
  static void tcp_chrono_set(struct tcp_sock *tp, const enum tcp_chrono new)
  {
628174ccc   Eric Dumazet   tcp: uses jiffies...
2190
  	const u32 now = tcp_jiffies32;
efe967cde   Arnd Bergmann   tcp: avoid bogus ...
2191
  	enum tcp_chrono old = tp->chrono_type;
05b055e89   Francis Yan   tcp: instrument t...
2192

efe967cde   Arnd Bergmann   tcp: avoid bogus ...
2193
2194
  	if (old > TCP_CHRONO_UNSPEC)
  		tp->chrono_stat[old - 1] += now - tp->chrono_start;
05b055e89   Francis Yan   tcp: instrument t...
2195
2196
2197
2198
2199
2200
2201
2202
2203
  	tp->chrono_start = now;
  	tp->chrono_type = new;
  }
  
  void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  
  	/* If there are multiple conditions worthy of tracking in a
0f87230d1   Francis Yan   tcp: instrument h...
2204
2205
  	 * chronograph then the highest priority enum takes precedence
  	 * over the other conditions. So that if something "more interesting"
05b055e89   Francis Yan   tcp: instrument t...
2206
2207
2208
2209
2210
2211
2212
2213
2214
  	 * starts happening, stop the previous chrono and start a new one.
  	 */
  	if (type > tp->chrono_type)
  		tcp_chrono_set(tp, type);
  }
  
  void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
0f87230d1   Francis Yan   tcp: instrument h...
2215
2216
2217
2218
2219
2220
2221
2222
  
  	/* There are multiple conditions worthy of tracking in a
  	 * chronograph, so that the highest priority enum takes
  	 * precedence over the other conditions (see tcp_chrono_start).
  	 * If a condition stops, we only stop chrono tracking if
  	 * it's the "most interesting" or current chrono we are
  	 * tracking and starts busy chrono if we have pending data.
  	 */
75c119afe   Eric Dumazet   tcp: implement rb...
2223
  	if (tcp_rtx_and_write_queues_empty(sk))
0f87230d1   Francis Yan   tcp: instrument h...
2224
2225
2226
  		tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
  	else if (type == tp->chrono_type)
  		tcp_chrono_set(tp, TCP_CHRONO_BUSY);
05b055e89   Francis Yan   tcp: instrument t...
2227
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2228
2229
2230
2231
  /* This routine writes packets to the network.  It advances the
   * send_head.  This happens as incoming acks open up the remote
   * window for us.
   *
f8269a495   Ilpo Järvinen   tcp: make urg+gso...
2232
2233
2234
2235
   * LARGESEND note: !tcp_urg_mode is overkill, only frames between
   * snd_up-64k-mss .. snd_up cannot be large. However, taking into
   * account rare use of URG, this is not a big flaw.
   *
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2236
2237
   * Send at most one packet when push_one > 0. Temporarily ignore
   * cwnd limit to force at most one packet out when push_one == 2.
a2a385d62   Eric Dumazet   tcp: bool convers...
2238
2239
   * Returns true, if no segments are in flight and we have queued segments,
   * but cannot send anything now because of SWS or another problem.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2240
   */
a2a385d62   Eric Dumazet   tcp: bool convers...
2241
2242
  static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
  			   int push_one, gfp_t gfp)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2243
2244
  {
  	struct tcp_sock *tp = tcp_sk(sk);
92df7b518   David S. Miller   [TCP]: tcp_write_...
2245
  	struct sk_buff *skb;
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
2246
2247
  	unsigned int tso_segs, sent_pkts;
  	int cwnd_quota;
5d424d5a6   John Heffner   [TCP]: MTU probing
2248
  	int result;
5615f8861   Francis Yan   tcp: instrument h...
2249
  	bool is_cwnd_limited = false, is_rwnd_limited = false;
605ad7f18   Eric Dumazet   tcp: refine TSO a...
2250
  	u32 max_segs;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2251

92df7b518   David S. Miller   [TCP]: tcp_write_...
2252
  	sent_pkts = 0;
5d424d5a6   John Heffner   [TCP]: MTU probing
2253

ee1836aec   Eric Dumazet   tcp: refresh tp t...
2254
  	tcp_mstamp_refresh(tp);
d5dd9175b   Ilpo Järvinen   tcp: use tcp_writ...
2255
2256
2257
2258
  	if (!push_one) {
  		/* Do MTU probing. */
  		result = tcp_mtu_probe(sk);
  		if (!result) {
a2a385d62   Eric Dumazet   tcp: bool convers...
2259
  			return false;
d5dd9175b   Ilpo Järvinen   tcp: use tcp_writ...
2260
2261
2262
  		} else if (result > 0) {
  			sent_pkts = 1;
  		}
5d424d5a6   John Heffner   [TCP]: MTU probing
2263
  	}
ed6e7268b   Neal Cardwell   tcp: allow conges...
2264
  	max_segs = tcp_tso_segs(sk, mss_now);
fe067e8ab   David S. Miller   [TCP]: Abstract o...
2265
  	while ((skb = tcp_send_head(sk))) {
c8ac37746   Herbert Xu   [TCP]: Fix bug #5...
2266
  		unsigned int limit;
79861919b   Eric Dumazet   tcp: fix TCP_REPA...
2267
2268
2269
2270
  		if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
  			/* "skb_mstamp_ns" is used as a start point for the retransmit timer */
  			skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache;
  			list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
bf50b606c   Eric Dumazet   tcp: repaired skb...
2271
  			tcp_init_tso_segs(skb, mss_now);
79861919b   Eric Dumazet   tcp: fix TCP_REPA...
2272
2273
  			goto repair; /* Skip network transmission */
  		}
218af599f   Eric Dumazet   tcp: internal imp...
2274
2275
  		if (tcp_pacing_check(sk))
  			break;
5bbb432c8   Eric Dumazet   tcp: tcp_set_skb_...
2276
  		tso_segs = tcp_init_tso_segs(skb, mss_now);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
2277
  		BUG_ON(!tso_segs);
aa93466bd   David S. Miller   [TCP]: Eliminate ...
2278

b68e9f857   Herbert Xu   [PATCH] tcp: fix ...
2279
  		cwnd_quota = tcp_cwnd_test(tp, skb);
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2280
2281
2282
2283
2284
2285
2286
  		if (!cwnd_quota) {
  			if (push_one == 2)
  				/* Force out a loss probe pkt. */
  				cwnd_quota = 1;
  			else
  				break;
  		}
b68e9f857   Herbert Xu   [PATCH] tcp: fix ...
2287

5615f8861   Francis Yan   tcp: instrument h...
2288
2289
  		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
  			is_rwnd_limited = true;
b68e9f857   Herbert Xu   [PATCH] tcp: fix ...
2290
  			break;
5615f8861   Francis Yan   tcp: instrument h...
2291
  		}
b68e9f857   Herbert Xu   [PATCH] tcp: fix ...
2292

d6a4e26af   Eric Dumazet   tcp: tcp_tso_auto...
2293
  		if (tso_segs == 1) {
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
2294
2295
2296
2297
2298
  			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,
  						     (tcp_skb_is_last(sk, skb) ?
  						      nonagle : TCP_NAGLE_PUSH))))
  				break;
  		} else {
ca8a22634   Neal Cardwell   tcp: make cwnd-li...
2299
  			if (!push_one &&
605ad7f18   Eric Dumazet   tcp: refine TSO a...
2300
  			    tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
f9bfe4e6a   Eric Dumazet   tcp: lack of avai...
2301
  						 &is_rwnd_limited, max_segs))
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
2302
2303
  				break;
  		}
aa93466bd   David S. Miller   [TCP]: Eliminate ...
2304

605ad7f18   Eric Dumazet   tcp: refine TSO a...
2305
  		limit = mss_now;
d6a4e26af   Eric Dumazet   tcp: tcp_tso_auto...
2306
  		if (tso_segs > 1 && !tcp_urg_mode(tp))
605ad7f18   Eric Dumazet   tcp: refine TSO a...
2307
2308
2309
2310
2311
2312
2313
  			limit = tcp_mss_split_point(sk, skb, mss_now,
  						    min_t(unsigned int,
  							  cwnd_quota,
  							  max_segs),
  						    nonagle);
  
  		if (skb->len > limit &&
564833419   Eric Dumazet   tcp: remove tcp_q...
2314
  		    unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
605ad7f18   Eric Dumazet   tcp: refine TSO a...
2315
  			break;
f9616c35a   Eric Dumazet   tcp: implement TS...
2316
2317
  		if (tcp_small_queue_check(sk, skb, 0))
  			break;
c9eeec26e   Eric Dumazet   tcp: TSQ can use ...
2318

703761d85   Eric Dumazet   tcp: do not send ...
2319
2320
2321
2322
2323
2324
2325
  		/* Argh, we hit an empty skb(), presumably a thread
  		 * is sleeping in sendmsg()/sk_stream_wait_memory().
  		 * We do not want to send a pure-ack packet and have
  		 * a strange looking rtx queue with empty packet(s).
  		 */
  		if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
  			break;
d5dd9175b   Ilpo Järvinen   tcp: use tcp_writ...
2326
  		if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
92df7b518   David S. Miller   [TCP]: tcp_write_...
2327
  			break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2328

ec3423257   Andrew Vagin   tcp: fix retransm...
2329
  repair:
92df7b518   David S. Miller   [TCP]: tcp_write_...
2330
2331
2332
  		/* Advance the send_head.  This one is sent out.
  		 * This call will increment packets_out.
  		 */
66f5fe624   Ilpo Järvinen   [TCP]: Rename upd...
2333
  		tcp_event_new_data_sent(sk, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2334

92df7b518   David S. Miller   [TCP]: tcp_write_...
2335
  		tcp_minshall_update(tp, mss_now, skb);
a262f0cdf   Nandita Dukkipati   Proportional Rate...
2336
  		sent_pkts += tcp_skb_pcount(skb);
d5dd9175b   Ilpo Järvinen   tcp: use tcp_writ...
2337
2338
2339
  
  		if (push_one)
  			break;
92df7b518   David S. Miller   [TCP]: tcp_write_...
2340
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2341

5615f8861   Francis Yan   tcp: instrument h...
2342
2343
2344
2345
  	if (is_rwnd_limited)
  		tcp_chrono_start(sk, TCP_CHRONO_RWND_LIMITED);
  	else
  		tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
aa93466bd   David S. Miller   [TCP]: Eliminate ...
2346
  	if (likely(sent_pkts)) {
684bad110   Yuchung Cheng   tcp: use PRR to r...
2347
2348
  		if (tcp_in_cwnd_reduction(sk))
  			tp->prr_out += sent_pkts;
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2349
2350
2351
  
  		/* Send one loss probe per tail loss episode. */
  		if (push_one != 2)
ed66dfaf2   Neal Cardwell   tcp: when schedul...
2352
  			tcp_schedule_loss_probe(sk, false);
d2e1339f4   Bendik Rønning Opstad   tcp: Fix CWV bein...
2353
  		is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
ca8a22634   Neal Cardwell   tcp: make cwnd-li...
2354
  		tcp_cwnd_validate(sk, is_cwnd_limited);
a2a385d62   Eric Dumazet   tcp: bool convers...
2355
  		return false;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2356
  	}
75c119afe   Eric Dumazet   tcp: implement rb...
2357
  	return !tp->packets_out && !tcp_write_queue_empty(sk);
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2358
  }
ed66dfaf2   Neal Cardwell   tcp: when schedul...
2359
  bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2360
2361
2362
  {
  	struct inet_connection_sock *icsk = inet_csk(sk);
  	struct tcp_sock *tp = tcp_sk(sk);
a2815817f   Neal Cardwell   tcp: enable xmit ...
2363
  	u32 timeout, rto_delta_us;
2ae21cf52   Eric Dumazet   tcp: Namespace-if...
2364
  	int early_retrans;
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2365

6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2366
2367
2368
  	/* Don't do any loss probe on a Fast Open connection before 3WHS
  	 * finishes.
  	 */
d983ea6f1   Eric Dumazet   tcp: add rcu prot...
2369
  	if (rcu_access_pointer(tp->fastopen_rsk))
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2370
  		return false;
2ae21cf52   Eric Dumazet   tcp: Namespace-if...
2371
  	early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2372
  	/* Schedule a loss probe in 2*RTT for SACK capable connections
b4f70c3d4   Neal Cardwell   tcp: allow TLP in...
2373
  	 * not in loss recovery, that are either limited by cwnd or application.
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2374
  	 */
2ae21cf52   Eric Dumazet   tcp: Namespace-if...
2375
  	if ((early_retrans != 3 && early_retrans != 4) ||
bec41a11d   Yuchung Cheng   tcp: remove early...
2376
  	    !tp->packets_out || !tcp_is_sack(tp) ||
b4f70c3d4   Neal Cardwell   tcp: allow TLP in...
2377
2378
  	    (icsk->icsk_ca_state != TCP_CA_Open &&
  	     icsk->icsk_ca_state != TCP_CA_CWR))
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2379
  		return false;
bb4d991a2   Yuchung Cheng   tcp: adjust tail ...
2380
  	/* Probe timeout is 2*rtt. Add minimum RTO to account
f9b995822   Yuchung Cheng   tcp: send loss pr...
2381
2382
  	 * for delayed ack when there's one outstanding packet. If no RTT
  	 * sample is available then probe after TCP_TIMEOUT_INIT.
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2383
  	 */
bb4d991a2   Yuchung Cheng   tcp: adjust tail ...
2384
2385
2386
2387
2388
2389
2390
2391
2392
  	if (tp->srtt_us) {
  		timeout = usecs_to_jiffies(tp->srtt_us >> 2);
  		if (tp->packets_out == 1)
  			timeout += TCP_RTO_MIN;
  		else
  			timeout += TCP_TIMEOUT_MIN;
  	} else {
  		timeout = TCP_TIMEOUT_INIT;
  	}
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2393

a2815817f   Neal Cardwell   tcp: enable xmit ...
2394
  	/* If the RTO formula yields an earlier time, then use that time. */
ed66dfaf2   Neal Cardwell   tcp: when schedul...
2395
2396
2397
  	rto_delta_us = advancing_rto ?
  			jiffies_to_usecs(inet_csk(sk)->icsk_rto) :
  			tcp_rto_delta_us(sk);  /* How far in future is RTO? */
a2815817f   Neal Cardwell   tcp: enable xmit ...
2398
2399
  	if (rto_delta_us > 0)
  		timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2400

3f80e08f4   Eric Dumazet   tcp: add tcp_rese...
2401
2402
  	tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
  			     TCP_RTO_MAX, NULL);
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2403
2404
  	return true;
  }
1f3279ae0   Eric Dumazet   tcp: avoid retran...
2405
2406
2407
  /* Thanks to skb fast clones, we can detect if a prior transmit of
   * a packet is still in a qdisc or driver queue.
   * In this case, there is very little point doing a retransmit !
1f3279ae0   Eric Dumazet   tcp: avoid retran...
2408
2409
2410
2411
   */
  static bool skb_still_in_host_queue(const struct sock *sk,
  				    const struct sk_buff *skb)
  {
39bb5e628   Eric Dumazet   net: skb_fclone_b...
2412
  	if (unlikely(skb_fclone_busy(sk, skb))) {
c10d9310e   Eric Dumazet   tcp: do not assum...
2413
2414
  		NET_INC_STATS(sock_net(sk),
  			      LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
1f3279ae0   Eric Dumazet   tcp: avoid retran...
2415
2416
2417
2418
  		return true;
  	}
  	return false;
  }
b340b2645   Yuchung Cheng   tcp: TLP retransm...
2419
  /* When probe timeout (PTO) fires, try send a new segment if possible, else
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2420
2421
2422
2423
   * retransmit the last segment.
   */
  void tcp_send_loss_probe(struct sock *sk)
  {
9b717a8d2   Nandita Dukkipati   tcp: TLP loss det...
2424
  	struct tcp_sock *tp = tcp_sk(sk);
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2425
2426
2427
  	struct sk_buff *skb;
  	int pcount;
  	int mss = tcp_current_mss(sk);
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2428

b340b2645   Yuchung Cheng   tcp: TLP retransm...
2429
  	skb = tcp_send_head(sk);
75c119afe   Eric Dumazet   tcp: implement rb...
2430
2431
2432
2433
2434
2435
  	if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
  		pcount = tp->packets_out;
  		tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
  		if (tp->packets_out > pcount)
  			goto probe_sent;
  		goto rearm_timer;
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2436
  	}
75c119afe   Eric Dumazet   tcp: implement rb...
2437
  	skb = skb_rb_last(&sk->tcp_rtx_queue);
b2b7af861   Yuchung Cheng   tcp: fix NULL ref...
2438
2439
2440
2441
2442
2443
2444
2445
  	if (unlikely(!skb)) {
  		WARN_ONCE(tp->packets_out,
  			  "invalid inflight: %u state %u cwnd %u mss %d
  ",
  			  tp->packets_out, sk->sk_state, tp->snd_cwnd, mss);
  		inet_csk(sk)->icsk_pending = 0;
  		return;
  	}
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2446

9b717a8d2   Nandita Dukkipati   tcp: TLP loss det...
2447
2448
2449
  	/* At most one outstanding TLP retransmission. */
  	if (tp->tlp_high_seq)
  		goto rearm_timer;
1f3279ae0   Eric Dumazet   tcp: avoid retran...
2450
2451
  	if (skb_still_in_host_queue(sk, skb))
  		goto rearm_timer;
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2452
2453
2454
2455
2456
  	pcount = tcp_skb_pcount(skb);
  	if (WARN_ON(!pcount))
  		goto rearm_timer;
  
  	if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
75c119afe   Eric Dumazet   tcp: implement rb...
2457
2458
  		if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
  					  (pcount - 1) * mss, mss,
6cc55e096   Octavian Purdila   tcp: add gfp para...
2459
  					  GFP_ATOMIC)))
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2460
  			goto rearm_timer;
75c119afe   Eric Dumazet   tcp: implement rb...
2461
  		skb = skb_rb_next(skb);
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2462
2463
2464
2465
  	}
  
  	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
  		goto rearm_timer;
10d3be569   Eric Dumazet   tcp-tso: do not s...
2466
  	if (__tcp_retransmit_skb(sk, skb, 1))
b340b2645   Yuchung Cheng   tcp: TLP retransm...
2467
  		goto rearm_timer;
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2468

9b717a8d2   Nandita Dukkipati   tcp: TLP loss det...
2469
  	/* Record snd_nxt for loss detection. */
b340b2645   Yuchung Cheng   tcp: TLP retransm...
2470
  	tp->tlp_high_seq = tp->snd_nxt;
9b717a8d2   Nandita Dukkipati   tcp: TLP loss det...
2471

b340b2645   Yuchung Cheng   tcp: TLP retransm...
2472
  probe_sent:
c10d9310e   Eric Dumazet   tcp: do not assum...
2473
  	NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
b340b2645   Yuchung Cheng   tcp: TLP retransm...
2474
2475
  	/* Reset s.t. tcp_rearm_rto will restart timer from now */
  	inet_csk(sk)->icsk_pending = 0;
6ba8a3b19   Nandita Dukkipati   tcp: Tail loss pr...
2476
  rearm_timer:
fcd16c0a9   Yuchung Cheng   tcp: don't extend...
2477
  	tcp_rearm_rto(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2478
  }
a762a9800   David S. Miller   [TCP]: Kill extra...
2479
2480
2481
2482
  /* Push out any pending frames which were held back due to
   * TCP_CORK or attempt at coalescing tiny packets.
   * The socket must be locked by the caller.
   */
9e412ba76   Ilpo Järvinen   [TCP]: Sed magic ...
2483
2484
  void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
  			       int nonagle)
a762a9800   David S. Miller   [TCP]: Kill extra...
2485
  {
726e07a8a   Ilpo Järvinen   tcp: move some pa...
2486
2487
2488
2489
2490
2491
  	/* If we are closed, the bytes will have to remain here.
  	 * In time closedown will finish, we empty the write queue and
  	 * all will be happy.
  	 */
  	if (unlikely(sk->sk_state == TCP_CLOSE))
  		return;
99a1dec70   Mel Gorman   net: introduce sk...
2492
  	if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
7450aaf61   Eric Dumazet   tcp: suppress too...
2493
  			   sk_gfp_mask(sk, GFP_ATOMIC)))
726e07a8a   Ilpo Järvinen   tcp: move some pa...
2494
  		tcp_check_probe_timer(sk);
a762a9800   David S. Miller   [TCP]: Kill extra...
2495
  }
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
2496
2497
2498
2499
2500
  /* Send _single_ skb sitting at the send head. This function requires
   * true push pending frames to setup probe timer etc.
   */
  void tcp_push_one(struct sock *sk, unsigned int mss_now)
  {
fe067e8ab   David S. Miller   [TCP]: Abstract o...
2501
  	struct sk_buff *skb = tcp_send_head(sk);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
2502
2503
  
  	BUG_ON(!skb || skb->len < mss_now);
d5dd9175b   Ilpo Järvinen   tcp: use tcp_writ...
2504
  	tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation);
c1b4a7e69   David S. Miller   [TCP]: Move to ne...
2505
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2506
2507
  /* This function returns the amount that we can raise the
   * usable window based on the following constraints
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
2508
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
   * 1. The window can never be shrunk once it is offered (RFC 793)
   * 2. We limit memory per socket
   *
   * RFC 1122:
   * "the suggested [SWS] avoidance algorithm for the receiver is to keep
   *  RECV.NEXT + RCV.WIN fixed until:
   *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
   *
   * i.e. don't raise the right edge of the window until you can raise
   * it at least MSS bytes.
   *
   * Unfortunately, the recommended algorithm breaks header prediction,
   * since header prediction assumes th->window stays fixed.
   *
   * Strictly speaking, keeping th->window fixed violates the receiver
   * side SWS prevention criteria. The problem is that under this rule
   * a stream of single byte packets will cause the right side of the
   * window to always advance by a single byte.
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
2527
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2528
2529
   * Of course, if the sender implements sender side SWS prevention
   * then this will not be a problem.
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
2530
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2531
   * BSD seems to make the following compromise:
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
2532
   *
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
   *	If the free space is less than the 1/4 of the maximum
   *	space available and the free space is less than 1/2 mss,
   *	then set the window to 0.
   *	[ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
   *	Otherwise, just prevent the window from shrinking
   *	and from being larger than the largest representable value.
   *
   * This prevents incremental opening of the window in the regime
   * where TCP is limited by the speed of the reader side taking
   * data out of the TCP receive queue. It does nothing about
   * those cases where the window is constrained on the sender side
   * because the pipeline is full.
   *
   * BSD also seems to "accidentally" limit itself to windows that are a
   * multiple of MSS, at least until the free space gets quite small.
   * This would appear to be a side effect of the mbuf implementation.
   * Combining these two algorithms results in the observed behavior
   * of having a fixed window size at almost all times.
   *
   * Below we obtain similar behavior by forcing the offered window to
   * a multiple of the mss when it is feasible to do so.
   *
   * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
   * Regular options like TIMESTAMP are taken into account.
   */
  u32 __tcp_select_window(struct sock *sk)
  {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
2560
  	struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2561
  	struct tcp_sock *tp = tcp_sk(sk);
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
2562
  	/* MSS for the peer's data.  Previous versions used mss_clamp
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2563
2564
2565
2566
2567
  	 * here.  I don't know if the value based on our guesses
  	 * of peer's MSS is better for the performance.  It's more correct
  	 * but may be worse for the performance because of rcv_mss
  	 * fluctuations.  --SAW  1998/11/1
  	 */
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
2568
  	int mss = icsk->icsk_ack.rcv_mss;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2569
  	int free_space = tcp_space(sk);
86c1a0456   Florian Westphal   tcp: use zero-win...
2570
2571
  	int allowed_space = tcp_full_space(sk);
  	int full_space = min_t(int, tp->window_clamp, allowed_space);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2572
  	int window;
06425c308   Eric Dumazet   tcp: fix 0 divide...
2573
  	if (unlikely(mss > full_space)) {
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
2574
  		mss = full_space;
06425c308   Eric Dumazet   tcp: fix 0 divide...
2575
2576
2577
  		if (mss <= 0)
  			return 0;
  	}
b92edbe0b   Eric Dumazet   [TCP] Avoid two d...
2578
  	if (free_space < (full_space >> 1)) {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
2579
  		icsk->icsk_ack.quick = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2580

b8da51ebb   Eric Dumazet   tcp: introduce tc...
2581
  		if (tcp_under_memory_pressure(sk))
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2582
2583
  			tp->rcv_ssthresh = min(tp->rcv_ssthresh,
  					       4U * tp->advmss);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2584

86c1a0456   Florian Westphal   tcp: use zero-win...
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
  		/* free_space might become our new window, make sure we don't
  		 * increase it due to wscale.
  		 */
  		free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
  
  		/* if free space is less than mss estimate, or is below 1/16th
  		 * of the maximum allowed, try to move to zero-window, else
  		 * tcp_clamp_window() will grow rcv buf up to tcp_rmem[2], and
  		 * new incoming data is dropped due to memory limits.
  		 * With large window, mss test triggers way too late in order
  		 * to announce zero window in time before rmem limit kicks in.
  		 */
  		if (free_space < (allowed_space >> 4) || free_space < mss)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2598
2599
2600
2601
2602
2603
2604
2605
2606
  			return 0;
  	}
  
  	if (free_space > tp->rcv_ssthresh)
  		free_space = tp->rcv_ssthresh;
  
  	/* Don't do rounding if we are using window scaling, since the
  	 * scaled window will not line up with the MSS boundary anyway.
  	 */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2607
2608
2609
2610
2611
2612
2613
  	if (tp->rx_opt.rcv_wscale) {
  		window = free_space;
  
  		/* Advertise enough space so that it won't get scaled away.
  		 * Import case: prevent zero window announcement if
  		 * 1<<rcv_wscale > mss.
  		 */
1935299d9   Gao Feng   net: tcp: Refine ...
2614
  		window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2615
  	} else {
1935299d9   Gao Feng   net: tcp: Refine ...
2616
  		window = tp->rcv_wnd;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2617
2618
2619
2620
2621
2622
2623
2624
2625
  		/* Get the largest window that is a nice multiple of mss.
  		 * Window clamp already applied above.
  		 * If our current window offering is within 1 mss of the
  		 * free space we just keep it. This prevents the divide
  		 * and multiply from happening most of the time.
  		 * We also don't do any window rounding when the free space
  		 * is too small.
  		 */
  		if (window <= free_space - mss || window > free_space)
1935299d9   Gao Feng   net: tcp: Refine ...
2626
  			window = rounddown(free_space, mss);
84565070e   John Heffner   [TCP]: Do receive...
2627
  		else if (mss == full_space &&
b92edbe0b   Eric Dumazet   [TCP] Avoid two d...
2628
  			 free_space > window + (full_space >> 1))
84565070e   John Heffner   [TCP]: Do receive...
2629
  			window = free_space;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2630
2631
2632
2633
  	}
  
  	return window;
  }
cfea5a688   Martin KaFai Lau   tcp: Merge tx_fla...
2634
2635
  void tcp_skb_collapse_tstamp(struct sk_buff *skb,
  			     const struct sk_buff *next_skb)
082ac2d51   Martin KaFai Lau   tcp: Merge tx_fla...
2636
  {
0a2cf20c3   Soheil Hassas Yeganeh   tcp: remove SKBTX...
2637
2638
2639
  	if (unlikely(tcp_has_tx_tstamp(next_skb))) {
  		const struct skb_shared_info *next_shinfo =
  			skb_shinfo(next_skb);
082ac2d51   Martin KaFai Lau   tcp: Merge tx_fla...
2640
  		struct skb_shared_info *shinfo = skb_shinfo(skb);
0a2cf20c3   Soheil Hassas Yeganeh   tcp: remove SKBTX...
2641
  		shinfo->tx_flags |= next_shinfo->tx_flags & SKBTX_ANY_TSTAMP;
082ac2d51   Martin KaFai Lau   tcp: Merge tx_fla...
2642
  		shinfo->tskey = next_shinfo->tskey;
2de8023e7   Martin KaFai Lau   tcp: Merge txstam...
2643
2644
  		TCP_SKB_CB(skb)->txstamp_ack |=
  			TCP_SKB_CB(next_skb)->txstamp_ack;
082ac2d51   Martin KaFai Lau   tcp: Merge tx_fla...
2645
2646
  	}
  }
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2647
  /* Collapses two adjacent SKB's during retransmission. */
f8071cde7   Eric Dumazet   tcp: enhance tcp_...
2648
  static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2649
2650
  {
  	struct tcp_sock *tp = tcp_sk(sk);
75c119afe   Eric Dumazet   tcp: implement rb...
2651
  	struct sk_buff *next_skb = skb_rb_next(skb);
13dde04f5   Wei Yongjun   tcp: remove set b...
2652
  	int next_skb_size;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2653

058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
2654
  	next_skb_size = next_skb->len;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2655

058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
2656
  	BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
a6963a6b3   Ilpo Järvinen   [TCP]: Re-place h...
2657

f8071cde7   Eric Dumazet   tcp: enhance tcp_...
2658
2659
2660
2661
  	if (next_skb_size) {
  		if (next_skb_size <= skb_availroom(skb))
  			skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
  				      next_skb_size);
3b4929f65   Eric Dumazet   tcp: limit payloa...
2662
  		else if (!tcp_skb_shift(skb, next_skb, 1, next_skb_size))
f8071cde7   Eric Dumazet   tcp: enhance tcp_...
2663
2664
  			return false;
  	}
2b7cda9c3   Eric Dumazet   tcp: fix tcp_mtu_...
2665
  	tcp_highest_sack_replace(sk, next_skb, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2666

058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
2667
2668
  	/* Update sequence range on original skb. */
  	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2669

e6c7d0857   Ilpo Järvinen   tcp: drop unneces...
2670
  	/* Merge over control information. This moves PSH/FIN etc. over */
4de075e04   Eric Dumazet   tcp: rename tcp_s...
2671
  	TCP_SKB_CB(skb)->tcp_flags |= TCP_SKB_CB(next_skb)->tcp_flags;
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
2672
2673
2674
2675
2676
  
  	/* All done, get rid of second SKB and account for it so
  	 * packet counting does not break.
  	 */
  	TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
a643b5d41   Martin KaFai Lau   tcp: Handle eor b...
2677
  	TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
2678
2679
  
  	/* changed transmit queue under us so clear hints */
ef9da47c7   Ilpo Järvinen   tcp: don't clear ...
2680
2681
2682
  	tcp_clear_retrans_hints_partial(tp);
  	if (next_skb == tp->retransmit_skb_hint)
  		tp->retransmit_skb_hint = skb;
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
2683

797108d13   Ilpo Järvinen   tcp: add helper f...
2684
  	tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
082ac2d51   Martin KaFai Lau   tcp: Merge tx_fla...
2685
  	tcp_skb_collapse_tstamp(skb, next_skb);
75c119afe   Eric Dumazet   tcp: implement rb...
2686
  	tcp_rtx_queue_unlink_and_free(next_skb, sk);
f8071cde7   Eric Dumazet   tcp: enhance tcp_...
2687
  	return true;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2688
  }
67edfef78   Andi Kleen   TCP: Add comments...
2689
  /* Check if coalescing SKBs is legal. */
a2a385d62   Eric Dumazet   tcp: bool convers...
2690
  static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2691
2692
  {
  	if (tcp_skb_pcount(skb) > 1)
a2a385d62   Eric Dumazet   tcp: bool convers...
2693
  		return false;
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2694
  	if (skb_cloned(skb))
a2a385d62   Eric Dumazet   tcp: bool convers...
2695
  		return false;
2331ccc5b   Eric Dumazet   tcp: enhance tcp ...
2696
  	/* Some heuristics for collapsing over SACK'd could be invented */
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2697
  	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
a2a385d62   Eric Dumazet   tcp: bool convers...
2698
  		return false;
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2699

a2a385d62   Eric Dumazet   tcp: bool convers...
2700
  	return true;
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2701
  }
67edfef78   Andi Kleen   TCP: Add comments...
2702
2703
2704
  /* Collapse packets in the retransmit queue to make to create
   * less packets on the wire. This is only done on retransmission.
   */
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2705
2706
2707
2708
2709
  static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
  				     int space)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *skb = to, *tmp;
a2a385d62   Eric Dumazet   tcp: bool convers...
2710
  	bool first = true;
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2711

e0a1e5b51   Eric Dumazet   tcp: Namespace-if...
2712
  	if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2713
  		return;
4de075e04   Eric Dumazet   tcp: rename tcp_s...
2714
  	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2715
  		return;
75c119afe   Eric Dumazet   tcp: implement rb...
2716
  	skb_rbtree_walk_from_safe(skb, tmp) {
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2717
2718
  		if (!tcp_can_collapse(sk, skb))
  			break;
a643b5d41   Martin KaFai Lau   tcp: Handle eor b...
2719
2720
  		if (!tcp_skb_can_collapse_to(to))
  			break;
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2721
2722
2723
  		space -= skb->len;
  
  		if (first) {
a2a385d62   Eric Dumazet   tcp: bool convers...
2724
  			first = false;
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2725
2726
2727
2728
2729
  			continue;
  		}
  
  		if (space < 0)
  			break;
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2730
2731
2732
  
  		if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp)))
  			break;
f8071cde7   Eric Dumazet   tcp: enhance tcp_...
2733
2734
  		if (!tcp_collapse_retrans(sk, to))
  			break;
4a17fc3ad   Ilpo Järvinen   tcp: collapse mor...
2735
2736
  	}
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2737
2738
2739
2740
  /* This retransmits one SKB.  Policy decisions and retransmit queue
   * state updates are done by the caller.  Returns non-zero if an
   * error occurred which prevented the send.
   */
10d3be569   Eric Dumazet   tcp-tso: do not s...
2741
  int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2742
  {
5d424d5a6   John Heffner   [TCP]: MTU probing
2743
  	struct inet_connection_sock *icsk = inet_csk(sk);
10d3be569   Eric Dumazet   tcp-tso: do not s...
2744
  	struct tcp_sock *tp = tcp_sk(sk);
7d227cd23   Sridhar Samudrala   tcp: TCP connecti...
2745
  	unsigned int cur_mss;
10d3be569   Eric Dumazet   tcp-tso: do not s...
2746
  	int diff, len, err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2747

10d3be569   Eric Dumazet   tcp-tso: do not s...
2748
2749
  	/* Inconclusive MTU probe */
  	if (icsk->icsk_mtup.probe_size)
5d424d5a6   John Heffner   [TCP]: MTU probing
2750
  		icsk->icsk_mtup.probe_size = 0;
5d424d5a6   John Heffner   [TCP]: MTU probing
2751

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2752
  	/* Do not sent more than we queued. 1/4 is reserved for possible
caa20d9ab   Stephen Hemminger   [TCP]: spelling f...
2753
  	 * copying overhead: fragmentation, tunneling, mangling etc.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2754
  	 */
14afee4b6   Reshetova, Elena   net: convert sock...
2755
  	if (refcount_read(&sk->sk_wmem_alloc) >
ffb4d6c85   Eric Dumazet   tcp: fix overflow...
2756
2757
  	    min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
  		  sk->sk_sndbuf))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2758
  		return -EAGAIN;
1f3279ae0   Eric Dumazet   tcp: avoid retran...
2759
2760
  	if (skb_still_in_host_queue(sk, skb))
  		return -EBUSY;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2761
  	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
7f582b248   Eric Dumazet   tcp: purge write ...
2762
2763
2764
2765
  		if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
  			WARN_ON_ONCE(1);
  			return -EINVAL;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2766
2767
2768
  		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
  			return -ENOMEM;
  	}
7d227cd23   Sridhar Samudrala   tcp: TCP connecti...
2769
2770
  	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
  		return -EHOSTUNREACH; /* Routing failure or similar. */
0c54b85f2   Ilpo Järvinen   tcp: simplify tcp...
2771
  	cur_mss = tcp_current_mss(sk);
7d227cd23   Sridhar Samudrala   tcp: TCP connecti...
2772

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2773
2774
2775
2776
2777
  	/* If receiver has shrunk his window, and skb is out of
  	 * new window, do not retransmit it. The exception is the
  	 * case, when window is shrunk to zero. In this case
  	 * our retransmit serves as a zero window probe.
  	 */
9d4fb27db   Joe Perches   net/ipv4: Move &&...
2778
2779
  	if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
  	    TCP_SKB_CB(skb)->seq != tp->snd_una)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2780
  		return -EAGAIN;
10d3be569   Eric Dumazet   tcp-tso: do not s...
2781
2782
  	len = cur_mss * segs;
  	if (skb->len > len) {
75c119afe   Eric Dumazet   tcp: implement rb...
2783
2784
  		if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
  				 cur_mss, GFP_ATOMIC))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2785
  			return -ENOMEM; /* We'll try again later. */
02276f3c9   Ilpo Järvinen   tcp: fix corner c...
2786
  	} else {
10d3be569   Eric Dumazet   tcp-tso: do not s...
2787
2788
  		if (skb_unclone(skb, GFP_ATOMIC))
  			return -ENOMEM;
9eb9362e5   Ilpo Järvinen   tcp: miscounts du...
2789

10d3be569   Eric Dumazet   tcp-tso: do not s...
2790
2791
2792
2793
2794
2795
2796
  		diff = tcp_skb_pcount(skb);
  		tcp_set_skb_tso_segs(skb, cur_mss);
  		diff -= tcp_skb_pcount(skb);
  		if (diff)
  			tcp_adjust_pcount(sk, skb, diff);
  		if (skb->len < cur_mss)
  			tcp_retrans_try_collapse(sk, skb, cur_mss);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2797
  	}
492135557   Daniel Borkmann   tcp: add rfc3168,...
2798
2799
2800
  	/* RFC3168, section 6.1.1.1. ECN fallback */
  	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
  		tcp_ecn_clear_syn(sk, skb);
678550c65   Yuchung Cheng   tcp: include loca...
2801
2802
2803
2804
2805
2806
  	/* Update global and local TCP statistics. */
  	segs = tcp_skb_pcount(skb);
  	TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs);
  	if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
  		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
  	tp->total_retrans += segs;
fb31c9b9f   Wei Wang   tcp: add data byt...
2807
  	tp->bytes_retrans += skb->len;
678550c65   Yuchung Cheng   tcp: include loca...
2808

50bceae9b   Thomas Graf   tcp: Reallocate h...
2809
2810
2811
2812
2813
2814
  	/* make sure skb->data is aligned on arches that require it
  	 * and check if ack-trimming & collapsing extended the headroom
  	 * beyond what csum_start can cover.
  	 */
  	if (unlikely((NET_IP_ALIGN && ((unsigned long)skb->data & 3)) ||
  		     skb_headroom(skb) >= 0xFFFF)) {
10a81980f   Eric Dumazet   tcp: refresh skb ...
2815
  		struct sk_buff *nskb;
e2080072e   Eric Dumazet   tcp: new list for...
2816
2817
2818
2819
2820
  		tcp_skb_tsorted_save(skb) {
  			nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
  			err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
  				     -ENOBUFS;
  		} tcp_skb_tsorted_restore(skb);
5889e2c0e   Yousuk Seung   tcp: call tcp_rat...
2821
  		if (!err) {
a7a256306   Eric Dumazet   tcp: mitigate sch...
2822
  			tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
5889e2c0e   Yousuk Seung   tcp: call tcp_rat...
2823
2824
  			tcp_rate_skb_sent(sk, skb);
  		}
117632e64   Eric Dumazet   tcp: take care of...
2825
  	} else {
c84a57113   Yuchung Cheng   tcp: fix bogus RT...
2826
  		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
117632e64   Eric Dumazet   tcp: take care of...
2827
  	}
c84a57113   Yuchung Cheng   tcp: fix bogus RT...
2828

7f12422c4   Yuchung Cheng   tcp: always times...
2829
2830
2831
2832
  	/* To avoid taking spuriously low RTT samples based on a timestamp
  	 * for a transmit that never happened, always mark EVER_RETRANS
  	 */
  	TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
a31ad29e6   Lawrence Brakmo   bpf: Add BPF_SOCK...
2833
2834
2835
  	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG))
  		tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
  				  TCP_SKB_CB(skb)->seq, segs, err);
fc9f35010   Eric Dumazet   tcp: increment re...
2836
  	if (likely(!err)) {
e086101b1   Cong Wang   tcp: add a tracep...
2837
  		trace_tcp_retransmit_skb(sk, skb);
678550c65   Yuchung Cheng   tcp: include loca...
2838
  	} else if (err != -EBUSY) {
ec641b394   Yuchung Cheng   tcp: fix SNMP und...
2839
  		NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
fc9f35010   Eric Dumazet   tcp: increment re...
2840
  	}
c84a57113   Yuchung Cheng   tcp: fix bogus RT...
2841
  	return err;
93b174ad7   Yuchung Cheng   tcp: bug fix Fast...
2842
  }
10d3be569   Eric Dumazet   tcp-tso: do not s...
2843
  int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
93b174ad7   Yuchung Cheng   tcp: bug fix Fast...
2844
2845
  {
  	struct tcp_sock *tp = tcp_sk(sk);
10d3be569   Eric Dumazet   tcp-tso: do not s...
2846
  	int err = __tcp_retransmit_skb(sk, skb, segs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2847
2848
  
  	if (err == 0) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2849
  #if FASTRETRANS_DEBUG > 0
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
2850
  		if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
e87cc4728   Joe Perches   net: Convert net_...
2851
2852
  			net_dbg_ratelimited("retrans_out leaked
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2853
2854
2855
2856
  		}
  #endif
  		TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
  		tp->retrans_out += tcp_skb_pcount(skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2857
  	}
6e08d5e3c   Yuchung Cheng   tcp: fix false un...
2858

7ae189759   Yuchung Cheng   tcp: always set r...
2859
2860
2861
  	/* Save stamp of the first (attempted) retransmit. */
  	if (!tp->retrans_stamp)
  		tp->retrans_stamp = tcp_skb_timestamp(skb);
6e08d5e3c   Yuchung Cheng   tcp: fix false un...
2862
2863
2864
  	if (tp->undo_retrans < 0)
  		tp->undo_retrans = 0;
  	tp->undo_retrans += tcp_skb_pcount(skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2865
2866
2867
2868
2869
2870
2871
  	return err;
  }
  
  /* This gets called after a retransmit timeout, and the initially
   * retransmitted data is acknowledged.  It tries to continue
   * resending the rest of the retransmit queue, until either
   * we've sent it all or the congestion window limit is reached.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2872
2873
2874
   */
  void tcp_xmit_retransmit_queue(struct sock *sk)
  {
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
2875
  	const struct inet_connection_sock *icsk = inet_csk(sk);
b9f1f1ce8   Eric Dumazet   tcp: fix tcp_xmit...
2876
  	struct sk_buff *skb, *rtx_head, *hole = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2877
  	struct tcp_sock *tp = tcp_sk(sk);
840a3cbe8   Yuchung Cheng   tcp: remove forwa...
2878
  	u32 max_segs;
61eb55f4d   Ilpo Järvinen   tcp: Reorganize s...
2879
  	int mib_idx;
6a438bbe6   Stephen Hemminger   [TCP]: speed up S...
2880

45e77d314   Ilpo Järvinen   tcp: fix crash in...
2881
2882
  	if (!tp->packets_out)
  		return;
b9f1f1ce8   Eric Dumazet   tcp: fix tcp_xmit...
2883
2884
  	rtx_head = tcp_rtx_queue_head(sk);
  	skb = tp->retransmit_skb_hint ?: rtx_head;
ed6e7268b   Neal Cardwell   tcp: allow conges...
2885
  	max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
75c119afe   Eric Dumazet   tcp: implement rb...
2886
  	skb_rbtree_walk_from(skb) {
dca0aaf84   Eric Dumazet   tcp: defer sacked...
2887
  		__u8 sacked;
10d3be569   Eric Dumazet   tcp-tso: do not s...
2888
  		int segs;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2889

218af599f   Eric Dumazet   tcp: internal imp...
2890
2891
  		if (tcp_pacing_check(sk))
  			break;
08ebd1721   Ilpo Järvinen   tcp: remove tp->l...
2892
  		/* we could do better than to assign each time */
51456b291   Ian Morris   ipv4: coding styl...
2893
  		if (!hole)
0e1c54c2a   Ilpo Järvinen   tcp: reorganize r...
2894
  			tp->retransmit_skb_hint = skb;
08ebd1721   Ilpo Järvinen   tcp: remove tp->l...
2895

10d3be569   Eric Dumazet   tcp-tso: do not s...
2896
2897
  		segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
  		if (segs <= 0)
08ebd1721   Ilpo Järvinen   tcp: remove tp->l...
2898
  			return;
dca0aaf84   Eric Dumazet   tcp: defer sacked...
2899
  		sacked = TCP_SKB_CB(skb)->sacked;
a3d2e9f8e   Eric Dumazet   tcp: do not send ...
2900
2901
2902
2903
  		/* In case tcp_shift_skb_data() have aggregated large skbs,
  		 * we need to make sure not sending too bigs TSO packets
  		 */
  		segs = min_t(int, segs, max_segs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2904

840a3cbe8   Yuchung Cheng   tcp: remove forwa...
2905
2906
  		if (tp->retrans_out >= tp->lost_out) {
  			break;
0e1c54c2a   Ilpo Järvinen   tcp: reorganize r...
2907
  		} else if (!(sacked & TCPCB_LOST)) {
51456b291   Ian Morris   ipv4: coding styl...
2908
  			if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
0e1c54c2a   Ilpo Järvinen   tcp: reorganize r...
2909
2910
  				hole = skb;
  			continue;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2911

0e1c54c2a   Ilpo Järvinen   tcp: reorganize r...
2912
2913
2914
2915
2916
2917
  		} else {
  			if (icsk->icsk_ca_state != TCP_CA_Loss)
  				mib_idx = LINUX_MIB_TCPFASTRETRANS;
  			else
  				mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2918

0e1c54c2a   Ilpo Järvinen   tcp: reorganize r...
2919
  		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2920
  			continue;
f9616c35a   Eric Dumazet   tcp: implement TS...
2921
2922
  		if (tcp_small_queue_check(sk, skb, 1))
  			return;
10d3be569   Eric Dumazet   tcp-tso: do not s...
2923
  		if (tcp_retransmit_skb(sk, skb, segs))
0e1c54c2a   Ilpo Järvinen   tcp: reorganize r...
2924
  			return;
24ab6bec8   Yuchung Cheng   tcp: account all ...
2925

de1d65781   Yuchung Cheng   tcp: fix under-ac...
2926
  		NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2927

684bad110   Yuchung Cheng   tcp: use PRR to r...
2928
  		if (tcp_in_cwnd_reduction(sk))
a262f0cdf   Nandita Dukkipati   Proportional Rate...
2929
  			tp->prr_out += tcp_skb_pcount(skb);
75c119afe   Eric Dumazet   tcp: implement rb...
2930
  		if (skb == rtx_head &&
57dde7f70   Yuchung Cheng   tcp: add reorderi...
2931
  		    icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
3f80e08f4   Eric Dumazet   tcp: add tcp_rese...
2932
2933
2934
2935
  			tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
  					     inet_csk(sk)->icsk_rto,
  					     TCP_RTO_MAX,
  					     skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2936
2937
  	}
  }
d83769a58   Eric Dumazet   tcp: fix possible...
2938
2939
  /* We allow to exceed memory limits for FIN packets to expedite
   * connection tear down and (memory) recovery.
845704a53   Eric Dumazet   tcp: avoid loopin...
2940
2941
   * Otherwise tcp_send_fin() could be tempted to either delay FIN
   * or even be forced to close flow without any FIN.
a6c5ea4cc   Eric Dumazet   tcp: rename sk_fo...
2942
2943
   * In general, we want to allow one skb per socket to avoid hangs
   * with edge trigger epoll()
d83769a58   Eric Dumazet   tcp: fix possible...
2944
   */
a6c5ea4cc   Eric Dumazet   tcp: rename sk_fo...
2945
  void sk_forced_mem_schedule(struct sock *sk, int size)
d83769a58   Eric Dumazet   tcp: fix possible...
2946
  {
e805605c7   Johannes Weiner   net: tcp_memcontr...
2947
  	int amt;
d83769a58   Eric Dumazet   tcp: fix possible...
2948
2949
2950
2951
2952
  
  	if (size <= sk->sk_forward_alloc)
  		return;
  	amt = sk_mem_pages(size);
  	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
e805605c7   Johannes Weiner   net: tcp_memcontr...
2953
  	sk_memory_allocated_add(sk, amt);
baac50bbc   Johannes Weiner   net: tcp_memcontr...
2954
2955
  	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
  		mem_cgroup_charge_skmem(sk->sk_memcg, amt);
d83769a58   Eric Dumazet   tcp: fix possible...
2956
  }
845704a53   Eric Dumazet   tcp: avoid loopin...
2957
2958
  /* Send a FIN. The caller locks the socket for us.
   * We should try to send a FIN packet really hard, but eventually give up.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2959
2960
2961
   */
  void tcp_send_fin(struct sock *sk)
  {
845704a53   Eric Dumazet   tcp: avoid loopin...
2962
  	struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
2963
  	struct tcp_sock *tp = tcp_sk(sk);
e905a9eda   YOSHIFUJI Hideaki   [NET] IPV4: Fix w...
2964

845704a53   Eric Dumazet   tcp: avoid loopin...
2965
2966
2967
2968
  	/* Optimization, tack on the FIN if we have one skb in write queue and
  	 * this skb was not yet sent, or we are under memory pressure.
  	 * Note: in the latter case, FIN packet will be sent after a timeout,
  	 * as TCP stack thinks it has already been transmitted.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2969
  	 */
75c119afe   Eric Dumazet   tcp: implement rb...
2970
2971
2972
2973
  	if (!tskb && tcp_under_memory_pressure(sk))
  		tskb = skb_rb_last(&sk->tcp_rtx_queue);
  
  	if (tskb) {
845704a53   Eric Dumazet   tcp: avoid loopin...
2974
2975
  		TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
  		TCP_SKB_CB(tskb)->end_seq++;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2976
  		tp->write_seq++;
75c119afe   Eric Dumazet   tcp: implement rb...
2977
  		if (tcp_write_queue_empty(sk)) {
845704a53   Eric Dumazet   tcp: avoid loopin...
2978
2979
2980
2981
2982
2983
  			/* This means tskb was already sent.
  			 * Pretend we included the FIN on previous transmit.
  			 * We need to set tp->snd_nxt to the value it would have
  			 * if FIN had been sent. This is because retransmit path
  			 * does not change tp->snd_nxt.
  			 */
e0d694d63   Eric Dumazet   tcp: annotate tp-...
2984
  			WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
845704a53   Eric Dumazet   tcp: avoid loopin...
2985
2986
  			return;
  		}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2987
  	} else {
845704a53   Eric Dumazet   tcp: avoid loopin...
2988
  		skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
d1edc0855   Colin Ian King   tcp: remove redun...
2989
  		if (unlikely(!skb))
845704a53   Eric Dumazet   tcp: avoid loopin...
2990
  			return;
d1edc0855   Colin Ian King   tcp: remove redun...
2991

e2080072e   Eric Dumazet   tcp: new list for...
2992
  		INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
d83769a58   Eric Dumazet   tcp: fix possible...
2993
  		skb_reserve(skb, MAX_TCP_HEADER);
a6c5ea4cc   Eric Dumazet   tcp: rename sk_fo...
2994
  		sk_forced_mem_schedule(sk, skb->truesize);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2995
  		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
e870a8efc   Ilpo Järvinen   [TCP]: Perform se...
2996
  		tcp_init_nondata_skb(skb, tp->write_seq,
a3433f35a   Changli Gao   tcp: unify tcp fl...
2997
  				     TCPHDR_ACK | TCPHDR_FIN);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2998
2999
  		tcp_queue_skb(sk, skb);
  	}
845704a53   Eric Dumazet   tcp: avoid loopin...
3000
  	__tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3001
3002
3003
3004
3005
  }
  
  /* We get here when a process closes a file descriptor (either due to
   * an explicit close() or as a byproduct of exit()'ing) and there
   * was unread data in the receive queue.  This behavior is recommended
65bb723c9   Gerrit Renker   [TCP]: Update ref...
3006
   * by RFC 2525, section 2.17.  -DaveM
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3007
   */
dd0fc66fb   Al Viro   [PATCH] gfp flags...
3008
  void tcp_send_active_reset(struct sock *sk, gfp_t priority)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3009
  {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3010
  	struct sk_buff *skb;
7cc2b043b   Gao Feng   net: tcp: Increas...
3011
  	TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3012
3013
3014
  	/* NOTE: No TCP options attached and we never retransmit this. */
  	skb = alloc_skb(MAX_TCP_HEADER, priority);
  	if (!skb) {
4e6734447   Pavel Emelyanov   mib: add net to N...
3015
  		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3016
3017
3018
3019
3020
  		return;
  	}
  
  	/* Reserve space for headers and prepare control bits. */
  	skb_reserve(skb, MAX_TCP_HEADER);
e870a8efc   Ilpo Järvinen   [TCP]: Perform se...
3021
  	tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
a3433f35a   Changli Gao   tcp: unify tcp fl...
3022
  			     TCPHDR_ACK | TCPHDR_RST);
9a568de48   Eric Dumazet   tcp: switch TCP T...
3023
  	tcp_mstamp_refresh(tcp_sk(sk));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3024
  	/* Send it off. */
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
3025
  	if (tcp_transmit_skb(sk, skb, 0, priority))
4e6734447   Pavel Emelyanov   mib: add net to N...
3026
  		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED);
c24b14c46   Song Liu   tcp: add tracepoi...
3027
3028
3029
3030
3031
  
  	/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
  	 * skb here is different to the troublesome skb, so use NULL
  	 */
  	trace_tcp_send_reset(sk, NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3032
  }
67edfef78   Andi Kleen   TCP: Add comments...
3033
3034
  /* Send a crossed SYN-ACK during socket establishment.
   * WARNING: This routine must only be called when we have already sent
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3035
3036
3037
3038
3039
3040
   * a SYN packet that crossed the incoming SYN that caused this routine
   * to get called. If this assumption fails then the initial rcv_wnd
   * and rcv_wscale values will not be correct.
   */
  int tcp_send_synack(struct sock *sk)
  {
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3041
  	struct sk_buff *skb;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3042

75c119afe   Eric Dumazet   tcp: implement rb...
3043
  	skb = tcp_rtx_queue_head(sk);
51456b291   Ian Morris   ipv4: coding styl...
3044
  	if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
75c119afe   Eric Dumazet   tcp: implement rb...
3045
3046
  		pr_err("%s: wrong queue state
  ", __func__);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3047
3048
  		return -EFAULT;
  	}
4de075e04   Eric Dumazet   tcp: rename tcp_s...
3049
  	if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3050
  		if (skb_cloned(skb)) {
e2080072e   Eric Dumazet   tcp: new list for...
3051
3052
3053
3054
3055
  			struct sk_buff *nskb;
  
  			tcp_skb_tsorted_save(skb) {
  				nskb = skb_copy(skb, GFP_ATOMIC);
  			} tcp_skb_tsorted_restore(skb);
51456b291   Ian Morris   ipv4: coding styl...
3056
  			if (!nskb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3057
  				return -ENOMEM;
e2080072e   Eric Dumazet   tcp: new list for...
3058
  			INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
69486bfa0   Eric Dumazet   tcp: do not leave...
3059
  			tcp_highest_sack_replace(sk, skb, nskb);
75c119afe   Eric Dumazet   tcp: implement rb...
3060
  			tcp_rtx_queue_unlink_and_free(skb, sk);
f4a775d14   Eric Dumazet   net: introduce __...
3061
  			__skb_header_release(nskb);
75c119afe   Eric Dumazet   tcp: implement rb...
3062
  			tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
ab4e846a8   Eric Dumazet   tcp: annotate sk-...
3063
  			sk_wmem_queued_add(sk, nskb->truesize);
3ab224be6   Hideo Aoki   [NET] CORE: Intro...
3064
  			sk_mem_charge(sk, nskb->truesize);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3065
3066
  			skb = nskb;
  		}
4de075e04   Eric Dumazet   tcp: rename tcp_s...
3067
  		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
735d38311   Florian Westphal   tcp: change TCP_E...
3068
  		tcp_ecn_send_synack(sk, skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3069
  	}
dfb4b9dce   David S. Miller   [TCP] Vegas: time...
3070
  	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3071
  }
4aea39c11   Eric Dumazet   tcp: tcp_make_syn...
3072
3073
3074
3075
3076
  /**
   * tcp_make_synack - Prepare a SYN-ACK.
   * sk: listener socket
   * dst: dst entry attached to the SYNACK
   * req: request_sock pointer
4aea39c11   Eric Dumazet   tcp: tcp_make_syn...
3077
3078
3079
3080
   *
   * Allocate one skb and build a SYNACK packet.
   * @dst is consumed : Caller should not use it again.
   */
5d062de7f   Eric Dumazet   tcp: constify tcp...
3081
  struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
e6b4d1136   William Allen Simpson   TCPCT part 1a: ad...
3082
  				struct request_sock *req,
ca6fb0651   Eric Dumazet   tcp: attach SYNAC...
3083
  				struct tcp_fastopen_cookie *foc,
b3d051477   Eric Dumazet   tcp: do not mess ...
3084
  				enum tcp_synack_type synack_type)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3085
  {
2e6599cb8   Arnaldo Carvalho de Melo   [NET] Generalise ...
3086
  	struct inet_request_sock *ireq = inet_rsk(req);
5d062de7f   Eric Dumazet   tcp: constify tcp...
3087
  	const struct tcp_sock *tp = tcp_sk(sk);
80f03e27a   Eric Dumazet   tcp: md5: fix rcu...
3088
  	struct tcp_md5sig_key *md5 = NULL;
5d062de7f   Eric Dumazet   tcp: constify tcp...
3089
3090
  	struct tcp_out_options opts;
  	struct sk_buff *skb;
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
3091
  	int tcp_header_size;
5d062de7f   Eric Dumazet   tcp: constify tcp...
3092
  	struct tcphdr *th;
f5fff5dc8   Tom Quetchenbach   tcp: advertise MS...
3093
  	int mss;
a842fe142   Eric Dumazet   tcp: add optional...
3094
  	u64 now;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3095

ca6fb0651   Eric Dumazet   tcp: attach SYNAC...
3096
  	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
4aea39c11   Eric Dumazet   tcp: tcp_make_syn...
3097
3098
  	if (unlikely(!skb)) {
  		dst_release(dst);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3099
  		return NULL;
4aea39c11   Eric Dumazet   tcp: tcp_make_syn...
3100
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3101
3102
  	/* Reserve space for headers. */
  	skb_reserve(skb, MAX_TCP_HEADER);
b3d051477   Eric Dumazet   tcp: do not mess ...
3103
3104
  	switch (synack_type) {
  	case TCP_SYNACK_NORMAL:
9e17f8a47   Eric Dumazet   net: make skb_set...
3105
  		skb_set_owner_w(skb, req_to_sk(req));
b3d051477   Eric Dumazet   tcp: do not mess ...
3106
3107
3108
3109
3110
3111
3112
  		break;
  	case TCP_SYNACK_COOKIE:
  		/* Under synflood, we do not attach skb to a socket,
  		 * to avoid false sharing.
  		 */
  		break;
  	case TCP_SYNACK_FASTOPEN:
ca6fb0651   Eric Dumazet   tcp: attach SYNAC...
3113
3114
3115
3116
3117
  		/* sk is a const pointer, because we want to express multiple
  		 * cpu might call us concurrently.
  		 * sk->sk_wmem_alloc in an atomic, we can promote to rw.
  		 */
  		skb_set_owner_w(skb, (struct sock *)sk);
b3d051477   Eric Dumazet   tcp: do not mess ...
3118
  		break;
ca6fb0651   Eric Dumazet   tcp: attach SYNAC...
3119
  	}
4aea39c11   Eric Dumazet   tcp: tcp_make_syn...
3120
  	skb_dst_set(skb, dst);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3121

3541f9e8b   Eric Dumazet   tcp: add tcp_mss_...
3122
  	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
f5fff5dc8   Tom Quetchenbach   tcp: advertise MS...
3123

33ad798c9   Adam Langley   tcp: options clea...
3124
  	memset(&opts, 0, sizeof(opts));
a842fe142   Eric Dumazet   tcp: add optional...
3125
  	now = tcp_clock_ns();
8b5f12d04   Florian Westphal   syncookies: fix i...
3126
3127
  #ifdef CONFIG_SYN_COOKIES
  	if (unlikely(req->cookie_ts))
d3edd06ea   Eric Dumazet   tcp: provide earl...
3128
  		skb->skb_mstamp_ns = cookie_init_timestamp(req);
8b5f12d04   Florian Westphal   syncookies: fix i...
3129
3130
  	else
  #endif
9e450c1ec   Yuchung Cheng   tcp: better SYNAC...
3131
  	{
a842fe142   Eric Dumazet   tcp: add optional...
3132
  		skb->skb_mstamp_ns = now;
9e450c1ec   Yuchung Cheng   tcp: better SYNAC...
3133
3134
3135
  		if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
  			tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
  	}
80f03e27a   Eric Dumazet   tcp: md5: fix rcu...
3136
3137
3138
  
  #ifdef CONFIG_TCP_MD5SIG
  	rcu_read_lock();
fd3a154a0   Eric Dumazet   tcp: md5: get rid...
3139
  	md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
80f03e27a   Eric Dumazet   tcp: md5: fix rcu...
3140
  #endif
58d607d3e   Eric Dumazet   tcp: provide skb-...
3141
  	skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
60e2a7780   Ursula Braun   tcp: TCP experime...
3142
3143
  	tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
  					     foc) + sizeof(*th);
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
3144

aa8223c7b   Arnaldo Carvalho de Melo   [SK_BUFF]: Introd...
3145
3146
  	skb_push(skb, tcp_header_size);
  	skb_reset_transport_header(skb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3147

ea1627c20   Eric Dumazet   tcp: minor optimi...
3148
  	th = (struct tcphdr *)skb->data;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3149
3150
3151
  	memset(th, 0, sizeof(struct tcphdr));
  	th->syn = 1;
  	th->ack = 1;
6ac705b18   Eric Dumazet   tcp: remove tcp_e...
3152
  	tcp_ecn_make_synack(req, th);
b44084c2c   Eric Dumazet   inet: rename ir_l...
3153
  	th->source = htons(ireq->ir_num);
634fb979e   Eric Dumazet   inet: includes a ...
3154
  	th->dest = ireq->ir_rmt_port;
e05a90ec9   Jamal Hadi Salim   net: reflect mark...
3155
  	skb->mark = ireq->ir_mark;
3b1177503   Eric Dumazet   tcp: do not mangl...
3156
3157
  	skb->ip_summed = CHECKSUM_PARTIAL;
  	th->seq = htonl(tcp_rsk(req)->snt_isn);
8336886f7   Jerry Chu   tcp: TCP Fast Ope...
3158
3159
  	/* XXX data is queued and acked as is. No buffer/window check */
  	th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3160
3161
  
  	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
ed53d0ab7   Eric Dumazet   net: shrink struc...
3162
  	th->window = htons(min(req->rsk_rcv_wnd, 65535U));
5d062de7f   Eric Dumazet   tcp: constify tcp...
3163
  	tcp_options_write((__be32 *)(th + 1), NULL, &opts);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3164
  	th->doff = (tcp_header_size >> 2);
90bbcc608   Eric Dumazet   net: tcp: rename ...
3165
  	__TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
3166
3167
3168
  
  #ifdef CONFIG_TCP_MD5SIG
  	/* Okay, we have all we need - do the md5 hash if needed */
80f03e27a   Eric Dumazet   tcp: md5: fix rcu...
3169
  	if (md5)
bd0388ae7   William Allen Simpson   TCPCT part 1f: In...
3170
  		tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
39f8e58e5   Eric Dumazet   tcp: md5: remove ...
3171
  					       md5, req_to_sk(req), skb);
80f03e27a   Eric Dumazet   tcp: md5: fix rcu...
3172
  	rcu_read_unlock();
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
3173
  #endif
a842fe142   Eric Dumazet   tcp: add optional...
3174
3175
  	skb->skb_mstamp_ns = now;
  	tcp_add_tx_delay(skb, tp);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3176
3177
  	return skb;
  }
4bc2f18ba   Eric Dumazet   net/ipv4: EXPORT_...
3178
  EXPORT_SYMBOL(tcp_make_synack);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3179

81164413a   Daniel Borkmann   net: tcp: add per...
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
  static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
  {
  	struct inet_connection_sock *icsk = inet_csk(sk);
  	const struct tcp_congestion_ops *ca;
  	u32 ca_key = dst_metric(dst, RTAX_CC_ALGO);
  
  	if (ca_key == TCP_CA_UNSPEC)
  		return;
  
  	rcu_read_lock();
  	ca = tcp_ca_find_key(ca_key);
  	if (likely(ca && try_module_get(ca->owner))) {
  		module_put(icsk->icsk_ca_ops->owner);
  		icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
  		icsk->icsk_ca_ops = ca;
  	}
  	rcu_read_unlock();
  }
67edfef78   Andi Kleen   TCP: Add comments...
3198
  /* Do all connect socket setups that can be done AF independent. */
f7e56a76a   stephen hemminger   tcp: make local f...
3199
  static void tcp_connect_init(struct sock *sk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3200
  {
cf533ea53   Eric Dumazet   tcp: add const qu...
3201
  	const struct dst_entry *dst = __sk_dst_get(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3202
3203
  	struct tcp_sock *tp = tcp_sk(sk);
  	__u8 rcv_wscale;
13d3b1ebe   Lawrence Brakmo   bpf: Support for ...
3204
  	u32 rcv_wnd;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3205
3206
3207
3208
  
  	/* We'll fix this up when we get a response from the other end.
  	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
  	 */
5d2ed0521   Eric Dumazet   tcp: Namespaceify...
3209
3210
3211
  	tp->tcp_header_len = sizeof(struct tcphdr);
  	if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
  		tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3212

cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
3213
  #ifdef CONFIG_TCP_MD5SIG
00db41243   Ian Morris   ipv4: coding styl...
3214
  	if (tp->af_specific->md5_lookup(sk, sk))
cfb6eeb4c   YOSHIFUJI Hideaki   [TCP]: MD5 Signat...
3215
3216
  		tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3217
3218
3219
3220
  	/* If user gave his TCP_MAXSEG, record it to clamp */
  	if (tp->rx_opt.user_mss)
  		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
  	tp->max_window = 0;
5d424d5a6   John Heffner   [TCP]: MTU probing
3221
  	tcp_mtup_init(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3222
  	tcp_sync_mss(sk, dst_mtu(dst));
81164413a   Daniel Borkmann   net: tcp: add per...
3223
  	tcp_ca_dst_init(sk, dst);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3224
3225
  	if (!tp->window_clamp)
  		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
3541f9e8b   Eric Dumazet   tcp: add tcp_mss_...
3226
  	tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
f5fff5dc8   Tom Quetchenbach   tcp: advertise MS...
3227

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3228
  	tcp_initialize_rcv_mss(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3229

e88c64f0a   Hagen Paul Pfeifer   tcp: allow effect...
3230
3231
3232
3233
  	/* limit the window selection if the user enforce a smaller rx buffer */
  	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
  	    (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
  		tp->window_clamp = tcp_full_space(sk);
13d3b1ebe   Lawrence Brakmo   bpf: Support for ...
3234
3235
3236
  	rcv_wnd = tcp_rwnd_init_bpf(sk);
  	if (rcv_wnd == 0)
  		rcv_wnd = dst_metric(dst, RTAX_INITRWND);
ceef9ab6b   Eric Dumazet   tcp: Namespace-if...
3237
  	tcp_select_initial_window(sk, tcp_full_space(sk),
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3238
3239
3240
  				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
  				  &tp->rcv_wnd,
  				  &tp->window_clamp,
9bb37ef00   Eric Dumazet   tcp: Namespaceify...
3241
  				  sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
31d12926e   laurent chavey   net: Add rtnetlin...
3242
  				  &rcv_wscale,
13d3b1ebe   Lawrence Brakmo   bpf: Support for ...
3243
  				  rcv_wnd);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3244
3245
3246
3247
3248
3249
3250
  
  	tp->rx_opt.rcv_wscale = rcv_wscale;
  	tp->rcv_ssthresh = tp->rcv_wnd;
  
  	sk->sk_err = 0;
  	sock_reset_flag(sk, SOCK_DONE);
  	tp->snd_wnd = 0;
ee7537b63   Hantzis Fotis   tcp: tcp_init_wl ...
3251
  	tcp_init_wl(tp, 0);
7f582b248   Eric Dumazet   tcp: purge write ...
3252
  	tcp_write_queue_purge(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3253
3254
  	tp->snd_una = tp->write_seq;
  	tp->snd_sml = tp->write_seq;
33f5f57ee   Ilpo Järvinen   tcp: kill pointle...
3255
  	tp->snd_up = tp->write_seq;
e0d694d63   Eric Dumazet   tcp: annotate tp-...
3256
  	WRITE_ONCE(tp->snd_nxt, tp->write_seq);
ee9952831   Pavel Emelyanov   tcp: Initial repa...
3257
3258
3259
  
  	if (likely(!tp->repair))
  		tp->rcv_nxt = 0;
c7781a6e3   Andrew Vagin   tcp: initialize r...
3260
  	else
70eabf0e1   Eric Dumazet   tcp: use tcp_jiff...
3261
  		tp->rcv_tstamp = tcp_jiffies32;
ee9952831   Pavel Emelyanov   tcp: Initial repa...
3262
  	tp->rcv_wup = tp->rcv_nxt;
7db48e983   Eric Dumazet   tcp: annotate tp-...
3263
  	WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3264

8550f328f   Lawrence Brakmo   bpf: Support for ...
3265
  	inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3266
  	inet_csk(sk)->icsk_retransmits = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3267
3268
  	tcp_clear_retrans(tp);
  }
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3269
3270
3271
3272
3273
3274
  static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
  
  	tcb->end_seq += skb->len;
f4a775d14   Eric Dumazet   net: introduce __...
3275
  	__skb_header_release(skb);
ab4e846a8   Eric Dumazet   tcp: annotate sk-...
3276
  	sk_wmem_queued_add(sk, skb->truesize);
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3277
  	sk_mem_charge(sk, skb->truesize);
0f3174645   Eric Dumazet   tcp: annotate tp-...
3278
  	WRITE_ONCE(tp->write_seq, tcb->end_seq);
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
  	tp->packets_out += tcp_skb_pcount(skb);
  }
  
  /* Build and send a SYN with data and (cached) Fast Open cookie. However,
   * queue a data-only packet after the regular SYN, such that regular SYNs
   * are retransmitted on timeouts. Also if the remote SYN-ACK acknowledges
   * only the SYN sequence, the data are retransmitted in the first ACK.
   * If cookie is not cached or other error occurs, falls back to send a
   * regular SYN with Fast Open cookie request option.
   */
  static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct tcp_fastopen_request *fo = tp->fastopen_req;
065263f40   Wei Wang   net/tcp-fastopen:...
3293
  	int space, err = 0;
355a901e6   Eric Dumazet   tcp: make connect...
3294
  	struct sk_buff *syn_data;
aab487435   Yuchung Cheng   net-tcp: Fast Ope...
3295

67da22d23   Yuchung Cheng   net-tcp: Fast Ope...
3296
  	tp->rx_opt.mss_clamp = tp->advmss;  /* If MSS is not cached */
065263f40   Wei Wang   net/tcp-fastopen:...
3297
  	if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3298
3299
3300
3301
3302
3303
  		goto fallback;
  
  	/* MSS for SYN-data is based on cached MSS and bounded by PMTU and
  	 * user-MSS. Reserve maximum option space for middleboxes that add
  	 * private TCP options. The cost is reduced data space in SYN :(
  	 */
3541f9e8b   Eric Dumazet   tcp: add tcp_mss_...
3304
  	tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
1b63edd6e   Yuchung Cheng   tcp: fix SYN-data...
3305
  	space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3306
  		MAX_TCP_OPTION_SPACE;
f5ddcbbb4   Eric Dumazet   net-tcp: fastopen...
3307
3308
3309
3310
  	space = min_t(size_t, space, fo->size);
  
  	/* limit to order-0 allocations */
  	space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
eb9344781   Eric Dumazet   tcp: add a force_...
3311
  	syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false);
355a901e6   Eric Dumazet   tcp: make connect...
3312
  	if (!syn_data)
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3313
  		goto fallback;
355a901e6   Eric Dumazet   tcp: make connect...
3314
3315
  	syn_data->ip_summed = CHECKSUM_PARTIAL;
  	memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
07e100f98   Eric Dumazet   tcp: restore fast...
3316
3317
3318
3319
  	if (space) {
  		int copied = copy_from_iter(skb_put(syn_data, space), space,
  					    &fo->data->msg_iter);
  		if (unlikely(!copied)) {
ba233b347   Eric Dumazet   tcp: fix tcp_send...
3320
  			tcp_skb_tsorted_anchor_cleanup(syn_data);
07e100f98   Eric Dumazet   tcp: restore fast...
3321
3322
3323
3324
3325
3326
3327
  			kfree_skb(syn_data);
  			goto fallback;
  		}
  		if (copied != space) {
  			skb_trim(syn_data, copied);
  			space = copied;
  		}
f859a4484   Willem de Bruijn   tcp: allow zeroco...
3328
  		skb_zcopy_set(syn_data, fo->uarg, NULL);
57be5bdad   Al Viro   ip: convert tcp_s...
3329
  	}
355a901e6   Eric Dumazet   tcp: make connect...
3330
3331
3332
3333
  	/* No more data pending in inet_wait_for_connect() */
  	if (space == fo->size)
  		fo->data = NULL;
  	fo->copied = space;
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3334

355a901e6   Eric Dumazet   tcp: make connect...
3335
  	tcp_connect_queue_skb(sk, syn_data);
0f87230d1   Francis Yan   tcp: instrument h...
3336
3337
  	if (syn_data->len)
  		tcp_chrono_start(sk, TCP_CHRONO_BUSY);
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3338

355a901e6   Eric Dumazet   tcp: make connect...
3339
  	err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3340

d3edd06ea   Eric Dumazet   tcp: provide earl...
3341
  	syn->skb_mstamp_ns = syn_data->skb_mstamp_ns;
431a91242   Eric Dumazet   tcp: timestamp SY...
3342

355a901e6   Eric Dumazet   tcp: make connect...
3343
3344
3345
3346
3347
3348
3349
3350
  	/* Now full SYN+DATA was cloned and sent (or not),
  	 * remove the SYN from the original skb (syn_data)
  	 * we keep in write queue in case of a retransmit, as we
  	 * also have the SYN packet (with no data) in the same queue.
  	 */
  	TCP_SKB_CB(syn_data)->seq++;
  	TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
  	if (!err) {
67da22d23   Yuchung Cheng   net-tcp: Fast Ope...
3351
  		tp->syn_data = (fo->copied > 0);
75c119afe   Eric Dumazet   tcp: implement rb...
3352
  		tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
f19c29e3e   Yuchung Cheng   tcp: snmp stats f...
3353
  		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3354
3355
  		goto done;
  	}
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3356

75c119afe   Eric Dumazet   tcp: implement rb...
3357
3358
  	/* data was not sent, put it in write_queue */
  	__skb_queue_tail(&sk->sk_write_queue, syn_data);
b5b7db8d6   Eric Dumazet   tcp: fastopen: fi...
3359
  	tp->packets_out -= tcp_skb_pcount(syn_data);
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3360
3361
3362
3363
3364
3365
3366
  fallback:
  	/* Send a regular SYN with Fast Open cookie request option */
  	if (fo->cookie.len > 0)
  		fo->cookie.len = 0;
  	err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
  	if (err)
  		tp->syn_fastopen = 0;
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3367
3368
3369
3370
  done:
  	fo->cookie.len = -1;  /* Exclude Fast Open option for SYN retries */
  	return err;
  }
67edfef78   Andi Kleen   TCP: Add comments...
3371
  /* Build a SYN and send it off. */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3372
3373
3374
3375
  int tcp_connect(struct sock *sk)
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *buff;
ee5868119   Eric Paris   network: tcp_conn...
3376
  	int err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3377

de525be2c   Lawrence Brakmo   bpf: Support pass...
3378
  	tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
8ba609247   Eric Dumazet   tcp: fastopen: tc...
3379
3380
3381
  
  	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
  		return -EHOSTUNREACH; /* Routing failure or similar. */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3382
  	tcp_connect_init(sk);
2b9164771   Andrey Vagin   ipv6: adapt conne...
3383
3384
3385
3386
  	if (unlikely(tp->repair)) {
  		tcp_finish_connect(sk, NULL);
  		return 0;
  	}
eb9344781   Eric Dumazet   tcp: add a force_...
3387
  	buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
355a901e6   Eric Dumazet   tcp: make connect...
3388
  	if (unlikely(!buff))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3389
  		return -ENOBUFS;
a3433f35a   Changli Gao   tcp: unify tcp fl...
3390
  	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
9a568de48   Eric Dumazet   tcp: switch TCP T...
3391
3392
  	tcp_mstamp_refresh(tp);
  	tp->retrans_stamp = tcp_time_stamp(tp);
783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3393
  	tcp_connect_queue_skb(sk, buff);
735d38311   Florian Westphal   tcp: change TCP_E...
3394
  	tcp_ecn_send_syn(sk, buff);
75c119afe   Eric Dumazet   tcp: implement rb...
3395
  	tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3396

783237e8d   Yuchung Cheng   net-tcp: Fast Ope...
3397
3398
3399
  	/* Send off SYN; include data in Fast Open. */
  	err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
  	      tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
ee5868119   Eric Paris   network: tcp_conn...
3400
3401
  	if (err == -ECONNREFUSED)
  		return err;
bd37a0885   Wei Yongjun   [TCP]: SNMPv2 tcp...
3402
3403
3404
3405
  
  	/* We change tp->snd_nxt after the tcp_transmit_skb() call
  	 * in order to make this packet get counted in tcpOutSegs.
  	 */
e0d694d63   Eric Dumazet   tcp: annotate tp-...
3406
  	WRITE_ONCE(tp->snd_nxt, tp->write_seq);
bd37a0885   Wei Yongjun   [TCP]: SNMPv2 tcp...
3407
  	tp->pushed_seq = tp->write_seq;
b5b7db8d6   Eric Dumazet   tcp: fastopen: fi...
3408
3409
  	buff = tcp_send_head(sk);
  	if (unlikely(buff)) {
e0d694d63   Eric Dumazet   tcp: annotate tp-...
3410
  		WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
b5b7db8d6   Eric Dumazet   tcp: fastopen: fi...
3411
3412
  		tp->pushed_seq	= TCP_SKB_CB(buff)->seq;
  	}
81cc8a75d   Pavel Emelyanov   mib: add net to T...
3413
  	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3414
3415
  
  	/* Timer for repeating the SYN until an answer. */
3f421baa4   Arnaldo Carvalho de Melo   [NET]: Just move ...
3416
3417
  	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
  				  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3418
3419
  	return 0;
  }
4bc2f18ba   Eric Dumazet   net/ipv4: EXPORT_...
3420
  EXPORT_SYMBOL(tcp_connect);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3421
3422
3423
3424
3425
3426
3427
  
  /* Send out a delayed ack, the caller does the policy checking
   * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
   * for details.
   */
  void tcp_send_delayed_ack(struct sock *sk)
  {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3428
3429
  	struct inet_connection_sock *icsk = inet_csk(sk);
  	int ato = icsk->icsk_ack.ato;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3430
3431
3432
  	unsigned long timeout;
  
  	if (ato > TCP_DELACK_MIN) {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3433
  		const struct tcp_sock *tp = tcp_sk(sk);
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3434
  		int max_ato = HZ / 2;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3435

31954cd8b   Wei Wang   tcp: Refactor pin...
3436
  		if (inet_csk_in_pingpong_mode(sk) ||
056834d9f   Ilpo Järvinen   [TCP]: cleanup tc...
3437
  		    (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3438
3439
3440
3441
3442
  			max_ato = TCP_DELACK_MAX;
  
  		/* Slow path, intersegment interval is "high". */
  
  		/* If some rtt estimate is known, use it to bound delayed ack.
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3443
  		 * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3444
3445
  		 * directly.
  		 */
740b0f184   Eric Dumazet   tcp: switch rtt e...
3446
3447
3448
  		if (tp->srtt_us) {
  			int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
  					TCP_DELACK_MIN);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
  
  			if (rtt < max_ato)
  				max_ato = rtt;
  		}
  
  		ato = min(ato, max_ato);
  	}
  
  	/* Stay within the limit we were given */
  	timeout = jiffies + ato;
  
  	/* Use new timeout only if there wasn't a older one earlier. */
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3461
  	if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3462
3463
3464
  		/* If delack timer was blocked or is about to expire,
  		 * send ACK now.
  		 */
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3465
3466
  		if (icsk->icsk_ack.blocked ||
  		    time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3467
3468
3469
  			tcp_send_ack(sk);
  			return;
  		}
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3470
3471
  		if (!time_before(timeout, icsk->icsk_ack.timeout))
  			timeout = icsk->icsk_ack.timeout;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3472
  	}
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3473
3474
3475
  	icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
  	icsk->icsk_ack.timeout = timeout;
  	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3476
3477
3478
  }
  
  /* This routine sends an ack and also updates the window. */
2987babb6   Yuchung Cheng   tcp: helpers to s...
3479
  void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3480
  {
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3481
  	struct sk_buff *buff;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3482

058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3483
3484
3485
  	/* If we have been reset, we may not send again. */
  	if (sk->sk_state == TCP_CLOSE)
  		return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3486

058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3487
3488
3489
3490
  	/* We are not putting this on the write queue, so
  	 * tcp_transmit_skb() will set the ownership to this
  	 * sock.
  	 */
7450aaf61   Eric Dumazet   tcp: suppress too...
3491
3492
3493
  	buff = alloc_skb(MAX_TCP_HEADER,
  			 sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
  	if (unlikely(!buff)) {
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3494
3495
3496
3497
3498
  		inet_csk_schedule_ack(sk);
  		inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
  		inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
  					  TCP_DELACK_MAX, TCP_RTO_MAX);
  		return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3499
  	}
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3500
3501
3502
  
  	/* Reserve space for headers and prepare control bits. */
  	skb_reserve(buff, MAX_TCP_HEADER);
a3433f35a   Changli Gao   tcp: unify tcp fl...
3503
  	tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3504

987819657   Eric Dumazet   tcp: do not pace ...
3505
3506
3507
  	/* We do not want pure acks influencing TCP Small Queues or fq/pacing
  	 * too much.
  	 * SKB_TRUESIZE(max(1 .. 66, MAX_TCP_HEADER)) is unfortunately ~784
987819657   Eric Dumazet   tcp: do not pace ...
3508
3509
  	 */
  	skb_set_tcp_pure_ack(buff);
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3510
  	/* Send it off, this clears delayed acks for us. */
2987babb6   Yuchung Cheng   tcp: helpers to s...
3511
3512
  	__tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
  }
27cde44a2   Yuchung Cheng   tcp: do not cance...
3513
  EXPORT_SYMBOL_GPL(__tcp_send_ack);
2987babb6   Yuchung Cheng   tcp: helpers to s...
3514
3515
3516
3517
  
  void tcp_send_ack(struct sock *sk)
  {
  	__tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
  }
  
  /* This routine sends a packet with an out of date sequence
   * number. It assumes the other end will try to ack it.
   *
   * Question: what should we make while urgent mode?
   * 4.4BSD forces sending single byte of data. We cannot send
   * out of window data, because we have SND.NXT==SND.MAX...
   *
   * Current solution: to send TWO zero-length segments in urgent mode:
   * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
   * out-of-date with SND.UNA-1 to probe window.
   */
e520af48c   Eric Dumazet   tcp: add TCPWinPr...
3531
  static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3532
3533
3534
3535
3536
  {
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *skb;
  
  	/* We don't queue it, tcp_transmit_skb() sets ownership. */
7450aaf61   Eric Dumazet   tcp: suppress too...
3537
3538
  	skb = alloc_skb(MAX_TCP_HEADER,
  			sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
51456b291   Ian Morris   ipv4: coding styl...
3539
  	if (!skb)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3540
3541
3542
3543
  		return -1;
  
  	/* Reserve space for headers and set control bits. */
  	skb_reserve(skb, MAX_TCP_HEADER);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3544
3545
3546
3547
  	/* Use a previous sequence.  This should cause the other
  	 * end to send an ack.  Don't queue or clone SKB, just
  	 * send it.
  	 */
a3433f35a   Changli Gao   tcp: unify tcp fl...
3548
  	tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
e2e8009ff   Renato Westphal   tcp: remove impro...
3549
  	NET_INC_STATS(sock_net(sk), mib);
7450aaf61   Eric Dumazet   tcp: suppress too...
3550
  	return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3551
  }
385e20706   Eric Dumazet   tcp: use tp->tcp_...
3552
  /* Called from setsockopt( ... TCP_REPAIR ) */
ee9952831   Pavel Emelyanov   tcp: Initial repa...
3553
3554
3555
3556
  void tcp_send_window_probe(struct sock *sk)
  {
  	if (sk->sk_state == TCP_ESTABLISHED) {
  		tcp_sk(sk)->snd_wl1 = tcp_sk(sk)->rcv_nxt - 1;
9a568de48   Eric Dumazet   tcp: switch TCP T...
3557
  		tcp_mstamp_refresh(tcp_sk(sk));
e520af48c   Eric Dumazet   tcp: add TCPWinPr...
3558
  		tcp_xmit_probe_skb(sk, 0, LINUX_MIB_TCPWINPROBE);
ee9952831   Pavel Emelyanov   tcp: Initial repa...
3559
3560
  	}
  }
67edfef78   Andi Kleen   TCP: Add comments...
3561
  /* Initiate keepalive or window probe from timer. */
e520af48c   Eric Dumazet   tcp: add TCPWinPr...
3562
  int tcp_write_wakeup(struct sock *sk, int mib)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3563
  {
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3564
3565
  	struct tcp_sock *tp = tcp_sk(sk);
  	struct sk_buff *skb;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3566

058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3567
3568
  	if (sk->sk_state == TCP_CLOSE)
  		return -1;
00db41243   Ian Morris   ipv4: coding styl...
3569
3570
  	skb = tcp_send_head(sk);
  	if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3571
  		int err;
0c54b85f2   Ilpo Järvinen   tcp: simplify tcp...
3572
  		unsigned int mss = tcp_current_mss(sk);
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
  		unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
  
  		if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
  			tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
  
  		/* We are probing the opening of a window
  		 * but the window size is != 0
  		 * must have been a result SWS avoidance ( sender )
  		 */
  		if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
  		    skb->len > mss) {
  			seg_size = min(seg_size, mss);
4de075e04   Eric Dumazet   tcp: rename tcp_s...
3585
  			TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
75c119afe   Eric Dumazet   tcp: implement rb...
3586
3587
  			if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
  					 skb, seg_size, mss, GFP_ATOMIC))
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3588
3589
  				return -1;
  		} else if (!tcp_skb_pcount(skb))
5bbb432c8   Eric Dumazet   tcp: tcp_set_skb_...
3590
  			tcp_set_skb_tso_segs(skb, mss);
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3591

4de075e04   Eric Dumazet   tcp: rename tcp_s...
3592
  		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
058dc3342   Ilpo Järvinen   [TCP]: reduce tcp...
3593
3594
3595
3596
3597
  		err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
  		if (!err)
  			tcp_event_new_data_sent(sk, skb);
  		return err;
  	} else {
33f5f57ee   Ilpo Järvinen   tcp: kill pointle...
3598
  		if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF))
e520af48c   Eric Dumazet   tcp: add TCPWinPr...
3599
3600
  			tcp_xmit_probe_skb(sk, 1, mib);
  		return tcp_xmit_probe_skb(sk, 0, mib);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3601
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3602
3603
3604
3605
3606
3607
3608
  }
  
  /* A window probe timeout has occurred.  If window is not closed send
   * a partial packet else a zero probe.
   */
  void tcp_send_probe0(struct sock *sk)
  {
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3609
  	struct inet_connection_sock *icsk = inet_csk(sk);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3610
  	struct tcp_sock *tp = tcp_sk(sk);
c6214a97c   Nikolay Borisov   ipv4: Namespaceif...
3611
  	struct net *net = sock_net(sk);
c1d5674f8   Yuchung Cheng   tcp: less aggress...
3612
  	unsigned long timeout;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3613
  	int err;
e520af48c   Eric Dumazet   tcp: add TCPWinPr...
3614
  	err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3615

75c119afe   Eric Dumazet   tcp: implement rb...
3616
  	if (tp->packets_out || tcp_write_queue_empty(sk)) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3617
  		/* Cancel probe timer, if it is not required. */
6687e988d   Arnaldo Carvalho de Melo   [ICSK]: Move TCP ...
3618
  		icsk->icsk_probes_out = 0;
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3619
  		icsk->icsk_backoff = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3620
3621
  		return;
  	}
c1d5674f8   Yuchung Cheng   tcp: less aggress...
3622
  	icsk->icsk_probes_out++;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3623
  	if (err <= 0) {
c6214a97c   Nikolay Borisov   ipv4: Namespaceif...
3624
  		if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
463c84b97   Arnaldo Carvalho de Melo   [NET]: Introduce ...
3625
  			icsk->icsk_backoff++;
c1d5674f8   Yuchung Cheng   tcp: less aggress...
3626
  		timeout = tcp_probe0_when(sk, TCP_RTO_MAX);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3627
3628
  	} else {
  		/* If packet was not sent due to local congestion,
c1d5674f8   Yuchung Cheng   tcp: less aggress...
3629
  		 * Let senders fight for local resources conservatively.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3630
  		 */
c1d5674f8   Yuchung Cheng   tcp: less aggress...
3631
3632
3633
  		timeout = TCP_RESOURCE_PROBE_INTERVAL;
  	}
  	tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX, NULL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
3634
  }
5db92c994   Octavian Purdila   tcp: unify tcp_v4...
3635

ea3bea3a1   Eric Dumazet   tcp/dccp: constif...
3636
  int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
5db92c994   Octavian Purdila   tcp: unify tcp_v4...
3637
3638
3639
3640
  {
  	const struct tcp_request_sock_ops *af_ops = tcp_rsk(req)->af_specific;
  	struct flowi fl;
  	int res;
58d607d3e   Eric Dumazet   tcp: provide skb-...
3641
  	tcp_rsk(req)->txhash = net_tx_rndhash();
b3d051477   Eric Dumazet   tcp: do not mess ...
3642
  	res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
5db92c994   Octavian Purdila   tcp: unify tcp_v4...
3643
  	if (!res) {
90bbcc608   Eric Dumazet   net: tcp: rename ...
3644
  		__TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
02a1d6e7a   Eric Dumazet   net: rename NET_{...
3645
  		__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
7e32b4436   Yuchung Cheng   tcp: properly acc...
3646
3647
  		if (unlikely(tcp_passive_fastopen(sk)))
  			tcp_sk(sk)->total_retrans++;
cf34ce3da   Song Liu   tcp: add tracepoi...
3648
  		trace_tcp_retransmit_synack(sk, req);
5db92c994   Octavian Purdila   tcp: unify tcp_v4...
3649
3650
3651
3652
  	}
  	return res;
  }
  EXPORT_SYMBOL(tcp_rtx_synack);