Commit e6c022a4fa2d2d9ca9d0a7ac3b05ad988f39fc30

Authored by Eric Dumazet
Committed by David S. Miller
1 parent 25b1e67921

tcp: better retrans tracking for defer-accept

For passive TCP connections using TCP_DEFER_ACCEPT facility,
we incorrectly increment req->retrans each time timeout triggers
while no SYNACK is sent.

SYNACK are not sent for TCP_DEFER_ACCEPT that were established (for
which we received the ACK from client). Only the last SYNACK is sent
so that we can receive again an ACK from client, to move the req into
accept queue. We plan to change this later to avoid the useless
retransmit (and potential problem as this SYNACK could be lost)

TCP_INFO later gives wrong information to user, claiming imaginary
retransmits.

Decouple req->retrans field into two independent fields :

num_retrans : number of retransmit
num_timeout : number of timeouts

num_timeout is the counter that is incremented at each timeout,
regardless of actual SYNACK being sent or not, and used to
compute the exponential timeout.

Introduce inet_rtx_syn_ack() helper to increment num_retrans
only if ->rtx_syn_ack() succeeded.

Use inet_rtx_syn_ack() from tcp_check_req() to increment num_retrans
when we re-send a SYNACK in answer to a (retransmitted) SYN.
Prior to this patch, we were not counting these retransmits.

Change tcp_v[46]_rtx_synack() to increment TCP_MIB_RETRANSSEGS
only if a synack packet was successfully queued.

Reported-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Julian Anastasov <ja@ssi.bg>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Elliott Hughes <enh@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 11 changed files with 56 additions and 35 deletions Side-by-side Diff

include/net/request_sock.h
... ... @@ -49,13 +49,16 @@
49 49 struct request_sock *req);
50 50 };
51 51  
  52 +extern int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req);
  53 +
52 54 /* struct request_sock - mini sock to represent a connection request
53 55 */
54 56 struct request_sock {
55 57 struct request_sock *dl_next; /* Must be first member! */
56 58 u16 mss;
57   - u8 retrans;
58   - u8 cookie_ts; /* syncookie: encode tcpopts in timestamp */
  59 + u8 num_retrans; /* number of retransmits */
  60 + u8 cookie_ts:1; /* syncookie: encode tcpopts in timestamp */
  61 + u8 num_timeout:7; /* number of timeouts */
59 62 /* The following two fields can be easily recomputed I think -AK */
60 63 u32 window_clamp; /* window clamp at creation time */
61 64 u32 rcv_wnd; /* rcv_wnd offered first time */
... ... @@ -231,7 +234,7 @@
231 234 {
232 235 struct listen_sock *lopt = queue->listen_opt;
233 236  
234   - if (req->retrans == 0)
  237 + if (req->num_timeout == 0)
235 238 --lopt->qlen_young;
236 239  
237 240 return --lopt->qlen;
... ... @@ -269,7 +272,8 @@
269 272 struct listen_sock *lopt = queue->listen_opt;
270 273  
271 274 req->expires = jiffies + timeout;
272   - req->retrans = 0;
  275 + req->num_retrans = 0;
  276 + req->num_timeout = 0;
273 277 req->sk = NULL;
274 278 req->dl_next = lopt->syn_table[hash];
275 279  
net/dccp/minisocks.c
... ... @@ -174,8 +174,7 @@
174 174 * To protect against Request floods, increment retrans
175 175 * counter (backoff, monitored by dccp_response_timer).
176 176 */
177   - req->retrans++;
178   - req->rsk_ops->rtx_syn_ack(sk, req, NULL);
  177 + inet_rtx_syn_ack(sk, req);
179 178 }
180 179 /* Network Duplicate, discard packet */
181 180 return NULL;
net/ipv4/inet_connection_sock.c
... ... @@ -521,21 +521,31 @@
521 521 int *expire, int *resend)
522 522 {
523 523 if (!rskq_defer_accept) {
524   - *expire = req->retrans >= thresh;
  524 + *expire = req->num_timeout >= thresh;
525 525 *resend = 1;
526 526 return;
527 527 }
528   - *expire = req->retrans >= thresh &&
529   - (!inet_rsk(req)->acked || req->retrans >= max_retries);
  528 + *expire = req->num_timeout >= thresh &&
  529 + (!inet_rsk(req)->acked || req->num_timeout >= max_retries);
530 530 /*
531 531 * Do not resend while waiting for data after ACK,
532 532 * start to resend on end of deferring period to give
533 533 * last chance for data or ACK to create established socket.
534 534 */
535 535 *resend = !inet_rsk(req)->acked ||
536   - req->retrans >= rskq_defer_accept - 1;
  536 + req->num_timeout >= rskq_defer_accept - 1;
537 537 }
538 538  
  539 +int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
  540 +{
  541 + int err = req->rsk_ops->rtx_syn_ack(parent, req, NULL);
  542 +
  543 + if (!err)
  544 + req->num_retrans++;
  545 + return err;
  546 +}
  547 +EXPORT_SYMBOL(inet_rtx_syn_ack);
  548 +
539 549 void inet_csk_reqsk_queue_prune(struct sock *parent,
540 550 const unsigned long interval,
541 551 const unsigned long timeout,
542 552  
543 553  
... ... @@ -599,13 +609,14 @@
599 609 req->rsk_ops->syn_ack_timeout(parent, req);
600 610 if (!expire &&
601 611 (!resend ||
602   - !req->rsk_ops->rtx_syn_ack(parent, req, NULL) ||
  612 + !inet_rtx_syn_ack(parent, req) ||
603 613 inet_rsk(req)->acked)) {
604 614 unsigned long timeo;
605 615  
606   - if (req->retrans++ == 0)
  616 + if (req->num_timeout++ == 0)
607 617 lopt->qlen_young--;
608   - timeo = min((timeout << req->retrans), max_rto);
  618 + timeo = min(timeout << req->num_timeout,
  619 + max_rto);
609 620 req->expires = now + timeo;
610 621 reqp = &req->dl_next;
611 622 continue;
net/ipv4/inet_diag.c
... ... @@ -620,7 +620,7 @@
620 620 r->idiag_family = sk->sk_family;
621 621 r->idiag_state = TCP_SYN_RECV;
622 622 r->idiag_timer = 1;
623   - r->idiag_retrans = req->retrans;
  623 + r->idiag_retrans = req->num_retrans;
624 624  
625 625 r->id.idiag_if = sk->sk_bound_dev_if;
626 626 sock_diag_save_cookie(req, r->id.idiag_cookie);
net/ipv4/syncookies.c
... ... @@ -340,7 +340,7 @@
340 340 }
341 341  
342 342 req->expires = 0UL;
343   - req->retrans = 0;
  343 + req->num_retrans = 0;
344 344  
345 345 /*
346 346 * We need to lookup the route here to get at the correct
net/ipv4/tcp_input.c
... ... @@ -5991,7 +5991,7 @@
5991 5991 */
5992 5992 if (req) {
5993 5993 tcp_synack_rtt_meas(sk, req);
5994   - tp->total_retrans = req->retrans;
  5994 + tp->total_retrans = req->num_retrans;
5995 5995  
5996 5996 reqsk_fastopen_remove(sk, req, false);
5997 5997 } else {
... ... @@ -877,10 +877,13 @@
877 877 }
878 878  
879 879 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
880   - struct request_values *rvp)
  880 + struct request_values *rvp)
881 881 {
882   - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
883   - return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
  882 + int res = tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
  883 +
  884 + if (!res)
  885 + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
  886 + return res;
884 887 }
885 888  
886 889 /*
... ... @@ -1386,7 +1389,8 @@
1386 1389 struct sock *child;
1387 1390 int err;
1388 1391  
1389   - req->retrans = 0;
  1392 + req->num_retrans = 0;
  1393 + req->num_timeout = 0;
1390 1394 req->sk = NULL;
1391 1395  
1392 1396 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
... ... @@ -1740,7 +1744,7 @@
1740 1744  
1741 1745 tcp_initialize_rcv_mss(newsk);
1742 1746 tcp_synack_rtt_meas(newsk, req);
1743   - newtp->total_retrans = req->retrans;
  1747 + newtp->total_retrans = req->num_retrans;
1744 1748  
1745 1749 #ifdef CONFIG_TCP_MD5SIG
1746 1750 /* Copy over the MD5 key from the original socket */
... ... @@ -2638,7 +2642,7 @@
2638 2642 0, 0, /* could print option size, but that is af dependent. */
2639 2643 1, /* timers active (only the expire timer) */
2640 2644 jiffies_delta_to_clock_t(delta),
2641   - req->retrans,
  2645 + req->num_timeout,
2642 2646 from_kuid_munged(seq_user_ns(f), uid),
2643 2647 0, /* non standard timer */
2644 2648 0, /* open_requests have no inode */
net/ipv4/tcp_minisocks.c
... ... @@ -552,7 +552,7 @@
552 552 * it can be estimated (approximately)
553 553 * from another data.
554 554 */
555   - tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
  555 + tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
556 556 paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
557 557 }
558 558 }
... ... @@ -581,7 +581,7 @@
581 581 * Note that even if there is new data in the SYN packet
582 582 * they will be thrown away too.
583 583 */
584   - req->rsk_ops->rtx_syn_ack(sk, req, NULL);
  584 + inet_rtx_syn_ack(sk, req);
585 585 return NULL;
586 586 }
587 587  
... ... @@ -695,7 +695,7 @@
695 695 /* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */
696 696 if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr)
697 697 tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr;
698   - else if (req->retrans) /* don't take RTT sample if retrans && ~TS */
  698 + else if (req->num_retrans) /* don't take RTT sample if retrans && ~TS */
699 699 tcp_rsk(req)->snt_synack = 0;
700 700  
701 701 /* For Fast Open no more processing is needed (sk is the
... ... @@ -705,7 +705,7 @@
705 705 return sk;
706 706  
707 707 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
708   - if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
  708 + if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
709 709 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
710 710 inet_rsk(req)->acked = 1;
711 711 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
net/ipv4/tcp_timer.c
... ... @@ -318,7 +318,7 @@
318 318 req = tcp_sk(sk)->fastopen_rsk;
319 319 req->rsk_ops->syn_ack_timeout(sk, req);
320 320  
321   - if (req->retrans >= max_retries) {
  321 + if (req->num_timeout >= max_retries) {
322 322 tcp_write_err(sk);
323 323 return;
324 324 }
325 325  
... ... @@ -327,10 +327,10 @@
327 327 * regular retransmit because if the child socket has been accepted
328 328 * it's not good to give up too easily.
329 329 */
330   - req->rsk_ops->rtx_syn_ack(sk, req, NULL);
331   - req->retrans++;
  330 + inet_rtx_syn_ack(sk, req);
  331 + req->num_timeout++;
332 332 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
333   - TCP_TIMEOUT_INIT << req->retrans, TCP_RTO_MAX);
  333 + TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
334 334 }
335 335  
336 336 /*
net/ipv6/syncookies.c
... ... @@ -214,7 +214,7 @@
214 214 ireq6->iif = inet6_iif(skb);
215 215  
216 216 req->expires = 0UL;
217   - req->retrans = 0;
  217 + req->num_retrans = 0;
218 218 ireq->ecn_ok = ecn_ok;
219 219 ireq->snd_wscale = tcp_opt.snd_wscale;
220 220 ireq->sack_ok = tcp_opt.sack_ok;
... ... @@ -495,9 +495,12 @@
495 495 struct request_values *rvp)
496 496 {
497 497 struct flowi6 fl6;
  498 + int res;
498 499  
499   - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
500   - return tcp_v6_send_synack(sk, NULL, &fl6, req, rvp, 0);
  500 + res = tcp_v6_send_synack(sk, NULL, &fl6, req, rvp, 0);
  501 + if (!res)
  502 + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
  503 + return res;
501 504 }
502 505  
503 506 static void tcp_v6_reqsk_destructor(struct request_sock *req)
... ... @@ -1364,7 +1367,7 @@
1364 1367  
1365 1368 tcp_initialize_rcv_mss(newsk);
1366 1369 tcp_synack_rtt_meas(newsk, req);
1367   - newtp->total_retrans = req->retrans;
  1370 + newtp->total_retrans = req->num_retrans;
1368 1371  
1369 1372 newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
1370 1373 newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
... ... @@ -1866,7 +1869,7 @@
1866 1869 0,0, /* could print option size, but that is af dependent. */
1867 1870 1, /* timers active (only the expire timer) */
1868 1871 jiffies_to_clock_t(ttd),
1869   - req->retrans,
  1872 + req->num_timeout,
1870 1873 from_kuid_munged(seq_user_ns(seq), uid),
1871 1874 0, /* non standard timer */
1872 1875 0, /* open_requests have no inode */