Commit 563d34d05786263893ba4a1042eb9b9374127cf5
Committed by
David S. Miller
1 parent
c3def943c7
Exists in
smarc-l5.0.0_1.0.0-ga
and in
5 other branches
tcp: dont drop MTU reduction indications
ICMP messages generated in output path if frame length is bigger than mtu are actually lost because socket is owned by user (doing the xmit) One example is the ipgre_tunnel_xmit() calling icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); We had a similar case fixed in commit a34a101e1e6 (ipv6: disable GSO on sockets hitting dst_allfrag). Problem of such fix is that it relied on retransmit timers, so short tcp sessions paid a too big latency increase price. This patch uses the tcp_release_cb() infrastructure so that MTU reduction messages (ICMP messages) are not lost, and no extra delay is added in TCP transmits. Reported-by: Maciej Żenczykowski <maze@google.com> Diagnosed-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Nandita Dukkipati <nanditad@google.com> Cc: Tom Herbert <therbert@google.com> Cc: Tore Anderson <tore@fud.no> Signed-off-by: David S. Miller <davem@davemloft.net>
Showing 5 changed files with 51 additions and 21 deletions Side-by-side Diff
include/linux/tcp.h
... | ... | @@ -493,6 +493,9 @@ |
493 | 493 | u32 probe_seq_start; |
494 | 494 | u32 probe_seq_end; |
495 | 495 | } mtu_probe; |
496 | + u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG | |
497 | + * while socket was owned by user. | |
498 | + */ | |
496 | 499 | |
497 | 500 | #ifdef CONFIG_TCP_MD5SIG |
498 | 501 | /* TCP AF-Specific parts; only used by MD5 Signature support so far */ |
... | ... | @@ -518,6 +521,9 @@ |
518 | 521 | TCP_TSQ_DEFERRED, /* tcp_tasklet_func() found socket was owned */ |
519 | 522 | TCP_WRITE_TIMER_DEFERRED, /* tcp_write_timer() found socket was owned */ |
520 | 523 | TCP_DELACK_TIMER_DEFERRED, /* tcp_delack_timer() found socket was owned */ |
524 | + TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call | |
525 | + * tcp_v{4|6}_mtu_reduced() | |
526 | + */ | |
521 | 527 | }; |
522 | 528 | |
523 | 529 | static inline struct tcp_sock *tcp_sk(const struct sock *sk) |
include/net/sock.h
net/ipv4/tcp_ipv4.c
... | ... | @@ -275,12 +275,15 @@ |
275 | 275 | EXPORT_SYMBOL(tcp_v4_connect); |
276 | 276 | |
277 | 277 | /* |
278 | - * This routine does path mtu discovery as defined in RFC1191. | |
278 | + * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. | |
279 | + * It can be called through tcp_release_cb() if socket was owned by user | |
280 | + * at the time tcp_v4_err() was called to handle ICMP message. | |
279 | 281 | */ |
280 | -static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu) | |
282 | +static void tcp_v4_mtu_reduced(struct sock *sk) | |
281 | 283 | { |
282 | 284 | struct dst_entry *dst; |
283 | 285 | struct inet_sock *inet = inet_sk(sk); |
286 | + u32 mtu = tcp_sk(sk)->mtu_info; | |
284 | 287 | |
285 | 288 | /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs |
286 | 289 | * send out by Linux are always <576bytes so they should go through |
287 | 290 | |
... | ... | @@ -373,8 +376,12 @@ |
373 | 376 | bh_lock_sock(sk); |
374 | 377 | /* If too many ICMPs get dropped on busy |
375 | 378 | * servers this needs to be solved differently. |
379 | + * We do take care of PMTU discovery (RFC1191) special case : | |
380 | + * we can receive locally generated ICMP messages while socket is held. | |
376 | 381 | */ |
377 | - if (sock_owned_by_user(sk)) | |
382 | + if (sock_owned_by_user(sk) && | |
383 | + type != ICMP_DEST_UNREACH && | |
384 | + code != ICMP_FRAG_NEEDED) | |
378 | 385 | NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); |
379 | 386 | |
380 | 387 | if (sk->sk_state == TCP_CLOSE) |
381 | 388 | |
... | ... | @@ -409,8 +416,11 @@ |
409 | 416 | goto out; |
410 | 417 | |
411 | 418 | if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ |
419 | + tp->mtu_info = info; | |
412 | 420 | if (!sock_owned_by_user(sk)) |
413 | - do_pmtu_discovery(sk, iph, info); | |
421 | + tcp_v4_mtu_reduced(sk); | |
422 | + else | |
423 | + set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags); | |
414 | 424 | goto out; |
415 | 425 | } |
416 | 426 | |
... | ... | @@ -2596,6 +2606,7 @@ |
2596 | 2606 | .sendpage = tcp_sendpage, |
2597 | 2607 | .backlog_rcv = tcp_v4_do_rcv, |
2598 | 2608 | .release_cb = tcp_release_cb, |
2609 | + .mtu_reduced = tcp_v4_mtu_reduced, | |
2599 | 2610 | .hash = inet_hash, |
2600 | 2611 | .unhash = inet_unhash, |
2601 | 2612 | .get_port = inet_csk_get_port, |
net/ipv4/tcp_output.c
... | ... | @@ -885,7 +885,8 @@ |
885 | 885 | |
886 | 886 | #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \ |
887 | 887 | (1UL << TCP_WRITE_TIMER_DEFERRED) | \ |
888 | - (1UL << TCP_DELACK_TIMER_DEFERRED)) | |
888 | + (1UL << TCP_DELACK_TIMER_DEFERRED) | \ | |
889 | + (1UL << TCP_MTU_REDUCED_DEFERRED)) | |
889 | 890 | /** |
890 | 891 | * tcp_release_cb - tcp release_sock() callback |
891 | 892 | * @sk: socket |
... | ... | @@ -914,6 +915,9 @@ |
914 | 915 | |
915 | 916 | if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) |
916 | 917 | tcp_delack_timer_handler(sk); |
918 | + | |
919 | + if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) | |
920 | + sk->sk_prot->mtu_reduced(sk); | |
917 | 921 | } |
918 | 922 | EXPORT_SYMBOL(tcp_release_cb); |
919 | 923 |
net/ipv6/tcp_ipv6.c
... | ... | @@ -315,6 +315,23 @@ |
315 | 315 | return err; |
316 | 316 | } |
317 | 317 | |
318 | +static void tcp_v6_mtu_reduced(struct sock *sk) | |
319 | +{ | |
320 | + struct dst_entry *dst; | |
321 | + | |
322 | + if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) | |
323 | + return; | |
324 | + | |
325 | + dst = inet6_csk_update_pmtu(sk, tcp_sk(sk)->mtu_info); | |
326 | + if (!dst) | |
327 | + return; | |
328 | + | |
329 | + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { | |
330 | + tcp_sync_mss(sk, dst_mtu(dst)); | |
331 | + tcp_simple_retransmit(sk); | |
332 | + } | |
333 | +} | |
334 | + | |
318 | 335 | static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, |
319 | 336 | u8 type, u8 code, int offset, __be32 info) |
320 | 337 | { |
... | ... | @@ -342,7 +359,7 @@ |
342 | 359 | } |
343 | 360 | |
344 | 361 | bh_lock_sock(sk); |
345 | - if (sock_owned_by_user(sk)) | |
362 | + if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) | |
346 | 363 | NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); |
347 | 364 | |
348 | 365 | if (sk->sk_state == TCP_CLOSE) |
... | ... | @@ -371,21 +388,11 @@ |
371 | 388 | } |
372 | 389 | |
373 | 390 | if (type == ICMPV6_PKT_TOOBIG) { |
374 | - struct dst_entry *dst; | |
375 | - | |
376 | - if (sock_owned_by_user(sk)) | |
377 | - goto out; | |
378 | - if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) | |
379 | - goto out; | |
380 | - | |
381 | - dst = inet6_csk_update_pmtu(sk, ntohl(info)); | |
382 | - if (!dst) | |
383 | - goto out; | |
384 | - | |
385 | - if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { | |
386 | - tcp_sync_mss(sk, dst_mtu(dst)); | |
387 | - tcp_simple_retransmit(sk); | |
388 | - } | |
391 | + tp->mtu_info = ntohl(info); | |
392 | + if (!sock_owned_by_user(sk)) | |
393 | + tcp_v6_mtu_reduced(sk); | |
394 | + else | |
395 | + set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags); | |
389 | 396 | goto out; |
390 | 397 | } |
391 | 398 | |
... | ... | @@ -1949,6 +1956,7 @@ |
1949 | 1956 | .sendpage = tcp_sendpage, |
1950 | 1957 | .backlog_rcv = tcp_v6_do_rcv, |
1951 | 1958 | .release_cb = tcp_release_cb, |
1959 | + .mtu_reduced = tcp_v6_mtu_reduced, | |
1952 | 1960 | .hash = tcp_v6_hash, |
1953 | 1961 | .unhash = inet_unhash, |
1954 | 1962 | .get_port = inet_csk_get_port, |