Commit 563d34d05786263893ba4a1042eb9b9374127cf5

Authored by Eric Dumazet
Committed by David S. Miller
1 parent c3def943c7

tcp: dont drop MTU reduction indications

ICMP messages generated in output path if frame length is bigger than
mtu are actually lost because socket is owned by user (doing the xmit)

One example is the ipgre_tunnel_xmit() calling
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));

We had a similar case fixed in commit a34a101e1e6 (ipv6: disable GSO on
sockets hitting dst_allfrag).

Problem of such fix is that it relied on retransmit timers, so short tcp
sessions paid a too big latency increase price.

This patch uses the tcp_release_cb() infrastructure so that MTU
reduction messages (ICMP messages) are not lost, and no extra delay
is added in TCP transmits.

Reported-by: Maciej Żenczykowski <maze@google.com>
Diagnosed-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Nandita Dukkipati <nanditad@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Tore Anderson <tore@fud.no>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 5 changed files with 51 additions and 21 deletions Side-by-side Diff

... ... @@ -493,6 +493,9 @@
493 493 u32 probe_seq_start;
494 494 u32 probe_seq_end;
495 495 } mtu_probe;
  496 + u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG
  497 + * while socket was owned by user.
  498 + */
496 499  
497 500 #ifdef CONFIG_TCP_MD5SIG
498 501 /* TCP AF-Specific parts; only used by MD5 Signature support so far */
... ... @@ -518,6 +521,9 @@
518 521 TCP_TSQ_DEFERRED, /* tcp_tasklet_func() found socket was owned */
519 522 TCP_WRITE_TIMER_DEFERRED, /* tcp_write_timer() found socket was owned */
520 523 TCP_DELACK_TIMER_DEFERRED, /* tcp_delack_timer() found socket was owned */
  524 + TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call
  525 + * tcp_v{4|6}_mtu_reduced()
  526 + */
521 527 };
522 528  
523 529 static inline struct tcp_sock *tcp_sk(const struct sock *sk)
... ... @@ -859,6 +859,7 @@
859 859 struct sk_buff *skb);
860 860  
861 861 void (*release_cb)(struct sock *sk);
  862 + void (*mtu_reduced)(struct sock *sk);
862 863  
863 864 /* Keeping track of sk's, looking them up, and port selection methods. */
864 865 void (*hash)(struct sock *sk);
... ... @@ -275,12 +275,15 @@
275 275 EXPORT_SYMBOL(tcp_v4_connect);
276 276  
277 277 /*
278   - * This routine does path mtu discovery as defined in RFC1191.
  278 + * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
  279 + * It can be called through tcp_release_cb() if socket was owned by user
  280 + * at the time tcp_v4_err() was called to handle ICMP message.
279 281 */
280   -static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
  282 +static void tcp_v4_mtu_reduced(struct sock *sk)
281 283 {
282 284 struct dst_entry *dst;
283 285 struct inet_sock *inet = inet_sk(sk);
  286 + u32 mtu = tcp_sk(sk)->mtu_info;
284 287  
285 288 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
286 289 * send out by Linux are always <576bytes so they should go through
287 290  
... ... @@ -373,8 +376,12 @@
373 376 bh_lock_sock(sk);
374 377 /* If too many ICMPs get dropped on busy
375 378 * servers this needs to be solved differently.
  379 + * We do take care of PMTU discovery (RFC1191) special case :
  380 + * we can receive locally generated ICMP messages while socket is held.
376 381 */
377   - if (sock_owned_by_user(sk))
  382 + if (sock_owned_by_user(sk) &&
  383 + type != ICMP_DEST_UNREACH &&
  384 + code != ICMP_FRAG_NEEDED)
378 385 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
379 386  
380 387 if (sk->sk_state == TCP_CLOSE)
381 388  
... ... @@ -409,8 +416,11 @@
409 416 goto out;
410 417  
411 418 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
  419 + tp->mtu_info = info;
412 420 if (!sock_owned_by_user(sk))
413   - do_pmtu_discovery(sk, iph, info);
  421 + tcp_v4_mtu_reduced(sk);
  422 + else
  423 + set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags);
414 424 goto out;
415 425 }
416 426  
... ... @@ -2596,6 +2606,7 @@
2596 2606 .sendpage = tcp_sendpage,
2597 2607 .backlog_rcv = tcp_v4_do_rcv,
2598 2608 .release_cb = tcp_release_cb,
  2609 + .mtu_reduced = tcp_v4_mtu_reduced,
2599 2610 .hash = inet_hash,
2600 2611 .unhash = inet_unhash,
2601 2612 .get_port = inet_csk_get_port,
net/ipv4/tcp_output.c
... ... @@ -885,7 +885,8 @@
885 885  
886 886 #define TCP_DEFERRED_ALL ((1UL << TCP_TSQ_DEFERRED) | \
887 887 (1UL << TCP_WRITE_TIMER_DEFERRED) | \
888   - (1UL << TCP_DELACK_TIMER_DEFERRED))
  888 + (1UL << TCP_DELACK_TIMER_DEFERRED) | \
  889 + (1UL << TCP_MTU_REDUCED_DEFERRED))
889 890 /**
890 891 * tcp_release_cb - tcp release_sock() callback
891 892 * @sk: socket
... ... @@ -914,6 +915,9 @@
914 915  
915 916 if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED))
916 917 tcp_delack_timer_handler(sk);
  918 +
  919 + if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED))
  920 + sk->sk_prot->mtu_reduced(sk);
917 921 }
918 922 EXPORT_SYMBOL(tcp_release_cb);
919 923  
... ... @@ -315,6 +315,23 @@
315 315 return err;
316 316 }
317 317  
  318 +static void tcp_v6_mtu_reduced(struct sock *sk)
  319 +{
  320 + struct dst_entry *dst;
  321 +
  322 + if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
  323 + return;
  324 +
  325 + dst = inet6_csk_update_pmtu(sk, tcp_sk(sk)->mtu_info);
  326 + if (!dst)
  327 + return;
  328 +
  329 + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
  330 + tcp_sync_mss(sk, dst_mtu(dst));
  331 + tcp_simple_retransmit(sk);
  332 + }
  333 +}
  334 +
318 335 static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
319 336 u8 type, u8 code, int offset, __be32 info)
320 337 {
... ... @@ -342,7 +359,7 @@
342 359 }
343 360  
344 361 bh_lock_sock(sk);
345   - if (sock_owned_by_user(sk))
  362 + if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
346 363 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
347 364  
348 365 if (sk->sk_state == TCP_CLOSE)
... ... @@ -371,21 +388,11 @@
371 388 }
372 389  
373 390 if (type == ICMPV6_PKT_TOOBIG) {
374   - struct dst_entry *dst;
375   -
376   - if (sock_owned_by_user(sk))
377   - goto out;
378   - if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
379   - goto out;
380   -
381   - dst = inet6_csk_update_pmtu(sk, ntohl(info));
382   - if (!dst)
383   - goto out;
384   -
385   - if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
386   - tcp_sync_mss(sk, dst_mtu(dst));
387   - tcp_simple_retransmit(sk);
388   - }
  391 + tp->mtu_info = ntohl(info);
  392 + if (!sock_owned_by_user(sk))
  393 + tcp_v6_mtu_reduced(sk);
  394 + else
  395 + set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags);
389 396 goto out;
390 397 }
391 398  
... ... @@ -1949,6 +1956,7 @@
1949 1956 .sendpage = tcp_sendpage,
1950 1957 .backlog_rcv = tcp_v6_do_rcv,
1951 1958 .release_cb = tcp_release_cb,
  1959 + .mtu_reduced = tcp_v6_mtu_reduced,
1952 1960 .hash = tcp_v6_hash,
1953 1961 .unhash = inet_unhash,
1954 1962 .get_port = inet_csk_get_port,