Commit 5d424d5a674f782d0659a3b66d951f412901faee
Committed by
David S. Miller
1 parent
1d60290f27
Exists in
master
and in
7 other branches
[TCP]: MTU probing
Implementation of packetization layer path mtu discovery for TCP, based on the internet-draft currently found at <http://www.ietf.org/internet-drafts/draft-ietf-pmtud-method-05.txt>. Signed-off-by: John Heffner <jheffner@psc.edu> Signed-off-by: David S. Miller <davem@davemloft.net>
Showing 9 changed files with 326 additions and 37 deletions Side-by-side Diff
include/linux/sysctl.h
include/net/inet_connection_sock.h
... | ... | @@ -72,6 +72,7 @@ |
72 | 72 | * @icsk_probes_out: unanswered 0 window probes |
73 | 73 | * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options) |
74 | 74 | * @icsk_ack: Delayed ACK control data |
75 | + * @icsk_mtup; MTU probing control data | |
75 | 76 | */ |
76 | 77 | struct inet_connection_sock { |
77 | 78 | /* inet_sock has to be the first member! */ |
... | ... | @@ -104,6 +105,18 @@ |
104 | 105 | __u16 last_seg_size; /* Size of last incoming segment */ |
105 | 106 | __u16 rcv_mss; /* MSS used for delayed ACK decisions */ |
106 | 107 | } icsk_ack; |
108 | + struct { | |
109 | + int enabled; | |
110 | + | |
111 | + /* Range of MTUs to search */ | |
112 | + int search_high; | |
113 | + int search_low; | |
114 | + | |
115 | + /* Information on the current probe. */ | |
116 | + int probe_size; | |
117 | + __u32 probe_seq_start; | |
118 | + __u32 probe_seq_end; | |
119 | + } icsk_mtup; | |
107 | 120 | u32 icsk_ca_priv[16]; |
108 | 121 | #define ICSK_CA_PRIV_SIZE (16 * sizeof(u32)) |
109 | 122 | }; |
include/net/tcp.h
... | ... | @@ -60,6 +60,9 @@ |
60 | 60 | /* Minimal RCV_MSS. */ |
61 | 61 | #define TCP_MIN_RCVMSS 536U |
62 | 62 | |
63 | +/* The least MTU to use for probing */ | |
64 | +#define TCP_BASE_MSS 512 | |
65 | + | |
63 | 66 | /* After receiving this amount of duplicate ACKs fast retransmit starts. */ |
64 | 67 | #define TCP_FASTRETRANS_THRESH 3 |
65 | 68 | |
... | ... | @@ -219,6 +222,8 @@ |
219 | 222 | extern int sysctl_tcp_moderate_rcvbuf; |
220 | 223 | extern int sysctl_tcp_tso_win_divisor; |
221 | 224 | extern int sysctl_tcp_abc; |
225 | +extern int sysctl_tcp_mtu_probing; | |
226 | +extern int sysctl_tcp_base_mss; | |
222 | 227 | |
223 | 228 | extern atomic_t tcp_memory_allocated; |
224 | 229 | extern atomic_t tcp_sockets_allocated; |
... | ... | @@ -446,6 +451,10 @@ |
446 | 451 | sk_read_actor_t recv_actor); |
447 | 452 | |
448 | 453 | extern void tcp_initialize_rcv_mss(struct sock *sk); |
454 | + | |
455 | +extern int tcp_mtu_to_mss(struct sock *sk, int pmtu); | |
456 | +extern int tcp_mss_to_mtu(struct sock *sk, int mss); | |
457 | +extern void tcp_mtup_init(struct sock *sk); | |
449 | 458 | |
450 | 459 | static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) |
451 | 460 | { |
net/ipv4/sysctl_net_ipv4.c
... | ... | @@ -664,6 +664,22 @@ |
664 | 664 | .mode = 0644, |
665 | 665 | .proc_handler = &proc_dointvec, |
666 | 666 | }, |
667 | + { | |
668 | + .ctl_name = NET_TCP_MTU_PROBING, | |
669 | + .procname = "tcp_mtu_probing", | |
670 | + .data = &sysctl_tcp_mtu_probing, | |
671 | + .maxlen = sizeof(int), | |
672 | + .mode = 0644, | |
673 | + .proc_handler = &proc_dointvec, | |
674 | + }, | |
675 | + { | |
676 | + .ctl_name = NET_TCP_BASE_MSS, | |
677 | + .procname = "tcp_base_mss", | |
678 | + .data = &sysctl_tcp_base_mss, | |
679 | + .maxlen = sizeof(int), | |
680 | + .mode = 0644, | |
681 | + .proc_handler = &proc_dointvec, | |
682 | + }, | |
667 | 683 | |
668 | 684 | { .ctl_name = 0 } |
669 | 685 | }; |
net/ipv4/tcp_input.c
... | ... | @@ -1891,6 +1891,34 @@ |
1891 | 1891 | } |
1892 | 1892 | } |
1893 | 1893 | |
1894 | +static void tcp_mtup_probe_failed(struct sock *sk) | |
1895 | +{ | |
1896 | + struct inet_connection_sock *icsk = inet_csk(sk); | |
1897 | + | |
1898 | + icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1; | |
1899 | + icsk->icsk_mtup.probe_size = 0; | |
1900 | +} | |
1901 | + | |
1902 | +static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) | |
1903 | +{ | |
1904 | + struct tcp_sock *tp = tcp_sk(sk); | |
1905 | + struct inet_connection_sock *icsk = inet_csk(sk); | |
1906 | + | |
1907 | + /* FIXME: breaks with very large cwnd */ | |
1908 | + tp->prior_ssthresh = tcp_current_ssthresh(sk); | |
1909 | + tp->snd_cwnd = tp->snd_cwnd * | |
1910 | + tcp_mss_to_mtu(sk, tp->mss_cache) / | |
1911 | + icsk->icsk_mtup.probe_size; | |
1912 | + tp->snd_cwnd_cnt = 0; | |
1913 | + tp->snd_cwnd_stamp = tcp_time_stamp; | |
1914 | + tp->rcv_ssthresh = tcp_current_ssthresh(sk); | |
1915 | + | |
1916 | + icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; | |
1917 | + icsk->icsk_mtup.probe_size = 0; | |
1918 | + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); | |
1919 | +} | |
1920 | + | |
1921 | + | |
1894 | 1922 | /* Process an event, which can update packets-in-flight not trivially. |
1895 | 1923 | * Main goal of this function is to calculate new estimate for left_out, |
1896 | 1924 | * taking into account both packets sitting in receiver's buffer and |
... | ... | @@ -2023,6 +2051,17 @@ |
2023 | 2051 | return; |
2024 | 2052 | } |
2025 | 2053 | |
2054 | + /* MTU probe failure: don't reduce cwnd */ | |
2055 | + if (icsk->icsk_ca_state < TCP_CA_CWR && | |
2056 | + icsk->icsk_mtup.probe_size && | |
2057 | + tp->snd_una == icsk->icsk_mtup.probe_seq_start) { | |
2058 | + tcp_mtup_probe_failed(sk); | |
2059 | + /* Restores the reduction we did in tcp_mtup_probe() */ | |
2060 | + tp->snd_cwnd++; | |
2061 | + tcp_simple_retransmit(sk); | |
2062 | + return; | |
2063 | + } | |
2064 | + | |
2026 | 2065 | /* Otherwise enter Recovery state */ |
2027 | 2066 | |
2028 | 2067 | if (IsReno(tp)) |
... | ... | @@ -2243,6 +2282,13 @@ |
2243 | 2282 | tp->retrans_stamp = 0; |
2244 | 2283 | } |
2245 | 2284 | |
2285 | + /* MTU probing checks */ | |
2286 | + if (icsk->icsk_mtup.probe_size) { | |
2287 | + if (!after(icsk->icsk_mtup.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) { | |
2288 | + tcp_mtup_probe_success(sk, skb); | |
2289 | + } | |
2290 | + } | |
2291 | + | |
2246 | 2292 | if (sacked) { |
2247 | 2293 | if (sacked & TCPCB_RETRANS) { |
2248 | 2294 | if(sacked & TCPCB_SACKED_RETRANS) |
... | ... | @@ -4101,6 +4147,7 @@ |
4101 | 4147 | if (tp->rx_opt.sack_ok && sysctl_tcp_fack) |
4102 | 4148 | tp->rx_opt.sack_ok |= 2; |
4103 | 4149 | |
4150 | + tcp_mtup_init(sk); | |
4104 | 4151 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); |
4105 | 4152 | tcp_initialize_rcv_mss(sk); |
4106 | 4153 | |
... | ... | @@ -4211,6 +4258,7 @@ |
4211 | 4258 | if (tp->ecn_flags&TCP_ECN_OK) |
4212 | 4259 | sock_set_flag(sk, SOCK_NO_LARGESEND); |
4213 | 4260 | |
4261 | + tcp_mtup_init(sk); | |
4214 | 4262 | tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); |
4215 | 4263 | tcp_initialize_rcv_mss(sk); |
4216 | 4264 | |
... | ... | @@ -4399,6 +4447,7 @@ |
4399 | 4447 | */ |
4400 | 4448 | tp->lsndtime = tcp_time_stamp; |
4401 | 4449 | |
4450 | + tcp_mtup_init(sk); | |
4402 | 4451 | tcp_initialize_rcv_mss(sk); |
4403 | 4452 | tcp_init_buffer_space(sk); |
4404 | 4453 | tcp_fast_path_on(tp); |
net/ipv4/tcp_ipv4.c
... | ... | @@ -900,6 +900,7 @@ |
900 | 900 | inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; |
901 | 901 | newinet->id = newtp->write_seq ^ jiffies; |
902 | 902 | |
903 | + tcp_mtup_init(newsk); | |
903 | 904 | tcp_sync_mss(newsk, dst_mtu(dst)); |
904 | 905 | newtp->advmss = dst_metric(dst, RTAX_ADVMSS); |
905 | 906 | tcp_initialize_rcv_mss(newsk); |
net/ipv4/tcp_output.c
... | ... | @@ -51,6 +51,12 @@ |
51 | 51 | */ |
52 | 52 | int sysctl_tcp_tso_win_divisor = 3; |
53 | 53 | |
54 | +int sysctl_tcp_mtu_probing = 0; | |
55 | +int sysctl_tcp_base_mss = 512; | |
56 | + | |
57 | +EXPORT_SYMBOL(sysctl_tcp_mtu_probing); | |
58 | +EXPORT_SYMBOL(sysctl_tcp_base_mss); | |
59 | + | |
54 | 60 | static void update_send_head(struct sock *sk, struct tcp_sock *tp, |
55 | 61 | struct sk_buff *skb) |
56 | 62 | { |
... | ... | @@ -681,6 +687,62 @@ |
681 | 687 | return 0; |
682 | 688 | } |
683 | 689 | |
690 | +/* Not accounting for SACKs here. */ | |
691 | +int tcp_mtu_to_mss(struct sock *sk, int pmtu) | |
692 | +{ | |
693 | + struct tcp_sock *tp = tcp_sk(sk); | |
694 | + struct inet_connection_sock *icsk = inet_csk(sk); | |
695 | + int mss_now; | |
696 | + | |
697 | + /* Calculate base mss without TCP options: | |
698 | + It is MMS_S - sizeof(tcphdr) of rfc1122 | |
699 | + */ | |
700 | + mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr); | |
701 | + | |
702 | + /* Clamp it (mss_clamp does not include tcp options) */ | |
703 | + if (mss_now > tp->rx_opt.mss_clamp) | |
704 | + mss_now = tp->rx_opt.mss_clamp; | |
705 | + | |
706 | + /* Now subtract optional transport overhead */ | |
707 | + mss_now -= icsk->icsk_ext_hdr_len; | |
708 | + | |
709 | + /* Then reserve room for full set of TCP options and 8 bytes of data */ | |
710 | + if (mss_now < 48) | |
711 | + mss_now = 48; | |
712 | + | |
713 | + /* Now subtract TCP options size, not including SACKs */ | |
714 | + mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); | |
715 | + | |
716 | + return mss_now; | |
717 | +} | |
718 | + | |
719 | +/* Inverse of above */ | |
720 | +int tcp_mss_to_mtu(struct sock *sk, int mss) | |
721 | +{ | |
722 | + struct tcp_sock *tp = tcp_sk(sk); | |
723 | + struct inet_connection_sock *icsk = inet_csk(sk); | |
724 | + int mtu; | |
725 | + | |
726 | + mtu = mss + | |
727 | + tp->tcp_header_len + | |
728 | + icsk->icsk_ext_hdr_len + | |
729 | + icsk->icsk_af_ops->net_header_len; | |
730 | + | |
731 | + return mtu; | |
732 | +} | |
733 | + | |
734 | +void tcp_mtup_init(struct sock *sk) | |
735 | +{ | |
736 | + struct tcp_sock *tp = tcp_sk(sk); | |
737 | + struct inet_connection_sock *icsk = inet_csk(sk); | |
738 | + | |
739 | + icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1; | |
740 | + icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) + | |
741 | + icsk->icsk_af_ops->net_header_len; | |
742 | + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss); | |
743 | + icsk->icsk_mtup.probe_size = 0; | |
744 | +} | |
745 | + | |
684 | 746 | /* This function synchronize snd mss to current pmtu/exthdr set. |
685 | 747 | |
686 | 748 | tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts |
687 | 749 | |
688 | 750 | |
689 | 751 | |
690 | 752 | |
... | ... | @@ -708,32 +770,21 @@ |
708 | 770 | { |
709 | 771 | struct tcp_sock *tp = tcp_sk(sk); |
710 | 772 | struct inet_connection_sock *icsk = inet_csk(sk); |
711 | - /* Calculate base mss without TCP options: | |
712 | - It is MMS_S - sizeof(tcphdr) of rfc1122 | |
713 | - */ | |
714 | - int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len - | |
715 | - sizeof(struct tcphdr)); | |
773 | + int mss_now; | |
716 | 774 | |
717 | - /* Clamp it (mss_clamp does not include tcp options) */ | |
718 | - if (mss_now > tp->rx_opt.mss_clamp) | |
719 | - mss_now = tp->rx_opt.mss_clamp; | |
775 | + if (icsk->icsk_mtup.search_high > pmtu) | |
776 | + icsk->icsk_mtup.search_high = pmtu; | |
720 | 777 | |
721 | - /* Now subtract optional transport overhead */ | |
722 | - mss_now -= icsk->icsk_ext_hdr_len; | |
778 | + mss_now = tcp_mtu_to_mss(sk, pmtu); | |
723 | 779 | |
724 | - /* Then reserve room for full set of TCP options and 8 bytes of data */ | |
725 | - if (mss_now < 48) | |
726 | - mss_now = 48; | |
727 | - | |
728 | - /* Now subtract TCP options size, not including SACKs */ | |
729 | - mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); | |
730 | - | |
731 | 780 | /* Bound mss with half of window */ |
732 | 781 | if (tp->max_window && mss_now > (tp->max_window>>1)) |
733 | 782 | mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len); |
734 | 783 | |
735 | 784 | /* And store cached results */ |
736 | 785 | icsk->icsk_pmtu_cookie = pmtu; |
786 | + if (icsk->icsk_mtup.enabled) | |
787 | + mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)); | |
737 | 788 | tp->mss_cache = mss_now; |
738 | 789 | |
739 | 790 | return mss_now; |
... | ... | @@ -1063,6 +1114,140 @@ |
1063 | 1114 | return 1; |
1064 | 1115 | } |
1065 | 1116 | |
1117 | +/* Create a new MTU probe if we are ready. | |
1118 | + * Returns 0 if we should wait to probe (no cwnd available), | |
1119 | + * 1 if a probe was sent, | |
1120 | + * -1 otherwise */ | |
1121 | +static int tcp_mtu_probe(struct sock *sk) | |
1122 | +{ | |
1123 | + struct tcp_sock *tp = tcp_sk(sk); | |
1124 | + struct inet_connection_sock *icsk = inet_csk(sk); | |
1125 | + struct sk_buff *skb, *nskb, *next; | |
1126 | + int len; | |
1127 | + int probe_size; | |
1128 | + unsigned int pif; | |
1129 | + int copy; | |
1130 | + int mss_now; | |
1131 | + | |
1132 | + /* Not currently probing/verifying, | |
1133 | + * not in recovery, | |
1134 | + * have enough cwnd, and | |
1135 | + * not SACKing (the variable headers throw things off) */ | |
1136 | + if (!icsk->icsk_mtup.enabled || | |
1137 | + icsk->icsk_mtup.probe_size || | |
1138 | + inet_csk(sk)->icsk_ca_state != TCP_CA_Open || | |
1139 | + tp->snd_cwnd < 11 || | |
1140 | + tp->rx_opt.eff_sacks) | |
1141 | + return -1; | |
1142 | + | |
1143 | + /* Very simple search strategy: just double the MSS. */ | |
1144 | + mss_now = tcp_current_mss(sk, 0); | |
1145 | + probe_size = 2*tp->mss_cache; | |
1146 | + if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { | |
1147 | + /* TODO: set timer for probe_converge_event */ | |
1148 | + return -1; | |
1149 | + } | |
1150 | + | |
1151 | + /* Have enough data in the send queue to probe? */ | |
1152 | + len = 0; | |
1153 | + if ((skb = sk->sk_send_head) == NULL) | |
1154 | + return -1; | |
1155 | + while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb)) | |
1156 | + skb = skb->next; | |
1157 | + if (len < probe_size) | |
1158 | + return -1; | |
1159 | + | |
1160 | + /* Receive window check. */ | |
1161 | + if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) { | |
1162 | + if (tp->snd_wnd < probe_size) | |
1163 | + return -1; | |
1164 | + else | |
1165 | + return 0; | |
1166 | + } | |
1167 | + | |
1168 | + /* Do we need to wait to drain cwnd? */ | |
1169 | + pif = tcp_packets_in_flight(tp); | |
1170 | + if (pif + 2 > tp->snd_cwnd) { | |
1171 | + /* With no packets in flight, don't stall. */ | |
1172 | + if (pif == 0) | |
1173 | + return -1; | |
1174 | + else | |
1175 | + return 0; | |
1176 | + } | |
1177 | + | |
1178 | + /* We're allowed to probe. Build it now. */ | |
1179 | + if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL) | |
1180 | + return -1; | |
1181 | + sk_charge_skb(sk, nskb); | |
1182 | + | |
1183 | + skb = sk->sk_send_head; | |
1184 | + __skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue); | |
1185 | + sk->sk_send_head = nskb; | |
1186 | + | |
1187 | + TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; | |
1188 | + TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; | |
1189 | + TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK; | |
1190 | + TCP_SKB_CB(nskb)->sacked = 0; | |
1191 | + nskb->csum = 0; | |
1192 | + if (skb->ip_summed == CHECKSUM_HW) | |
1193 | + nskb->ip_summed = CHECKSUM_HW; | |
1194 | + | |
1195 | + len = 0; | |
1196 | + while (len < probe_size) { | |
1197 | + next = skb->next; | |
1198 | + | |
1199 | + copy = min_t(int, skb->len, probe_size - len); | |
1200 | + if (nskb->ip_summed) | |
1201 | + skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); | |
1202 | + else | |
1203 | + nskb->csum = skb_copy_and_csum_bits(skb, 0, | |
1204 | + skb_put(nskb, copy), copy, nskb->csum); | |
1205 | + | |
1206 | + if (skb->len <= copy) { | |
1207 | + /* We've eaten all the data from this skb. | |
1208 | + * Throw it away. */ | |
1209 | + TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags; | |
1210 | + __skb_unlink(skb, &sk->sk_write_queue); | |
1211 | + sk_stream_free_skb(sk, skb); | |
1212 | + } else { | |
1213 | + TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & | |
1214 | + ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); | |
1215 | + if (!skb_shinfo(skb)->nr_frags) { | |
1216 | + skb_pull(skb, copy); | |
1217 | + if (skb->ip_summed != CHECKSUM_HW) | |
1218 | + skb->csum = csum_partial(skb->data, skb->len, 0); | |
1219 | + } else { | |
1220 | + __pskb_trim_head(skb, copy); | |
1221 | + tcp_set_skb_tso_segs(sk, skb, mss_now); | |
1222 | + } | |
1223 | + TCP_SKB_CB(skb)->seq += copy; | |
1224 | + } | |
1225 | + | |
1226 | + len += copy; | |
1227 | + skb = next; | |
1228 | + } | |
1229 | + tcp_init_tso_segs(sk, nskb, nskb->len); | |
1230 | + | |
1231 | + /* We're ready to send. If this fails, the probe will | |
1232 | + * be resegmented into mss-sized pieces by tcp_write_xmit(). */ | |
1233 | + TCP_SKB_CB(nskb)->when = tcp_time_stamp; | |
1234 | + if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { | |
1235 | + /* Decrement cwnd here because we are sending | |
1236 | + * effectively two packets. */ | |
1237 | + tp->snd_cwnd--; | |
1238 | + update_send_head(sk, tp, nskb); | |
1239 | + | |
1240 | + icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); | |
1241 | + icsk->icsk_mtup.probe_seq_start = TCP_SKB_CB(nskb)->seq; | |
1242 | + icsk->icsk_mtup.probe_seq_end = TCP_SKB_CB(nskb)->end_seq; | |
1243 | + | |
1244 | + return 1; | |
1245 | + } | |
1246 | + | |
1247 | + return -1; | |
1248 | +} | |
1249 | + | |
1250 | + | |
1066 | 1251 | /* This routine writes packets to the network. It advances the |
1067 | 1252 | * send_head. This happens as incoming acks open up the remote |
1068 | 1253 | * window for us. |
... | ... | @@ -1076,6 +1261,7 @@ |
1076 | 1261 | struct sk_buff *skb; |
1077 | 1262 | unsigned int tso_segs, sent_pkts; |
1078 | 1263 | int cwnd_quota; |
1264 | + int result; | |
1079 | 1265 | |
1080 | 1266 | /* If we are closed, the bytes will have to remain here. |
1081 | 1267 | * In time closedown will finish, we empty the write queue and all |
... | ... | @@ -1085,6 +1271,14 @@ |
1085 | 1271 | return 0; |
1086 | 1272 | |
1087 | 1273 | sent_pkts = 0; |
1274 | + | |
1275 | + /* Do MTU probing. */ | |
1276 | + if ((result = tcp_mtu_probe(sk)) == 0) { | |
1277 | + return 0; | |
1278 | + } else if (result > 0) { | |
1279 | + sent_pkts = 1; | |
1280 | + } | |
1281 | + | |
1088 | 1282 | while ((skb = sk->sk_send_head)) { |
1089 | 1283 | unsigned int limit; |
1090 | 1284 | |
1091 | 1285 | |
... | ... | @@ -1455,9 +1649,15 @@ |
1455 | 1649 | int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) |
1456 | 1650 | { |
1457 | 1651 | struct tcp_sock *tp = tcp_sk(sk); |
1652 | + struct inet_connection_sock *icsk = inet_csk(sk); | |
1458 | 1653 | unsigned int cur_mss = tcp_current_mss(sk, 0); |
1459 | 1654 | int err; |
1460 | 1655 | |
1656 | + /* Inconslusive MTU probe */ | |
1657 | + if (icsk->icsk_mtup.probe_size) { | |
1658 | + icsk->icsk_mtup.probe_size = 0; | |
1659 | + } | |
1660 | + | |
1461 | 1661 | /* Do not sent more than we queued. 1/4 is reserved for possible |
1462 | 1662 | * copying overhead: fragmentation, tunneling, mangling etc. |
1463 | 1663 | */ |
... | ... | @@ -1883,6 +2083,7 @@ |
1883 | 2083 | if (tp->rx_opt.user_mss) |
1884 | 2084 | tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; |
1885 | 2085 | tp->max_window = 0; |
2086 | + tcp_mtup_init(sk); | |
1886 | 2087 | tcp_sync_mss(sk, dst_mtu(dst)); |
1887 | 2088 | |
1888 | 2089 | if (!tp->window_clamp) |
... | ... | @@ -2180,4 +2381,5 @@ |
2180 | 2381 | EXPORT_SYMBOL(tcp_simple_retransmit); |
2181 | 2382 | EXPORT_SYMBOL(tcp_sync_mss); |
2182 | 2383 | EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); |
2384 | +EXPORT_SYMBOL(tcp_mtup_init); |
net/ipv4/tcp_timer.c
... | ... | @@ -119,8 +119,10 @@ |
119 | 119 | /* A write timeout has occurred. Process the after effects. */ |
120 | 120 | static int tcp_write_timeout(struct sock *sk) |
121 | 121 | { |
122 | - const struct inet_connection_sock *icsk = inet_csk(sk); | |
122 | + struct inet_connection_sock *icsk = inet_csk(sk); | |
123 | + struct tcp_sock *tp = tcp_sk(sk); | |
123 | 124 | int retry_until; |
125 | + int mss; | |
124 | 126 | |
125 | 127 | if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { |
126 | 128 | if (icsk->icsk_retransmits) |
... | ... | @@ -128,25 +130,19 @@ |
128 | 130 | retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; |
129 | 131 | } else { |
130 | 132 | if (icsk->icsk_retransmits >= sysctl_tcp_retries1) { |
131 | - /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black | |
132 | - hole detection. :-( | |
133 | - | |
134 | - It is place to make it. It is not made. I do not want | |
135 | - to make it. It is disgusting. It does not work in any | |
136 | - case. Let me to cite the same draft, which requires for | |
137 | - us to implement this: | |
138 | - | |
139 | - "The one security concern raised by this memo is that ICMP black holes | |
140 | - are often caused by over-zealous security administrators who block | |
141 | - all ICMP messages. It is vitally important that those who design and | |
142 | - deploy security systems understand the impact of strict filtering on | |
143 | - upper-layer protocols. The safest web site in the world is worthless | |
144 | - if most TCP implementations cannot transfer data from it. It would | |
145 | - be far nicer to have all of the black holes fixed rather than fixing | |
146 | - all of the TCP implementations." | |
147 | - | |
148 | - Golden words :-). | |
149 | - */ | |
133 | + /* Black hole detection */ | |
134 | + if (sysctl_tcp_mtu_probing) { | |
135 | + if (!icsk->icsk_mtup.enabled) { | |
136 | + icsk->icsk_mtup.enabled = 1; | |
137 | + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); | |
138 | + } else { | |
139 | + mss = min(sysctl_tcp_base_mss, | |
140 | + tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)/2); | |
141 | + mss = max(mss, 68 - tp->tcp_header_len); | |
142 | + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss); | |
143 | + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); | |
144 | + } | |
145 | + } | |
150 | 146 | |
151 | 147 | dst_negative_advice(&sk->sk_dst_cache); |
152 | 148 | } |
net/ipv6/tcp_ipv6.c