Commit 5d424d5a674f782d0659a3b66d951f412901faee

Authored by John Heffner
Committed by David S. Miller
1 parent 1d60290f27

[TCP]: MTU probing

Implementation of packetization layer path mtu discovery for TCP, based on
the internet-draft currently found at
<http://www.ietf.org/internet-drafts/draft-ietf-pmtud-method-05.txt>.

Signed-off-by: John Heffner <jheffner@psc.edu>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 9 changed files with 326 additions and 37 deletions Side-by-side Diff

include/linux/sysctl.h
... ... @@ -397,6 +397,8 @@
397 397 NET_TCP_CONG_CONTROL=110,
398 398 NET_TCP_ABC=111,
399 399 NET_IPV4_IPFRAG_MAX_DIST=112,
  400 + NET_TCP_MTU_PROBING=113,
  401 + NET_TCP_BASE_MSS=114,
400 402 };
401 403  
402 404 enum {
include/net/inet_connection_sock.h
... ... @@ -72,6 +72,7 @@
72 72 * @icsk_probes_out: unanswered 0 window probes
73 73 * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options)
74 74 * @icsk_ack: Delayed ACK control data
  75 + * @icsk_mtup; MTU probing control data
75 76 */
76 77 struct inet_connection_sock {
77 78 /* inet_sock has to be the first member! */
... ... @@ -104,6 +105,18 @@
104 105 __u16 last_seg_size; /* Size of last incoming segment */
105 106 __u16 rcv_mss; /* MSS used for delayed ACK decisions */
106 107 } icsk_ack;
  108 + struct {
  109 + int enabled;
  110 +
  111 + /* Range of MTUs to search */
  112 + int search_high;
  113 + int search_low;
  114 +
  115 + /* Information on the current probe. */
  116 + int probe_size;
  117 + __u32 probe_seq_start;
  118 + __u32 probe_seq_end;
  119 + } icsk_mtup;
107 120 u32 icsk_ca_priv[16];
108 121 #define ICSK_CA_PRIV_SIZE (16 * sizeof(u32))
109 122 };
... ... @@ -60,6 +60,9 @@
60 60 /* Minimal RCV_MSS. */
61 61 #define TCP_MIN_RCVMSS 536U
62 62  
  63 +/* The least MTU to use for probing */
  64 +#define TCP_BASE_MSS 512
  65 +
63 66 /* After receiving this amount of duplicate ACKs fast retransmit starts. */
64 67 #define TCP_FASTRETRANS_THRESH 3
65 68  
... ... @@ -219,6 +222,8 @@
219 222 extern int sysctl_tcp_moderate_rcvbuf;
220 223 extern int sysctl_tcp_tso_win_divisor;
221 224 extern int sysctl_tcp_abc;
  225 +extern int sysctl_tcp_mtu_probing;
  226 +extern int sysctl_tcp_base_mss;
222 227  
223 228 extern atomic_t tcp_memory_allocated;
224 229 extern atomic_t tcp_sockets_allocated;
... ... @@ -446,6 +451,10 @@
446 451 sk_read_actor_t recv_actor);
447 452  
448 453 extern void tcp_initialize_rcv_mss(struct sock *sk);
  454 +
  455 +extern int tcp_mtu_to_mss(struct sock *sk, int pmtu);
  456 +extern int tcp_mss_to_mtu(struct sock *sk, int mss);
  457 +extern void tcp_mtup_init(struct sock *sk);
449 458  
450 459 static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
451 460 {
net/ipv4/sysctl_net_ipv4.c
... ... @@ -664,6 +664,22 @@
664 664 .mode = 0644,
665 665 .proc_handler = &proc_dointvec,
666 666 },
  667 + {
  668 + .ctl_name = NET_TCP_MTU_PROBING,
  669 + .procname = "tcp_mtu_probing",
  670 + .data = &sysctl_tcp_mtu_probing,
  671 + .maxlen = sizeof(int),
  672 + .mode = 0644,
  673 + .proc_handler = &proc_dointvec,
  674 + },
  675 + {
  676 + .ctl_name = NET_TCP_BASE_MSS,
  677 + .procname = "tcp_base_mss",
  678 + .data = &sysctl_tcp_base_mss,
  679 + .maxlen = sizeof(int),
  680 + .mode = 0644,
  681 + .proc_handler = &proc_dointvec,
  682 + },
667 683  
668 684 { .ctl_name = 0 }
669 685 };
net/ipv4/tcp_input.c
... ... @@ -1891,6 +1891,34 @@
1891 1891 }
1892 1892 }
1893 1893  
  1894 +static void tcp_mtup_probe_failed(struct sock *sk)
  1895 +{
  1896 + struct inet_connection_sock *icsk = inet_csk(sk);
  1897 +
  1898 + icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
  1899 + icsk->icsk_mtup.probe_size = 0;
  1900 +}
  1901 +
  1902 +static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
  1903 +{
  1904 + struct tcp_sock *tp = tcp_sk(sk);
  1905 + struct inet_connection_sock *icsk = inet_csk(sk);
  1906 +
  1907 + /* FIXME: breaks with very large cwnd */
  1908 + tp->prior_ssthresh = tcp_current_ssthresh(sk);
  1909 + tp->snd_cwnd = tp->snd_cwnd *
  1910 + tcp_mss_to_mtu(sk, tp->mss_cache) /
  1911 + icsk->icsk_mtup.probe_size;
  1912 + tp->snd_cwnd_cnt = 0;
  1913 + tp->snd_cwnd_stamp = tcp_time_stamp;
  1914 + tp->rcv_ssthresh = tcp_current_ssthresh(sk);
  1915 +
  1916 + icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
  1917 + icsk->icsk_mtup.probe_size = 0;
  1918 + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
  1919 +}
  1920 +
  1921 +
1894 1922 /* Process an event, which can update packets-in-flight not trivially.
1895 1923 * Main goal of this function is to calculate new estimate for left_out,
1896 1924 * taking into account both packets sitting in receiver's buffer and
... ... @@ -2023,6 +2051,17 @@
2023 2051 return;
2024 2052 }
2025 2053  
  2054 + /* MTU probe failure: don't reduce cwnd */
  2055 + if (icsk->icsk_ca_state < TCP_CA_CWR &&
  2056 + icsk->icsk_mtup.probe_size &&
  2057 + tp->snd_una == icsk->icsk_mtup.probe_seq_start) {
  2058 + tcp_mtup_probe_failed(sk);
  2059 + /* Restores the reduction we did in tcp_mtup_probe() */
  2060 + tp->snd_cwnd++;
  2061 + tcp_simple_retransmit(sk);
  2062 + return;
  2063 + }
  2064 +
2026 2065 /* Otherwise enter Recovery state */
2027 2066  
2028 2067 if (IsReno(tp))
... ... @@ -2243,6 +2282,13 @@
2243 2282 tp->retrans_stamp = 0;
2244 2283 }
2245 2284  
  2285 + /* MTU probing checks */
  2286 + if (icsk->icsk_mtup.probe_size) {
  2287 + if (!after(icsk->icsk_mtup.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) {
  2288 + tcp_mtup_probe_success(sk, skb);
  2289 + }
  2290 + }
  2291 +
2246 2292 if (sacked) {
2247 2293 if (sacked & TCPCB_RETRANS) {
2248 2294 if(sacked & TCPCB_SACKED_RETRANS)
... ... @@ -4101,6 +4147,7 @@
4101 4147 if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
4102 4148 tp->rx_opt.sack_ok |= 2;
4103 4149  
  4150 + tcp_mtup_init(sk);
4104 4151 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
4105 4152 tcp_initialize_rcv_mss(sk);
4106 4153  
... ... @@ -4211,6 +4258,7 @@
4211 4258 if (tp->ecn_flags&TCP_ECN_OK)
4212 4259 sock_set_flag(sk, SOCK_NO_LARGESEND);
4213 4260  
  4261 + tcp_mtup_init(sk);
4214 4262 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
4215 4263 tcp_initialize_rcv_mss(sk);
4216 4264  
... ... @@ -4399,6 +4447,7 @@
4399 4447 */
4400 4448 tp->lsndtime = tcp_time_stamp;
4401 4449  
  4450 + tcp_mtup_init(sk);
4402 4451 tcp_initialize_rcv_mss(sk);
4403 4452 tcp_init_buffer_space(sk);
4404 4453 tcp_fast_path_on(tp);
... ... @@ -900,6 +900,7 @@
900 900 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
901 901 newinet->id = newtp->write_seq ^ jiffies;
902 902  
  903 + tcp_mtup_init(newsk);
903 904 tcp_sync_mss(newsk, dst_mtu(dst));
904 905 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
905 906 tcp_initialize_rcv_mss(newsk);
net/ipv4/tcp_output.c
... ... @@ -51,6 +51,12 @@
51 51 */
52 52 int sysctl_tcp_tso_win_divisor = 3;
53 53  
  54 +int sysctl_tcp_mtu_probing = 0;
  55 +int sysctl_tcp_base_mss = 512;
  56 +
  57 +EXPORT_SYMBOL(sysctl_tcp_mtu_probing);
  58 +EXPORT_SYMBOL(sysctl_tcp_base_mss);
  59 +
54 60 static void update_send_head(struct sock *sk, struct tcp_sock *tp,
55 61 struct sk_buff *skb)
56 62 {
... ... @@ -681,6 +687,62 @@
681 687 return 0;
682 688 }
683 689  
  690 +/* Not accounting for SACKs here. */
  691 +int tcp_mtu_to_mss(struct sock *sk, int pmtu)
  692 +{
  693 + struct tcp_sock *tp = tcp_sk(sk);
  694 + struct inet_connection_sock *icsk = inet_csk(sk);
  695 + int mss_now;
  696 +
  697 + /* Calculate base mss without TCP options:
  698 + It is MMS_S - sizeof(tcphdr) of rfc1122
  699 + */
  700 + mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
  701 +
  702 + /* Clamp it (mss_clamp does not include tcp options) */
  703 + if (mss_now > tp->rx_opt.mss_clamp)
  704 + mss_now = tp->rx_opt.mss_clamp;
  705 +
  706 + /* Now subtract optional transport overhead */
  707 + mss_now -= icsk->icsk_ext_hdr_len;
  708 +
  709 + /* Then reserve room for full set of TCP options and 8 bytes of data */
  710 + if (mss_now < 48)
  711 + mss_now = 48;
  712 +
  713 + /* Now subtract TCP options size, not including SACKs */
  714 + mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
  715 +
  716 + return mss_now;
  717 +}
  718 +
  719 +/* Inverse of above */
  720 +int tcp_mss_to_mtu(struct sock *sk, int mss)
  721 +{
  722 + struct tcp_sock *tp = tcp_sk(sk);
  723 + struct inet_connection_sock *icsk = inet_csk(sk);
  724 + int mtu;
  725 +
  726 + mtu = mss +
  727 + tp->tcp_header_len +
  728 + icsk->icsk_ext_hdr_len +
  729 + icsk->icsk_af_ops->net_header_len;
  730 +
  731 + return mtu;
  732 +}
  733 +
  734 +void tcp_mtup_init(struct sock *sk)
  735 +{
  736 + struct tcp_sock *tp = tcp_sk(sk);
  737 + struct inet_connection_sock *icsk = inet_csk(sk);
  738 +
  739 + icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
  740 + icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
  741 + icsk->icsk_af_ops->net_header_len;
  742 + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
  743 + icsk->icsk_mtup.probe_size = 0;
  744 +}
  745 +
684 746 /* This function synchronize snd mss to current pmtu/exthdr set.
685 747  
686 748 tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
687 749  
688 750  
689 751  
690 752  
... ... @@ -708,32 +770,21 @@
708 770 {
709 771 struct tcp_sock *tp = tcp_sk(sk);
710 772 struct inet_connection_sock *icsk = inet_csk(sk);
711   - /* Calculate base mss without TCP options:
712   - It is MMS_S - sizeof(tcphdr) of rfc1122
713   - */
714   - int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
715   - sizeof(struct tcphdr));
  773 + int mss_now;
716 774  
717   - /* Clamp it (mss_clamp does not include tcp options) */
718   - if (mss_now > tp->rx_opt.mss_clamp)
719   - mss_now = tp->rx_opt.mss_clamp;
  775 + if (icsk->icsk_mtup.search_high > pmtu)
  776 + icsk->icsk_mtup.search_high = pmtu;
720 777  
721   - /* Now subtract optional transport overhead */
722   - mss_now -= icsk->icsk_ext_hdr_len;
  778 + mss_now = tcp_mtu_to_mss(sk, pmtu);
723 779  
724   - /* Then reserve room for full set of TCP options and 8 bytes of data */
725   - if (mss_now < 48)
726   - mss_now = 48;
727   -
728   - /* Now subtract TCP options size, not including SACKs */
729   - mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
730   -
731 780 /* Bound mss with half of window */
732 781 if (tp->max_window && mss_now > (tp->max_window>>1))
733 782 mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
734 783  
735 784 /* And store cached results */
736 785 icsk->icsk_pmtu_cookie = pmtu;
  786 + if (icsk->icsk_mtup.enabled)
  787 + mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
737 788 tp->mss_cache = mss_now;
738 789  
739 790 return mss_now;
... ... @@ -1063,6 +1114,140 @@
1063 1114 return 1;
1064 1115 }
1065 1116  
  1117 +/* Create a new MTU probe if we are ready.
  1118 + * Returns 0 if we should wait to probe (no cwnd available),
  1119 + * 1 if a probe was sent,
  1120 + * -1 otherwise */
  1121 +static int tcp_mtu_probe(struct sock *sk)
  1122 +{
  1123 + struct tcp_sock *tp = tcp_sk(sk);
  1124 + struct inet_connection_sock *icsk = inet_csk(sk);
  1125 + struct sk_buff *skb, *nskb, *next;
  1126 + int len;
  1127 + int probe_size;
  1128 + unsigned int pif;
  1129 + int copy;
  1130 + int mss_now;
  1131 +
  1132 + /* Not currently probing/verifying,
  1133 + * not in recovery,
  1134 + * have enough cwnd, and
  1135 + * not SACKing (the variable headers throw things off) */
  1136 + if (!icsk->icsk_mtup.enabled ||
  1137 + icsk->icsk_mtup.probe_size ||
  1138 + inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
  1139 + tp->snd_cwnd < 11 ||
  1140 + tp->rx_opt.eff_sacks)
  1141 + return -1;
  1142 +
  1143 + /* Very simple search strategy: just double the MSS. */
  1144 + mss_now = tcp_current_mss(sk, 0);
  1145 + probe_size = 2*tp->mss_cache;
  1146 + if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
  1147 + /* TODO: set timer for probe_converge_event */
  1148 + return -1;
  1149 + }
  1150 +
  1151 + /* Have enough data in the send queue to probe? */
  1152 + len = 0;
  1153 + if ((skb = sk->sk_send_head) == NULL)
  1154 + return -1;
  1155 + while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb))
  1156 + skb = skb->next;
  1157 + if (len < probe_size)
  1158 + return -1;
  1159 +
  1160 + /* Receive window check. */
  1161 + if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) {
  1162 + if (tp->snd_wnd < probe_size)
  1163 + return -1;
  1164 + else
  1165 + return 0;
  1166 + }
  1167 +
  1168 + /* Do we need to wait to drain cwnd? */
  1169 + pif = tcp_packets_in_flight(tp);
  1170 + if (pif + 2 > tp->snd_cwnd) {
  1171 + /* With no packets in flight, don't stall. */
  1172 + if (pif == 0)
  1173 + return -1;
  1174 + else
  1175 + return 0;
  1176 + }
  1177 +
  1178 + /* We're allowed to probe. Build it now. */
  1179 + if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
  1180 + return -1;
  1181 + sk_charge_skb(sk, nskb);
  1182 +
  1183 + skb = sk->sk_send_head;
  1184 + __skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue);
  1185 + sk->sk_send_head = nskb;
  1186 +
  1187 + TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
  1188 + TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
  1189 + TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
  1190 + TCP_SKB_CB(nskb)->sacked = 0;
  1191 + nskb->csum = 0;
  1192 + if (skb->ip_summed == CHECKSUM_HW)
  1193 + nskb->ip_summed = CHECKSUM_HW;
  1194 +
  1195 + len = 0;
  1196 + while (len < probe_size) {
  1197 + next = skb->next;
  1198 +
  1199 + copy = min_t(int, skb->len, probe_size - len);
  1200 + if (nskb->ip_summed)
  1201 + skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
  1202 + else
  1203 + nskb->csum = skb_copy_and_csum_bits(skb, 0,
  1204 + skb_put(nskb, copy), copy, nskb->csum);
  1205 +
  1206 + if (skb->len <= copy) {
  1207 + /* We've eaten all the data from this skb.
  1208 + * Throw it away. */
  1209 + TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
  1210 + __skb_unlink(skb, &sk->sk_write_queue);
  1211 + sk_stream_free_skb(sk, skb);
  1212 + } else {
  1213 + TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
  1214 + ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
  1215 + if (!skb_shinfo(skb)->nr_frags) {
  1216 + skb_pull(skb, copy);
  1217 + if (skb->ip_summed != CHECKSUM_HW)
  1218 + skb->csum = csum_partial(skb->data, skb->len, 0);
  1219 + } else {
  1220 + __pskb_trim_head(skb, copy);
  1221 + tcp_set_skb_tso_segs(sk, skb, mss_now);
  1222 + }
  1223 + TCP_SKB_CB(skb)->seq += copy;
  1224 + }
  1225 +
  1226 + len += copy;
  1227 + skb = next;
  1228 + }
  1229 + tcp_init_tso_segs(sk, nskb, nskb->len);
  1230 +
  1231 + /* We're ready to send. If this fails, the probe will
  1232 + * be resegmented into mss-sized pieces by tcp_write_xmit(). */
  1233 + TCP_SKB_CB(nskb)->when = tcp_time_stamp;
  1234 + if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
  1235 + /* Decrement cwnd here because we are sending
  1236 + * effectively two packets. */
  1237 + tp->snd_cwnd--;
  1238 + update_send_head(sk, tp, nskb);
  1239 +
  1240 + icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
  1241 + icsk->icsk_mtup.probe_seq_start = TCP_SKB_CB(nskb)->seq;
  1242 + icsk->icsk_mtup.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
  1243 +
  1244 + return 1;
  1245 + }
  1246 +
  1247 + return -1;
  1248 +}
  1249 +
  1250 +
1066 1251 /* This routine writes packets to the network. It advances the
1067 1252 * send_head. This happens as incoming acks open up the remote
1068 1253 * window for us.
... ... @@ -1076,6 +1261,7 @@
1076 1261 struct sk_buff *skb;
1077 1262 unsigned int tso_segs, sent_pkts;
1078 1263 int cwnd_quota;
  1264 + int result;
1079 1265  
1080 1266 /* If we are closed, the bytes will have to remain here.
1081 1267 * In time closedown will finish, we empty the write queue and all
... ... @@ -1085,6 +1271,14 @@
1085 1271 return 0;
1086 1272  
1087 1273 sent_pkts = 0;
  1274 +
  1275 + /* Do MTU probing. */
  1276 + if ((result = tcp_mtu_probe(sk)) == 0) {
  1277 + return 0;
  1278 + } else if (result > 0) {
  1279 + sent_pkts = 1;
  1280 + }
  1281 +
1088 1282 while ((skb = sk->sk_send_head)) {
1089 1283 unsigned int limit;
1090 1284  
1091 1285  
... ... @@ -1455,9 +1649,15 @@
1455 1649 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1456 1650 {
1457 1651 struct tcp_sock *tp = tcp_sk(sk);
  1652 + struct inet_connection_sock *icsk = inet_csk(sk);
1458 1653 unsigned int cur_mss = tcp_current_mss(sk, 0);
1459 1654 int err;
1460 1655  
  1656 + /* Inconslusive MTU probe */
  1657 + if (icsk->icsk_mtup.probe_size) {
  1658 + icsk->icsk_mtup.probe_size = 0;
  1659 + }
  1660 +
1461 1661 /* Do not sent more than we queued. 1/4 is reserved for possible
1462 1662 * copying overhead: fragmentation, tunneling, mangling etc.
1463 1663 */
... ... @@ -1883,6 +2083,7 @@
1883 2083 if (tp->rx_opt.user_mss)
1884 2084 tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
1885 2085 tp->max_window = 0;
  2086 + tcp_mtup_init(sk);
1886 2087 tcp_sync_mss(sk, dst_mtu(dst));
1887 2088  
1888 2089 if (!tp->window_clamp)
... ... @@ -2180,4 +2381,5 @@
2180 2381 EXPORT_SYMBOL(tcp_simple_retransmit);
2181 2382 EXPORT_SYMBOL(tcp_sync_mss);
2182 2383 EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);
  2384 +EXPORT_SYMBOL(tcp_mtup_init);
net/ipv4/tcp_timer.c
... ... @@ -119,8 +119,10 @@
119 119 /* A write timeout has occurred. Process the after effects. */
120 120 static int tcp_write_timeout(struct sock *sk)
121 121 {
122   - const struct inet_connection_sock *icsk = inet_csk(sk);
  122 + struct inet_connection_sock *icsk = inet_csk(sk);
  123 + struct tcp_sock *tp = tcp_sk(sk);
123 124 int retry_until;
  125 + int mss;
124 126  
125 127 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
126 128 if (icsk->icsk_retransmits)
... ... @@ -128,25 +130,19 @@
128 130 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
129 131 } else {
130 132 if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
131   - /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
132   - hole detection. :-(
133   -
134   - It is place to make it. It is not made. I do not want
135   - to make it. It is disgusting. It does not work in any
136   - case. Let me to cite the same draft, which requires for
137   - us to implement this:
138   -
139   - "The one security concern raised by this memo is that ICMP black holes
140   - are often caused by over-zealous security administrators who block
141   - all ICMP messages. It is vitally important that those who design and
142   - deploy security systems understand the impact of strict filtering on
143   - upper-layer protocols. The safest web site in the world is worthless
144   - if most TCP implementations cannot transfer data from it. It would
145   - be far nicer to have all of the black holes fixed rather than fixing
146   - all of the TCP implementations."
147   -
148   - Golden words :-).
149   - */
  133 + /* Black hole detection */
  134 + if (sysctl_tcp_mtu_probing) {
  135 + if (!icsk->icsk_mtup.enabled) {
  136 + icsk->icsk_mtup.enabled = 1;
  137 + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
  138 + } else {
  139 + mss = min(sysctl_tcp_base_mss,
  140 + tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)/2);
  141 + mss = max(mss, 68 - tp->tcp_header_len);
  142 + icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
  143 + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
  144 + }
  145 + }
150 146  
151 147 dst_negative_advice(&sk->sk_dst_cache);
152 148 }
... ... @@ -987,6 +987,7 @@
987 987 inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
988 988 newnp->opt->opt_flen);
989 989  
  990 + tcp_mtup_init(newsk);
990 991 tcp_sync_mss(newsk, dst_mtu(dst));
991 992 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
992 993 tcp_initialize_rcv_mss(newsk);