Commit 30e502a34b8b21fae2c789da102bd9f6e99fef83
Committed by
David S. Miller
1 parent
55d8694fa8
Exists in
ti-lsk-linux-4.1.y
and in
10 other branches
net: tcp: add flag for ca to indicate that ECN is required
This patch adds a flag to TCP congestion algorithms that allows for requesting to mark IPv4/IPv6 sockets with transport as ECN capable, that is, ECT(0), when required by a congestion algorithm. It is currently used and needed in DataCenter TCP (DCTCP), as it requires both peers to assert ECT on all IP packets sent - it uses ECN feedback (i.e. CE, Congestion Encountered information) from switches inside the data center to derive feedback to the end hosts. Therefore, simply add a new flag to icsk_ca_ops. Note that DCTCP's algorithm/behaviour slightly diverges from RFC3168, therefore this is only (!) enabled iff the assigned congestion control ops module has requested this. By that, we can tightly couple this logic really only to the provided congestion control ops. Joint work with Florian Westphal and Glenn Judd. Signed-off-by: Daniel Borkmann <dborkman@redhat.com> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Glenn Judd <glenn.judd@morganstanley.com> Acked-by: Stephen Hemminger <stephen@networkplumber.org> Signed-off-by: David S. Miller <davem@davemloft.net>
Showing 3 changed files with 63 additions and 25 deletions Side-by-side Diff
include/net/tcp.h
... | ... | @@ -733,23 +733,6 @@ |
733 | 733 | |
734 | 734 | #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) |
735 | 735 | |
736 | -/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set | |
737 | - * | |
738 | - * If we receive a SYN packet with these bits set, it means a network is | |
739 | - * playing bad games with TOS bits. In order to avoid possible false congestion | |
740 | - * notifications, we disable TCP ECN negociation. | |
741 | - */ | |
742 | -static inline void | |
743 | -TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb, | |
744 | - struct net *net) | |
745 | -{ | |
746 | - const struct tcphdr *th = tcp_hdr(skb); | |
747 | - | |
748 | - if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr && | |
749 | - INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield)) | |
750 | - inet_rsk(req)->ecn_ok = 1; | |
751 | -} | |
752 | - | |
753 | 736 | /* Due to TSO, an SKB can be composed of multiple actual |
754 | 737 | * packets. To keep these tracked properly, we use this. |
755 | 738 | */ |
756 | 739 | |
... | ... | @@ -791,7 +774,10 @@ |
791 | 774 | #define TCP_CA_MAX 128 |
792 | 775 | #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) |
793 | 776 | |
777 | +/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */ | |
794 | 778 | #define TCP_CONG_NON_RESTRICTED 0x1 |
779 | +/* Requires ECN/ECT set on all packets */ | |
780 | +#define TCP_CONG_NEEDS_ECN 0x2 | |
795 | 781 | |
796 | 782 | struct tcp_congestion_ops { |
797 | 783 | struct list_head list; |
... | ... | @@ -840,6 +826,13 @@ |
840 | 826 | void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); |
841 | 827 | extern struct tcp_congestion_ops tcp_reno; |
842 | 828 | |
829 | +static inline bool tcp_ca_needs_ecn(const struct sock *sk) | |
830 | +{ | |
831 | + const struct inet_connection_sock *icsk = inet_csk(sk); | |
832 | + | |
833 | + return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN; | |
834 | +} | |
835 | + | |
843 | 836 | static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state) |
844 | 837 | { |
845 | 838 | struct inet_connection_sock *icsk = inet_csk(sk); |
... | ... | @@ -855,6 +848,40 @@ |
855 | 848 | |
856 | 849 | if (icsk->icsk_ca_ops->cwnd_event) |
857 | 850 | icsk->icsk_ca_ops->cwnd_event(sk, event); |
851 | +} | |
852 | + | |
853 | +/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set | |
854 | + * | |
855 | + * If we receive a SYN packet with these bits set, it means a | |
856 | + * network is playing bad games with TOS bits. In order to | |
857 | + * avoid possible false congestion notifications, we disable | |
858 | + * TCP ECN negociation. | |
859 | + * | |
860 | + * Exception: tcp_ca wants ECN. This is required for DCTCP | |
861 | + * congestion control; it requires setting ECT on all packets, | |
862 | + * including SYN. We inverse the test in this case: If our | |
863 | + * local socket wants ECN, but peer only set ece/cwr (but not | |
864 | + * ECT in IP header) its probably a non-DCTCP aware sender. | |
865 | + */ | |
866 | +static inline void | |
867 | +TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb, | |
868 | + const struct sock *listen_sk) | |
869 | +{ | |
870 | + const struct tcphdr *th = tcp_hdr(skb); | |
871 | + const struct net *net = sock_net(listen_sk); | |
872 | + bool th_ecn = th->ece && th->cwr; | |
873 | + bool ect, need_ecn; | |
874 | + | |
875 | + if (!th_ecn) | |
876 | + return; | |
877 | + | |
878 | + ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); | |
879 | + need_ecn = tcp_ca_needs_ecn(listen_sk); | |
880 | + | |
881 | + if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) | |
882 | + inet_rsk(req)->ecn_ok = 1; | |
883 | + else if (ect && need_ecn) | |
884 | + inet_rsk(req)->ecn_ok = 1; | |
858 | 885 | } |
859 | 886 | |
860 | 887 | /* These functions determine how the current flow behaves in respect of SACK |
net/ipv4/tcp_input.c
... | ... | @@ -5944,7 +5944,7 @@ |
5944 | 5944 | goto drop_and_free; |
5945 | 5945 | |
5946 | 5946 | if (!want_cookie || tmp_opt.tstamp_ok) |
5947 | - TCP_ECN_create_request(req, skb, sock_net(sk)); | |
5947 | + TCP_ECN_create_request(req, skb, sk); | |
5948 | 5948 | |
5949 | 5949 | if (want_cookie) { |
5950 | 5950 | isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); |
net/ipv4/tcp_output.c
... | ... | @@ -318,11 +318,15 @@ |
318 | 318 | } |
319 | 319 | |
320 | 320 | /* Packet ECN state for a SYN-ACK */ |
321 | -static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb) | |
321 | +static inline void TCP_ECN_send_synack(struct sock *sk, struct sk_buff *skb) | |
322 | 322 | { |
323 | + const struct tcp_sock *tp = tcp_sk(sk); | |
324 | + | |
323 | 325 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; |
324 | 326 | if (!(tp->ecn_flags & TCP_ECN_OK)) |
325 | 327 | TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; |
328 | + else if (tcp_ca_needs_ecn(sk)) | |
329 | + INET_ECN_xmit(sk); | |
326 | 330 | } |
327 | 331 | |
328 | 332 | /* Packet ECN state for a SYN. */ |
329 | 333 | |
330 | 334 | |
331 | 335 | |
332 | 336 | |
... | ... | @@ -331,17 +335,24 @@ |
331 | 335 | struct tcp_sock *tp = tcp_sk(sk); |
332 | 336 | |
333 | 337 | tp->ecn_flags = 0; |
334 | - if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) { | |
338 | + if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || | |
339 | + tcp_ca_needs_ecn(sk)) { | |
335 | 340 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; |
336 | 341 | tp->ecn_flags = TCP_ECN_OK; |
342 | + if (tcp_ca_needs_ecn(sk)) | |
343 | + INET_ECN_xmit(sk); | |
337 | 344 | } |
338 | 345 | } |
339 | 346 | |
340 | 347 | static __inline__ void |
341 | -TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th) | |
348 | +TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th, | |
349 | + struct sock *sk) | |
342 | 350 | { |
343 | - if (inet_rsk(req)->ecn_ok) | |
351 | + if (inet_rsk(req)->ecn_ok) { | |
344 | 352 | th->ece = 1; |
353 | + if (tcp_ca_needs_ecn(sk)) | |
354 | + INET_ECN_xmit(sk); | |
355 | + } | |
345 | 356 | } |
346 | 357 | |
347 | 358 | /* Set up ECN state for a packet on a ESTABLISHED socket that is about to |
... | ... | @@ -362,7 +373,7 @@ |
362 | 373 | tcp_hdr(skb)->cwr = 1; |
363 | 374 | skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; |
364 | 375 | } |
365 | - } else { | |
376 | + } else if (!tcp_ca_needs_ecn(sk)) { | |
366 | 377 | /* ACK or retransmitted segment: clear ECT|CE */ |
367 | 378 | INET_ECN_dontxmit(sk); |
368 | 379 | } |
... | ... | @@ -2789,7 +2800,7 @@ |
2789 | 2800 | } |
2790 | 2801 | |
2791 | 2802 | TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK; |
2792 | - TCP_ECN_send_synack(tcp_sk(sk), skb); | |
2803 | + TCP_ECN_send_synack(sk, skb); | |
2793 | 2804 | } |
2794 | 2805 | return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); |
2795 | 2806 | } |
... | ... | @@ -2848,7 +2859,7 @@ |
2848 | 2859 | memset(th, 0, sizeof(struct tcphdr)); |
2849 | 2860 | th->syn = 1; |
2850 | 2861 | th->ack = 1; |
2851 | - TCP_ECN_make_synack(req, th); | |
2862 | + TCP_ECN_make_synack(req, th, sk); | |
2852 | 2863 | th->source = htons(ireq->ir_num); |
2853 | 2864 | th->dest = ireq->ir_rmt_port; |
2854 | 2865 | /* Setting of flags are superfluous here for callers (and ECE is |