Commit 30e502a34b8b21fae2c789da102bd9f6e99fef83

Authored by Daniel Borkmann
Committed by David S. Miller
1 parent 55d8694fa8

net: tcp: add flag for ca to indicate that ECN is required

This patch adds a flag to TCP congestion algorithms that allows
for requesting to mark IPv4/IPv6 sockets with transport as ECN
capable, that is, ECT(0), when required by a congestion algorithm.

It is currently used and needed in DataCenter TCP (DCTCP), as it
requires both peers to assert ECT on all IP packets sent - it
uses ECN feedback (i.e. CE, Congestion Encountered information)
from switches inside the data center to derive feedback to the
end hosts.

Therefore, simply add a new flag to icsk_ca_ops. Note that DCTCP's
algorithm/behaviour slightly diverges from RFC3168, therefore this
is only (!) enabled iff the assigned congestion control ops module
has requested this. By that, we can tightly couple this logic really
only to the provided congestion control ops.

Joint work with Florian Westphal and Glenn Judd.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Glenn Judd <glenn.judd@morganstanley.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 3 changed files with 63 additions and 25 deletions Side-by-side Diff

... ... @@ -733,23 +733,6 @@
733 733  
734 734 #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
735 735  
736   -/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
737   - *
738   - * If we receive a SYN packet with these bits set, it means a network is
739   - * playing bad games with TOS bits. In order to avoid possible false congestion
740   - * notifications, we disable TCP ECN negociation.
741   - */
742   -static inline void
743   -TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
744   - struct net *net)
745   -{
746   - const struct tcphdr *th = tcp_hdr(skb);
747   -
748   - if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr &&
749   - INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield))
750   - inet_rsk(req)->ecn_ok = 1;
751   -}
752   -
753 736 /* Due to TSO, an SKB can be composed of multiple actual
754 737 * packets. To keep these tracked properly, we use this.
755 738 */
756 739  
... ... @@ -791,7 +774,10 @@
791 774 #define TCP_CA_MAX 128
792 775 #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX)
793 776  
  777 +/* Algorithm can be set on socket without CAP_NET_ADMIN privileges */
794 778 #define TCP_CONG_NON_RESTRICTED 0x1
  779 +/* Requires ECN/ECT set on all packets */
  780 +#define TCP_CONG_NEEDS_ECN 0x2
795 781  
796 782 struct tcp_congestion_ops {
797 783 struct list_head list;
... ... @@ -840,6 +826,13 @@
840 826 void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
841 827 extern struct tcp_congestion_ops tcp_reno;
842 828  
  829 +static inline bool tcp_ca_needs_ecn(const struct sock *sk)
  830 +{
  831 + const struct inet_connection_sock *icsk = inet_csk(sk);
  832 +
  833 + return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
  834 +}
  835 +
843 836 static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
844 837 {
845 838 struct inet_connection_sock *icsk = inet_csk(sk);
... ... @@ -855,6 +848,40 @@
855 848  
856 849 if (icsk->icsk_ca_ops->cwnd_event)
857 850 icsk->icsk_ca_ops->cwnd_event(sk, event);
  851 +}
  852 +
  853 +/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
  854 + *
  855 + * If we receive a SYN packet with these bits set, it means a
  856 + * network is playing bad games with TOS bits. In order to
  857 + * avoid possible false congestion notifications, we disable
  858 + * TCP ECN negociation.
  859 + *
  860 + * Exception: tcp_ca wants ECN. This is required for DCTCP
  861 + * congestion control; it requires setting ECT on all packets,
  862 + * including SYN. We inverse the test in this case: If our
  863 + * local socket wants ECN, but peer only set ece/cwr (but not
  864 + * ECT in IP header) its probably a non-DCTCP aware sender.
  865 + */
  866 +static inline void
  867 +TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
  868 + const struct sock *listen_sk)
  869 +{
  870 + const struct tcphdr *th = tcp_hdr(skb);
  871 + const struct net *net = sock_net(listen_sk);
  872 + bool th_ecn = th->ece && th->cwr;
  873 + bool ect, need_ecn;
  874 +
  875 + if (!th_ecn)
  876 + return;
  877 +
  878 + ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
  879 + need_ecn = tcp_ca_needs_ecn(listen_sk);
  880 +
  881 + if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn)
  882 + inet_rsk(req)->ecn_ok = 1;
  883 + else if (ect && need_ecn)
  884 + inet_rsk(req)->ecn_ok = 1;
858 885 }
859 886  
860 887 /* These functions determine how the current flow behaves in respect of SACK
net/ipv4/tcp_input.c
... ... @@ -5944,7 +5944,7 @@
5944 5944 goto drop_and_free;
5945 5945  
5946 5946 if (!want_cookie || tmp_opt.tstamp_ok)
5947   - TCP_ECN_create_request(req, skb, sock_net(sk));
  5947 + TCP_ECN_create_request(req, skb, sk);
5948 5948  
5949 5949 if (want_cookie) {
5950 5950 isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
net/ipv4/tcp_output.c
... ... @@ -318,11 +318,15 @@
318 318 }
319 319  
320 320 /* Packet ECN state for a SYN-ACK */
321   -static inline void TCP_ECN_send_synack(const struct tcp_sock *tp, struct sk_buff *skb)
  321 +static inline void TCP_ECN_send_synack(struct sock *sk, struct sk_buff *skb)
322 322 {
  323 + const struct tcp_sock *tp = tcp_sk(sk);
  324 +
323 325 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
324 326 if (!(tp->ecn_flags & TCP_ECN_OK))
325 327 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
  328 + else if (tcp_ca_needs_ecn(sk))
  329 + INET_ECN_xmit(sk);
326 330 }
327 331  
328 332 /* Packet ECN state for a SYN. */
329 333  
330 334  
331 335  
332 336  
... ... @@ -331,17 +335,24 @@
331 335 struct tcp_sock *tp = tcp_sk(sk);
332 336  
333 337 tp->ecn_flags = 0;
334   - if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) {
  338 + if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
  339 + tcp_ca_needs_ecn(sk)) {
335 340 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
336 341 tp->ecn_flags = TCP_ECN_OK;
  342 + if (tcp_ca_needs_ecn(sk))
  343 + INET_ECN_xmit(sk);
337 344 }
338 345 }
339 346  
340 347 static __inline__ void
341   -TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th)
  348 +TCP_ECN_make_synack(const struct request_sock *req, struct tcphdr *th,
  349 + struct sock *sk)
342 350 {
343   - if (inet_rsk(req)->ecn_ok)
  351 + if (inet_rsk(req)->ecn_ok) {
344 352 th->ece = 1;
  353 + if (tcp_ca_needs_ecn(sk))
  354 + INET_ECN_xmit(sk);
  355 + }
345 356 }
346 357  
347 358 /* Set up ECN state for a packet on a ESTABLISHED socket that is about to
... ... @@ -362,7 +373,7 @@
362 373 tcp_hdr(skb)->cwr = 1;
363 374 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
364 375 }
365   - } else {
  376 + } else if (!tcp_ca_needs_ecn(sk)) {
366 377 /* ACK or retransmitted segment: clear ECT|CE */
367 378 INET_ECN_dontxmit(sk);
368 379 }
... ... @@ -2789,7 +2800,7 @@
2789 2800 }
2790 2801  
2791 2802 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ACK;
2792   - TCP_ECN_send_synack(tcp_sk(sk), skb);
  2803 + TCP_ECN_send_synack(sk, skb);
2793 2804 }
2794 2805 return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
2795 2806 }
... ... @@ -2848,7 +2859,7 @@
2848 2859 memset(th, 0, sizeof(struct tcphdr));
2849 2860 th->syn = 1;
2850 2861 th->ack = 1;
2851   - TCP_ECN_make_synack(req, th);
  2862 + TCP_ECN_make_synack(req, th, sk);
2852 2863 th->source = htons(ireq->ir_num);
2853 2864 th->dest = ireq->ir_rmt_port;
2854 2865 /* Setting of flags are superfluous here for callers (and ECE is