Commit 66b13d99d96a1a69f47a6bc3dc47f45955967377

Authored by Eric Dumazet
Committed by David S. Miller
1 parent 318cf7aaa0

ipv4: tcp: fix TOS value in ACK messages sent from TIME_WAIT

There is a long standing bug in linux tcp stack, about ACK messages sent
on behalf of TIME_WAIT sockets.

In the IP header of the ACK message, we choose to reflect TOS field of
incoming message, and this might break some setups.

Example of things that were broken :
  - Routing using TOS as a selector
  - Firewalls
  - Trafic classification / shaping

We now remember in timewait structure the inet tos field and use it in
ACK generation, and route lookup.

Notes :
 - We still reflect incoming TOS in RST messages.
 - We could extend MuraliRaja Muniraju patch to report TOS value in
netlink messages for TIME_WAIT sockets.
 - A patch is needed for IPv6

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 5 changed files with 15 additions and 9 deletions Side-by-side Diff

include/net/inet_timewait_sock.h
... ... @@ -126,7 +126,8 @@
126 126 /* And these are ours. */
127 127 unsigned int tw_ipv6only : 1,
128 128 tw_transparent : 1,
129   - tw_pad : 14, /* 14 bits hole */
  129 + tw_pad : 6, /* 6 bits hole */
  130 + tw_tos : 8,
130 131 tw_ipv6_offset : 16;
131 132 kmemcheck_bitfield_end(flags);
132 133 unsigned long tw_ttd;
... ... @@ -165,6 +165,7 @@
165 165 int csumoffset; /* u16 offset of csum in iov[0].iov_base */
166 166 /* -1 if not needed */
167 167 int bound_dev_if;
  168 + u8 tos;
168 169 };
169 170  
170 171 #define IP_REPLY_ARG_NOSRCCHECK 1
... ... @@ -175,7 +176,7 @@
175 176 }
176 177  
177 178 void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
178   - struct ip_reply_arg *arg, unsigned int len);
  179 + const struct ip_reply_arg *arg, unsigned int len);
179 180  
180 181 struct ipv4_config {
181 182 int log_martians;
net/ipv4/inet_timewait_sock.c
... ... @@ -183,6 +183,7 @@
183 183 tw->tw_daddr = inet->inet_daddr;
184 184 tw->tw_rcv_saddr = inet->inet_rcv_saddr;
185 185 tw->tw_bound_dev_if = sk->sk_bound_dev_if;
  186 + tw->tw_tos = inet->tos;
186 187 tw->tw_num = inet->inet_num;
187 188 tw->tw_state = TCP_TIME_WAIT;
188 189 tw->tw_substate = state;
net/ipv4/ip_output.c
... ... @@ -1466,7 +1466,7 @@
1466 1466 * structure to pass arguments.
1467 1467 */
1468 1468 void ip_send_reply(struct sock *sk, struct sk_buff *skb, __be32 daddr,
1469   - struct ip_reply_arg *arg, unsigned int len)
  1469 + const struct ip_reply_arg *arg, unsigned int len)
1470 1470 {
1471 1471 struct inet_sock *inet = inet_sk(sk);
1472 1472 struct ip_options_data replyopts;
... ... @@ -1489,7 +1489,7 @@
1489 1489 }
1490 1490  
1491 1491 flowi4_init_output(&fl4, arg->bound_dev_if, 0,
1492   - RT_TOS(ip_hdr(skb)->tos),
  1492 + RT_TOS(arg->tos),
1493 1493 RT_SCOPE_UNIVERSE, sk->sk_protocol,
1494 1494 ip_reply_arg_flowi_flags(arg),
1495 1495 daddr, rt->rt_spec_dst,
... ... @@ -1506,7 +1506,7 @@
1506 1506 with locally disabled BH and that sk cannot be already spinlocked.
1507 1507 */
1508 1508 bh_lock_sock(sk);
1509   - inet->tos = ip_hdr(skb)->tos;
  1509 + inet->tos = arg->tos;
1510 1510 sk->sk_priority = skb->priority;
1511 1511 sk->sk_protocol = ip_hdr(skb)->protocol;
1512 1512 sk->sk_bound_dev_if = arg->bound_dev_if;
... ... @@ -652,6 +652,7 @@
652 652 arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
653 653  
654 654 net = dev_net(skb_dst(skb)->dev);
  655 + arg.tos = ip_hdr(skb)->tos;
655 656 ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
656 657 &arg, arg.iov[0].iov_len);
657 658  
... ... @@ -666,7 +667,7 @@
666 667 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
667 668 u32 win, u32 ts, int oif,
668 669 struct tcp_md5sig_key *key,
669   - int reply_flags)
  670 + int reply_flags, u8 tos)
670 671 {
671 672 const struct tcphdr *th = tcp_hdr(skb);
672 673 struct {
... ... @@ -726,7 +727,7 @@
726 727 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
727 728 if (oif)
728 729 arg.bound_dev_if = oif;
729   -
  730 + arg.tos = tos;
730 731 ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
731 732 &arg, arg.iov[0].iov_len);
732 733  
... ... @@ -743,7 +744,8 @@
743 744 tcptw->tw_ts_recent,
744 745 tw->tw_bound_dev_if,
745 746 tcp_twsk_md5_key(tcptw),
746   - tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
  747 + tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
  748 + tw->tw_tos
747 749 );
748 750  
749 751 inet_twsk_put(tw);
... ... @@ -757,7 +759,8 @@
757 759 req->ts_recent,
758 760 0,
759 761 tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
760   - inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
  762 + inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
  763 + ip_hdr(skb)->tos);
761 764 }
762 765  
763 766 /*