Commit 41063e9dd11956f2d285e12e4342e1d232ba0ea2
1 parent
f9242b6b28
Exists in
smarc-l5.0.0_1.0.0-ga
and in
5 other branches
ipv4: Early TCP socket demux.
Input packet processing for local sockets involves two major demuxes. One for the route and one for the socket. But we can optimize this down to one demux for certain kinds of local sockets. Currently we only do this for established TCP sockets, but it could at least in theory be expanded to other kinds of connections. If a TCP socket is established then it's identity is fully specified. This means that whatever input route was used during the three-way handshake must work equally well for the rest of the connection since the keys will not change. Once we move to established state, we cache the receive packet's input route to use later. Like the existing cached route in sk->sk_dst_cache used for output packets, we have to check for route invalidations using dst->obsolete and dst->ops->check(). Early demux occurs outside of a socket locked section, so when a route invalidation occurs we defer the fixup of sk->sk_rx_dst until we are actually inside of established state packet processing and thus have the socket locked. Signed-off-by: David S. Miller <davem@davemloft.net>
Showing 10 changed files with 110 additions and 24 deletions Side-by-side Diff
include/net/inet_hashtables.h
... | ... | @@ -379,10 +379,10 @@ |
379 | 379 | const __be16 sport, |
380 | 380 | const __be16 dport) |
381 | 381 | { |
382 | - struct sock *sk; | |
382 | + struct sock *sk = skb_steal_sock(skb); | |
383 | 383 | const struct iphdr *iph = ip_hdr(skb); |
384 | 384 | |
385 | - if (unlikely(sk = skb_steal_sock(skb))) | |
385 | + if (sk) | |
386 | 386 | return sk; |
387 | 387 | else |
388 | 388 | return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo, |
include/net/protocol.h
... | ... | @@ -37,6 +37,7 @@ |
37 | 37 | |
38 | 38 | /* This is used to register protocols. */ |
39 | 39 | struct net_protocol { |
40 | + int (*early_demux)(struct sk_buff *skb); | |
40 | 41 | int (*handler)(struct sk_buff *skb); |
41 | 42 | void (*err_handler)(struct sk_buff *skb, u32 info); |
42 | 43 | int (*gso_send_check)(struct sk_buff *skb); |
include/net/sock.h
... | ... | @@ -319,6 +319,7 @@ |
319 | 319 | unsigned long sk_flags; |
320 | 320 | struct dst_entry *sk_dst_cache; |
321 | 321 | spinlock_t sk_dst_lock; |
322 | + struct dst_entry *sk_rx_dst; | |
322 | 323 | atomic_t sk_wmem_alloc; |
323 | 324 | atomic_t sk_omem_alloc; |
324 | 325 | int sk_sndbuf; |
... | ... | @@ -1426,6 +1427,7 @@ |
1426 | 1427 | gfp_t priority); |
1427 | 1428 | extern void sock_wfree(struct sk_buff *skb); |
1428 | 1429 | extern void sock_rfree(struct sk_buff *skb); |
1430 | +extern void sock_edemux(struct sk_buff *skb); | |
1429 | 1431 | |
1430 | 1432 | extern int sock_setsockopt(struct socket *sock, int level, |
1431 | 1433 | int op, char __user *optval, |
include/net/tcp.h
net/core/sock.c
net/ipv4/af_inet.c
... | ... | @@ -157,6 +157,7 @@ |
157 | 157 | |
158 | 158 | kfree(rcu_dereference_protected(inet->inet_opt, 1)); |
159 | 159 | dst_release(rcu_dereference_check(sk->sk_dst_cache, 1)); |
160 | + dst_release(sk->sk_rx_dst); | |
160 | 161 | sk_refcnt_debug_dec(sk); |
161 | 162 | } |
162 | 163 | EXPORT_SYMBOL(inet_sock_destruct); |
... | ... | @@ -1518,14 +1519,15 @@ |
1518 | 1519 | #endif |
1519 | 1520 | |
1520 | 1521 | static const struct net_protocol tcp_protocol = { |
1521 | - .handler = tcp_v4_rcv, | |
1522 | - .err_handler = tcp_v4_err, | |
1523 | - .gso_send_check = tcp_v4_gso_send_check, | |
1524 | - .gso_segment = tcp_tso_segment, | |
1525 | - .gro_receive = tcp4_gro_receive, | |
1526 | - .gro_complete = tcp4_gro_complete, | |
1527 | - .no_policy = 1, | |
1528 | - .netns_ok = 1, | |
1522 | + .early_demux = tcp_v4_early_demux, | |
1523 | + .handler = tcp_v4_rcv, | |
1524 | + .err_handler = tcp_v4_err, | |
1525 | + .gso_send_check = tcp_v4_gso_send_check, | |
1526 | + .gso_segment = tcp_tso_segment, | |
1527 | + .gro_receive = tcp4_gro_receive, | |
1528 | + .gro_complete = tcp4_gro_complete, | |
1529 | + .no_policy = 1, | |
1530 | + .netns_ok = 1, | |
1529 | 1531 | }; |
1530 | 1532 | |
1531 | 1533 | static const struct net_protocol udp_protocol = { |
net/ipv4/ip_input.c
... | ... | @@ -323,19 +323,32 @@ |
323 | 323 | * how the packet travels inside Linux networking. |
324 | 324 | */ |
325 | 325 | if (skb_dst(skb) == NULL) { |
326 | - int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, | |
327 | - iph->tos, skb->dev); | |
328 | - if (unlikely(err)) { | |
329 | - if (err == -EHOSTUNREACH) | |
330 | - IP_INC_STATS_BH(dev_net(skb->dev), | |
331 | - IPSTATS_MIB_INADDRERRORS); | |
332 | - else if (err == -ENETUNREACH) | |
333 | - IP_INC_STATS_BH(dev_net(skb->dev), | |
334 | - IPSTATS_MIB_INNOROUTES); | |
335 | - else if (err == -EXDEV) | |
336 | - NET_INC_STATS_BH(dev_net(skb->dev), | |
337 | - LINUX_MIB_IPRPFILTER); | |
338 | - goto drop; | |
326 | + const struct net_protocol *ipprot; | |
327 | + int protocol = iph->protocol; | |
328 | + int err; | |
329 | + | |
330 | + rcu_read_lock(); | |
331 | + ipprot = rcu_dereference(inet_protos[protocol]); | |
332 | + err = -ENOENT; | |
333 | + if (ipprot && ipprot->early_demux) | |
334 | + err = ipprot->early_demux(skb); | |
335 | + rcu_read_unlock(); | |
336 | + | |
337 | + if (err) { | |
338 | + err = ip_route_input_noref(skb, iph->daddr, iph->saddr, | |
339 | + iph->tos, skb->dev); | |
340 | + if (unlikely(err)) { | |
341 | + if (err == -EHOSTUNREACH) | |
342 | + IP_INC_STATS_BH(dev_net(skb->dev), | |
343 | + IPSTATS_MIB_INADDRERRORS); | |
344 | + else if (err == -ENETUNREACH) | |
345 | + IP_INC_STATS_BH(dev_net(skb->dev), | |
346 | + IPSTATS_MIB_INNOROUTES); | |
347 | + else if (err == -EXDEV) | |
348 | + NET_INC_STATS_BH(dev_net(skb->dev), | |
349 | + LINUX_MIB_IPRPFILTER); | |
350 | + goto drop; | |
351 | + } | |
339 | 352 | } |
340 | 353 | } |
341 | 354 |
net/ipv4/tcp_input.c
... | ... | @@ -5518,6 +5518,18 @@ |
5518 | 5518 | struct tcp_sock *tp = tcp_sk(sk); |
5519 | 5519 | int res; |
5520 | 5520 | |
5521 | + if (sk->sk_rx_dst) { | |
5522 | + struct dst_entry *dst = sk->sk_rx_dst; | |
5523 | + if (unlikely(dst->obsolete)) { | |
5524 | + if (dst->ops->check(dst, 0) == NULL) { | |
5525 | + dst_release(dst); | |
5526 | + sk->sk_rx_dst = NULL; | |
5527 | + } | |
5528 | + } | |
5529 | + } | |
5530 | + if (unlikely(sk->sk_rx_dst == NULL)) | |
5531 | + sk->sk_rx_dst = dst_clone(skb_dst(skb)); | |
5532 | + | |
5521 | 5533 | /* |
5522 | 5534 | * Header prediction. |
5523 | 5535 | * The code loosely follows the one in the famous |
5524 | 5536 | |
... | ... | @@ -5729,8 +5741,10 @@ |
5729 | 5741 | |
5730 | 5742 | tcp_set_state(sk, TCP_ESTABLISHED); |
5731 | 5743 | |
5732 | - if (skb != NULL) | |
5744 | + if (skb != NULL) { | |
5745 | + sk->sk_rx_dst = dst_clone(skb_dst(skb)); | |
5733 | 5746 | security_inet_conn_established(sk, skb); |
5747 | + } | |
5734 | 5748 | |
5735 | 5749 | /* Make sure socket is routed, for correct metrics. */ |
5736 | 5750 | icsk->icsk_af_ops->rebuild_header(sk); |
net/ipv4/tcp_ipv4.c
... | ... | @@ -1671,6 +1671,52 @@ |
1671 | 1671 | } |
1672 | 1672 | EXPORT_SYMBOL(tcp_v4_do_rcv); |
1673 | 1673 | |
1674 | +int tcp_v4_early_demux(struct sk_buff *skb) | |
1675 | +{ | |
1676 | + struct net *net = dev_net(skb->dev); | |
1677 | + const struct iphdr *iph; | |
1678 | + const struct tcphdr *th; | |
1679 | + struct sock *sk; | |
1680 | + int err; | |
1681 | + | |
1682 | + err = -ENOENT; | |
1683 | + if (skb->pkt_type != PACKET_HOST) | |
1684 | + goto out_err; | |
1685 | + | |
1686 | + if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr))) | |
1687 | + goto out_err; | |
1688 | + | |
1689 | + iph = ip_hdr(skb); | |
1690 | + th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb)); | |
1691 | + | |
1692 | + if (th->doff < sizeof(struct tcphdr) / 4) | |
1693 | + goto out_err; | |
1694 | + | |
1695 | + if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4)) | |
1696 | + goto out_err; | |
1697 | + | |
1698 | + sk = __inet_lookup_established(net, &tcp_hashinfo, | |
1699 | + iph->saddr, th->source, | |
1700 | + iph->daddr, th->dest, | |
1701 | + skb->dev->ifindex); | |
1702 | + if (sk) { | |
1703 | + skb->sk = sk; | |
1704 | + skb->destructor = sock_edemux; | |
1705 | + if (sk->sk_state != TCP_TIME_WAIT) { | |
1706 | + struct dst_entry *dst = sk->sk_rx_dst; | |
1707 | + if (dst) | |
1708 | + dst = dst_check(dst, 0); | |
1709 | + if (dst) { | |
1710 | + skb_dst_set_noref(skb, dst); | |
1711 | + err = 0; | |
1712 | + } | |
1713 | + } | |
1714 | + } | |
1715 | + | |
1716 | +out_err: | |
1717 | + return err; | |
1718 | +} | |
1719 | + | |
1674 | 1720 | /* |
1675 | 1721 | * From tcp_input.c |
1676 | 1722 | */ |
net/ipv4/tcp_minisocks.c
... | ... | @@ -445,6 +445,8 @@ |
445 | 445 | struct tcp_sock *oldtp = tcp_sk(sk); |
446 | 446 | struct tcp_cookie_values *oldcvp = oldtp->cookie_values; |
447 | 447 | |
448 | + newsk->sk_rx_dst = dst_clone(skb_dst(skb)); | |
449 | + | |
448 | 450 | /* TCP Cookie Transactions require space for the cookie pair, |
449 | 451 | * as it differs for each connection. There is no need to |
450 | 452 | * copy any s_data_payload stored at the original socket. |