Commit 41063e9dd11956f2d285e12e4342e1d232ba0ea2

Authored by David S. Miller
1 parent f9242b6b28

ipv4: Early TCP socket demux.

Input packet processing for local sockets involves two major demuxes.
One for the route and one for the socket.

But we can optimize this down to one demux for certain kinds of local
sockets.

Currently we only do this for established TCP sockets, but it could
at least in theory be expanded to other kinds of connections.

If a TCP socket is established then it's identity is fully specified.

This means that whatever input route was used during the three-way
handshake must work equally well for the rest of the connection since
the keys will not change.

Once we move to established state, we cache the receive packet's input
route to use later.

Like the existing cached route in sk->sk_dst_cache used for output
packets, we have to check for route invalidations using dst->obsolete
and dst->ops->check().

Early demux occurs outside of a socket locked section, so when a route
invalidation occurs we defer the fixup of sk->sk_rx_dst until we are
actually inside of established state packet processing and thus have
the socket locked.

Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 10 changed files with 110 additions and 24 deletions Side-by-side Diff

include/net/inet_hashtables.h
... ... @@ -379,10 +379,10 @@
379 379 const __be16 sport,
380 380 const __be16 dport)
381 381 {
382   - struct sock *sk;
  382 + struct sock *sk = skb_steal_sock(skb);
383 383 const struct iphdr *iph = ip_hdr(skb);
384 384  
385   - if (unlikely(sk = skb_steal_sock(skb)))
  385 + if (sk)
386 386 return sk;
387 387 else
388 388 return __inet_lookup(dev_net(skb_dst(skb)->dev), hashinfo,
include/net/protocol.h
... ... @@ -37,6 +37,7 @@
37 37  
38 38 /* This is used to register protocols. */
39 39 struct net_protocol {
  40 + int (*early_demux)(struct sk_buff *skb);
40 41 int (*handler)(struct sk_buff *skb);
41 42 void (*err_handler)(struct sk_buff *skb, u32 info);
42 43 int (*gso_send_check)(struct sk_buff *skb);
... ... @@ -319,6 +319,7 @@
319 319 unsigned long sk_flags;
320 320 struct dst_entry *sk_dst_cache;
321 321 spinlock_t sk_dst_lock;
  322 + struct dst_entry *sk_rx_dst;
322 323 atomic_t sk_wmem_alloc;
323 324 atomic_t sk_omem_alloc;
324 325 int sk_sndbuf;
... ... @@ -1426,6 +1427,7 @@
1426 1427 gfp_t priority);
1427 1428 extern void sock_wfree(struct sk_buff *skb);
1428 1429 extern void sock_rfree(struct sk_buff *skb);
  1430 +extern void sock_edemux(struct sk_buff *skb);
1429 1431  
1430 1432 extern int sock_setsockopt(struct socket *sock, int level,
1431 1433 int op, char __user *optval,
... ... @@ -325,6 +325,7 @@
325 325  
326 326 extern void tcp_shutdown (struct sock *sk, int how);
327 327  
  328 +extern int tcp_v4_early_demux(struct sk_buff *skb);
328 329 extern int tcp_v4_rcv(struct sk_buff *skb);
329 330  
330 331 extern struct inet_peer *tcp_v4_get_peer(struct sock *sk);
... ... @@ -1465,6 +1465,11 @@
1465 1465 }
1466 1466 EXPORT_SYMBOL(sock_rfree);
1467 1467  
  1468 +void sock_edemux(struct sk_buff *skb)
  1469 +{
  1470 + sock_put(skb->sk);
  1471 +}
  1472 +EXPORT_SYMBOL(sock_edemux);
1468 1473  
1469 1474 int sock_i_uid(struct sock *sk)
1470 1475 {
... ... @@ -157,6 +157,7 @@
157 157  
158 158 kfree(rcu_dereference_protected(inet->inet_opt, 1));
159 159 dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
  160 + dst_release(sk->sk_rx_dst);
160 161 sk_refcnt_debug_dec(sk);
161 162 }
162 163 EXPORT_SYMBOL(inet_sock_destruct);
... ... @@ -1518,14 +1519,15 @@
1518 1519 #endif
1519 1520  
1520 1521 static const struct net_protocol tcp_protocol = {
1521   - .handler = tcp_v4_rcv,
1522   - .err_handler = tcp_v4_err,
1523   - .gso_send_check = tcp_v4_gso_send_check,
1524   - .gso_segment = tcp_tso_segment,
1525   - .gro_receive = tcp4_gro_receive,
1526   - .gro_complete = tcp4_gro_complete,
1527   - .no_policy = 1,
1528   - .netns_ok = 1,
  1522 + .early_demux = tcp_v4_early_demux,
  1523 + .handler = tcp_v4_rcv,
  1524 + .err_handler = tcp_v4_err,
  1525 + .gso_send_check = tcp_v4_gso_send_check,
  1526 + .gso_segment = tcp_tso_segment,
  1527 + .gro_receive = tcp4_gro_receive,
  1528 + .gro_complete = tcp4_gro_complete,
  1529 + .no_policy = 1,
  1530 + .netns_ok = 1,
1529 1531 };
1530 1532  
1531 1533 static const struct net_protocol udp_protocol = {
... ... @@ -323,19 +323,32 @@
323 323 * how the packet travels inside Linux networking.
324 324 */
325 325 if (skb_dst(skb) == NULL) {
326   - int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
327   - iph->tos, skb->dev);
328   - if (unlikely(err)) {
329   - if (err == -EHOSTUNREACH)
330   - IP_INC_STATS_BH(dev_net(skb->dev),
331   - IPSTATS_MIB_INADDRERRORS);
332   - else if (err == -ENETUNREACH)
333   - IP_INC_STATS_BH(dev_net(skb->dev),
334   - IPSTATS_MIB_INNOROUTES);
335   - else if (err == -EXDEV)
336   - NET_INC_STATS_BH(dev_net(skb->dev),
337   - LINUX_MIB_IPRPFILTER);
338   - goto drop;
  326 + const struct net_protocol *ipprot;
  327 + int protocol = iph->protocol;
  328 + int err;
  329 +
  330 + rcu_read_lock();
  331 + ipprot = rcu_dereference(inet_protos[protocol]);
  332 + err = -ENOENT;
  333 + if (ipprot && ipprot->early_demux)
  334 + err = ipprot->early_demux(skb);
  335 + rcu_read_unlock();
  336 +
  337 + if (err) {
  338 + err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
  339 + iph->tos, skb->dev);
  340 + if (unlikely(err)) {
  341 + if (err == -EHOSTUNREACH)
  342 + IP_INC_STATS_BH(dev_net(skb->dev),
  343 + IPSTATS_MIB_INADDRERRORS);
  344 + else if (err == -ENETUNREACH)
  345 + IP_INC_STATS_BH(dev_net(skb->dev),
  346 + IPSTATS_MIB_INNOROUTES);
  347 + else if (err == -EXDEV)
  348 + NET_INC_STATS_BH(dev_net(skb->dev),
  349 + LINUX_MIB_IPRPFILTER);
  350 + goto drop;
  351 + }
339 352 }
340 353 }
341 354  
net/ipv4/tcp_input.c
... ... @@ -5518,6 +5518,18 @@
5518 5518 struct tcp_sock *tp = tcp_sk(sk);
5519 5519 int res;
5520 5520  
  5521 + if (sk->sk_rx_dst) {
  5522 + struct dst_entry *dst = sk->sk_rx_dst;
  5523 + if (unlikely(dst->obsolete)) {
  5524 + if (dst->ops->check(dst, 0) == NULL) {
  5525 + dst_release(dst);
  5526 + sk->sk_rx_dst = NULL;
  5527 + }
  5528 + }
  5529 + }
  5530 + if (unlikely(sk->sk_rx_dst == NULL))
  5531 + sk->sk_rx_dst = dst_clone(skb_dst(skb));
  5532 +
5521 5533 /*
5522 5534 * Header prediction.
5523 5535 * The code loosely follows the one in the famous
5524 5536  
... ... @@ -5729,8 +5741,10 @@
5729 5741  
5730 5742 tcp_set_state(sk, TCP_ESTABLISHED);
5731 5743  
5732   - if (skb != NULL)
  5744 + if (skb != NULL) {
  5745 + sk->sk_rx_dst = dst_clone(skb_dst(skb));
5733 5746 security_inet_conn_established(sk, skb);
  5747 + }
5734 5748  
5735 5749 /* Make sure socket is routed, for correct metrics. */
5736 5750 icsk->icsk_af_ops->rebuild_header(sk);
... ... @@ -1671,6 +1671,52 @@
1671 1671 }
1672 1672 EXPORT_SYMBOL(tcp_v4_do_rcv);
1673 1673  
  1674 +int tcp_v4_early_demux(struct sk_buff *skb)
  1675 +{
  1676 + struct net *net = dev_net(skb->dev);
  1677 + const struct iphdr *iph;
  1678 + const struct tcphdr *th;
  1679 + struct sock *sk;
  1680 + int err;
  1681 +
  1682 + err = -ENOENT;
  1683 + if (skb->pkt_type != PACKET_HOST)
  1684 + goto out_err;
  1685 +
  1686 + if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
  1687 + goto out_err;
  1688 +
  1689 + iph = ip_hdr(skb);
  1690 + th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
  1691 +
  1692 + if (th->doff < sizeof(struct tcphdr) / 4)
  1693 + goto out_err;
  1694 +
  1695 + if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
  1696 + goto out_err;
  1697 +
  1698 + sk = __inet_lookup_established(net, &tcp_hashinfo,
  1699 + iph->saddr, th->source,
  1700 + iph->daddr, th->dest,
  1701 + skb->dev->ifindex);
  1702 + if (sk) {
  1703 + skb->sk = sk;
  1704 + skb->destructor = sock_edemux;
  1705 + if (sk->sk_state != TCP_TIME_WAIT) {
  1706 + struct dst_entry *dst = sk->sk_rx_dst;
  1707 + if (dst)
  1708 + dst = dst_check(dst, 0);
  1709 + if (dst) {
  1710 + skb_dst_set_noref(skb, dst);
  1711 + err = 0;
  1712 + }
  1713 + }
  1714 + }
  1715 +
  1716 +out_err:
  1717 + return err;
  1718 +}
  1719 +
1674 1720 /*
1675 1721 * From tcp_input.c
1676 1722 */
net/ipv4/tcp_minisocks.c
... ... @@ -445,6 +445,8 @@
445 445 struct tcp_sock *oldtp = tcp_sk(sk);
446 446 struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
447 447  
  448 + newsk->sk_rx_dst = dst_clone(skb_dst(skb));
  449 +
448 450 /* TCP Cookie Transactions require space for the cookie pair,
449 451 * as it differs for each connection. There is no need to
450 452 * copy any s_data_payload stored at the original socket.