Commit d346a3fae3ff1d99f5d0c819bf86edf9094a26a1

Authored by Daniel Borkmann
Committed by David S. Miller
1 parent 4262e5ccbb

packet: introduce PACKET_QDISC_BYPASS socket option

This patch introduces a PACKET_QDISC_BYPASS socket option, that
allows for using a similar xmit() function as in pktgen instead
of taking the dev_queue_xmit() path. This can be very useful when
PF_PACKET applications are required to be used in a similar
scenario as pktgen, but with full, flexible packet payload that
needs to be provided, for example.

On default, nothing changes in behaviour for normal PF_PACKET
TX users, so everything stays as is for applications. New users,
however, can now set PACKET_QDISC_BYPASS if needed to prevent
own packets from i) reentering packet_rcv() and ii) to directly
push the frame to the driver.

In doing so we can increase pps (here 64 byte packets) for
PF_PACKET a bit:

  # CPUs -- QDISC_BYPASS   -- qdisc path -- qdisc path[**]
  1 CPU  ==  1,509,628 pps --  1,208,708 --  1,247,436
  2 CPUs ==  3,198,659 pps --  2,536,012 --  1,605,779
  3 CPUs ==  4,787,992 pps --  3,788,740 --  1,735,610
  4 CPUs ==  6,173,956 pps --  4,907,799 --  1,909,114
  5 CPUs ==  7,495,676 pps --  5,956,499 --  2,014,422
  6 CPUs ==  9,001,496 pps --  7,145,064 --  2,155,261
  7 CPUs == 10,229,776 pps --  8,190,596 --  2,220,619
  8 CPUs == 11,040,732 pps --  9,188,544 --  2,241,879
  9 CPUs == 12,009,076 pps -- 10,275,936 --  2,068,447
 10 CPUs == 11,380,052 pps -- 11,265,337 --  1,578,689
 11 CPUs == 11,672,676 pps -- 11,845,344 --  1,297,412
 [...]
 20 CPUs == 11,363,192 pps -- 11,014,933 --  1,245,081

 [**]: qdisc path with packet_rcv(), how probably most people
       seem to use it (hopefully not anymore if not needed)

The test was done using a modified trafgen, sending a simple
static 64 bytes packet, on all CPUs.  The trick in the fast
"qdisc path" case, is to avoid reentering packet_rcv() by
setting the RAW socket protocol to zero, like:
socket(PF_PACKET, SOCK_RAW, 0);

Tradeoffs are documented as well in this patch, clearly, if
queues are busy, we will drop more packets, tc disciplines are
ignored, and these packets are not visible to taps anymore. For
a pktgen like scenario, we argue that this is acceptable.

The pointer to the xmit function has been placed in packet
socket structure hole between cached_dev and prot_hook that
is hot anyway as we're working on cached_dev in each send path.

Done in joint work together with Jesper Dangaard Brouer.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 4 changed files with 102 additions and 12 deletions Side-by-side Diff

Documentation/networking/packet_mmap.txt
... ... @@ -953,6 +953,27 @@
953 953 }
954 954  
955 955 -------------------------------------------------------------------------------
  956 ++ PACKET_QDISC_BYPASS
  957 +-------------------------------------------------------------------------------
  958 +
  959 +If there is a requirement to load the network with many packets in a similar
  960 +fashion as pktgen does, you might set the following option after socket
  961 +creation:
  962 +
  963 + int one = 1;
  964 + setsockopt(fd, SOL_PACKET, PACKET_QDISC_BYPASS, &one, sizeof(one));
  965 +
  966 +This has the side-effect, that packets sent through PF_PACKET will bypass the
  967 +kernel's qdisc layer and are forcedly pushed to the driver directly. Meaning,
  968 +packet are not buffered, tc disciplines are ignored, increased loss can occur
  969 +and such packets are also not visible to other PF_PACKET sockets anymore. So,
  970 +you have been warned; generally, this can be useful for stress testing various
  971 +components of a system.
  972 +
  973 +On default, PACKET_QDISC_BYPASS is disabled and needs to be explicitly enabled
  974 +on PF_PACKET sockets.
  975 +
  976 +-------------------------------------------------------------------------------
956 977 + PACKET_TIMESTAMP
957 978 -------------------------------------------------------------------------------
958 979  
include/uapi/linux/if_packet.h
... ... @@ -51,6 +51,7 @@
51 51 #define PACKET_TIMESTAMP 17
52 52 #define PACKET_FANOUT 18
53 53 #define PACKET_TX_HAS_OFF 19
  54 +#define PACKET_QDISC_BYPASS 20
54 55  
55 56 #define PACKET_FANOUT_HASH 0
56 57 #define PACKET_FANOUT_LB 1
net/packet/af_packet.c
... ... @@ -237,6 +237,48 @@
237 237 static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
238 238 static void __fanout_link(struct sock *sk, struct packet_sock *po);
239 239  
  240 +static int packet_direct_xmit(struct sk_buff *skb)
  241 +{
  242 + struct net_device *dev = skb->dev;
  243 + const struct net_device_ops *ops = dev->netdev_ops;
  244 + netdev_features_t features;
  245 + struct netdev_queue *txq;
  246 + u16 queue_map;
  247 + int ret;
  248 +
  249 + if (unlikely(!netif_running(dev) ||
  250 + !netif_carrier_ok(dev))) {
  251 + kfree_skb(skb);
  252 + return NET_XMIT_DROP;
  253 + }
  254 +
  255 + features = netif_skb_features(skb);
  256 + if (skb_needs_linearize(skb, features) &&
  257 + __skb_linearize(skb)) {
  258 + kfree_skb(skb);
  259 + return NET_XMIT_DROP;
  260 + }
  261 +
  262 + queue_map = skb_get_queue_mapping(skb);
  263 + txq = netdev_get_tx_queue(dev, queue_map);
  264 +
  265 + __netif_tx_lock_bh(txq);
  266 + if (unlikely(netif_xmit_frozen_or_stopped(txq))) {
  267 + ret = NETDEV_TX_BUSY;
  268 + kfree_skb(skb);
  269 + goto out;
  270 + }
  271 +
  272 + ret = ops->ndo_start_xmit(skb, dev);
  273 + if (likely(dev_xmit_complete(ret)))
  274 + txq_trans_update(txq);
  275 + else
  276 + kfree_skb(skb);
  277 +out:
  278 + __netif_tx_unlock_bh(txq);
  279 + return ret;
  280 +}
  281 +
240 282 static struct net_device *packet_cached_dev_get(struct packet_sock *po)
241 283 {
242 284 struct net_device *dev;
... ... @@ -261,6 +303,16 @@
261 303 RCU_INIT_POINTER(po->cached_dev, NULL);
262 304 }
263 305  
  306 +static bool packet_use_direct_xmit(const struct packet_sock *po)
  307 +{
  308 + return po->xmit == packet_direct_xmit;
  309 +}
  310 +
  311 +static u16 packet_pick_tx_queue(struct net_device *dev)
  312 +{
  313 + return (u16) smp_processor_id() % dev->real_num_tx_queues;
  314 +}
  315 +
264 316 /* register_prot_hook must be invoked with the po->bind_lock held,
265 317 * or from a context in which asynchronous accesses to the packet
266 318 * socket is not possible (packet_create()).
267 319  
... ... @@ -1994,9 +2046,10 @@
1994 2046  
1995 2047 skb_reserve(skb, hlen);
1996 2048 skb_reset_network_header(skb);
1997   - skb_probe_transport_header(skb, 0);
1998 2049  
1999   - if (po->tp_tx_has_off) {
  2050 + if (!packet_use_direct_xmit(po))
  2051 + skb_probe_transport_header(skb, 0);
  2052 + if (unlikely(po->tp_tx_has_off)) {
2000 2053 int off_min, off_max, off;
2001 2054 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
2002 2055 off_max = po->tx_ring.frame_size - tp_len;
2003 2056  
... ... @@ -2166,12 +2219,13 @@
2166 2219 }
2167 2220 }
2168 2221  
  2222 + skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
2169 2223 skb->destructor = tpacket_destruct_skb;
2170 2224 __packet_set_status(po, ph, TP_STATUS_SENDING);
2171 2225 atomic_inc(&po->tx_ring.pending);
2172 2226  
2173 2227 status = TP_STATUS_SEND_REQUEST;
2174   - err = dev_queue_xmit(skb);
  2228 + err = po->xmit(skb);
2175 2229 if (unlikely(err > 0)) {
2176 2230 err = net_xmit_errno(err);
2177 2231 if (err && __packet_get_status(po, ph) ==
... ... @@ -2230,8 +2284,7 @@
2230 2284 return skb;
2231 2285 }
2232 2286  
2233   -static int packet_snd(struct socket *sock,
2234   - struct msghdr *msg, size_t len)
  2287 +static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
2235 2288 {
2236 2289 struct sock *sk = sock->sk;
2237 2290 struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
... ... @@ -2376,6 +2429,7 @@
2376 2429 skb->dev = dev;
2377 2430 skb->priority = sk->sk_priority;
2378 2431 skb->mark = sk->sk_mark;
  2432 + skb_set_queue_mapping(skb, packet_pick_tx_queue(dev));
2379 2433  
2380 2434 if (po->has_vnet_hdr) {
2381 2435 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
2382 2436  
... ... @@ -2396,16 +2450,12 @@
2396 2450 len += vnet_hdr_len;
2397 2451 }
2398 2452  
2399   - skb_probe_transport_header(skb, reserve);
2400   -
  2453 + if (!packet_use_direct_xmit(po))
  2454 + skb_probe_transport_header(skb, reserve);
2401 2455 if (unlikely(extra_len == 4))
2402 2456 skb->no_fcs = 1;
2403 2457  
2404   - /*
2405   - * Now send it
2406   - */
2407   -
2408   - err = dev_queue_xmit(skb);
  2458 + err = po->xmit(skb);
2409 2459 if (err > 0 && (err = net_xmit_errno(err)) != 0)
2410 2460 goto out_unlock;
2411 2461  
... ... @@ -2427,6 +2477,7 @@
2427 2477 {
2428 2478 struct sock *sk = sock->sk;
2429 2479 struct packet_sock *po = pkt_sk(sk);
  2480 +
2430 2481 if (po->tx_ring.pg_vec)
2431 2482 return tpacket_snd(po, msg);
2432 2483 else
... ... @@ -2641,6 +2692,7 @@
2641 2692 po = pkt_sk(sk);
2642 2693 sk->sk_family = PF_PACKET;
2643 2694 po->num = proto;
  2695 + po->xmit = dev_queue_xmit;
2644 2696  
2645 2697 packet_cached_dev_reset(po);
2646 2698  
... ... @@ -3220,6 +3272,18 @@
3220 3272 po->tp_tx_has_off = !!val;
3221 3273 return 0;
3222 3274 }
  3275 + case PACKET_QDISC_BYPASS:
  3276 + {
  3277 + int val;
  3278 +
  3279 + if (optlen != sizeof(val))
  3280 + return -EINVAL;
  3281 + if (copy_from_user(&val, optval, sizeof(val)))
  3282 + return -EFAULT;
  3283 +
  3284 + po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
  3285 + return 0;
  3286 + }
3223 3287 default:
3224 3288 return -ENOPROTOOPT;
3225 3289 }
... ... @@ -3311,6 +3375,9 @@
3311 3375 break;
3312 3376 case PACKET_TX_HAS_OFF:
3313 3377 val = po->tp_tx_has_off;
  3378 + break;
  3379 + case PACKET_QDISC_BYPASS:
  3380 + val = packet_use_direct_xmit(po);
3314 3381 break;
3315 3382 default:
3316 3383 return -ENOPROTOOPT;
net/packet/internal.h
... ... @@ -114,6 +114,7 @@
114 114 unsigned int tp_tx_has_off:1;
115 115 unsigned int tp_tstamp;
116 116 struct net_device __rcu *cached_dev;
  117 + int (*xmit)(struct sk_buff *skb);
117 118 struct packet_type prot_hook ____cacheline_aligned_in_smp;
118 119 };
119 120