Commit c445477d74ab3779d1386ab797fbb9b628eb9f64

Authored by Ben Hutchings
Committed by David S. Miller
1 parent c39649c331

net: RPS: Enable hardware acceleration of RFS

Allow drivers for multiqueue hardware with flow filter tables to
accelerate RFS.  The driver must:

1. Set net_device::rx_cpu_rmap to a cpu_rmap of the RX completion
IRQs (in queue order).  This will provide a mapping from CPUs to the
queues for which completions are handled nearest to them.

2. Implement net_device_ops::ndo_rx_flow_steer.  This operation adds
or replaces a filter steering the given flow to the given RX queue, if
possible.

3. Periodically remove filters for which rps_may_expire_flow() returns
true.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 3 changed files with 127 additions and 9 deletions Side-by-side Diff

include/linux/netdevice.h
... ... @@ -554,14 +554,16 @@
554 554 #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16)))
555 555  
556 556 /*
557   - * The rps_dev_flow structure contains the mapping of a flow to a CPU and the
558   - * tail pointer for that CPU's input queue at the time of last enqueue.
  557 + * The rps_dev_flow structure contains the mapping of a flow to a CPU, the
  558 + * tail pointer for that CPU's input queue at the time of last enqueue, and
  559 + * a hardware filter index.
559 560 */
560 561 struct rps_dev_flow {
561 562 u16 cpu;
562   - u16 fill;
  563 + u16 filter;
563 564 unsigned int last_qtail;
564 565 };
  566 +#define RPS_NO_FILTER 0xffff
565 567  
566 568 /*
567 569 * The rps_dev_flow_table structure contains a table of flow mappings.
... ... @@ -611,6 +613,11 @@
611 613  
612 614 extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
613 615  
  616 +#ifdef CONFIG_RFS_ACCEL
  617 +extern bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
  618 + u32 flow_id, u16 filter_id);
  619 +#endif
  620 +
614 621 /* This structure contains an instance of an RX queue. */
615 622 struct netdev_rx_queue {
616 623 struct rps_map __rcu *rps_map;
... ... @@ -769,6 +776,13 @@
769 776 * is always called from the stack with the rtnl lock held and netif tx
770 777 * queues stopped. This allows the netdevice to perform queue management
771 778 * safely.
  779 + *
  780 + * RFS acceleration.
  781 + * int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb,
  782 + * u16 rxq_index, u32 flow_id);
  783 + * Set hardware filter for RFS. rxq_index is the target queue index;
  784 + * flow_id is a flow ID to be passed to rps_may_expire_flow() later.
  785 + * Return the filter ID on success, or a negative error code.
772 786 */
773 787 #define HAVE_NET_DEVICE_OPS
774 788 struct net_device_ops {
... ... @@ -842,6 +856,12 @@
842 856 int (*ndo_fcoe_get_wwn)(struct net_device *dev,
843 857 u64 *wwn, int type);
844 858 #endif
  859 +#ifdef CONFIG_RFS_ACCEL
  860 + int (*ndo_rx_flow_steer)(struct net_device *dev,
  861 + const struct sk_buff *skb,
  862 + u16 rxq_index,
  863 + u32 flow_id);
  864 +#endif
845 865 };
846 866  
847 867 /*
... ... @@ -1056,6 +1076,13 @@
1056 1076  
1057 1077 /* Number of RX queues currently active in device */
1058 1078 unsigned int real_num_rx_queues;
  1079 +
  1080 +#ifdef CONFIG_RFS_ACCEL
  1081 + /* CPU reverse-mapping for RX completion interrupts, indexed
  1082 + * by RX queue number. Assigned by driver. This must only be
  1083 + * set if the ndo_rx_flow_steer operation is defined. */
  1084 + struct cpu_rmap *rx_cpu_rmap;
  1085 +#endif
1059 1086 #endif
1060 1087  
1061 1088 rx_handler_func_t __rcu *rx_handler;
... ... @@ -221,6 +221,12 @@
221 221 depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
222 222 default y
223 223  
  224 +config RFS_ACCEL
  225 + boolean
  226 + depends on RPS && GENERIC_HARDIRQS
  227 + select CPU_RMAP
  228 + default y
  229 +
224 230 config XPS
225 231 boolean
226 232 depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
... ... @@ -132,6 +132,7 @@
132 132 #include <trace/events/skb.h>
133 133 #include <linux/pci.h>
134 134 #include <linux/inetdevice.h>
  135 +#include <linux/cpu_rmap.h>
135 136  
136 137 #include "net-sysfs.h"
137 138  
... ... @@ -2588,6 +2589,53 @@
2588 2589 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2589 2590 EXPORT_SYMBOL(rps_sock_flow_table);
2590 2591  
  2592 +static struct rps_dev_flow *
  2593 +set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
  2594 + struct rps_dev_flow *rflow, u16 next_cpu)
  2595 +{
  2596 + u16 tcpu;
  2597 +
  2598 + tcpu = rflow->cpu = next_cpu;
  2599 + if (tcpu != RPS_NO_CPU) {
  2600 +#ifdef CONFIG_RFS_ACCEL
  2601 + struct netdev_rx_queue *rxqueue;
  2602 + struct rps_dev_flow_table *flow_table;
  2603 + struct rps_dev_flow *old_rflow;
  2604 + u32 flow_id;
  2605 + u16 rxq_index;
  2606 + int rc;
  2607 +
  2608 + /* Should we steer this flow to a different hardware queue? */
  2609 + if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap)
  2610 + goto out;
  2611 + rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
  2612 + if (rxq_index == skb_get_rx_queue(skb))
  2613 + goto out;
  2614 +
  2615 + rxqueue = dev->_rx + rxq_index;
  2616 + flow_table = rcu_dereference(rxqueue->rps_flow_table);
  2617 + if (!flow_table)
  2618 + goto out;
  2619 + flow_id = skb->rxhash & flow_table->mask;
  2620 + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
  2621 + rxq_index, flow_id);
  2622 + if (rc < 0)
  2623 + goto out;
  2624 + old_rflow = rflow;
  2625 + rflow = &flow_table->flows[flow_id];
  2626 + rflow->cpu = next_cpu;
  2627 + rflow->filter = rc;
  2628 + if (old_rflow->filter == rflow->filter)
  2629 + old_rflow->filter = RPS_NO_FILTER;
  2630 + out:
  2631 +#endif
  2632 + rflow->last_qtail =
  2633 + per_cpu(softnet_data, tcpu).input_queue_head;
  2634 + }
  2635 +
  2636 + return rflow;
  2637 +}
  2638 +
2591 2639 /*
2592 2640 * get_rps_cpu is called from netif_receive_skb and returns the target
2593 2641 * CPU from the RPS map of the receiving queue for a given skb.
... ... @@ -2658,12 +2706,9 @@
2658 2706 if (unlikely(tcpu != next_cpu) &&
2659 2707 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2660 2708 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2661   - rflow->last_qtail)) >= 0)) {
2662   - tcpu = rflow->cpu = next_cpu;
2663   - if (tcpu != RPS_NO_CPU)
2664   - rflow->last_qtail = per_cpu(softnet_data,
2665   - tcpu).input_queue_head;
2666   - }
  2709 + rflow->last_qtail)) >= 0))
  2710 + rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
  2711 +
2667 2712 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2668 2713 *rflowp = rflow;
2669 2714 cpu = tcpu;
... ... @@ -2683,6 +2728,46 @@
2683 2728 done:
2684 2729 return cpu;
2685 2730 }
  2731 +
  2732 +#ifdef CONFIG_RFS_ACCEL
  2733 +
  2734 +/**
  2735 + * rps_may_expire_flow - check whether an RFS hardware filter may be removed
  2736 + * @dev: Device on which the filter was set
  2737 + * @rxq_index: RX queue index
  2738 + * @flow_id: Flow ID passed to ndo_rx_flow_steer()
  2739 + * @filter_id: Filter ID returned by ndo_rx_flow_steer()
  2740 + *
  2741 + * Drivers that implement ndo_rx_flow_steer() should periodically call
  2742 + * this function for each installed filter and remove the filters for
  2743 + * which it returns %true.
  2744 + */
  2745 +bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
  2746 + u32 flow_id, u16 filter_id)
  2747 +{
  2748 + struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
  2749 + struct rps_dev_flow_table *flow_table;
  2750 + struct rps_dev_flow *rflow;
  2751 + bool expire = true;
  2752 + int cpu;
  2753 +
  2754 + rcu_read_lock();
  2755 + flow_table = rcu_dereference(rxqueue->rps_flow_table);
  2756 + if (flow_table && flow_id <= flow_table->mask) {
  2757 + rflow = &flow_table->flows[flow_id];
  2758 + cpu = ACCESS_ONCE(rflow->cpu);
  2759 + if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
  2760 + ((int)(per_cpu(softnet_data, cpu).input_queue_head -
  2761 + rflow->last_qtail) <
  2762 + (int)(10 * flow_table->mask)))
  2763 + expire = false;
  2764 + }
  2765 + rcu_read_unlock();
  2766 + return expire;
  2767 +}
  2768 +EXPORT_SYMBOL(rps_may_expire_flow);
  2769 +
  2770 +#endif /* CONFIG_RFS_ACCEL */
2686 2771  
2687 2772 /* Called from hardirq (IPI) context */
2688 2773 static void rps_trigger_softirq(void *data)