Commit c445477d74ab3779d1386ab797fbb9b628eb9f64
Committed by
David S. Miller
1 parent
c39649c331
Exists in
master
and in
4 other branches
net: RPS: Enable hardware acceleration of RFS
Allow drivers for multiqueue hardware with flow filter tables to accelerate RFS. The driver must: 1. Set net_device::rx_cpu_rmap to a cpu_rmap of the RX completion IRQs (in queue order). This will provide a mapping from CPUs to the queues for which completions are handled nearest to them. 2. Implement net_device_ops::ndo_rx_flow_steer. This operation adds or replaces a filter steering the given flow to the given RX queue, if possible. 3. Periodically remove filters for which rps_may_expire_flow() returns true. Signed-off-by: Ben Hutchings <bhutchings@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Showing 3 changed files with 127 additions and 9 deletions Side-by-side Diff
include/linux/netdevice.h
... | ... | @@ -554,14 +554,16 @@ |
554 | 554 | #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16))) |
555 | 555 | |
556 | 556 | /* |
557 | - * The rps_dev_flow structure contains the mapping of a flow to a CPU and the | |
558 | - * tail pointer for that CPU's input queue at the time of last enqueue. | |
557 | + * The rps_dev_flow structure contains the mapping of a flow to a CPU, the | |
558 | + * tail pointer for that CPU's input queue at the time of last enqueue, and | |
559 | + * a hardware filter index. | |
559 | 560 | */ |
560 | 561 | struct rps_dev_flow { |
561 | 562 | u16 cpu; |
562 | - u16 fill; | |
563 | + u16 filter; | |
563 | 564 | unsigned int last_qtail; |
564 | 565 | }; |
566 | +#define RPS_NO_FILTER 0xffff | |
565 | 567 | |
566 | 568 | /* |
567 | 569 | * The rps_dev_flow_table structure contains a table of flow mappings. |
... | ... | @@ -611,6 +613,11 @@ |
611 | 613 | |
612 | 614 | extern struct rps_sock_flow_table __rcu *rps_sock_flow_table; |
613 | 615 | |
616 | +#ifdef CONFIG_RFS_ACCEL | |
617 | +extern bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, | |
618 | + u32 flow_id, u16 filter_id); | |
619 | +#endif | |
620 | + | |
614 | 621 | /* This structure contains an instance of an RX queue. */ |
615 | 622 | struct netdev_rx_queue { |
616 | 623 | struct rps_map __rcu *rps_map; |
... | ... | @@ -769,6 +776,13 @@ |
769 | 776 | * is always called from the stack with the rtnl lock held and netif tx |
770 | 777 | * queues stopped. This allows the netdevice to perform queue management |
771 | 778 | * safely. |
779 | + * | |
780 | + * RFS acceleration. | |
781 | + * int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb, | |
782 | + * u16 rxq_index, u32 flow_id); | |
783 | + * Set hardware filter for RFS. rxq_index is the target queue index; | |
784 | + * flow_id is a flow ID to be passed to rps_may_expire_flow() later. | |
785 | + * Return the filter ID on success, or a negative error code. | |
772 | 786 | */ |
773 | 787 | #define HAVE_NET_DEVICE_OPS |
774 | 788 | struct net_device_ops { |
... | ... | @@ -842,6 +856,12 @@ |
842 | 856 | int (*ndo_fcoe_get_wwn)(struct net_device *dev, |
843 | 857 | u64 *wwn, int type); |
844 | 858 | #endif |
859 | +#ifdef CONFIG_RFS_ACCEL | |
860 | + int (*ndo_rx_flow_steer)(struct net_device *dev, | |
861 | + const struct sk_buff *skb, | |
862 | + u16 rxq_index, | |
863 | + u32 flow_id); | |
864 | +#endif | |
845 | 865 | }; |
846 | 866 | |
847 | 867 | /* |
... | ... | @@ -1056,6 +1076,13 @@ |
1056 | 1076 | |
1057 | 1077 | /* Number of RX queues currently active in device */ |
1058 | 1078 | unsigned int real_num_rx_queues; |
1079 | + | |
1080 | +#ifdef CONFIG_RFS_ACCEL | |
1081 | + /* CPU reverse-mapping for RX completion interrupts, indexed | |
1082 | + * by RX queue number. Assigned by driver. This must only be | |
1083 | + * set if the ndo_rx_flow_steer operation is defined. */ | |
1084 | + struct cpu_rmap *rx_cpu_rmap; | |
1085 | +#endif | |
1059 | 1086 | #endif |
1060 | 1087 | |
1061 | 1088 | rx_handler_func_t __rcu *rx_handler; |
net/Kconfig
... | ... | @@ -221,6 +221,12 @@ |
221 | 221 | depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS |
222 | 222 | default y |
223 | 223 | |
224 | +config RFS_ACCEL | |
225 | + boolean | |
226 | + depends on RPS && GENERIC_HARDIRQS | |
227 | + select CPU_RMAP | |
228 | + default y | |
229 | + | |
224 | 230 | config XPS |
225 | 231 | boolean |
226 | 232 | depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS |
net/core/dev.c
... | ... | @@ -132,6 +132,7 @@ |
132 | 132 | #include <trace/events/skb.h> |
133 | 133 | #include <linux/pci.h> |
134 | 134 | #include <linux/inetdevice.h> |
135 | +#include <linux/cpu_rmap.h> | |
135 | 136 | |
136 | 137 | #include "net-sysfs.h" |
137 | 138 | |
... | ... | @@ -2588,6 +2589,53 @@ |
2588 | 2589 | struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; |
2589 | 2590 | EXPORT_SYMBOL(rps_sock_flow_table); |
2590 | 2591 | |
2592 | +static struct rps_dev_flow * | |
2593 | +set_rps_cpu(struct net_device *dev, struct sk_buff *skb, | |
2594 | + struct rps_dev_flow *rflow, u16 next_cpu) | |
2595 | +{ | |
2596 | + u16 tcpu; | |
2597 | + | |
2598 | + tcpu = rflow->cpu = next_cpu; | |
2599 | + if (tcpu != RPS_NO_CPU) { | |
2600 | +#ifdef CONFIG_RFS_ACCEL | |
2601 | + struct netdev_rx_queue *rxqueue; | |
2602 | + struct rps_dev_flow_table *flow_table; | |
2603 | + struct rps_dev_flow *old_rflow; | |
2604 | + u32 flow_id; | |
2605 | + u16 rxq_index; | |
2606 | + int rc; | |
2607 | + | |
2608 | + /* Should we steer this flow to a different hardware queue? */ | |
2609 | + if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap) | |
2610 | + goto out; | |
2611 | + rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); | |
2612 | + if (rxq_index == skb_get_rx_queue(skb)) | |
2613 | + goto out; | |
2614 | + | |
2615 | + rxqueue = dev->_rx + rxq_index; | |
2616 | + flow_table = rcu_dereference(rxqueue->rps_flow_table); | |
2617 | + if (!flow_table) | |
2618 | + goto out; | |
2619 | + flow_id = skb->rxhash & flow_table->mask; | |
2620 | + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, | |
2621 | + rxq_index, flow_id); | |
2622 | + if (rc < 0) | |
2623 | + goto out; | |
2624 | + old_rflow = rflow; | |
2625 | + rflow = &flow_table->flows[flow_id]; | |
2626 | + rflow->cpu = next_cpu; | |
2627 | + rflow->filter = rc; | |
2628 | + if (old_rflow->filter == rflow->filter) | |
2629 | + old_rflow->filter = RPS_NO_FILTER; | |
2630 | + out: | |
2631 | +#endif | |
2632 | + rflow->last_qtail = | |
2633 | + per_cpu(softnet_data, tcpu).input_queue_head; | |
2634 | + } | |
2635 | + | |
2636 | + return rflow; | |
2637 | +} | |
2638 | + | |
2591 | 2639 | /* |
2592 | 2640 | * get_rps_cpu is called from netif_receive_skb and returns the target |
2593 | 2641 | * CPU from the RPS map of the receiving queue for a given skb. |
... | ... | @@ -2658,12 +2706,9 @@ |
2658 | 2706 | if (unlikely(tcpu != next_cpu) && |
2659 | 2707 | (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || |
2660 | 2708 | ((int)(per_cpu(softnet_data, tcpu).input_queue_head - |
2661 | - rflow->last_qtail)) >= 0)) { | |
2662 | - tcpu = rflow->cpu = next_cpu; | |
2663 | - if (tcpu != RPS_NO_CPU) | |
2664 | - rflow->last_qtail = per_cpu(softnet_data, | |
2665 | - tcpu).input_queue_head; | |
2666 | - } | |
2709 | + rflow->last_qtail)) >= 0)) | |
2710 | + rflow = set_rps_cpu(dev, skb, rflow, next_cpu); | |
2711 | + | |
2667 | 2712 | if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { |
2668 | 2713 | *rflowp = rflow; |
2669 | 2714 | cpu = tcpu; |
... | ... | @@ -2683,6 +2728,46 @@ |
2683 | 2728 | done: |
2684 | 2729 | return cpu; |
2685 | 2730 | } |
2731 | + | |
2732 | +#ifdef CONFIG_RFS_ACCEL | |
2733 | + | |
2734 | +/** | |
2735 | + * rps_may_expire_flow - check whether an RFS hardware filter may be removed | |
2736 | + * @dev: Device on which the filter was set | |
2737 | + * @rxq_index: RX queue index | |
2738 | + * @flow_id: Flow ID passed to ndo_rx_flow_steer() | |
2739 | + * @filter_id: Filter ID returned by ndo_rx_flow_steer() | |
2740 | + * | |
2741 | + * Drivers that implement ndo_rx_flow_steer() should periodically call | |
2742 | + * this function for each installed filter and remove the filters for | |
2743 | + * which it returns %true. | |
2744 | + */ | |
2745 | +bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, | |
2746 | + u32 flow_id, u16 filter_id) | |
2747 | +{ | |
2748 | + struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; | |
2749 | + struct rps_dev_flow_table *flow_table; | |
2750 | + struct rps_dev_flow *rflow; | |
2751 | + bool expire = true; | |
2752 | + int cpu; | |
2753 | + | |
2754 | + rcu_read_lock(); | |
2755 | + flow_table = rcu_dereference(rxqueue->rps_flow_table); | |
2756 | + if (flow_table && flow_id <= flow_table->mask) { | |
2757 | + rflow = &flow_table->flows[flow_id]; | |
2758 | + cpu = ACCESS_ONCE(rflow->cpu); | |
2759 | + if (rflow->filter == filter_id && cpu != RPS_NO_CPU && | |
2760 | + ((int)(per_cpu(softnet_data, cpu).input_queue_head - | |
2761 | + rflow->last_qtail) < | |
2762 | + (int)(10 * flow_table->mask))) | |
2763 | + expire = false; | |
2764 | + } | |
2765 | + rcu_read_unlock(); | |
2766 | + return expire; | |
2767 | +} | |
2768 | +EXPORT_SYMBOL(rps_may_expire_flow); | |
2769 | + | |
2770 | +#endif /* CONFIG_RFS_ACCEL */ | |
2686 | 2771 | |
2687 | 2772 | /* Called from hardirq (IPI) context */ |
2688 | 2773 | static void rps_trigger_softirq(void *data) |