rps: selective flow shedding during softnet overflow

A cpu executing the network receive path sheds packets when its input queue grows to netdev_max_backlog. A single high rate flow (such as a spoofed source DoS) can exceed a single cpu processing rate and will degrade throughput of other flows hashed onto the same cpu. This patch adds a more fine grained hashtable. If the netdev backlog is above a threshold, IRQ cpus track the ratio of total traffic of each flow (using 4096 buckets, configurable). The ratio is measured by counting the number of packets per flow over the last 256 packets from the source cpu. Any flow that occupies a large fraction of this (set at 50%) will see packet drop while above the threshold. Tested: Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0, kernel receive (RPS) on cpu0 and application threads on cpus 2--7 each handling 20k req/s. Throughput halves when hit with a 400 kpps antagonist storm. With this patch applied, antagonist overload is dropped and the server processes its complete load. The patch is effective when kernel receive processing is the bottleneck. The above RPS scenario is a extreme, but the same is reached with RFS and sufficient kernel processing (iptables, packet socket tap, ..). Signed-off-by: Willem de Bruijn <willemb@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>

rps: selective flow shedding during softnet overflow
A cpu executing the network receive path sheds packets when its input queue grows to netdev_max_backlog. A single high rate flow (such as a spoofed source DoS) can exceed a single cpu processing rate and will degrade throughput of other flows hashed onto the same cpu. This patch adds a more fine grained hashtable. If the netdev backlog is above a threshold, IRQ cpus track the ratio of total traffic of each flow (using 4096 buckets, configurable). The ratio is measured by counting the number of packets per flow over the last 256 packets from the source cpu. Any flow that occupies a large fraction of this (set at 50%) will see packet drop while above the threshold. Tested: Setup is a muli-threaded UDP echo server with network rx IRQ on cpu0, kernel receive (RPS) on cpu0 and application threads on cpus 2--7 each handling 20k req/s. Throughput halves when hit with a 400 kpps antagonist storm. With this patch applied, antagonist overload is dropped and the server processes its complete load. The patch is effective when kernel receive processing is the bottleneck. The above RPS scenario is a extreme, but the same is reached with RFS and sufficient kernel processing (iptables, packet socket tap, ..). Signed-off-by: Willem de Bruijn <willemb@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Willem de Bruijn · David S. Miller
1 parent 4a5bddf7ea
Showing 5 changed files with 194 additions and 3 deletions Side-by-side Diff
include/linux/netdevice.h
net/Kconfig
net/core/dev.c
net/core/net-procfs.c
net/core/sysctl_net_core.c
@@ -1778,6 +1778,19 @@
 	return register_gifconf(family, NULL);
 }
  
+#ifdef CONFIG_NET_FLOW_LIMIT
+#define FLOW_LIMIT_HISTORY	(1 << 8)	/* must be ^2 */
+struct sd_flow_limit {
+	u64			count;
+	unsigned int		num_buckets;
+	unsigned int		history_head;
+	u16			history[FLOW_LIMIT_HISTORY];
+	u8			buckets[];
+};
+
+extern int netdev_flow_limit_table_len;
+#endif /* CONFIG_NET_FLOW_LIMIT */
+
 /*
  * Incoming packets are placed on per-cpu queues
  */
@@ -1807,6 +1820,10 @@
 	unsigned int		dropped;
 	struct sk_buff_head	input_pkt_queue;
 	struct napi_struct	backlog;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit	*flow_limit;
+#endif
 };
  
 static inline void input_queue_head_incr(struct softnet_data *sd)
@@ -259,6 +259,18 @@
 	  packet sniffing (libpcap/tcpdump). Note : Admin should enable
 	  this feature changing /proc/sys/net/core/bpf_jit_enable
  
+config NET_FLOW_LIMIT
+	boolean
+	depends on RPS
+	default y
+	---help---
+	  The network stack has to drop packets when a receive processing CPU's
+	  backlog reaches netdev_max_backlog. If a few out of many active flows
+	  generate the vast majority of load, drop their traffic earlier to
+	  maintain capacity for the other flows. This feature provides servers
+	  with many clients some protection against DoS by a single (spoofed)
+	  flow that greatly exceeds average workload.
+
 menu "Network testing"
  
 config NET_PKTGEN
@@ -3064,6 +3064,46 @@
 	return 0;
 }
  
+#ifdef CONFIG_NET_FLOW_LIMIT
+int netdev_flow_limit_table_len __read_mostly = (1 << 12);
+#endif
+
+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
+{
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit *fl;
+	struct softnet_data *sd;
+	unsigned int old_flow, new_flow;
+
+	if (qlen < (netdev_max_backlog >> 1))
+		return false;
+
+	sd = &__get_cpu_var(softnet_data);
+
+	rcu_read_lock();
+	fl = rcu_dereference(sd->flow_limit);
+	if (fl) {
+		new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
+		old_flow = fl->history[fl->history_head];
+		fl->history[fl->history_head] = new_flow;
+
+		fl->history_head++;
+		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
+
+		if (likely(fl->buckets[old_flow]))
+			fl->buckets[old_flow]--;
+
+		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
+			fl->count++;
+			rcu_read_unlock();
+			return true;
+		}
+	}
+	rcu_read_unlock();
+#endif
+	return false;
+}
+
 /*
  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
  * queue (may be a remote CPU queue).
  
@@ -3073,13 +3113,15 @@
 {
 	struct softnet_data *sd;
 	unsigned long flags;
+	unsigned int qlen;
  
 	sd = &per_cpu(softnet_data, cpu);
  
 	local_irq_save(flags);
  
 	rps_lock(sd);
-	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
+	qlen = skb_queue_len(&sd->input_pkt_queue);
+	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
 		if (skb_queue_len(&sd->input_pkt_queue)) {
 enqueue:
 			__skb_queue_tail(&sd->input_pkt_queue, skb);
@@ -6269,6 +6311,10 @@
 		sd->backlog.weight = weight_p;
 		sd->backlog.gro_list = NULL;
 		sd->backlog.gro_count = 0;
+
+#ifdef CONFIG_NET_FLOW_LIMIT
+		sd->flow_limit = NULL;
+#endif
 	}
  
 	dev_boot_phase = 0;
@@ -146,11 +146,23 @@
 static int softnet_seq_show(struct seq_file *seq, void *v)
 {
 	struct softnet_data *sd = v;
+	unsigned int flow_limit_count = 0;
  
-	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+#ifdef CONFIG_NET_FLOW_LIMIT
+	struct sd_flow_limit *fl;
+
+	rcu_read_lock();
+	fl = rcu_dereference(sd->flow_limit);
+	if (fl)
+		flow_limit_count = fl->count;
+	rcu_read_unlock();
+#endif
+
+	seq_printf(seq,
+		   "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
 		   0, 0, 0, 0, /* was fastroute */
-		   sd->cpu_collision, sd->received_rps);
+		   sd->cpu_collision, sd->received_rps, flow_limit_count);
 	return 0;
 }
  
@@ -87,6 +87,96 @@
 }
 #endif /* CONFIG_RPS */
  
+#ifdef CONFIG_NET_FLOW_LIMIT
+static DEFINE_MUTEX(flow_limit_update_mutex);
+
+static int flow_limit_cpu_sysctl(ctl_table *table, int write,
+				 void __user *buffer, size_t *lenp,
+				 loff_t *ppos)
+{
+	struct sd_flow_limit *cur;
+	struct softnet_data *sd;
+	cpumask_var_t mask;
+	int i, len, ret = 0;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	if (write) {
+		ret = cpumask_parse_user(buffer, *lenp, mask);
+		if (ret)
+			goto done;
+
+		mutex_lock(&flow_limit_update_mutex);
+		len = sizeof(*cur) + netdev_flow_limit_table_len;
+		for_each_possible_cpu(i) {
+			sd = &per_cpu(softnet_data, i);
+			cur = rcu_dereference_protected(sd->flow_limit,
+				     lockdep_is_held(&flow_limit_update_mutex));
+			if (cur && !cpumask_test_cpu(i, mask)) {
+				RCU_INIT_POINTER(sd->flow_limit, NULL);
+				synchronize_rcu();
+				kfree(cur);
+			} else if (!cur && cpumask_test_cpu(i, mask)) {
+				cur = kzalloc(len, GFP_KERNEL);
+				if (!cur) {
+					/* not unwinding previous changes */
+					ret = -ENOMEM;
+					goto write_unlock;
+				}
+				cur->num_buckets = netdev_flow_limit_table_len;
+				rcu_assign_pointer(sd->flow_limit, cur);
+			}
+		}
+write_unlock:
+		mutex_unlock(&flow_limit_update_mutex);
+	} else {
+		if (*ppos || !*lenp) {
+			*lenp = 0;
+			goto done;
+		}
+
+		cpumask_clear(mask);
+		rcu_read_lock();
+		for_each_possible_cpu(i) {
+			sd = &per_cpu(softnet_data, i);
+			if (rcu_dereference(sd->flow_limit))
+				cpumask_set_cpu(i, mask);
+		}
+		rcu_read_unlock();
+
+		len = cpumask_scnprintf(buffer, *lenp, mask);
+		*lenp = len + 1;
+		*ppos += len + 1;
+	}
+
+done:
+	free_cpumask_var(mask);
+	return ret;
+}
+
+static int flow_limit_table_len_sysctl(ctl_table *table, int write,
+				       void __user *buffer, size_t *lenp,
+				       loff_t *ppos)
+{
+	unsigned int old, *ptr;
+	int ret;
+
+	mutex_lock(&flow_limit_update_mutex);
+
+	ptr = table->data;
+	old = *ptr;
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (!ret && write && !is_power_of_2(*ptr)) {
+		*ptr = old;
+		ret = -EINVAL;
+	}
+
+	mutex_unlock(&flow_limit_update_mutex);
+	return ret;
+}
+#endif /* CONFIG_NET_FLOW_LIMIT */
+
 static struct ctl_table net_core_table[] = {
 #ifdef CONFIG_NET
 	{
@@ -180,6 +270,20 @@
 		.proc_handler	= rps_sock_flow_sysctl
 	},
 #endif
+#ifdef CONFIG_NET_FLOW_LIMIT
+	{
+		.procname	= "flow_limit_cpu_bitmap",
+		.mode		= 0644,
+		.proc_handler	= flow_limit_cpu_sysctl
+	},
+	{
+		.procname	= "flow_limit_table_len",
+		.data		= &netdev_flow_limit_table_len,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= flow_limit_table_len_sysctl
+	},
+#endif /* CONFIG_NET_FLOW_LIMIT */
 #endif /* CONFIG_NET */
 	{
 		.procname	= "netdev_budget",
...	...	@@ -1778,6 +1778,19 @@
1778	1778	return register_gifconf(family, NULL);
1779	1779	}
1780	1780
	1781	+#ifdef CONFIG_NET_FLOW_LIMIT
	1782	+#define FLOW_LIMIT_HISTORY (1 << 8) /* must be ^2 */
	1783	+struct sd_flow_limit {
	1784	+ u64 count;
	1785	+ unsigned int num_buckets;
	1786	+ unsigned int history_head;
	1787	+ u16 history[FLOW_LIMIT_HISTORY];
	1788	+ u8 buckets[];
	1789	+};
	1790	+
	1791	+extern int netdev_flow_limit_table_len;
	1792	+#endif /* CONFIG_NET_FLOW_LIMIT */
	1793	+
1781	1794	/*
1782	1795	* Incoming packets are placed on per-cpu queues
1783	1796	*/
...	...	@@ -1807,6 +1820,10 @@
1807	1820	unsigned int dropped;
1808	1821	struct sk_buff_head input_pkt_queue;
1809	1822	struct napi_struct backlog;
	1823	+
	1824	+#ifdef CONFIG_NET_FLOW_LIMIT
	1825	+ struct sd_flow_limit *flow_limit;
	1826	+#endif
1810	1827	};
1811	1828
1812	1829	static inline void input_queue_head_incr(struct softnet_data *sd)
...	...	@@ -259,6 +259,18 @@
259	259	packet sniffing (libpcap/tcpdump). Note : Admin should enable
260	260	this feature changing /proc/sys/net/core/bpf_jit_enable
261	261
	262	+config NET_FLOW_LIMIT
	263	+ boolean
	264	+ depends on RPS
	265	+ default y
	266	+ ---help---
	267	+ The network stack has to drop packets when a receive processing CPU's
	268	+ backlog reaches netdev_max_backlog. If a few out of many active flows
	269	+ generate the vast majority of load, drop their traffic earlier to
	270	+ maintain capacity for the other flows. This feature provides servers
	271	+ with many clients some protection against DoS by a single (spoofed)
	272	+ flow that greatly exceeds average workload.
	273	+
262	274	menu "Network testing"
263	275
264	276	config NET_PKTGEN
...	...	@@ -3064,6 +3064,46 @@
3064	3064	return 0;
3065	3065	}
3066	3066
	3067	+#ifdef CONFIG_NET_FLOW_LIMIT
	3068	+int netdev_flow_limit_table_len __read_mostly = (1 << 12);
	3069	+#endif
	3070	+
	3071	+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
	3072	+{
	3073	+#ifdef CONFIG_NET_FLOW_LIMIT
	3074	+ struct sd_flow_limit *fl;
	3075	+ struct softnet_data *sd;
	3076	+ unsigned int old_flow, new_flow;
	3077	+
	3078	+ if (qlen < (netdev_max_backlog >> 1))
	3079	+ return false;
	3080	+
	3081	+ sd = &__get_cpu_var(softnet_data);
	3082	+
	3083	+ rcu_read_lock();
	3084	+ fl = rcu_dereference(sd->flow_limit);
	3085	+ if (fl) {
	3086	+ new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
	3087	+ old_flow = fl->history[fl->history_head];
	3088	+ fl->history[fl->history_head] = new_flow;
	3089	+
	3090	+ fl->history_head++;
	3091	+ fl->history_head &= FLOW_LIMIT_HISTORY - 1;
	3092	+
	3093	+ if (likely(fl->buckets[old_flow]))
	3094	+ fl->buckets[old_flow]--;
	3095	+
	3096	+ if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
	3097	+ fl->count++;
	3098	+ rcu_read_unlock();
	3099	+ return true;
	3100	+ }
	3101	+ }
	3102	+ rcu_read_unlock();
	3103	+#endif
	3104	+ return false;
	3105	+}
	3106	+
3067	3107	/*
3068	3108	* enqueue_to_backlog is called to queue an skb to a per CPU backlog
3069	3109	* queue (may be a remote CPU queue).
3070	3110
...	...	@@ -3073,13 +3113,15 @@
3073	3113	{
3074	3114	struct softnet_data *sd;
3075	3115	unsigned long flags;
	3116	+ unsigned int qlen;
3076	3117
3077	3118	sd = &per_cpu(softnet_data, cpu);
3078	3119
3079	3120	local_irq_save(flags);
3080	3121
3081	3122	rps_lock(sd);
3082		- if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
	3123	+ qlen = skb_queue_len(&sd->input_pkt_queue);
	3124	+ if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3083	3125	if (skb_queue_len(&sd->input_pkt_queue)) {
3084	3126	enqueue:
3085	3127	__skb_queue_tail(&sd->input_pkt_queue, skb);
...	...	@@ -6269,6 +6311,10 @@
6269	6311	sd->backlog.weight = weight_p;
6270	6312	sd->backlog.gro_list = NULL;
6271	6313	sd->backlog.gro_count = 0;
	6314	+
	6315	+#ifdef CONFIG_NET_FLOW_LIMIT
	6316	+ sd->flow_limit = NULL;
	6317	+#endif
6272	6318	}
6273	6319
6274	6320	dev_boot_phase = 0;
...	...	@@ -146,11 +146,23 @@
146	146	static int softnet_seq_show(struct seq_file seq, void v)
147	147	{
148	148	struct softnet_data *sd = v;
	149	+ unsigned int flow_limit_count = 0;
149	150
150		- seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
	151	+#ifdef CONFIG_NET_FLOW_LIMIT
	152	+ struct sd_flow_limit *fl;
	153	+
	154	+ rcu_read_lock();
	155	+ fl = rcu_dereference(sd->flow_limit);
	156	+ if (fl)
	157	+ flow_limit_count = fl->count;
	158	+ rcu_read_unlock();
	159	+#endif
	160	+
	161	+ seq_printf(seq,
	162	+ "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
151	163	sd->processed, sd->dropped, sd->time_squeeze, 0,
152	164	0, 0, 0, 0, /* was fastroute */
153		- sd->cpu_collision, sd->received_rps);
	165	+ sd->cpu_collision, sd->received_rps, flow_limit_count);
154	166	return 0;
155	167	}
156	168
...	...	@@ -87,6 +87,96 @@
87	87	}
88	88	#endif /* CONFIG_RPS */
89	89
	90	+#ifdef CONFIG_NET_FLOW_LIMIT
	91	+static DEFINE_MUTEX(flow_limit_update_mutex);
	92	+
	93	+static int flow_limit_cpu_sysctl(ctl_table *table, int write,
	94	+ void __user buffer, size_t lenp,
	95	+ loff_t *ppos)
	96	+{
	97	+ struct sd_flow_limit *cur;
	98	+ struct softnet_data *sd;
	99	+ cpumask_var_t mask;
	100	+ int i, len, ret = 0;
	101	+
	102	+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
	103	+ return -ENOMEM;
	104	+
	105	+ if (write) {
	106	+ ret = cpumask_parse_user(buffer, *lenp, mask);
	107	+ if (ret)
	108	+ goto done;
	109	+
	110	+ mutex_lock(&flow_limit_update_mutex);
	111	+ len = sizeof(*cur) + netdev_flow_limit_table_len;
	112	+ for_each_possible_cpu(i) {
	113	+ sd = &per_cpu(softnet_data, i);
	114	+ cur = rcu_dereference_protected(sd->flow_limit,
	115	+ lockdep_is_held(&flow_limit_update_mutex));
	116	+ if (cur && !cpumask_test_cpu(i, mask)) {
	117	+ RCU_INIT_POINTER(sd->flow_limit, NULL);
	118	+ synchronize_rcu();
	119	+ kfree(cur);
	120	+ } else if (!cur && cpumask_test_cpu(i, mask)) {
	121	+ cur = kzalloc(len, GFP_KERNEL);
	122	+ if (!cur) {
	123	+ /* not unwinding previous changes */
	124	+ ret = -ENOMEM;
	125	+ goto write_unlock;
	126	+ }
	127	+ cur->num_buckets = netdev_flow_limit_table_len;
	128	+ rcu_assign_pointer(sd->flow_limit, cur);
	129	+ }
	130	+ }
	131	+write_unlock:
	132	+ mutex_unlock(&flow_limit_update_mutex);
	133	+ } else {
	134	+ if (ppos \|\| !lenp) {
	135	+ *lenp = 0;
	136	+ goto done;
	137	+ }
	138	+
	139	+ cpumask_clear(mask);
	140	+ rcu_read_lock();
	141	+ for_each_possible_cpu(i) {
	142	+ sd = &per_cpu(softnet_data, i);
	143	+ if (rcu_dereference(sd->flow_limit))
	144	+ cpumask_set_cpu(i, mask);
	145	+ }
	146	+ rcu_read_unlock();
	147	+
	148	+ len = cpumask_scnprintf(buffer, *lenp, mask);
	149	+ *lenp = len + 1;
	150	+ *ppos += len + 1;
	151	+ }
	152	+
	153	+done:
	154	+ free_cpumask_var(mask);
	155	+ return ret;
	156	+}
	157	+
	158	+static int flow_limit_table_len_sysctl(ctl_table *table, int write,
	159	+ void __user buffer, size_t lenp,
	160	+ loff_t *ppos)
	161	+{
	162	+ unsigned int old, *ptr;
	163	+ int ret;
	164	+
	165	+ mutex_lock(&flow_limit_update_mutex);
	166	+
	167	+ ptr = table->data;
	168	+ old = *ptr;
	169	+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
	170	+ if (!ret && write && !is_power_of_2(*ptr)) {
	171	+ *ptr = old;
	172	+ ret = -EINVAL;
	173	+ }
	174	+
	175	+ mutex_unlock(&flow_limit_update_mutex);
	176	+ return ret;
	177	+}
	178	+#endif /* CONFIG_NET_FLOW_LIMIT */
	179	+
90	180	static struct ctl_table net_core_table[] = {
91	181	#ifdef CONFIG_NET
92	182	{
...	...	@@ -180,6 +270,20 @@
180	270	.proc_handler = rps_sock_flow_sysctl
181	271	},
182	272	#endif
	273	+#ifdef CONFIG_NET_FLOW_LIMIT
	274	+ {
	275	+ .procname = "flow_limit_cpu_bitmap",
	276	+ .mode = 0644,
	277	+ .proc_handler = flow_limit_cpu_sysctl
	278	+ },
	279	+ {
	280	+ .procname = "flow_limit_table_len",
	281	+ .data = &netdev_flow_limit_table_len,
	282	+ .maxlen = sizeof(int),
	283	+ .mode = 0644,
	284	+ .proc_handler = flow_limit_table_len_sysctl
	285	+ },
	286	+#endif /* CONFIG_NET_FLOW_LIMIT */
183	287	#endif /* CONFIG_NET */
184	288	{
185	289	.procname = "netdev_budget",