sched: avoid large irq-latencies in smp-balancing

SMP balancing is done with IRQs disabled and can iterate the full rq. When rqs are large this can cause large irq-latencies. Limit the nr of iterations on each run. This fixes a scheduling latency regression reported by the -rt folks. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Steven Rostedt <rostedt@goodmis.org> Tested-by: Gregory Haskins <ghaskins@novell.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>

sched: avoid large irq-latencies in smp-balancing
SMP balancing is done with IRQs disabled and can iterate the full rq. When rqs are large this can cause large irq-latencies. Limit the nr of iterations on each run. This fixes a scheduling latency regression reported by the -rt folks. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Steven Rostedt <rostedt@goodmis.org> Tested-by: Gregory Haskins <ghaskins@novell.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Peter Zijlstra · Ingo Molnar
1 parent 3c90e6e99b
Showing 3 changed files with 19 additions and 5 deletions Side-by-side Diff
include/linux/sched.h
kernel/sched.c
kernel/sysctl.c
@@ -1466,6 +1466,7 @@
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
+extern unsigned int sysctl_sched_nr_migrate;
  
 int sched_nr_latency_handler(struct ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length,
@@ -472,6 +472,12 @@
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
  
 /*
+ * Number of tasks to iterate in a single balance run.
+ * Limited because this is done with IRQs disabled.
+ */
+const_debug unsigned int sysctl_sched_nr_migrate = 32;
+
+/*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
  * clock constructed from sched_clock():
  */
@@ -2235,7 +2241,7 @@
 	      enum cpu_idle_type idle, int *all_pinned,
 	      int *this_best_prio, struct rq_iterator *iterator)
 {
-	int pulled = 0, pinned = 0, skip_for_load;
+	int loops = 0, pulled = 0, pinned = 0, skip_for_load;
 	struct task_struct *p;
 	long rem_load_move = max_load_move;
  
  
@@ -2249,10 +2255,10 @@
 	 */
 	p = iterator->start(iterator->arg);
 next:
-	if (!p)
+	if (!p || loops++ > sysctl_sched_nr_migrate)
 		goto out;
 	/*
-	 * To help distribute high priority tasks accross CPUs we don't
+	 * To help distribute high priority tasks across CPUs we don't
 	 * skip a task if it will be the highest priority task (i.e. smallest
 	 * prio value) on its new queue regardless of its load weight
 	 */
@@ -2269,8 +2275,7 @@
 	rem_load_move -= p->se.load.weight;
  
 	/*
-	 * We only want to steal up to the prescribed number of tasks
-	 * and the prescribed amount of weighted load.
+	 * We only want to steal up to the prescribed amount of weighted load.
 	 */
 	if (rem_load_move > 0) {
 		if (p->prio < *this_best_prio)
@@ -301,6 +301,14 @@
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_nr_migrate",
+		.data		= &sysctl_sched_nr_migrate,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 	{
 		.ctl_name	= CTL_UNNUMBERED,
...	...	@@ -1466,6 +1466,7 @@
1466	1466	extern unsigned int sysctl_sched_child_runs_first;
1467	1467	extern unsigned int sysctl_sched_features;
1468	1468	extern unsigned int sysctl_sched_migration_cost;
	1469	+extern unsigned int sysctl_sched_nr_migrate;
1469	1470
1470	1471	int sched_nr_latency_handler(struct ctl_table *table, int write,
1471	1472	struct file file, void __user buffer, size_t *length,
...	...	@@ -472,6 +472,12 @@
472	472	#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
473	473
474	474	/*
	475	+ * Number of tasks to iterate in a single balance run.
	476	+ * Limited because this is done with IRQs disabled.
	477	+ */
	478	+const_debug unsigned int sysctl_sched_nr_migrate = 32;
	479	+
	480	+/*
475	481	* For kernel-internal use: high-speed (but slightly incorrect) per-cpu
476	482	* clock constructed from sched_clock():
477	483	*/
...	...	@@ -2235,7 +2241,7 @@
2235	2241	enum cpu_idle_type idle, int *all_pinned,
2236	2242	int this_best_prio, struct rq_iterator iterator)
2237	2243	{
2238		- int pulled = 0, pinned = 0, skip_for_load;
	2244	+ int loops = 0, pulled = 0, pinned = 0, skip_for_load;
2239	2245	struct task_struct *p;
2240	2246	long rem_load_move = max_load_move;
2241	2247
2242	2248
...	...	@@ -2249,10 +2255,10 @@
2249	2255	*/
2250	2256	p = iterator->start(iterator->arg);
2251	2257	next:
2252		- if (!p)
	2258	+ if (!p \|\| loops++ > sysctl_sched_nr_migrate)
2253	2259	goto out;
2254	2260	/*
2255		- * To help distribute high priority tasks accross CPUs we don't
	2261	+ * To help distribute high priority tasks across CPUs we don't
2256	2262	* skip a task if it will be the highest priority task (i.e. smallest
2257	2263	* prio value) on its new queue regardless of its load weight
2258	2264	*/
...	...	@@ -2269,8 +2275,7 @@
2269	2275	rem_load_move -= p->se.load.weight;
2270	2276
2271	2277	/*
2272		- * We only want to steal up to the prescribed number of tasks
2273		- * and the prescribed amount of weighted load.
	2278	+ * We only want to steal up to the prescribed amount of weighted load.
2274	2279	*/
2275	2280	if (rem_load_move > 0) {
2276	2281	if (p->prio < *this_best_prio)
...	...	@@ -301,6 +301,14 @@
301	301	.mode = 0644,
302	302	.proc_handler = &proc_dointvec,
303	303	},
	304	+ {
	305	+ .ctl_name = CTL_UNNUMBERED,
	306	+ .procname = "sched_nr_migrate",
	307	+ .data = &sysctl_sched_nr_migrate,
	308	+ .maxlen = sizeof(unsigned int),
	309	+ .mode = 644,
	310	+ .proc_handler = &proc_dointvec,
	311	+ },
304	312	#endif
305	313	{
306	314	.ctl_name = CTL_UNNUMBERED,