Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar: "Four bugfixes and one performance fix" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Avoid integer overflow sched: Optimize task_sched_runtime() sched/numa: Cure update_numa_stats() vs. hotplug sched/numa: Fix NULL pointer dereference in task_numa_migrate() sched: Fix endless sync_sched/rcu() loop inside _cpu_down()

Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar: "Four bugfixes and one performance fix" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Avoid integer overflow sched: Optimize task_sched_runtime() sched/numa: Cure update_numa_stats() vs. hotplug sched/numa: Fix NULL pointer dereference in task_numa_migrate() sched: Fix endless sync_sched/rcu() loop inside _cpu_down()
Linus Torvalds
2 parents 5e30025a31 85b088e934
Showing 3 changed files Side-by-side Diff
kernel/cpu.c
kernel/sched/core.c
kernel/sched/fair.c
@@ -306,7 +306,6 @@
 				__func__, cpu);
 		goto out_release;
 	}
-	smpboot_park_threads(cpu);
  
 	/*
 	 * By now we've cleared cpu_active_mask, wait for all preempt-disabled
  
@@ -315,11 +314,15 @@
 	 *
 	 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
 	 * not imply sync_sched(), so explicitly call both.
+	 *
+	 * Do sync before park smpboot threads to take care the rcu boost case.
 	 */
 #ifdef CONFIG_PREEMPT
 	synchronize_sched();
 #endif
 	synchronize_rcu();
+
+	smpboot_park_threads(cpu);
  
 	/*
 	 * So now all preempt/rcu users must observe !cpu_active().
@@ -2253,6 +2253,20 @@
 	struct rq *rq;
 	u64 ns = 0;
  
+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+	/*
+	 * 64-bit doesn't need locks to atomically read a 64bit value.
+	 * So we have a optimization chance when the task's delta_exec is 0.
+	 * Reading ->on_cpu is racy, but this is ok.
+	 *
+	 * If we race with it leaving cpu, we'll take a lock. So we're correct.
+	 * If we race with it entering cpu, unaccounted time is 0. This is
+	 * indistinguishable from the read occurring a few cycles earlier.
+	 */
+	if (!p->on_cpu)
+		return p->se.sum_exec_runtime;
+#endif
+
 	rq = task_rq_lock(p, &flags);
 	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
 	task_rq_unlock(rq, p, &flags);
@@ -1000,7 +1000,7 @@
  */
 static void update_numa_stats(struct numa_stats *ns, int nid)
 {
-	int cpu;
+	int cpu, cpus = 0;
  
 	memset(ns, 0, sizeof(*ns));
 	for_each_cpu(cpu, cpumask_of_node(nid)) {
  
@@ -1009,8 +1009,21 @@
 		ns->nr_running += rq->nr_running;
 		ns->load += weighted_cpuload(cpu);
 		ns->power += power_of(cpu);
+
+		cpus++;
 	}
  
+	/*
+	 * If we raced with hotplug and there are no CPUs left in our mask
+	 * the @ns structure is NULL'ed and task_numa_compare() will
+	 * not find this node attractive.
+	 *
+	 * We'll either bail at !has_capacity, or we'll detect a huge imbalance
+	 * and bail there.
+	 */
+	if (!cpus)
+		return;
+
 	ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
 	ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
 	ns->has_capacity = (ns->nr_running < ns->capacity);
  
@@ -1201,9 +1214,21 @@
 	 */
 	rcu_read_lock();
 	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
-	env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+	if (sd)
+		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
 	rcu_read_unlock();
  
+	/*
+	 * Cpusets can break the scheduler domain tree into smaller
+	 * balance domains, some of which do not cross NUMA boundaries.
+	 * Tasks that are "trapped" in such domains cannot be migrated
+	 * elsewhere, so there is no point in (re)trying.
+	 */
+	if (unlikely(!sd)) {
+		p->numa_preferred_nid = cpu_to_node(task_cpu(p));
+		return -EINVAL;
+	}
+
 	taskweight = task_weight(p, env.src_nid);
 	groupweight = group_weight(p, env.src_nid);
 	update_numa_stats(&env.src_stats, env.src_nid);
@@ -2153,7 +2178,7 @@
 	long contrib;
  
 	/* The fraction of a cpu used by this cfs_rq */
-	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
+	contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
 			  sa->runnable_avg_period + 1);
 	contrib -= cfs_rq->tg_runnable_contrib;
...	...	@@ -306,7 +306,6 @@
306	306	__func__, cpu);
307	307	goto out_release;
308	308	}
309		- smpboot_park_threads(cpu);
310	309
311	310	/*
312	311	* By now we've cleared cpu_active_mask, wait for all preempt-disabled
313	312
...	...	@@ -315,11 +314,15 @@
315	314	*
316	315	* For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
317	316	* not imply sync_sched(), so explicitly call both.
	317	+ *
	318	+ * Do sync before park smpboot threads to take care the rcu boost case.
318	319	*/
319	320	#ifdef CONFIG_PREEMPT
320	321	synchronize_sched();
321	322	#endif
322	323	synchronize_rcu();
	324	+
	325	+ smpboot_park_threads(cpu);
323	326
324	327	/*
325	328	* So now all preempt/rcu users must observe !cpu_active().
...	...	@@ -2253,6 +2253,20 @@
2253	2253	struct rq *rq;
2254	2254	u64 ns = 0;
2255	2255
	2256	+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
	2257	+ /*
	2258	+ * 64-bit doesn't need locks to atomically read a 64bit value.
	2259	+ * So we have a optimization chance when the task's delta_exec is 0.
	2260	+ * Reading ->on_cpu is racy, but this is ok.
	2261	+ *
	2262	+ * If we race with it leaving cpu, we'll take a lock. So we're correct.
	2263	+ * If we race with it entering cpu, unaccounted time is 0. This is
	2264	+ * indistinguishable from the read occurring a few cycles earlier.
	2265	+ */
	2266	+ if (!p->on_cpu)
	2267	+ return p->se.sum_exec_runtime;
	2268	+#endif
	2269	+
2256	2270	rq = task_rq_lock(p, &flags);
2257	2271	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2258	2272	task_rq_unlock(rq, p, &flags);
...	...	@@ -1000,7 +1000,7 @@
1000	1000	*/
1001	1001	static void update_numa_stats(struct numa_stats *ns, int nid)
1002	1002	{
1003		- int cpu;
	1003	+ int cpu, cpus = 0;
1004	1004
1005	1005	memset(ns, 0, sizeof(*ns));
1006	1006	for_each_cpu(cpu, cpumask_of_node(nid)) {
1007	1007
...	...	@@ -1009,8 +1009,21 @@
1009	1009	ns->nr_running += rq->nr_running;
1010	1010	ns->load += weighted_cpuload(cpu);
1011	1011	ns->power += power_of(cpu);
	1012	+
	1013	+ cpus++;
1012	1014	}
1013	1015
	1016	+ /*
	1017	+ * If we raced with hotplug and there are no CPUs left in our mask
	1018	+ * the @ns structure is NULL'ed and task_numa_compare() will
	1019	+ * not find this node attractive.
	1020	+ *
	1021	+ * We'll either bail at !has_capacity, or we'll detect a huge imbalance
	1022	+ * and bail there.
	1023	+ */
	1024	+ if (!cpus)
	1025	+ return;
	1026	+
1014	1027	ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
1015	1028	ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
1016	1029	ns->has_capacity = (ns->nr_running < ns->capacity);
1017	1030
...	...	@@ -1201,9 +1214,21 @@
1201	1214	*/
1202	1215	rcu_read_lock();
1203	1216	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1204		- env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
	1217	+ if (sd)
	1218	+ env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1205	1219	rcu_read_unlock();
1206	1220
	1221	+ /*
	1222	+ * Cpusets can break the scheduler domain tree into smaller
	1223	+ * balance domains, some of which do not cross NUMA boundaries.
	1224	+ * Tasks that are "trapped" in such domains cannot be migrated
	1225	+ * elsewhere, so there is no point in (re)trying.
	1226	+ */
	1227	+ if (unlikely(!sd)) {
	1228	+ p->numa_preferred_nid = cpu_to_node(task_cpu(p));
	1229	+ return -EINVAL;
	1230	+ }
	1231	+
1207	1232	taskweight = task_weight(p, env.src_nid);
1208	1233	groupweight = group_weight(p, env.src_nid);
1209	1234	update_numa_stats(&env.src_stats, env.src_nid);
...	...	@@ -2153,7 +2178,7 @@
2153	2178	long contrib;
2154	2179
2155	2180	/* The fraction of a cpu used by this cfs_rq */
2156		- contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
	2181	+ contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
2157	2182	sa->runnable_avg_period + 1);
2158	2183	contrib -= cfs_rq->tg_runnable_contrib;
2159	2184