Commit fe8a45df368038566c62bf311accf4319b210123
Exists in
smarc-imx_3.14.28_1.0.0_ga
and in
1 other branch
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar: "Four bugfixes and one performance fix" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Avoid integer overflow sched: Optimize task_sched_runtime() sched/numa: Cure update_numa_stats() vs. hotplug sched/numa: Fix NULL pointer dereference in task_numa_migrate() sched: Fix endless sync_sched/rcu() loop inside _cpu_down()
Showing 3 changed files Side-by-side Diff
kernel/cpu.c
... | ... | @@ -306,7 +306,6 @@ |
306 | 306 | __func__, cpu); |
307 | 307 | goto out_release; |
308 | 308 | } |
309 | - smpboot_park_threads(cpu); | |
310 | 309 | |
311 | 310 | /* |
312 | 311 | * By now we've cleared cpu_active_mask, wait for all preempt-disabled |
313 | 312 | |
... | ... | @@ -315,11 +314,15 @@ |
315 | 314 | * |
316 | 315 | * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might |
317 | 316 | * not imply sync_sched(), so explicitly call both. |
317 | + * | |
318 | + * Do sync before park smpboot threads to take care the rcu boost case. | |
318 | 319 | */ |
319 | 320 | #ifdef CONFIG_PREEMPT |
320 | 321 | synchronize_sched(); |
321 | 322 | #endif |
322 | 323 | synchronize_rcu(); |
324 | + | |
325 | + smpboot_park_threads(cpu); | |
323 | 326 | |
324 | 327 | /* |
325 | 328 | * So now all preempt/rcu users must observe !cpu_active(). |
kernel/sched/core.c
... | ... | @@ -2253,6 +2253,20 @@ |
2253 | 2253 | struct rq *rq; |
2254 | 2254 | u64 ns = 0; |
2255 | 2255 | |
2256 | +#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) | |
2257 | + /* | |
2258 | + * 64-bit doesn't need locks to atomically read a 64bit value. | |
2259 | + * So we have a optimization chance when the task's delta_exec is 0. | |
2260 | + * Reading ->on_cpu is racy, but this is ok. | |
2261 | + * | |
2262 | + * If we race with it leaving cpu, we'll take a lock. So we're correct. | |
2263 | + * If we race with it entering cpu, unaccounted time is 0. This is | |
2264 | + * indistinguishable from the read occurring a few cycles earlier. | |
2265 | + */ | |
2266 | + if (!p->on_cpu) | |
2267 | + return p->se.sum_exec_runtime; | |
2268 | +#endif | |
2269 | + | |
2256 | 2270 | rq = task_rq_lock(p, &flags); |
2257 | 2271 | ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); |
2258 | 2272 | task_rq_unlock(rq, p, &flags); |
kernel/sched/fair.c
... | ... | @@ -1000,7 +1000,7 @@ |
1000 | 1000 | */ |
1001 | 1001 | static void update_numa_stats(struct numa_stats *ns, int nid) |
1002 | 1002 | { |
1003 | - int cpu; | |
1003 | + int cpu, cpus = 0; | |
1004 | 1004 | |
1005 | 1005 | memset(ns, 0, sizeof(*ns)); |
1006 | 1006 | for_each_cpu(cpu, cpumask_of_node(nid)) { |
1007 | 1007 | |
... | ... | @@ -1009,8 +1009,21 @@ |
1009 | 1009 | ns->nr_running += rq->nr_running; |
1010 | 1010 | ns->load += weighted_cpuload(cpu); |
1011 | 1011 | ns->power += power_of(cpu); |
1012 | + | |
1013 | + cpus++; | |
1012 | 1014 | } |
1013 | 1015 | |
1016 | + /* | |
1017 | + * If we raced with hotplug and there are no CPUs left in our mask | |
1018 | + * the @ns structure is NULL'ed and task_numa_compare() will | |
1019 | + * not find this node attractive. | |
1020 | + * | |
1021 | + * We'll either bail at !has_capacity, or we'll detect a huge imbalance | |
1022 | + * and bail there. | |
1023 | + */ | |
1024 | + if (!cpus) | |
1025 | + return; | |
1026 | + | |
1014 | 1027 | ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power; |
1015 | 1028 | ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE); |
1016 | 1029 | ns->has_capacity = (ns->nr_running < ns->capacity); |
1017 | 1030 | |
... | ... | @@ -1201,9 +1214,21 @@ |
1201 | 1214 | */ |
1202 | 1215 | rcu_read_lock(); |
1203 | 1216 | sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu)); |
1204 | - env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; | |
1217 | + if (sd) | |
1218 | + env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2; | |
1205 | 1219 | rcu_read_unlock(); |
1206 | 1220 | |
1221 | + /* | |
1222 | + * Cpusets can break the scheduler domain tree into smaller | |
1223 | + * balance domains, some of which do not cross NUMA boundaries. | |
1224 | + * Tasks that are "trapped" in such domains cannot be migrated | |
1225 | + * elsewhere, so there is no point in (re)trying. | |
1226 | + */ | |
1227 | + if (unlikely(!sd)) { | |
1228 | + p->numa_preferred_nid = cpu_to_node(task_cpu(p)); | |
1229 | + return -EINVAL; | |
1230 | + } | |
1231 | + | |
1207 | 1232 | taskweight = task_weight(p, env.src_nid); |
1208 | 1233 | groupweight = group_weight(p, env.src_nid); |
1209 | 1234 | update_numa_stats(&env.src_stats, env.src_nid); |
... | ... | @@ -2153,7 +2178,7 @@ |
2153 | 2178 | long contrib; |
2154 | 2179 | |
2155 | 2180 | /* The fraction of a cpu used by this cfs_rq */ |
2156 | - contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT, | |
2181 | + contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, | |
2157 | 2182 | sa->runnable_avg_period + 1); |
2158 | 2183 | contrib -= cfs_rq->tg_runnable_contrib; |
2159 | 2184 |