Commit fe8a45df368038566c62bf311accf4319b210123

Authored by Linus Torvalds

Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar:
 "Four bugfixes and one performance fix"

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/fair: Avoid integer overflow
  sched: Optimize task_sched_runtime()
  sched/numa: Cure update_numa_stats() vs. hotplug
  sched/numa: Fix NULL pointer dereference in task_numa_migrate()
  sched: Fix endless sync_sched/rcu() loop inside _cpu_down()

Showing 3 changed files Side-by-side Diff

... ... @@ -306,7 +306,6 @@
306 306 __func__, cpu);
307 307 goto out_release;
308 308 }
309   - smpboot_park_threads(cpu);
310 309  
311 310 /*
312 311 * By now we've cleared cpu_active_mask, wait for all preempt-disabled
313 312  
... ... @@ -315,11 +314,15 @@
315 314 *
316 315 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
317 316 * not imply sync_sched(), so explicitly call both.
  317 + *
  318 + * Do sync before park smpboot threads to take care the rcu boost case.
318 319 */
319 320 #ifdef CONFIG_PREEMPT
320 321 synchronize_sched();
321 322 #endif
322 323 synchronize_rcu();
  324 +
  325 + smpboot_park_threads(cpu);
323 326  
324 327 /*
325 328 * So now all preempt/rcu users must observe !cpu_active().
... ... @@ -2253,6 +2253,20 @@
2253 2253 struct rq *rq;
2254 2254 u64 ns = 0;
2255 2255  
  2256 +#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
  2257 + /*
  2258 + * 64-bit doesn't need locks to atomically read a 64bit value.
  2259 + * So we have a optimization chance when the task's delta_exec is 0.
  2260 + * Reading ->on_cpu is racy, but this is ok.
  2261 + *
  2262 + * If we race with it leaving cpu, we'll take a lock. So we're correct.
  2263 + * If we race with it entering cpu, unaccounted time is 0. This is
  2264 + * indistinguishable from the read occurring a few cycles earlier.
  2265 + */
  2266 + if (!p->on_cpu)
  2267 + return p->se.sum_exec_runtime;
  2268 +#endif
  2269 +
2256 2270 rq = task_rq_lock(p, &flags);
2257 2271 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2258 2272 task_rq_unlock(rq, p, &flags);
... ... @@ -1000,7 +1000,7 @@
1000 1000 */
1001 1001 static void update_numa_stats(struct numa_stats *ns, int nid)
1002 1002 {
1003   - int cpu;
  1003 + int cpu, cpus = 0;
1004 1004  
1005 1005 memset(ns, 0, sizeof(*ns));
1006 1006 for_each_cpu(cpu, cpumask_of_node(nid)) {
1007 1007  
... ... @@ -1009,8 +1009,21 @@
1009 1009 ns->nr_running += rq->nr_running;
1010 1010 ns->load += weighted_cpuload(cpu);
1011 1011 ns->power += power_of(cpu);
  1012 +
  1013 + cpus++;
1012 1014 }
1013 1015  
  1016 + /*
  1017 + * If we raced with hotplug and there are no CPUs left in our mask
  1018 + * the @ns structure is NULL'ed and task_numa_compare() will
  1019 + * not find this node attractive.
  1020 + *
  1021 + * We'll either bail at !has_capacity, or we'll detect a huge imbalance
  1022 + * and bail there.
  1023 + */
  1024 + if (!cpus)
  1025 + return;
  1026 +
1014 1027 ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
1015 1028 ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
1016 1029 ns->has_capacity = (ns->nr_running < ns->capacity);
1017 1030  
... ... @@ -1201,9 +1214,21 @@
1201 1214 */
1202 1215 rcu_read_lock();
1203 1216 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1204   - env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
  1217 + if (sd)
  1218 + env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1205 1219 rcu_read_unlock();
1206 1220  
  1221 + /*
  1222 + * Cpusets can break the scheduler domain tree into smaller
  1223 + * balance domains, some of which do not cross NUMA boundaries.
  1224 + * Tasks that are "trapped" in such domains cannot be migrated
  1225 + * elsewhere, so there is no point in (re)trying.
  1226 + */
  1227 + if (unlikely(!sd)) {
  1228 + p->numa_preferred_nid = cpu_to_node(task_cpu(p));
  1229 + return -EINVAL;
  1230 + }
  1231 +
1207 1232 taskweight = task_weight(p, env.src_nid);
1208 1233 groupweight = group_weight(p, env.src_nid);
1209 1234 update_numa_stats(&env.src_stats, env.src_nid);
... ... @@ -2153,7 +2178,7 @@
2153 2178 long contrib;
2154 2179  
2155 2180 /* The fraction of a cpu used by this cfs_rq */
2156   - contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
  2181 + contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
2157 2182 sa->runnable_avg_period + 1);
2158 2183 contrib -= cfs_rq->tg_runnable_contrib;
2159 2184