Commit 27f4d28057adf98750cf863c40baefb12f5b6d21

Authored by Paul E. McKenney
Committed by Paul E. McKenney
1 parent a26ac2455f

rcu: priority boosting for TREE_PREEMPT_RCU

Add priority boosting for TREE_PREEMPT_RCU, similar to that for
TINY_PREEMPT_RCU.  This is enabled by the default-off RCU_BOOST
kernel parameter.  The priority to which to boost preempted
RCU readers is controlled by the RCU_BOOST_PRIO kernel parameter
(defaulting to real-time priority 1) and the time to wait before
boosting the readers who are blocking a given grace period is
controlled by the RCU_BOOST_DELAY kernel parameter (defaulting to
500 milliseconds).

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>

Showing 4 changed files with 411 additions and 51 deletions Side-by-side Diff

... ... @@ -485,7 +485,7 @@
485 485  
486 486 config RCU_BOOST
487 487 bool "Enable RCU priority boosting"
488   - depends on RT_MUTEXES && TINY_PREEMPT_RCU
  488 + depends on RT_MUTEXES && PREEMPT_RCU
489 489 default n
490 490 help
491 491 This option boosts the priority of preempted RCU readers that
... ... @@ -81,6 +81,8 @@
81 81 struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
82 82 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
83 83  
  84 +static struct rcu_state *rcu_state;
  85 +
84 86 int rcu_scheduler_active __read_mostly;
85 87 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
86 88  
... ... @@ -94,7 +96,7 @@
94 96 static char rcu_kthreads_spawnable;
95 97  
96 98 static void rcu_node_kthread_setaffinity(struct rcu_node *rnp);
97   -static void invoke_rcu_kthread(void);
  99 +static void invoke_rcu_cpu_kthread(void);
98 100  
99 101 #define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */
100 102  
... ... @@ -791,6 +793,7 @@
791 793 rnp->completed = rsp->completed;
792 794 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
793 795 rcu_start_gp_per_cpu(rsp, rnp, rdp);
  796 + rcu_preempt_boost_start_gp(rnp);
794 797 raw_spin_unlock_irqrestore(&rnp->lock, flags);
795 798 return;
796 799 }
... ... @@ -826,6 +829,7 @@
826 829 rnp->completed = rsp->completed;
827 830 if (rnp == rdp->mynode)
828 831 rcu_start_gp_per_cpu(rsp, rnp, rdp);
  832 + rcu_preempt_boost_start_gp(rnp);
829 833 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
830 834 }
831 835  
... ... @@ -882,7 +886,7 @@
882 886 return;
883 887 }
884 888 rnp->qsmask &= ~mask;
885   - if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
  889 + if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
886 890  
887 891 /* Other bits still set at this level, so done. */
888 892 raw_spin_unlock_irqrestore(&rnp->lock, flags);
889 893  
... ... @@ -1089,8 +1093,11 @@
1089 1093 t = rnp->node_kthread_task;
1090 1094 if (t != NULL &&
1091 1095 rnp->qsmaskinit == 0) {
1092   - kthread_stop(t);
  1096 + raw_spin_lock_irqsave(&rnp->lock, flags);
1093 1097 rnp->node_kthread_task = NULL;
  1098 + raw_spin_unlock_irqrestore(&rnp->lock, flags);
  1099 + kthread_stop(t);
  1100 + rcu_stop_boost_kthread(rnp);
1094 1101 } else
1095 1102 rcu_node_kthread_setaffinity(rnp);
1096 1103 }
... ... @@ -1190,7 +1197,7 @@
1190 1197  
1191 1198 /* Re-raise the RCU softirq if there are callbacks remaining. */
1192 1199 if (cpu_has_callbacks_ready_to_invoke(rdp))
1193   - invoke_rcu_kthread();
  1200 + invoke_rcu_cpu_kthread();
1194 1201 }
1195 1202  
1196 1203 /*
... ... @@ -1236,7 +1243,7 @@
1236 1243 }
1237 1244 rcu_preempt_check_callbacks(cpu);
1238 1245 if (rcu_pending(cpu))
1239   - invoke_rcu_kthread();
  1246 + invoke_rcu_cpu_kthread();
1240 1247 }
1241 1248  
1242 1249 #ifdef CONFIG_SMP
... ... @@ -1244,6 +1251,8 @@
1244 1251 /*
1245 1252 * Scan the leaf rcu_node structures, processing dyntick state for any that
1246 1253 * have not yet encountered a quiescent state, using the function specified.
  1254 + * Also initiate boosting for any threads blocked on the root rcu_node.
  1255 + *
1247 1256 * The caller must have suppressed start of new grace periods.
1248 1257 */
1249 1258 static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
... ... @@ -1262,6 +1271,7 @@
1262 1271 return;
1263 1272 }
1264 1273 if (rnp->qsmask == 0) {
  1274 + rcu_initiate_boost(rnp);
1265 1275 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1266 1276 continue;
1267 1277 }
... ... @@ -1280,6 +1290,11 @@
1280 1290 }
1281 1291 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1282 1292 }
  1293 + rnp = rcu_get_root(rsp);
  1294 + raw_spin_lock_irqsave(&rnp->lock, flags);
  1295 + if (rnp->qsmask == 0)
  1296 + rcu_initiate_boost(rnp);
  1297 + raw_spin_unlock_irqrestore(&rnp->lock, flags);
1283 1298 }
1284 1299  
1285 1300 /*
... ... @@ -1417,7 +1432,7 @@
1417 1432 * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
1418 1433 * cannot disappear out from under us.
1419 1434 */
1420   -static void invoke_rcu_kthread(void)
  1435 +static void invoke_rcu_cpu_kthread(void)
1421 1436 {
1422 1437 unsigned long flags;
1423 1438 wait_queue_head_t *q;
1424 1439  
1425 1440  
1426 1441  
1427 1442  
... ... @@ -1436,24 +1451,33 @@
1436 1451 }
1437 1452  
1438 1453 /*
  1454 + * Wake up the specified per-rcu_node-structure kthread.
  1455 + * The caller must hold ->lock.
  1456 + */
  1457 +static void invoke_rcu_node_kthread(struct rcu_node *rnp)
  1458 +{
  1459 + struct task_struct *t;
  1460 +
  1461 + t = rnp->node_kthread_task;
  1462 + if (t != NULL)
  1463 + wake_up_process(t);
  1464 +}
  1465 +
  1466 +/*
1439 1467 * Timer handler to initiate the waking up of per-CPU kthreads that
1440 1468 * have yielded the CPU due to excess numbers of RCU callbacks.
  1469 + * We wake up the per-rcu_node kthread, which in turn will wake up
  1470 + * the booster kthread.
1441 1471 */
1442 1472 static void rcu_cpu_kthread_timer(unsigned long arg)
1443 1473 {
1444 1474 unsigned long flags;
1445   - struct rcu_data *rdp = (struct rcu_data *)arg;
  1475 + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
1446 1476 struct rcu_node *rnp = rdp->mynode;
1447   - struct task_struct *t;
1448 1477  
1449 1478 raw_spin_lock_irqsave(&rnp->lock, flags);
1450 1479 rnp->wakemask |= rdp->grpmask;
1451   - t = rnp->node_kthread_task;
1452   - if (t == NULL) {
1453   - raw_spin_unlock_irqrestore(&rnp->lock, flags);
1454   - return;
1455   - }
1456   - wake_up_process(t);
  1480 + invoke_rcu_node_kthread(rnp);
1457 1481 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1458 1482 }
1459 1483  
1460 1484  
1461 1485  
... ... @@ -1463,13 +1487,12 @@
1463 1487 * remain preempted. Either way, we restore our real-time priority
1464 1488 * before returning.
1465 1489 */
1466   -static void rcu_yield(int cpu)
  1490 +static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
1467 1491 {
1468   - struct rcu_data *rdp = per_cpu_ptr(rcu_sched_state.rda, cpu);
1469 1492 struct sched_param sp;
1470 1493 struct timer_list yield_timer;
1471 1494  
1472   - setup_timer_on_stack(&yield_timer, rcu_cpu_kthread_timer, (unsigned long)rdp);
  1495 + setup_timer_on_stack(&yield_timer, f, arg);
1473 1496 mod_timer(&yield_timer, jiffies + 2);
1474 1497 sp.sched_priority = 0;
1475 1498 sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
... ... @@ -1540,7 +1563,7 @@
1540 1563 else
1541 1564 spincnt = 0;
1542 1565 if (spincnt > 10) {
1543   - rcu_yield(cpu);
  1566 + rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
1544 1567 spincnt = 0;
1545 1568 }
1546 1569 }
... ... @@ -1597,6 +1620,7 @@
1597 1620 raw_spin_lock_irqsave(&rnp->lock, flags);
1598 1621 mask = rnp->wakemask;
1599 1622 rnp->wakemask = 0;
  1623 + rcu_initiate_boost(rnp);
1600 1624 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1601 1625 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
1602 1626 if ((mask & 0x1) == 0)
... ... @@ -1618,7 +1642,8 @@
1618 1642  
1619 1643 /*
1620 1644 * Set the per-rcu_node kthread's affinity to cover all CPUs that are
1621   - * served by the rcu_node in question.
  1645 + * served by the rcu_node in question. The CPU hotplug lock is still
  1646 + * held, so the value of rnp->qsmaskinit will be stable.
1622 1647 */
1623 1648 static void rcu_node_kthread_setaffinity(struct rcu_node *rnp)
1624 1649 {
... ... @@ -1626,8 +1651,7 @@
1626 1651 int cpu;
1627 1652 unsigned long mask = rnp->qsmaskinit;
1628 1653  
1629   - if (rnp->node_kthread_task == NULL ||
1630   - rnp->qsmaskinit == 0)
  1654 + if (rnp->node_kthread_task == NULL || mask == 0)
1631 1655 return;
1632 1656 if (!alloc_cpumask_var(&cm, GFP_KERNEL))
1633 1657 return;
1634 1658  
1635 1659  
1636 1660  
1637 1661  
... ... @@ -1636,31 +1660,40 @@
1636 1660 if (mask & 0x1)
1637 1661 cpumask_set_cpu(cpu, cm);
1638 1662 set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
  1663 + rcu_boost_kthread_setaffinity(rnp, cm);
1639 1664 free_cpumask_var(cm);
1640 1665 }
1641 1666  
1642 1667 /*
1643 1668 * Spawn a per-rcu_node kthread, setting priority and affinity.
  1669 + * Called during boot before online/offline can happen, or, if
  1670 + * during runtime, with the main CPU-hotplug locks held. So only
  1671 + * one of these can be executing at a time.
1644 1672 */
1645 1673 static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
1646 1674 struct rcu_node *rnp)
1647 1675 {
  1676 + unsigned long flags;
1648 1677 int rnp_index = rnp - &rsp->node[0];
1649 1678 struct sched_param sp;
1650 1679 struct task_struct *t;
1651 1680  
1652 1681 if (!rcu_kthreads_spawnable ||
1653   - rnp->qsmaskinit == 0 ||
1654   - rnp->node_kthread_task != NULL)
  1682 + rnp->qsmaskinit == 0)
1655 1683 return 0;
1656   - t = kthread_create(rcu_node_kthread, (void *)rnp, "rcun%d", rnp_index);
1657   - if (IS_ERR(t))
1658   - return PTR_ERR(t);
1659   - rnp->node_kthread_task = t;
1660   - wake_up_process(t);
1661   - sp.sched_priority = 99;
1662   - sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1663   - return 0;
  1684 + if (rnp->node_kthread_task == NULL) {
  1685 + t = kthread_create(rcu_node_kthread, (void *)rnp,
  1686 + "rcun%d", rnp_index);
  1687 + if (IS_ERR(t))
  1688 + return PTR_ERR(t);
  1689 + raw_spin_lock_irqsave(&rnp->lock, flags);
  1690 + rnp->node_kthread_task = t;
  1691 + raw_spin_unlock_irqrestore(&rnp->lock, flags);
  1692 + wake_up_process(t);
  1693 + sp.sched_priority = 99;
  1694 + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
  1695 + }
  1696 + return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
1664 1697 }
1665 1698  
1666 1699 /*
... ... @@ -1678,10 +1711,16 @@
1678 1711 if (cpu_online(cpu))
1679 1712 (void)rcu_spawn_one_cpu_kthread(cpu);
1680 1713 }
1681   - rcu_for_each_leaf_node(&rcu_sched_state, rnp) {
1682   - init_waitqueue_head(&rnp->node_wq);
1683   - (void)rcu_spawn_one_node_kthread(&rcu_sched_state, rnp);
1684   - }
  1714 + rnp = rcu_get_root(rcu_state);
  1715 + init_waitqueue_head(&rnp->node_wq);
  1716 + rcu_init_boost_waitqueue(rnp);
  1717 + (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
  1718 + if (NUM_RCU_NODES > 1)
  1719 + rcu_for_each_leaf_node(rcu_state, rnp) {
  1720 + init_waitqueue_head(&rnp->node_wq);
  1721 + rcu_init_boost_waitqueue(rnp);
  1722 + (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
  1723 + }
1685 1724 return 0;
1686 1725 }
1687 1726 early_initcall(rcu_spawn_kthreads);
1688 1727  
... ... @@ -2087,14 +2126,14 @@
2087 2126  
2088 2127 static void __cpuinit rcu_online_kthreads(int cpu)
2089 2128 {
2090   - struct rcu_data *rdp = per_cpu_ptr(rcu_sched_state.rda, cpu);
  2129 + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2091 2130 struct rcu_node *rnp = rdp->mynode;
2092 2131  
2093 2132 /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
2094 2133 if (rcu_kthreads_spawnable) {
2095 2134 (void)rcu_spawn_one_cpu_kthread(cpu);
2096 2135 if (rnp->node_kthread_task == NULL)
2097   - (void)rcu_spawn_one_node_kthread(&rcu_sched_state, rnp);
  2136 + (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
2098 2137 }
2099 2138 }
2100 2139  
... ... @@ -2105,7 +2144,7 @@
2105 2144 unsigned long action, void *hcpu)
2106 2145 {
2107 2146 long cpu = (long)hcpu;
2108   - struct rcu_data *rdp = per_cpu_ptr(rcu_sched_state.rda, cpu);
  2147 + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2109 2148 struct rcu_node *rnp = rdp->mynode;
2110 2149  
2111 2150 switch (action) {
... ... @@ -135,6 +135,24 @@
135 135 /* if there is no such task. If there */
136 136 /* is no current expedited grace period, */
137 137 /* then there can cannot be any such task. */
  138 +#ifdef CONFIG_RCU_BOOST
  139 + struct list_head *boost_tasks;
  140 + /* Pointer to first task that needs to be */
  141 + /* priority boosted, or NULL if no priority */
  142 + /* boosting is needed for this rcu_node */
  143 + /* structure. If there are no tasks */
  144 + /* queued on this rcu_node structure that */
  145 + /* are blocking the current grace period, */
  146 + /* there can be no such task. */
  147 + unsigned long boost_time;
  148 + /* When to start boosting (jiffies). */
  149 + struct task_struct *boost_kthread_task;
  150 + /* kthread that takes care of priority */
  151 + /* boosting for this rcu_node structure. */
  152 + wait_queue_head_t boost_wq;
  153 + /* Wait queue on which to park the boost */
  154 + /* kthread. */
  155 +#endif /* #ifdef CONFIG_RCU_BOOST */
138 156 struct task_struct *node_kthread_task;
139 157 /* kthread that takes care of this rcu_node */
140 158 /* structure, for example, awakening the */
... ... @@ -365,7 +383,7 @@
365 383 static void rcu_bootup_announce(void);
366 384 long rcu_batches_completed(void);
367 385 static void rcu_preempt_note_context_switch(int cpu);
368   -static int rcu_preempted_readers(struct rcu_node *rnp);
  386 +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
369 387 #ifdef CONFIG_HOTPLUG_CPU
370 388 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
371 389 unsigned long flags);
... ... @@ -392,6 +410,17 @@
392 410 static void rcu_preempt_send_cbs_to_online(void);
393 411 static void __init __rcu_init_preempt(void);
394 412 static void rcu_needs_cpu_flush(void);
  413 +static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp);
  414 +static void rcu_initiate_boost(struct rcu_node *rnp);
  415 +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
  416 + cpumask_var_t cm);
  417 +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
  418 +static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
  419 + struct rcu_node *rnp,
  420 + int rnp_index);
  421 +#ifdef CONFIG_HOTPLUG_CPU
  422 +static void rcu_stop_boost_kthread(struct rcu_node *rnp);
  423 +#endif /* #ifdef CONFIG_HOTPLUG_CPU */
395 424  
396 425 #endif /* #ifndef RCU_TREE_NONCORE */
kernel/rcutree_plugin.h
... ... @@ -66,6 +66,7 @@
66 66  
67 67 struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
68 68 DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
  69 +static struct rcu_state *rcu_state = &rcu_preempt_state;
69 70  
70 71 static int rcu_preempted_readers_exp(struct rcu_node *rnp);
71 72  
... ... @@ -179,6 +180,10 @@
179 180 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
180 181 list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
181 182 rnp->gp_tasks = &t->rcu_node_entry;
  183 +#ifdef CONFIG_RCU_BOOST
  184 + if (rnp->boost_tasks != NULL)
  185 + rnp->boost_tasks = rnp->gp_tasks;
  186 +#endif /* #ifdef CONFIG_RCU_BOOST */
182 187 } else {
183 188 list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
184 189 if (rnp->qsmask & rdp->grpmask)
... ... @@ -218,7 +223,7 @@
218 223 * for the specified rcu_node structure. If the caller needs a reliable
219 224 * answer, it must hold the rcu_node's ->lock.
220 225 */
221   -static int rcu_preempted_readers(struct rcu_node *rnp)
  226 +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
222 227 {
223 228 return rnp->gp_tasks != NULL;
224 229 }
... ... @@ -236,7 +241,7 @@
236 241 unsigned long mask;
237 242 struct rcu_node *rnp_p;
238 243  
239   - if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
  244 + if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
240 245 raw_spin_unlock_irqrestore(&rnp->lock, flags);
241 246 return; /* Still need more quiescent states! */
242 247 }
... ... @@ -325,7 +330,7 @@
325 330 break;
326 331 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
327 332 }
328   - empty = !rcu_preempted_readers(rnp);
  333 + empty = !rcu_preempt_blocked_readers_cgp(rnp);
329 334 empty_exp = !rcu_preempted_readers_exp(rnp);
330 335 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
331 336 np = rcu_next_node_entry(t, rnp);
... ... @@ -334,6 +339,10 @@
334 339 rnp->gp_tasks = np;
335 340 if (&t->rcu_node_entry == rnp->exp_tasks)
336 341 rnp->exp_tasks = np;
  342 +#ifdef CONFIG_RCU_BOOST
  343 + if (&t->rcu_node_entry == rnp->boost_tasks)
  344 + rnp->boost_tasks = np;
  345 +#endif /* #ifdef CONFIG_RCU_BOOST */
337 346 t->rcu_blocked_node = NULL;
338 347  
339 348 /*
... ... @@ -346,6 +355,15 @@
346 355 else
347 356 rcu_report_unblock_qs_rnp(rnp, flags);
348 357  
  358 +#ifdef CONFIG_RCU_BOOST
  359 + /* Unboost if we were boosted. */
  360 + if (special & RCU_READ_UNLOCK_BOOSTED) {
  361 + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
  362 + rt_mutex_unlock(t->rcu_boost_mutex);
  363 + t->rcu_boost_mutex = NULL;
  364 + }
  365 +#endif /* #ifdef CONFIG_RCU_BOOST */
  366 +
349 367 /*
350 368 * If this was the last task on the expedited lists,
351 369 * then we need to report up the rcu_node hierarchy.
... ... @@ -391,7 +409,7 @@
391 409 unsigned long flags;
392 410 struct task_struct *t;
393 411  
394   - if (!rcu_preempted_readers(rnp))
  412 + if (!rcu_preempt_blocked_readers_cgp(rnp))
395 413 return;
396 414 raw_spin_lock_irqsave(&rnp->lock, flags);
397 415 t = list_entry(rnp->gp_tasks,
... ... @@ -430,7 +448,7 @@
430 448 {
431 449 struct task_struct *t;
432 450  
433   - if (!rcu_preempted_readers(rnp))
  451 + if (!rcu_preempt_blocked_readers_cgp(rnp))
434 452 return;
435 453 t = list_entry(rnp->gp_tasks,
436 454 struct task_struct, rcu_node_entry);
... ... @@ -460,7 +478,7 @@
460 478 */
461 479 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
462 480 {
463   - WARN_ON_ONCE(rcu_preempted_readers(rnp));
  481 + WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
464 482 if (!list_empty(&rnp->blkd_tasks))
465 483 rnp->gp_tasks = rnp->blkd_tasks.next;
466 484 WARN_ON_ONCE(rnp->qsmask);
... ... @@ -509,7 +527,7 @@
509 527 * absolutely necessary, but this is a good performance/complexity
510 528 * tradeoff.
511 529 */
512   - if (rcu_preempted_readers(rnp))
  530 + if (rcu_preempt_blocked_readers_cgp(rnp))
513 531 retval |= RCU_OFL_TASKS_NORM_GP;
514 532 if (rcu_preempted_readers_exp(rnp))
515 533 retval |= RCU_OFL_TASKS_EXP_GP;
516 534  
... ... @@ -525,8 +543,22 @@
525 543 rnp_root->gp_tasks = rnp->gp_tasks;
526 544 if (&t->rcu_node_entry == rnp->exp_tasks)
527 545 rnp_root->exp_tasks = rnp->exp_tasks;
  546 +#ifdef CONFIG_RCU_BOOST
  547 + if (&t->rcu_node_entry == rnp->boost_tasks)
  548 + rnp_root->boost_tasks = rnp->boost_tasks;
  549 +#endif /* #ifdef CONFIG_RCU_BOOST */
528 550 raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
529 551 }
  552 +
  553 +#ifdef CONFIG_RCU_BOOST
  554 + /* In case root is being boosted and leaf is not. */
  555 + raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
  556 + if (rnp_root->boost_tasks != NULL &&
  557 + rnp_root->boost_tasks != rnp_root->gp_tasks)
  558 + rnp_root->boost_tasks = rnp_root->gp_tasks;
  559 + raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
  560 +#endif /* #ifdef CONFIG_RCU_BOOST */
  561 +
530 562 rnp->gp_tasks = NULL;
531 563 rnp->exp_tasks = NULL;
532 564 return retval;
... ... @@ -684,6 +716,7 @@
684 716 raw_spin_lock(&rnp->lock); /* irqs already disabled */
685 717 if (!list_empty(&rnp->blkd_tasks)) {
686 718 rnp->exp_tasks = rnp->blkd_tasks.next;
  719 + rcu_initiate_boost(rnp);
687 720 must_wait = 1;
688 721 }
689 722 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
... ... @@ -830,6 +863,8 @@
830 863  
831 864 #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
832 865  
  866 +static struct rcu_state *rcu_state = &rcu_sched_state;
  867 +
833 868 /*
834 869 * Tell them what RCU they are running.
835 870 */
... ... @@ -870,7 +905,7 @@
870 905 * Because preemptable RCU does not exist, there are never any preempted
871 906 * RCU readers.
872 907 */
873   -static int rcu_preempted_readers(struct rcu_node *rnp)
  908 +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
874 909 {
875 910 return 0;
876 911 }
... ... @@ -1034,6 +1069,263 @@
1034 1069  
1035 1070 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
1036 1071  
  1072 +#ifdef CONFIG_RCU_BOOST
  1073 +
  1074 +#include "rtmutex_common.h"
  1075 +
  1076 +/*
  1077 + * Carry out RCU priority boosting on the task indicated by ->exp_tasks
  1078 + * or ->boost_tasks, advancing the pointer to the next task in the
  1079 + * ->blkd_tasks list.
  1080 + *
  1081 + * Note that irqs must be enabled: boosting the task can block.
  1082 + * Returns 1 if there are more tasks needing to be boosted.
  1083 + */
  1084 +static int rcu_boost(struct rcu_node *rnp)
  1085 +{
  1086 + unsigned long flags;
  1087 + struct rt_mutex mtx;
  1088 + struct task_struct *t;
  1089 + struct list_head *tb;
  1090 +
  1091 + if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
  1092 + return 0; /* Nothing left to boost. */
  1093 +
  1094 + raw_spin_lock_irqsave(&rnp->lock, flags);
  1095 +
  1096 + /*
  1097 + * Recheck under the lock: all tasks in need of boosting
  1098 + * might exit their RCU read-side critical sections on their own.
  1099 + */
  1100 + if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
  1101 + raw_spin_unlock_irqrestore(&rnp->lock, flags);
  1102 + return 0;
  1103 + }
  1104 +
  1105 + /*
  1106 + * Preferentially boost tasks blocking expedited grace periods.
  1107 + * This cannot starve the normal grace periods because a second
  1108 + * expedited grace period must boost all blocked tasks, including
  1109 + * those blocking the pre-existing normal grace period.
  1110 + */
  1111 + if (rnp->exp_tasks != NULL)
  1112 + tb = rnp->exp_tasks;
  1113 + else
  1114 + tb = rnp->boost_tasks;
  1115 +
  1116 + /*
  1117 + * We boost task t by manufacturing an rt_mutex that appears to
  1118 + * be held by task t. We leave a pointer to that rt_mutex where
  1119 + * task t can find it, and task t will release the mutex when it
  1120 + * exits its outermost RCU read-side critical section. Then
  1121 + * simply acquiring this artificial rt_mutex will boost task
  1122 + * t's priority. (Thanks to tglx for suggesting this approach!)
  1123 + *
  1124 + * Note that task t must acquire rnp->lock to remove itself from
  1125 + * the ->blkd_tasks list, which it will do from exit() if from
  1126 + * nowhere else. We therefore are guaranteed that task t will
  1127 + * stay around at least until we drop rnp->lock. Note that
  1128 + * rnp->lock also resolves races between our priority boosting
  1129 + * and task t's exiting its outermost RCU read-side critical
  1130 + * section.
  1131 + */
  1132 + t = container_of(tb, struct task_struct, rcu_node_entry);
  1133 + rt_mutex_init_proxy_locked(&mtx, t);
  1134 + t->rcu_boost_mutex = &mtx;
  1135 + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
  1136 + raw_spin_unlock_irqrestore(&rnp->lock, flags);
  1137 + rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
  1138 + rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
  1139 +
  1140 + return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
  1141 +}
  1142 +
  1143 +/*
  1144 + * Timer handler to initiate waking up of boost kthreads that
  1145 + * have yielded the CPU due to excessive numbers of tasks to
  1146 + * boost. We wake up the per-rcu_node kthread, which in turn
  1147 + * will wake up the booster kthread.
  1148 + */
  1149 +static void rcu_boost_kthread_timer(unsigned long arg)
  1150 +{
  1151 + unsigned long flags;
  1152 + struct rcu_node *rnp = (struct rcu_node *)arg;
  1153 +
  1154 + raw_spin_lock_irqsave(&rnp->lock, flags);
  1155 + invoke_rcu_node_kthread(rnp);
  1156 + raw_spin_unlock_irqrestore(&rnp->lock, flags);
  1157 +}
  1158 +
  1159 +/*
  1160 + * Priority-boosting kthread. One per leaf rcu_node and one for the
  1161 + * root rcu_node.
  1162 + */
  1163 +static int rcu_boost_kthread(void *arg)
  1164 +{
  1165 + struct rcu_node *rnp = (struct rcu_node *)arg;
  1166 + int spincnt = 0;
  1167 + int more2boost;
  1168 +
  1169 + for (;;) {
  1170 + wait_event_interruptible(rnp->boost_wq, rnp->boost_tasks ||
  1171 + rnp->exp_tasks ||
  1172 + kthread_should_stop());
  1173 + if (kthread_should_stop())
  1174 + break;
  1175 + more2boost = rcu_boost(rnp);
  1176 + if (more2boost)
  1177 + spincnt++;
  1178 + else
  1179 + spincnt = 0;
  1180 + if (spincnt > 10) {
  1181 + rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
  1182 + spincnt = 0;
  1183 + }
  1184 + }
  1185 + return 0;
  1186 +}
  1187 +
  1188 +/*
  1189 + * Check to see if it is time to start boosting RCU readers that are
  1190 + * blocking the current grace period, and, if so, tell the per-rcu_node
  1191 + * kthread to start boosting them. If there is an expedited grace
  1192 + * period in progress, it is always time to boost.
  1193 + *
  1194 + * The caller must hold rnp->lock.
  1195 + */
  1196 +static void rcu_initiate_boost(struct rcu_node *rnp)
  1197 +{
  1198 + struct task_struct *t;
  1199 +
  1200 + if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL)
  1201 + return;
  1202 + if (rnp->exp_tasks != NULL ||
  1203 + (rnp->gp_tasks != NULL &&
  1204 + rnp->boost_tasks == NULL &&
  1205 + rnp->qsmask == 0 &&
  1206 + ULONG_CMP_GE(jiffies, rnp->boost_time))) {
  1207 + if (rnp->exp_tasks == NULL)
  1208 + rnp->boost_tasks = rnp->gp_tasks;
  1209 + t = rnp->boost_kthread_task;
  1210 + if (t != NULL)
  1211 + wake_up_process(t);
  1212 + }
  1213 +}
  1214 +
  1215 +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
  1216 + cpumask_var_t cm)
  1217 +{
  1218 + unsigned long flags;
  1219 + struct task_struct *t;
  1220 +
  1221 + raw_spin_lock_irqsave(&rnp->lock, flags);
  1222 + t = rnp->boost_kthread_task;
  1223 + if (t != NULL)
  1224 + set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
  1225 + raw_spin_unlock_irqrestore(&rnp->lock, flags);
  1226 +}
  1227 +
  1228 +#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
  1229 +
  1230 +/*
  1231 + * Do priority-boost accounting for the start of a new grace period.
  1232 + */
  1233 +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
  1234 +{
  1235 + rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
  1236 +}
  1237 +
  1238 +/*
  1239 + * Initialize the RCU-boost waitqueue.
  1240 + */
  1241 +static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
  1242 +{
  1243 + init_waitqueue_head(&rnp->boost_wq);
  1244 +}
  1245 +
  1246 +/*
  1247 + * Create an RCU-boost kthread for the specified node if one does not
  1248 + * already exist. We only create this kthread for preemptible RCU.
  1249 + * Returns zero if all is well, a negated errno otherwise.
  1250 + */
  1251 +static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
  1252 + struct rcu_node *rnp,
  1253 + int rnp_index)
  1254 +{
  1255 + unsigned long flags;
  1256 + struct sched_param sp;
  1257 + struct task_struct *t;
  1258 +
  1259 + if (&rcu_preempt_state != rsp)
  1260 + return 0;
  1261 + if (rnp->boost_kthread_task != NULL)
  1262 + return 0;
  1263 + t = kthread_create(rcu_boost_kthread, (void *)rnp,
  1264 + "rcub%d", rnp_index);
  1265 + if (IS_ERR(t))
  1266 + return PTR_ERR(t);
  1267 + raw_spin_lock_irqsave(&rnp->lock, flags);
  1268 + rnp->boost_kthread_task = t;
  1269 + raw_spin_unlock_irqrestore(&rnp->lock, flags);
  1270 + wake_up_process(t);
  1271 + sp.sched_priority = RCU_KTHREAD_PRIO;
  1272 + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
  1273 + return 0;
  1274 +}
  1275 +
  1276 +#ifdef CONFIG_HOTPLUG_CPU
  1277 +
  1278 +static void rcu_stop_boost_kthread(struct rcu_node *rnp)
  1279 +{
  1280 + unsigned long flags;
  1281 + struct task_struct *t;
  1282 +
  1283 + raw_spin_lock_irqsave(&rnp->lock, flags);
  1284 + t = rnp->boost_kthread_task;
  1285 + rnp->boost_kthread_task = NULL;
  1286 + raw_spin_unlock_irqrestore(&rnp->lock, flags);
  1287 + if (t != NULL)
  1288 + kthread_stop(t);
  1289 +}
  1290 +
  1291 +#endif /* #ifdef CONFIG_HOTPLUG_CPU */
  1292 +
  1293 +#else /* #ifdef CONFIG_RCU_BOOST */
  1294 +
  1295 +static void rcu_initiate_boost(struct rcu_node *rnp)
  1296 +{
  1297 +}
  1298 +
  1299 +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
  1300 + cpumask_var_t cm)
  1301 +{
  1302 +}
  1303 +
  1304 +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
  1305 +{
  1306 +}
  1307 +
  1308 +static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
  1309 +{
  1310 +}
  1311 +
  1312 +static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
  1313 + struct rcu_node *rnp,
  1314 + int rnp_index)
  1315 +{
  1316 + return 0;
  1317 +}
  1318 +
  1319 +#ifdef CONFIG_HOTPLUG_CPU
  1320 +
  1321 +static void rcu_stop_boost_kthread(struct rcu_node *rnp)
  1322 +{
  1323 +}
  1324 +
  1325 +#endif /* #ifdef CONFIG_HOTPLUG_CPU */
  1326 +
  1327 +#endif /* #else #ifdef CONFIG_RCU_BOOST */
  1328 +
1037 1329 #ifndef CONFIG_SMP
1038 1330  
1039 1331 void synchronize_sched_expedited(void)
... ... @@ -1206,8 +1498,8 @@
1206 1498 *
1207 1499 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1208 1500 * disabled, we do one pass of force_quiescent_state(), then do a
1209   - * invoke_rcu_kthread() to cause rcu_process_callbacks() to be invoked later.
1210   - * The per-cpu rcu_dyntick_drain variable controls the sequencing.
  1501 + * invoke_rcu_cpu_kthread() to cause rcu_process_callbacks() to be invoked
  1502 + * later. The per-cpu rcu_dyntick_drain variable controls the sequencing.
1211 1503 */
1212 1504 int rcu_needs_cpu(int cpu)
1213 1505 {
... ... @@ -1257,7 +1549,7 @@
1257 1549  
1258 1550 /* If RCU callbacks are still pending, RCU still needs this CPU. */
1259 1551 if (c)
1260   - invoke_rcu_kthread();
  1552 + invoke_rcu_cpu_kthread();
1261 1553 return c;
1262 1554 }
1263 1555