Commit 2af49b6058d857fa5b476db642d4452bf5833ecd
Exists in
master
and in
7 other branches
Merge branch 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'core-rcu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: rcu: remove unused __list_for_each_rcu() macro rculist: fix borked __list_for_each_rcu() macro rcu: reduce __call_rcu()-induced contention on rcu_node structures rcu: limit rcu_node leaf-level fanout rcu: fine-tune grace-period begin/end checks rcu: Keep gpnum and completed fields synchronized rcu: Stop chasing QS if another CPU did it for us rcu: increase synchronize_sched_expedited() batching rcu: Make synchronize_srcu_expedited() fast if running readers rcu: fix race condition in synchronize_sched_expedited() rcu: update documentation/comments for Lai's adoption patch rcu,cleanup: simplify the code when cpu is dying rcu,cleanup: move synchronize_sched_expedited() out of sched.c rcu: get rid of obsolete "classic" names in TREE_RCU tracing rcu: Distinguish between boosting and boosted rcu: document TINY_RCU and TINY_PREEMPT_RCU tracing. rcu: add tracing for TINY_RCU and TINY_PREEMPT_RCU rcu: priority boosting for TINY_PREEMPT_RCU rcu: move TINY_RCU from softirq to kthread rcu: add priority-inversion testing to rcutorture
Showing 17 changed files Side-by-side Diff
- Documentation/RCU/trace.txt
- include/linux/init_task.h
- include/linux/rculist.h
- include/linux/rcupdate.h
- include/linux/rcutiny.h
- include/linux/rcutree.h
- include/linux/sched.h
- init/Kconfig
- kernel/rcutiny.c
- kernel/rcutiny_plugin.h
- kernel/rcutorture.c
- kernel/rcutree.c
- kernel/rcutree.h
- kernel/rcutree_plugin.h
- kernel/rcutree_trace.c
- kernel/sched.c
- kernel/srcu.c
Documentation/RCU/trace.txt
1 | 1 | CONFIG_RCU_TRACE debugfs Files and Formats |
2 | 2 | |
3 | 3 | |
4 | -The rcutree implementation of RCU provides debugfs trace output that | |
5 | -summarizes counters and state. This information is useful for debugging | |
6 | -RCU itself, and can sometimes also help to debug abuses of RCU. | |
7 | -The following sections describe the debugfs files and formats. | |
4 | +The rcutree and rcutiny implementations of RCU provide debugfs trace | |
5 | +output that summarizes counters and state. This information is useful for | |
6 | +debugging RCU itself, and can sometimes also help to debug abuses of RCU. | |
7 | +The following sections describe the debugfs files and formats, first | |
8 | +for rcutree and next for rcutiny. | |
8 | 9 | |
9 | 10 | |
10 | -Hierarchical RCU debugfs Files and Formats | |
11 | +CONFIG_TREE_RCU and CONFIG_TREE_PREEMPT_RCU debugfs Files and Formats | |
11 | 12 | |
12 | -This implementation of RCU provides three debugfs files under the | |
13 | +These implementations of RCU provides five debugfs files under the | |
13 | 14 | top-level directory RCU: rcu/rcudata (which displays fields in struct |
14 | -rcu_data), rcu/rcugp (which displays grace-period counters), and | |
15 | -rcu/rcuhier (which displays the struct rcu_node hierarchy). | |
15 | +rcu_data), rcu/rcudata.csv (which is a .csv spreadsheet version of | |
16 | +rcu/rcudata), rcu/rcugp (which displays grace-period counters), | |
17 | +rcu/rcuhier (which displays the struct rcu_node hierarchy), and | |
18 | +rcu/rcu_pending (which displays counts of the reasons that the | |
19 | +rcu_pending() function decided that there was core RCU work to do). | |
16 | 20 | |
17 | 21 | The output of "cat rcu/rcudata" looks as follows: |
18 | 22 | |
... | ... | @@ -130,7 +134,8 @@ |
130 | 134 | been registered in absence of CPU-hotplug activity. |
131 | 135 | |
132 | 136 | o "co" is the number of RCU callbacks that have been orphaned due to |
133 | - this CPU going offline. | |
137 | + this CPU going offline. These orphaned callbacks have been moved | |
138 | + to an arbitrarily chosen online CPU. | |
134 | 139 | |
135 | 140 | o "ca" is the number of RCU callbacks that have been adopted due to |
136 | 141 | other CPUs going offline. Note that ci+co-ca+ql is the number of |
137 | 142 | |
... | ... | @@ -168,12 +173,12 @@ |
168 | 173 | |
169 | 174 | The output of "cat rcu/rcuhier" looks as follows, with very long lines: |
170 | 175 | |
171 | -c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 oqlen=0 | |
176 | +c=6902 g=6903 s=2 jfq=3 j=72c7 nfqs=13142/nfqsng=0(13142) fqlh=6 | |
172 | 177 | 1/1 .>. 0:127 ^0 |
173 | 178 | 3/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3 |
174 | 179 | 3/3f .>. 0:5 ^0 2/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3 |
175 | 180 | rcu_bh: |
176 | -c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 oqlen=0 | |
181 | +c=-226 g=-226 s=1 jfq=-5701 j=72c7 nfqs=88/nfqsng=0(88) fqlh=0 | |
177 | 182 | 0/1 .>. 0:127 ^0 |
178 | 183 | 0/3 .>. 0:35 ^0 0/0 .>. 36:71 ^1 0/0 .>. 72:107 ^2 0/0 .>. 108:127 ^3 |
179 | 184 | 0/3f .>. 0:5 ^0 0/3 .>. 6:11 ^1 0/0 .>. 12:17 ^2 0/0 .>. 18:23 ^3 0/0 .>. 24:29 ^4 0/0 .>. 30:35 ^5 0/0 .>. 36:41 ^0 0/0 .>. 42:47 ^1 0/0 .>. 48:53 ^2 0/0 .>. 54:59 ^3 0/0 .>. 60:65 ^4 0/0 .>. 66:71 ^5 0/0 .>. 72:77 ^0 0/0 .>. 78:83 ^1 0/0 .>. 84:89 ^2 0/0 .>. 90:95 ^3 0/0 .>. 96:101 ^4 0/0 .>. 102:107 ^5 0/0 .>. 108:113 ^0 0/0 .>. 114:119 ^1 0/0 .>. 120:125 ^2 0/0 .>. 126:127 ^3 |
... | ... | @@ -212,11 +217,6 @@ |
212 | 217 | exited immediately (without even being counted in nfqs above) |
213 | 218 | due to contention on ->fqslock. |
214 | 219 | |
215 | -o "oqlen" is the number of callbacks on the "orphan" callback | |
216 | - list. RCU callbacks are placed on this list by CPUs going | |
217 | - offline, and are "adopted" either by the CPU helping the outgoing | |
218 | - CPU or by the next rcu_barrier*() call, whichever comes first. | |
219 | - | |
220 | 220 | o Each element of the form "1/1 0:127 ^0" represents one struct |
221 | 221 | rcu_node. Each line represents one level of the hierarchy, from |
222 | 222 | root to leaves. It is best to think of the rcu_data structures |
... | ... | @@ -326,4 +326,116 @@ |
326 | 326 | readers will note that the rcu "nn" number for a given CPU very |
327 | 327 | closely matches the rcu_bh "np" number for that same CPU. This |
328 | 328 | is due to short-circuit evaluation in rcu_pending(). |
329 | + | |
330 | + | |
331 | +CONFIG_TINY_RCU and CONFIG_TINY_PREEMPT_RCU debugfs Files and Formats | |
332 | + | |
333 | +These implementations of RCU provides a single debugfs file under the | |
334 | +top-level directory RCU, namely rcu/rcudata, which displays fields in | |
335 | +rcu_bh_ctrlblk, rcu_sched_ctrlblk and, for CONFIG_TINY_PREEMPT_RCU, | |
336 | +rcu_preempt_ctrlblk. | |
337 | + | |
338 | +The output of "cat rcu/rcudata" is as follows: | |
339 | + | |
340 | +rcu_preempt: qlen=24 gp=1097669 g197/p197/c197 tasks=... | |
341 | + ttb=. btg=no ntb=184 neb=0 nnb=183 j=01f7 bt=0274 | |
342 | + normal balk: nt=1097669 gt=0 bt=371 b=0 ny=25073378 nos=0 | |
343 | + exp balk: bt=0 nos=0 | |
344 | +rcu_sched: qlen: 0 | |
345 | +rcu_bh: qlen: 0 | |
346 | + | |
347 | +This is split into rcu_preempt, rcu_sched, and rcu_bh sections, with the | |
348 | +rcu_preempt section appearing only in CONFIG_TINY_PREEMPT_RCU builds. | |
349 | +The last three lines of the rcu_preempt section appear only in | |
350 | +CONFIG_RCU_BOOST kernel builds. The fields are as follows: | |
351 | + | |
352 | +o "qlen" is the number of RCU callbacks currently waiting either | |
353 | + for an RCU grace period or waiting to be invoked. This is the | |
354 | + only field present for rcu_sched and rcu_bh, due to the | |
355 | + short-circuiting of grace period in those two cases. | |
356 | + | |
357 | +o "gp" is the number of grace periods that have completed. | |
358 | + | |
359 | +o "g197/p197/c197" displays the grace-period state, with the | |
360 | + "g" number being the number of grace periods that have started | |
361 | + (mod 256), the "p" number being the number of grace periods | |
362 | + that the CPU has responded to (also mod 256), and the "c" | |
363 | + number being the number of grace periods that have completed | |
364 | + (once again mode 256). | |
365 | + | |
366 | + Why have both "gp" and "g"? Because the data flowing into | |
367 | + "gp" is only present in a CONFIG_RCU_TRACE kernel. | |
368 | + | |
369 | +o "tasks" is a set of bits. The first bit is "T" if there are | |
370 | + currently tasks that have recently blocked within an RCU | |
371 | + read-side critical section, the second bit is "N" if any of the | |
372 | + aforementioned tasks are blocking the current RCU grace period, | |
373 | + and the third bit is "E" if any of the aforementioned tasks are | |
374 | + blocking the current expedited grace period. Each bit is "." | |
375 | + if the corresponding condition does not hold. | |
376 | + | |
377 | +o "ttb" is a single bit. It is "B" if any of the blocked tasks | |
378 | + need to be priority boosted and "." otherwise. | |
379 | + | |
380 | +o "btg" indicates whether boosting has been carried out during | |
381 | + the current grace period, with "exp" indicating that boosting | |
382 | + is in progress for an expedited grace period, "no" indicating | |
383 | + that boosting has not yet started for a normal grace period, | |
384 | + "begun" indicating that boosting has bebug for a normal grace | |
385 | + period, and "done" indicating that boosting has completed for | |
386 | + a normal grace period. | |
387 | + | |
388 | +o "ntb" is the total number of tasks subjected to RCU priority boosting | |
389 | + periods since boot. | |
390 | + | |
391 | +o "neb" is the number of expedited grace periods that have had | |
392 | + to resort to RCU priority boosting since boot. | |
393 | + | |
394 | +o "nnb" is the number of normal grace periods that have had | |
395 | + to resort to RCU priority boosting since boot. | |
396 | + | |
397 | +o "j" is the low-order 12 bits of the jiffies counter in hexadecimal. | |
398 | + | |
399 | +o "bt" is the low-order 12 bits of the value that the jiffies counter | |
400 | + will have at the next time that boosting is scheduled to begin. | |
401 | + | |
402 | +o In the line beginning with "normal balk", the fields are as follows: | |
403 | + | |
404 | + o "nt" is the number of times that the system balked from | |
405 | + boosting because there were no blocked tasks to boost. | |
406 | + Note that the system will balk from boosting even if the | |
407 | + grace period is overdue when the currently running task | |
408 | + is looping within an RCU read-side critical section. | |
409 | + There is no point in boosting in this case, because | |
410 | + boosting a running task won't make it run any faster. | |
411 | + | |
412 | + o "gt" is the number of times that the system balked | |
413 | + from boosting because, although there were blocked tasks, | |
414 | + none of them were preventing the current grace period | |
415 | + from completing. | |
416 | + | |
417 | + o "bt" is the number of times that the system balked | |
418 | + from boosting because boosting was already in progress. | |
419 | + | |
420 | + o "b" is the number of times that the system balked from | |
421 | + boosting because boosting had already completed for | |
422 | + the grace period in question. | |
423 | + | |
424 | + o "ny" is the number of times that the system balked from | |
425 | + boosting because it was not yet time to start boosting | |
426 | + the grace period in question. | |
427 | + | |
428 | + o "nos" is the number of times that the system balked from | |
429 | + boosting for inexplicable ("not otherwise specified") | |
430 | + reasons. This can actually happen due to races involving | |
431 | + increments of the jiffies counter. | |
432 | + | |
433 | +o In the line beginning with "exp balk", the fields are as follows: | |
434 | + | |
435 | + o "bt" is the number of times that the system balked from | |
436 | + boosting because there were no blocked tasks to boost. | |
437 | + | |
438 | + o "nos" is the number of times that the system balked from | |
439 | + boosting for inexplicable ("not otherwise specified") | |
440 | + reasons. |
include/linux/init_task.h
... | ... | @@ -83,6 +83,12 @@ |
83 | 83 | */ |
84 | 84 | # define CAP_INIT_BSET CAP_FULL_SET |
85 | 85 | |
86 | +#ifdef CONFIG_RCU_BOOST | |
87 | +#define INIT_TASK_RCU_BOOST() \ | |
88 | + .rcu_boost_mutex = NULL, | |
89 | +#else | |
90 | +#define INIT_TASK_RCU_BOOST() | |
91 | +#endif | |
86 | 92 | #ifdef CONFIG_TREE_PREEMPT_RCU |
87 | 93 | #define INIT_TASK_RCU_TREE_PREEMPT() \ |
88 | 94 | .rcu_blocked_node = NULL, |
... | ... | @@ -94,7 +100,8 @@ |
94 | 100 | .rcu_read_lock_nesting = 0, \ |
95 | 101 | .rcu_read_unlock_special = 0, \ |
96 | 102 | .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \ |
97 | - INIT_TASK_RCU_TREE_PREEMPT() | |
103 | + INIT_TASK_RCU_TREE_PREEMPT() \ | |
104 | + INIT_TASK_RCU_BOOST() | |
98 | 105 | #else |
99 | 106 | #define INIT_TASK_RCU_PREEMPT(tsk) |
100 | 107 | #endif |
include/linux/rculist.h
... | ... | @@ -241,11 +241,6 @@ |
241 | 241 | #define list_first_entry_rcu(ptr, type, member) \ |
242 | 242 | list_entry_rcu((ptr)->next, type, member) |
243 | 243 | |
244 | -#define __list_for_each_rcu(pos, head) \ | |
245 | - for (pos = rcu_dereference_raw(list_next_rcu(head)); \ | |
246 | - pos != (head); \ | |
247 | - pos = rcu_dereference_raw(list_next_rcu((pos))) | |
248 | - | |
249 | 244 | /** |
250 | 245 | * list_for_each_entry_rcu - iterate over rcu list of given type |
251 | 246 | * @pos: the type * to use as a loop cursor. |
include/linux/rcupdate.h
... | ... | @@ -47,6 +47,8 @@ |
47 | 47 | extern int rcutorture_runnable; /* for sysctl */ |
48 | 48 | #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ |
49 | 49 | |
50 | +#define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b)) | |
51 | +#define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b)) | |
50 | 52 | #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) |
51 | 53 | #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) |
52 | 54 | |
... | ... | @@ -66,7 +68,6 @@ |
66 | 68 | extern void synchronize_sched(void); |
67 | 69 | extern void rcu_barrier_bh(void); |
68 | 70 | extern void rcu_barrier_sched(void); |
69 | -extern void synchronize_sched_expedited(void); | |
70 | 71 | extern int sched_expedited_torture_stats(char *page); |
71 | 72 | |
72 | 73 | static inline void __rcu_read_lock_bh(void) |
... | ... | @@ -118,7 +119,6 @@ |
118 | 119 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ |
119 | 120 | |
120 | 121 | /* Internal to kernel */ |
121 | -extern void rcu_init(void); | |
122 | 122 | extern void rcu_sched_qs(int cpu); |
123 | 123 | extern void rcu_bh_qs(int cpu); |
124 | 124 | extern void rcu_check_callbacks(int cpu, int user); |
include/linux/rcutiny.h
... | ... | @@ -27,7 +27,9 @@ |
27 | 27 | |
28 | 28 | #include <linux/cache.h> |
29 | 29 | |
30 | -#define rcu_init_sched() do { } while (0) | |
30 | +static inline void rcu_init(void) | |
31 | +{ | |
32 | +} | |
31 | 33 | |
32 | 34 | #ifdef CONFIG_TINY_RCU |
33 | 35 | |
... | ... | @@ -58,6 +60,11 @@ |
58 | 60 | synchronize_sched(); |
59 | 61 | } |
60 | 62 | |
63 | +static inline void synchronize_sched_expedited(void) | |
64 | +{ | |
65 | + synchronize_sched(); | |
66 | +} | |
67 | + | |
61 | 68 | #ifdef CONFIG_TINY_RCU |
62 | 69 | |
63 | 70 | static inline void rcu_preempt_note_context_switch(void) |
64 | 71 | |
65 | 72 | |
66 | 73 | |
... | ... | @@ -125,16 +132,12 @@ |
125 | 132 | } |
126 | 133 | |
127 | 134 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
128 | - | |
129 | 135 | extern int rcu_scheduler_active __read_mostly; |
130 | 136 | extern void rcu_scheduler_starting(void); |
131 | - | |
132 | 137 | #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
133 | - | |
134 | 138 | static inline void rcu_scheduler_starting(void) |
135 | 139 | { |
136 | 140 | } |
137 | - | |
138 | 141 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
139 | 142 | |
140 | 143 | #endif /* __LINUX_RCUTINY_H */ |
include/linux/rcutree.h
... | ... | @@ -30,6 +30,7 @@ |
30 | 30 | #ifndef __LINUX_RCUTREE_H |
31 | 31 | #define __LINUX_RCUTREE_H |
32 | 32 | |
33 | +extern void rcu_init(void); | |
33 | 34 | extern void rcu_note_context_switch(int cpu); |
34 | 35 | extern int rcu_needs_cpu(int cpu); |
35 | 36 | extern void rcu_cpu_stall_reset(void); |
... | ... | @@ -47,6 +48,7 @@ |
47 | 48 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ |
48 | 49 | |
49 | 50 | extern void synchronize_rcu_bh(void); |
51 | +extern void synchronize_sched_expedited(void); | |
50 | 52 | extern void synchronize_rcu_expedited(void); |
51 | 53 | |
52 | 54 | static inline void synchronize_rcu_bh_expedited(void) |
include/linux/sched.h
... | ... | @@ -1229,6 +1229,9 @@ |
1229 | 1229 | #ifdef CONFIG_TREE_PREEMPT_RCU |
1230 | 1230 | struct rcu_node *rcu_blocked_node; |
1231 | 1231 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
1232 | +#ifdef CONFIG_RCU_BOOST | |
1233 | + struct rt_mutex *rcu_boost_mutex; | |
1234 | +#endif /* #ifdef CONFIG_RCU_BOOST */ | |
1232 | 1235 | |
1233 | 1236 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
1234 | 1237 | struct sched_info sched_info; |
... | ... | @@ -1759,7 +1762,8 @@ |
1759 | 1762 | #ifdef CONFIG_PREEMPT_RCU |
1760 | 1763 | |
1761 | 1764 | #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ |
1762 | -#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ | |
1765 | +#define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */ | |
1766 | +#define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */ | |
1763 | 1767 | |
1764 | 1768 | static inline void rcu_copy_process(struct task_struct *p) |
1765 | 1769 | { |
... | ... | @@ -1767,7 +1771,10 @@ |
1767 | 1771 | p->rcu_read_unlock_special = 0; |
1768 | 1772 | #ifdef CONFIG_TREE_PREEMPT_RCU |
1769 | 1773 | p->rcu_blocked_node = NULL; |
1770 | -#endif | |
1774 | +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | |
1775 | +#ifdef CONFIG_RCU_BOOST | |
1776 | + p->rcu_boost_mutex = NULL; | |
1777 | +#endif /* #ifdef CONFIG_RCU_BOOST */ | |
1771 | 1778 | INIT_LIST_HEAD(&p->rcu_node_entry); |
1772 | 1779 | } |
1773 | 1780 |
init/Kconfig
... | ... | @@ -393,7 +393,6 @@ |
393 | 393 | |
394 | 394 | config RCU_TRACE |
395 | 395 | bool "Enable tracing for RCU" |
396 | - depends on TREE_RCU || TREE_PREEMPT_RCU | |
397 | 396 | help |
398 | 397 | This option provides tracing in RCU which presents stats |
399 | 398 | in debugfs for debugging RCU implementation. |
... | ... | @@ -458,6 +457,60 @@ |
458 | 457 | This option provides tracing for the TREE_RCU and |
459 | 458 | TREE_PREEMPT_RCU implementations, permitting Makefile to |
460 | 459 | trivially select kernel/rcutree_trace.c. |
460 | + | |
461 | +config RCU_BOOST | |
462 | + bool "Enable RCU priority boosting" | |
463 | + depends on RT_MUTEXES && TINY_PREEMPT_RCU | |
464 | + default n | |
465 | + help | |
466 | + This option boosts the priority of preempted RCU readers that | |
467 | + block the current preemptible RCU grace period for too long. | |
468 | + This option also prevents heavy loads from blocking RCU | |
469 | + callback invocation for all flavors of RCU. | |
470 | + | |
471 | + Say Y here if you are working with real-time apps or heavy loads | |
472 | + Say N here if you are unsure. | |
473 | + | |
474 | +config RCU_BOOST_PRIO | |
475 | + int "Real-time priority to boost RCU readers to" | |
476 | + range 1 99 | |
477 | + depends on RCU_BOOST | |
478 | + default 1 | |
479 | + help | |
480 | + This option specifies the real-time priority to which preempted | |
481 | + RCU readers are to be boosted. If you are working with CPU-bound | |
482 | + real-time applications, you should specify a priority higher then | |
483 | + the highest-priority CPU-bound application. | |
484 | + | |
485 | + Specify the real-time priority, or take the default if unsure. | |
486 | + | |
487 | +config RCU_BOOST_DELAY | |
488 | + int "Milliseconds to delay boosting after RCU grace-period start" | |
489 | + range 0 3000 | |
490 | + depends on RCU_BOOST | |
491 | + default 500 | |
492 | + help | |
493 | + This option specifies the time to wait after the beginning of | |
494 | + a given grace period before priority-boosting preempted RCU | |
495 | + readers blocking that grace period. Note that any RCU reader | |
496 | + blocking an expedited RCU grace period is boosted immediately. | |
497 | + | |
498 | + Accept the default if unsure. | |
499 | + | |
500 | +config SRCU_SYNCHRONIZE_DELAY | |
501 | + int "Microseconds to delay before waiting for readers" | |
502 | + range 0 20 | |
503 | + default 10 | |
504 | + help | |
505 | + This option controls how long SRCU delays before entering its | |
506 | + loop waiting on SRCU readers. The purpose of this loop is | |
507 | + to avoid the unconditional context-switch penalty that would | |
508 | + otherwise be incurred if there was an active SRCU reader, | |
509 | + in a manner similar to adaptive locking schemes. This should | |
510 | + be set to be a bit longer than the common-case SRCU read-side | |
511 | + critical-section overhead. | |
512 | + | |
513 | + Accept the default if unsure. | |
461 | 514 | |
462 | 515 | endmenu # "RCU Subsystem" |
463 | 516 |
kernel/rcutiny.c
... | ... | @@ -36,31 +36,16 @@ |
36 | 36 | #include <linux/time.h> |
37 | 37 | #include <linux/cpu.h> |
38 | 38 | |
39 | -/* Global control variables for rcupdate callback mechanism. */ | |
40 | -struct rcu_ctrlblk { | |
41 | - struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ | |
42 | - struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ | |
43 | - struct rcu_head **curtail; /* ->next pointer of last CB. */ | |
44 | -}; | |
39 | +/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ | |
40 | +static struct task_struct *rcu_kthread_task; | |
41 | +static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); | |
42 | +static unsigned long have_rcu_kthread_work; | |
43 | +static void invoke_rcu_kthread(void); | |
45 | 44 | |
46 | -/* Definition for rcupdate control block. */ | |
47 | -static struct rcu_ctrlblk rcu_sched_ctrlblk = { | |
48 | - .donetail = &rcu_sched_ctrlblk.rcucblist, | |
49 | - .curtail = &rcu_sched_ctrlblk.rcucblist, | |
50 | -}; | |
51 | - | |
52 | -static struct rcu_ctrlblk rcu_bh_ctrlblk = { | |
53 | - .donetail = &rcu_bh_ctrlblk.rcucblist, | |
54 | - .curtail = &rcu_bh_ctrlblk.rcucblist, | |
55 | -}; | |
56 | - | |
57 | -#ifdef CONFIG_DEBUG_LOCK_ALLOC | |
58 | -int rcu_scheduler_active __read_mostly; | |
59 | -EXPORT_SYMBOL_GPL(rcu_scheduler_active); | |
60 | -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | |
61 | - | |
62 | 45 | /* Forward declarations for rcutiny_plugin.h. */ |
63 | -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); | |
46 | +struct rcu_ctrlblk; | |
47 | +static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); | |
48 | +static int rcu_kthread(void *arg); | |
64 | 49 | static void __call_rcu(struct rcu_head *head, |
65 | 50 | void (*func)(struct rcu_head *rcu), |
66 | 51 | struct rcu_ctrlblk *rcp); |
... | ... | @@ -123,7 +108,7 @@ |
123 | 108 | { |
124 | 109 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + |
125 | 110 | rcu_qsctr_help(&rcu_bh_ctrlblk)) |
126 | - raise_softirq(RCU_SOFTIRQ); | |
111 | + invoke_rcu_kthread(); | |
127 | 112 | } |
128 | 113 | |
129 | 114 | /* |
... | ... | @@ -132,7 +117,7 @@ |
132 | 117 | void rcu_bh_qs(int cpu) |
133 | 118 | { |
134 | 119 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) |
135 | - raise_softirq(RCU_SOFTIRQ); | |
120 | + invoke_rcu_kthread(); | |
136 | 121 | } |
137 | 122 | |
138 | 123 | /* |
139 | 124 | |
140 | 125 | |
... | ... | @@ -152,13 +137,14 @@ |
152 | 137 | } |
153 | 138 | |
154 | 139 | /* |
155 | - * Helper function for rcu_process_callbacks() that operates on the | |
156 | - * specified rcu_ctrlkblk structure. | |
140 | + * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure | |
141 | + * whose grace period has elapsed. | |
157 | 142 | */ |
158 | -static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |
143 | +static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) | |
159 | 144 | { |
160 | 145 | struct rcu_head *next, *list; |
161 | 146 | unsigned long flags; |
147 | + RCU_TRACE(int cb_count = 0); | |
162 | 148 | |
163 | 149 | /* If no RCU callbacks ready to invoke, just return. */ |
164 | 150 | if (&rcp->rcucblist == rcp->donetail) |
165 | 151 | |
166 | 152 | |
167 | 153 | |
168 | 154 | |
169 | 155 | |
170 | 156 | |
171 | 157 | |
... | ... | @@ -180,22 +166,61 @@ |
180 | 166 | next = list->next; |
181 | 167 | prefetch(next); |
182 | 168 | debug_rcu_head_unqueue(list); |
169 | + local_bh_disable(); | |
183 | 170 | list->func(list); |
171 | + local_bh_enable(); | |
184 | 172 | list = next; |
173 | + RCU_TRACE(cb_count++); | |
185 | 174 | } |
175 | + RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); | |
186 | 176 | } |
187 | 177 | |
188 | 178 | /* |
189 | - * Invoke any callbacks whose grace period has completed. | |
179 | + * This kthread invokes RCU callbacks whose grace periods have | |
180 | + * elapsed. It is awakened as needed, and takes the place of the | |
181 | + * RCU_SOFTIRQ that was used previously for this purpose. | |
182 | + * This is a kthread, but it is never stopped, at least not until | |
183 | + * the system goes down. | |
190 | 184 | */ |
191 | -static void rcu_process_callbacks(struct softirq_action *unused) | |
185 | +static int rcu_kthread(void *arg) | |
192 | 186 | { |
193 | - __rcu_process_callbacks(&rcu_sched_ctrlblk); | |
194 | - __rcu_process_callbacks(&rcu_bh_ctrlblk); | |
195 | - rcu_preempt_process_callbacks(); | |
187 | + unsigned long work; | |
188 | + unsigned long morework; | |
189 | + unsigned long flags; | |
190 | + | |
191 | + for (;;) { | |
192 | + wait_event(rcu_kthread_wq, have_rcu_kthread_work != 0); | |
193 | + morework = rcu_boost(); | |
194 | + local_irq_save(flags); | |
195 | + work = have_rcu_kthread_work; | |
196 | + have_rcu_kthread_work = morework; | |
197 | + local_irq_restore(flags); | |
198 | + if (work) { | |
199 | + rcu_process_callbacks(&rcu_sched_ctrlblk); | |
200 | + rcu_process_callbacks(&rcu_bh_ctrlblk); | |
201 | + rcu_preempt_process_callbacks(); | |
202 | + } | |
203 | + schedule_timeout_interruptible(1); /* Leave CPU for others. */ | |
204 | + } | |
205 | + | |
206 | + return 0; /* Not reached, but needed to shut gcc up. */ | |
196 | 207 | } |
197 | 208 | |
198 | 209 | /* |
210 | + * Wake up rcu_kthread() to process callbacks now eligible for invocation | |
211 | + * or to boost readers. | |
212 | + */ | |
213 | +static void invoke_rcu_kthread(void) | |
214 | +{ | |
215 | + unsigned long flags; | |
216 | + | |
217 | + local_irq_save(flags); | |
218 | + have_rcu_kthread_work = 1; | |
219 | + wake_up(&rcu_kthread_wq); | |
220 | + local_irq_restore(flags); | |
221 | +} | |
222 | + | |
223 | +/* | |
199 | 224 | * Wait for a grace period to elapse. But it is illegal to invoke |
200 | 225 | * synchronize_sched() from within an RCU read-side critical section. |
201 | 226 | * Therefore, any legal call to synchronize_sched() is a quiescent |
... | ... | @@ -230,6 +255,7 @@ |
230 | 255 | local_irq_save(flags); |
231 | 256 | *rcp->curtail = head; |
232 | 257 | rcp->curtail = &head->next; |
258 | + RCU_TRACE(rcp->qlen++); | |
233 | 259 | local_irq_restore(flags); |
234 | 260 | } |
235 | 261 | |
236 | 262 | |
237 | 263 | |
... | ... | @@ -282,8 +308,17 @@ |
282 | 308 | } |
283 | 309 | EXPORT_SYMBOL_GPL(rcu_barrier_sched); |
284 | 310 | |
285 | -void __init rcu_init(void) | |
311 | +/* | |
312 | + * Spawn the kthread that invokes RCU callbacks. | |
313 | + */ | |
314 | +static int __init rcu_spawn_kthreads(void) | |
286 | 315 | { |
287 | - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | |
316 | + struct sched_param sp; | |
317 | + | |
318 | + rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); | |
319 | + sp.sched_priority = RCU_BOOST_PRIO; | |
320 | + sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); | |
321 | + return 0; | |
288 | 322 | } |
323 | +early_initcall(rcu_spawn_kthreads); |
kernel/rcutiny_plugin.h
... | ... | @@ -22,6 +22,40 @@ |
22 | 22 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> |
23 | 23 | */ |
24 | 24 | |
25 | +#include <linux/kthread.h> | |
26 | +#include <linux/debugfs.h> | |
27 | +#include <linux/seq_file.h> | |
28 | + | |
29 | +#ifdef CONFIG_RCU_TRACE | |
30 | +#define RCU_TRACE(stmt) stmt | |
31 | +#else /* #ifdef CONFIG_RCU_TRACE */ | |
32 | +#define RCU_TRACE(stmt) | |
33 | +#endif /* #else #ifdef CONFIG_RCU_TRACE */ | |
34 | + | |
35 | +/* Global control variables for rcupdate callback mechanism. */ | |
36 | +struct rcu_ctrlblk { | |
37 | + struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ | |
38 | + struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ | |
39 | + struct rcu_head **curtail; /* ->next pointer of last CB. */ | |
40 | + RCU_TRACE(long qlen); /* Number of pending CBs. */ | |
41 | +}; | |
42 | + | |
43 | +/* Definition for rcupdate control block. */ | |
44 | +static struct rcu_ctrlblk rcu_sched_ctrlblk = { | |
45 | + .donetail = &rcu_sched_ctrlblk.rcucblist, | |
46 | + .curtail = &rcu_sched_ctrlblk.rcucblist, | |
47 | +}; | |
48 | + | |
49 | +static struct rcu_ctrlblk rcu_bh_ctrlblk = { | |
50 | + .donetail = &rcu_bh_ctrlblk.rcucblist, | |
51 | + .curtail = &rcu_bh_ctrlblk.rcucblist, | |
52 | +}; | |
53 | + | |
54 | +#ifdef CONFIG_DEBUG_LOCK_ALLOC | |
55 | +int rcu_scheduler_active __read_mostly; | |
56 | +EXPORT_SYMBOL_GPL(rcu_scheduler_active); | |
57 | +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | |
58 | + | |
25 | 59 | #ifdef CONFIG_TINY_PREEMPT_RCU |
26 | 60 | |
27 | 61 | #include <linux/delay.h> |
28 | 62 | |
29 | 63 | |
... | ... | @@ -46,17 +80,45 @@ |
46 | 80 | struct list_head *gp_tasks; |
47 | 81 | /* Pointer to the first task blocking the */ |
48 | 82 | /* current grace period, or NULL if there */ |
49 | - /* is not such task. */ | |
83 | + /* is no such task. */ | |
50 | 84 | struct list_head *exp_tasks; |
51 | 85 | /* Pointer to first task blocking the */ |
52 | 86 | /* current expedited grace period, or NULL */ |
53 | 87 | /* if there is no such task. If there */ |
54 | 88 | /* is no current expedited grace period, */ |
55 | 89 | /* then there cannot be any such task. */ |
90 | +#ifdef CONFIG_RCU_BOOST | |
91 | + struct list_head *boost_tasks; | |
92 | + /* Pointer to first task that needs to be */ | |
93 | + /* priority-boosted, or NULL if no priority */ | |
94 | + /* boosting is needed. If there is no */ | |
95 | + /* current or expedited grace period, there */ | |
96 | + /* can be no such task. */ | |
97 | +#endif /* #ifdef CONFIG_RCU_BOOST */ | |
56 | 98 | u8 gpnum; /* Current grace period. */ |
57 | 99 | u8 gpcpu; /* Last grace period blocked by the CPU. */ |
58 | 100 | u8 completed; /* Last grace period completed. */ |
59 | 101 | /* If all three are equal, RCU is idle. */ |
102 | +#ifdef CONFIG_RCU_BOOST | |
103 | + s8 boosted_this_gp; /* Has boosting already happened? */ | |
104 | + unsigned long boost_time; /* When to start boosting (jiffies) */ | |
105 | +#endif /* #ifdef CONFIG_RCU_BOOST */ | |
106 | +#ifdef CONFIG_RCU_TRACE | |
107 | + unsigned long n_grace_periods; | |
108 | +#ifdef CONFIG_RCU_BOOST | |
109 | + unsigned long n_tasks_boosted; | |
110 | + unsigned long n_exp_boosts; | |
111 | + unsigned long n_normal_boosts; | |
112 | + unsigned long n_normal_balk_blkd_tasks; | |
113 | + unsigned long n_normal_balk_gp_tasks; | |
114 | + unsigned long n_normal_balk_boost_tasks; | |
115 | + unsigned long n_normal_balk_boosted; | |
116 | + unsigned long n_normal_balk_notyet; | |
117 | + unsigned long n_normal_balk_nos; | |
118 | + unsigned long n_exp_balk_blkd_tasks; | |
119 | + unsigned long n_exp_balk_nos; | |
120 | +#endif /* #ifdef CONFIG_RCU_BOOST */ | |
121 | +#endif /* #ifdef CONFIG_RCU_TRACE */ | |
60 | 122 | }; |
61 | 123 | |
62 | 124 | static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { |
... | ... | @@ -122,6 +184,210 @@ |
122 | 184 | } |
123 | 185 | |
124 | 186 | /* |
187 | + * Advance a ->blkd_tasks-list pointer to the next entry, instead | |
188 | + * returning NULL if at the end of the list. | |
189 | + */ | |
190 | +static struct list_head *rcu_next_node_entry(struct task_struct *t) | |
191 | +{ | |
192 | + struct list_head *np; | |
193 | + | |
194 | + np = t->rcu_node_entry.next; | |
195 | + if (np == &rcu_preempt_ctrlblk.blkd_tasks) | |
196 | + np = NULL; | |
197 | + return np; | |
198 | +} | |
199 | + | |
200 | +#ifdef CONFIG_RCU_TRACE | |
201 | + | |
202 | +#ifdef CONFIG_RCU_BOOST | |
203 | +static void rcu_initiate_boost_trace(void); | |
204 | +static void rcu_initiate_exp_boost_trace(void); | |
205 | +#endif /* #ifdef CONFIG_RCU_BOOST */ | |
206 | + | |
207 | +/* | |
208 | + * Dump additional statistice for TINY_PREEMPT_RCU. | |
209 | + */ | |
210 | +static void show_tiny_preempt_stats(struct seq_file *m) | |
211 | +{ | |
212 | + seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n", | |
213 | + rcu_preempt_ctrlblk.rcb.qlen, | |
214 | + rcu_preempt_ctrlblk.n_grace_periods, | |
215 | + rcu_preempt_ctrlblk.gpnum, | |
216 | + rcu_preempt_ctrlblk.gpcpu, | |
217 | + rcu_preempt_ctrlblk.completed, | |
218 | + "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)], | |
219 | + "N."[!rcu_preempt_ctrlblk.gp_tasks], | |
220 | + "E."[!rcu_preempt_ctrlblk.exp_tasks]); | |
221 | +#ifdef CONFIG_RCU_BOOST | |
222 | + seq_printf(m, " ttb=%c btg=", | |
223 | + "B."[!rcu_preempt_ctrlblk.boost_tasks]); | |
224 | + switch (rcu_preempt_ctrlblk.boosted_this_gp) { | |
225 | + case -1: | |
226 | + seq_puts(m, "exp"); | |
227 | + break; | |
228 | + case 0: | |
229 | + seq_puts(m, "no"); | |
230 | + break; | |
231 | + case 1: | |
232 | + seq_puts(m, "begun"); | |
233 | + break; | |
234 | + case 2: | |
235 | + seq_puts(m, "done"); | |
236 | + break; | |
237 | + default: | |
238 | + seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp); | |
239 | + } | |
240 | + seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", | |
241 | + rcu_preempt_ctrlblk.n_tasks_boosted, | |
242 | + rcu_preempt_ctrlblk.n_exp_boosts, | |
243 | + rcu_preempt_ctrlblk.n_normal_boosts, | |
244 | + (int)(jiffies & 0xffff), | |
245 | + (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); | |
246 | + seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n", | |
247 | + "normal balk", | |
248 | + rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks, | |
249 | + rcu_preempt_ctrlblk.n_normal_balk_gp_tasks, | |
250 | + rcu_preempt_ctrlblk.n_normal_balk_boost_tasks, | |
251 | + rcu_preempt_ctrlblk.n_normal_balk_boosted, | |
252 | + rcu_preempt_ctrlblk.n_normal_balk_notyet, | |
253 | + rcu_preempt_ctrlblk.n_normal_balk_nos); | |
254 | + seq_printf(m, " exp balk: bt=%lu nos=%lu\n", | |
255 | + rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks, | |
256 | + rcu_preempt_ctrlblk.n_exp_balk_nos); | |
257 | +#endif /* #ifdef CONFIG_RCU_BOOST */ | |
258 | +} | |
259 | + | |
260 | +#endif /* #ifdef CONFIG_RCU_TRACE */ | |
261 | + | |
262 | +#ifdef CONFIG_RCU_BOOST | |
263 | + | |
264 | +#include "rtmutex_common.h" | |
265 | + | |
266 | +/* | |
267 | + * Carry out RCU priority boosting on the task indicated by ->boost_tasks, | |
268 | + * and advance ->boost_tasks to the next task in the ->blkd_tasks list. | |
269 | + */ | |
270 | +static int rcu_boost(void) | |
271 | +{ | |
272 | + unsigned long flags; | |
273 | + struct rt_mutex mtx; | |
274 | + struct list_head *np; | |
275 | + struct task_struct *t; | |
276 | + | |
277 | + if (rcu_preempt_ctrlblk.boost_tasks == NULL) | |
278 | + return 0; /* Nothing to boost. */ | |
279 | + raw_local_irq_save(flags); | |
280 | + rcu_preempt_ctrlblk.boosted_this_gp++; | |
281 | + t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct, | |
282 | + rcu_node_entry); | |
283 | + np = rcu_next_node_entry(t); | |
284 | + rt_mutex_init_proxy_locked(&mtx, t); | |
285 | + t->rcu_boost_mutex = &mtx; | |
286 | + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; | |
287 | + raw_local_irq_restore(flags); | |
288 | + rt_mutex_lock(&mtx); | |
289 | + RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); | |
290 | + rcu_preempt_ctrlblk.boosted_this_gp++; | |
291 | + rt_mutex_unlock(&mtx); | |
292 | + return rcu_preempt_ctrlblk.boost_tasks != NULL; | |
293 | +} | |
294 | + | |
295 | +/* | |
296 | + * Check to see if it is now time to start boosting RCU readers blocking | |
297 | + * the current grace period, and, if so, tell the rcu_kthread_task to | |
298 | + * start boosting them. If there is an expedited boost in progress, | |
299 | + * we wait for it to complete. | |
300 | + * | |
301 | + * If there are no blocked readers blocking the current grace period, | |
302 | + * return 0 to let the caller know, otherwise return 1. Note that this | |
303 | + * return value is independent of whether or not boosting was done. | |
304 | + */ | |
305 | +static int rcu_initiate_boost(void) | |
306 | +{ | |
307 | + if (!rcu_preempt_blocked_readers_cgp()) { | |
308 | + RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++); | |
309 | + return 0; | |
310 | + } | |
311 | + if (rcu_preempt_ctrlblk.gp_tasks != NULL && | |
312 | + rcu_preempt_ctrlblk.boost_tasks == NULL && | |
313 | + rcu_preempt_ctrlblk.boosted_this_gp == 0 && | |
314 | + ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) { | |
315 | + rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; | |
316 | + invoke_rcu_kthread(); | |
317 | + RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); | |
318 | + } else | |
319 | + RCU_TRACE(rcu_initiate_boost_trace()); | |
320 | + return 1; | |
321 | +} | |
322 | + | |
323 | +/* | |
324 | + * Initiate boosting for an expedited grace period. | |
325 | + */ | |
326 | +static void rcu_initiate_expedited_boost(void) | |
327 | +{ | |
328 | + unsigned long flags; | |
329 | + | |
330 | + raw_local_irq_save(flags); | |
331 | + if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) { | |
332 | + rcu_preempt_ctrlblk.boost_tasks = | |
333 | + rcu_preempt_ctrlblk.blkd_tasks.next; | |
334 | + rcu_preempt_ctrlblk.boosted_this_gp = -1; | |
335 | + invoke_rcu_kthread(); | |
336 | + RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); | |
337 | + } else | |
338 | + RCU_TRACE(rcu_initiate_exp_boost_trace()); | |
339 | + raw_local_irq_restore(flags); | |
340 | +} | |
341 | + | |
342 | +#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000); | |
343 | + | |
344 | +/* | |
345 | + * Do priority-boost accounting for the start of a new grace period. | |
346 | + */ | |
347 | +static void rcu_preempt_boost_start_gp(void) | |
348 | +{ | |
349 | + rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; | |
350 | + if (rcu_preempt_ctrlblk.boosted_this_gp > 0) | |
351 | + rcu_preempt_ctrlblk.boosted_this_gp = 0; | |
352 | +} | |
353 | + | |
354 | +#else /* #ifdef CONFIG_RCU_BOOST */ | |
355 | + | |
356 | +/* | |
357 | + * If there is no RCU priority boosting, we don't boost. | |
358 | + */ | |
359 | +static int rcu_boost(void) | |
360 | +{ | |
361 | + return 0; | |
362 | +} | |
363 | + | |
364 | +/* | |
365 | + * If there is no RCU priority boosting, we don't initiate boosting, | |
366 | + * but we do indicate whether there are blocked readers blocking the | |
367 | + * current grace period. | |
368 | + */ | |
369 | +static int rcu_initiate_boost(void) | |
370 | +{ | |
371 | + return rcu_preempt_blocked_readers_cgp(); | |
372 | +} | |
373 | + | |
374 | +/* | |
375 | + * If there is no RCU priority boosting, we don't initiate expedited boosting. | |
376 | + */ | |
377 | +static void rcu_initiate_expedited_boost(void) | |
378 | +{ | |
379 | +} | |
380 | + | |
381 | +/* | |
382 | + * If there is no RCU priority boosting, nothing to do at grace-period start. | |
383 | + */ | |
384 | +static void rcu_preempt_boost_start_gp(void) | |
385 | +{ | |
386 | +} | |
387 | + | |
388 | +#endif /* else #ifdef CONFIG_RCU_BOOST */ | |
389 | + | |
390 | +/* | |
125 | 391 | * Record a preemptible-RCU quiescent state for the specified CPU. Note |
126 | 392 | * that this just means that the task currently running on the CPU is |
127 | 393 | * in a quiescent state. There might be any number of tasks blocked |
128 | 394 | |
129 | 395 | |
... | ... | @@ -148,11 +414,14 @@ |
148 | 414 | rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; |
149 | 415 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; |
150 | 416 | |
417 | + /* If there is no GP then there is nothing more to do. */ | |
418 | + if (!rcu_preempt_gp_in_progress()) | |
419 | + return; | |
151 | 420 | /* |
152 | - * If there is no GP, or if blocked readers are still blocking GP, | |
153 | - * then there is nothing more to do. | |
421 | + * Check up on boosting. If there are no readers blocking the | |
422 | + * current grace period, leave. | |
154 | 423 | */ |
155 | - if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp()) | |
424 | + if (rcu_initiate_boost()) | |
156 | 425 | return; |
157 | 426 | |
158 | 427 | /* Advance callbacks. */ |
159 | 428 | |
... | ... | @@ -164,9 +433,9 @@ |
164 | 433 | if (!rcu_preempt_blocked_readers_any()) |
165 | 434 | rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; |
166 | 435 | |
167 | - /* If there are done callbacks, make RCU_SOFTIRQ process them. */ | |
436 | + /* If there are done callbacks, cause them to be invoked. */ | |
168 | 437 | if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) |
169 | - raise_softirq(RCU_SOFTIRQ); | |
438 | + invoke_rcu_kthread(); | |
170 | 439 | } |
171 | 440 | |
172 | 441 | /* |
173 | 442 | |
... | ... | @@ -178,12 +447,16 @@ |
178 | 447 | |
179 | 448 | /* Official start of GP. */ |
180 | 449 | rcu_preempt_ctrlblk.gpnum++; |
450 | + RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); | |
181 | 451 | |
182 | 452 | /* Any blocked RCU readers block new GP. */ |
183 | 453 | if (rcu_preempt_blocked_readers_any()) |
184 | 454 | rcu_preempt_ctrlblk.gp_tasks = |
185 | 455 | rcu_preempt_ctrlblk.blkd_tasks.next; |
186 | 456 | |
457 | + /* Set up for RCU priority boosting. */ | |
458 | + rcu_preempt_boost_start_gp(); | |
459 | + | |
187 | 460 | /* If there is no running reader, CPU is done with GP. */ |
188 | 461 | if (!rcu_preempt_running_reader()) |
189 | 462 | rcu_preempt_cpu_qs(); |
190 | 463 | |
... | ... | @@ -304,14 +577,16 @@ |
304 | 577 | */ |
305 | 578 | empty = !rcu_preempt_blocked_readers_cgp(); |
306 | 579 | empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; |
307 | - np = t->rcu_node_entry.next; | |
308 | - if (np == &rcu_preempt_ctrlblk.blkd_tasks) | |
309 | - np = NULL; | |
580 | + np = rcu_next_node_entry(t); | |
310 | 581 | list_del(&t->rcu_node_entry); |
311 | 582 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) |
312 | 583 | rcu_preempt_ctrlblk.gp_tasks = np; |
313 | 584 | if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) |
314 | 585 | rcu_preempt_ctrlblk.exp_tasks = np; |
586 | +#ifdef CONFIG_RCU_BOOST | |
587 | + if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) | |
588 | + rcu_preempt_ctrlblk.boost_tasks = np; | |
589 | +#endif /* #ifdef CONFIG_RCU_BOOST */ | |
315 | 590 | INIT_LIST_HEAD(&t->rcu_node_entry); |
316 | 591 | |
317 | 592 | /* |
... | ... | @@ -331,6 +606,14 @@ |
331 | 606 | if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) |
332 | 607 | rcu_report_exp_done(); |
333 | 608 | } |
609 | +#ifdef CONFIG_RCU_BOOST | |
610 | + /* Unboost self if was boosted. */ | |
611 | + if (special & RCU_READ_UNLOCK_BOOSTED) { | |
612 | + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; | |
613 | + rt_mutex_unlock(t->rcu_boost_mutex); | |
614 | + t->rcu_boost_mutex = NULL; | |
615 | + } | |
616 | +#endif /* #ifdef CONFIG_RCU_BOOST */ | |
334 | 617 | local_irq_restore(flags); |
335 | 618 | } |
336 | 619 | |
... | ... | @@ -374,7 +657,7 @@ |
374 | 657 | rcu_preempt_cpu_qs(); |
375 | 658 | if (&rcu_preempt_ctrlblk.rcb.rcucblist != |
376 | 659 | rcu_preempt_ctrlblk.rcb.donetail) |
377 | - raise_softirq(RCU_SOFTIRQ); | |
660 | + invoke_rcu_kthread(); | |
378 | 661 | if (rcu_preempt_gp_in_progress() && |
379 | 662 | rcu_cpu_blocking_cur_gp() && |
380 | 663 | rcu_preempt_running_reader()) |
... | ... | @@ -383,7 +666,7 @@ |
383 | 666 | |
384 | 667 | /* |
385 | 668 | * TINY_PREEMPT_RCU has an extra callback-list tail pointer to |
386 | - * update, so this is invoked from __rcu_process_callbacks() to | |
669 | + * update, so this is invoked from rcu_process_callbacks() to | |
387 | 670 | * handle that case. Of course, it is invoked for all flavors of |
388 | 671 | * RCU, but RCU callbacks can appear only on one of the lists, and |
389 | 672 | * neither ->nexttail nor ->donetail can possibly be NULL, so there |
... | ... | @@ -400,7 +683,7 @@ |
400 | 683 | */ |
401 | 684 | static void rcu_preempt_process_callbacks(void) |
402 | 685 | { |
403 | - __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); | |
686 | + rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); | |
404 | 687 | } |
405 | 688 | |
406 | 689 | /* |
... | ... | @@ -417,6 +700,7 @@ |
417 | 700 | local_irq_save(flags); |
418 | 701 | *rcu_preempt_ctrlblk.nexttail = head; |
419 | 702 | rcu_preempt_ctrlblk.nexttail = &head->next; |
703 | + RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++); | |
420 | 704 | rcu_preempt_start_gp(); /* checks to see if GP needed. */ |
421 | 705 | local_irq_restore(flags); |
422 | 706 | } |
... | ... | @@ -532,6 +816,7 @@ |
532 | 816 | |
533 | 817 | /* Wait for tail of ->blkd_tasks list to drain. */ |
534 | 818 | if (rcu_preempted_readers_exp()) |
819 | + rcu_initiate_expedited_boost(); | |
535 | 820 | wait_event(sync_rcu_preempt_exp_wq, |
536 | 821 | !rcu_preempted_readers_exp()); |
537 | 822 | |
538 | 823 | |
... | ... | @@ -572,7 +857,28 @@ |
572 | 857 | |
573 | 858 | #else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ |
574 | 859 | |
860 | +#ifdef CONFIG_RCU_TRACE | |
861 | + | |
575 | 862 | /* |
863 | + * Because preemptible RCU does not exist, it is not necessary to | |
864 | + * dump out its statistics. | |
865 | + */ | |
866 | +static void show_tiny_preempt_stats(struct seq_file *m) | |
867 | +{ | |
868 | +} | |
869 | + | |
870 | +#endif /* #ifdef CONFIG_RCU_TRACE */ | |
871 | + | |
872 | +/* | |
873 | + * Because preemptible RCU does not exist, it is never necessary to | |
874 | + * boost preempted RCU readers. | |
875 | + */ | |
876 | +static int rcu_boost(void) | |
877 | +{ | |
878 | + return 0; | |
879 | +} | |
880 | + | |
881 | +/* | |
576 | 882 | * Because preemptible RCU does not exist, it never has any callbacks |
577 | 883 | * to check. |
578 | 884 | */ |
579 | 885 | |
580 | 886 | |
... | ... | @@ -599,18 +905,117 @@ |
599 | 905 | #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ |
600 | 906 | |
601 | 907 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
602 | - | |
603 | 908 | #include <linux/kernel_stat.h> |
604 | 909 | |
605 | 910 | /* |
606 | 911 | * During boot, we forgive RCU lockdep issues. After this function is |
607 | 912 | * invoked, we start taking RCU lockdep issues seriously. |
608 | 913 | */ |
609 | -void rcu_scheduler_starting(void) | |
914 | +void __init rcu_scheduler_starting(void) | |
610 | 915 | { |
611 | 916 | WARN_ON(nr_context_switches() > 0); |
612 | 917 | rcu_scheduler_active = 1; |
613 | 918 | } |
614 | 919 | |
615 | 920 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
921 | + | |
922 | +#ifdef CONFIG_RCU_BOOST | |
923 | +#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO | |
924 | +#else /* #ifdef CONFIG_RCU_BOOST */ | |
925 | +#define RCU_BOOST_PRIO 1 | |
926 | +#endif /* #else #ifdef CONFIG_RCU_BOOST */ | |
927 | + | |
928 | +#ifdef CONFIG_RCU_TRACE | |
929 | + | |
930 | +#ifdef CONFIG_RCU_BOOST | |
931 | + | |
932 | +static void rcu_initiate_boost_trace(void) | |
933 | +{ | |
934 | + if (rcu_preempt_ctrlblk.gp_tasks == NULL) | |
935 | + rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++; | |
936 | + else if (rcu_preempt_ctrlblk.boost_tasks != NULL) | |
937 | + rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++; | |
938 | + else if (rcu_preempt_ctrlblk.boosted_this_gp != 0) | |
939 | + rcu_preempt_ctrlblk.n_normal_balk_boosted++; | |
940 | + else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) | |
941 | + rcu_preempt_ctrlblk.n_normal_balk_notyet++; | |
942 | + else | |
943 | + rcu_preempt_ctrlblk.n_normal_balk_nos++; | |
944 | +} | |
945 | + | |
946 | +static void rcu_initiate_exp_boost_trace(void) | |
947 | +{ | |
948 | + if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) | |
949 | + rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++; | |
950 | + else | |
951 | + rcu_preempt_ctrlblk.n_exp_balk_nos++; | |
952 | +} | |
953 | + | |
954 | +#endif /* #ifdef CONFIG_RCU_BOOST */ | |
955 | + | |
956 | +static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) | |
957 | +{ | |
958 | + unsigned long flags; | |
959 | + | |
960 | + raw_local_irq_save(flags); | |
961 | + rcp->qlen -= n; | |
962 | + raw_local_irq_restore(flags); | |
963 | +} | |
964 | + | |
965 | +/* | |
966 | + * Dump statistics for TINY_RCU, such as they are. | |
967 | + */ | |
968 | +static int show_tiny_stats(struct seq_file *m, void *unused) | |
969 | +{ | |
970 | + show_tiny_preempt_stats(m); | |
971 | + seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); | |
972 | + seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); | |
973 | + return 0; | |
974 | +} | |
975 | + | |
976 | +static int show_tiny_stats_open(struct inode *inode, struct file *file) | |
977 | +{ | |
978 | + return single_open(file, show_tiny_stats, NULL); | |
979 | +} | |
980 | + | |
981 | +static const struct file_operations show_tiny_stats_fops = { | |
982 | + .owner = THIS_MODULE, | |
983 | + .open = show_tiny_stats_open, | |
984 | + .read = seq_read, | |
985 | + .llseek = seq_lseek, | |
986 | + .release = single_release, | |
987 | +}; | |
988 | + | |
989 | +static struct dentry *rcudir; | |
990 | + | |
991 | +static int __init rcutiny_trace_init(void) | |
992 | +{ | |
993 | + struct dentry *retval; | |
994 | + | |
995 | + rcudir = debugfs_create_dir("rcu", NULL); | |
996 | + if (!rcudir) | |
997 | + goto free_out; | |
998 | + retval = debugfs_create_file("rcudata", 0444, rcudir, | |
999 | + NULL, &show_tiny_stats_fops); | |
1000 | + if (!retval) | |
1001 | + goto free_out; | |
1002 | + return 0; | |
1003 | +free_out: | |
1004 | + debugfs_remove_recursive(rcudir); | |
1005 | + return 1; | |
1006 | +} | |
1007 | + | |
1008 | +static void __exit rcutiny_trace_cleanup(void) | |
1009 | +{ | |
1010 | + debugfs_remove_recursive(rcudir); | |
1011 | +} | |
1012 | + | |
1013 | +module_init(rcutiny_trace_init); | |
1014 | +module_exit(rcutiny_trace_cleanup); | |
1015 | + | |
1016 | +MODULE_AUTHOR("Paul E. McKenney"); | |
1017 | +MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); | |
1018 | +MODULE_LICENSE("GPL"); | |
1019 | + | |
1020 | +#endif /* #ifdef CONFIG_RCU_TRACE */ |
kernel/rcutorture.c
... | ... | @@ -47,6 +47,7 @@ |
47 | 47 | #include <linux/srcu.h> |
48 | 48 | #include <linux/slab.h> |
49 | 49 | #include <asm/byteorder.h> |
50 | +#include <linux/sched.h> | |
50 | 51 | |
51 | 52 | MODULE_LICENSE("GPL"); |
52 | 53 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " |
... | ... | @@ -64,6 +65,9 @@ |
64 | 65 | static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ |
65 | 66 | static int fqs_holdoff = 0; /* Hold time within burst (us). */ |
66 | 67 | static int fqs_stutter = 3; /* Wait time between bursts (s). */ |
68 | +static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ | |
69 | +static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ | |
70 | +static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ | |
67 | 71 | static char *torture_type = "rcu"; /* What RCU implementation to torture. */ |
68 | 72 | |
69 | 73 | module_param(nreaders, int, 0444); |
... | ... | @@ -88,6 +92,12 @@ |
88 | 92 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); |
89 | 93 | module_param(fqs_stutter, int, 0444); |
90 | 94 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); |
95 | +module_param(test_boost, int, 0444); | |
96 | +MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); | |
97 | +module_param(test_boost_interval, int, 0444); | |
98 | +MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); | |
99 | +module_param(test_boost_duration, int, 0444); | |
100 | +MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); | |
91 | 101 | module_param(torture_type, charp, 0444); |
92 | 102 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); |
93 | 103 | |
... | ... | @@ -109,6 +119,7 @@ |
109 | 119 | static struct task_struct *shuffler_task; |
110 | 120 | static struct task_struct *stutter_task; |
111 | 121 | static struct task_struct *fqs_task; |
122 | +static struct task_struct *boost_tasks[NR_CPUS]; | |
112 | 123 | |
113 | 124 | #define RCU_TORTURE_PIPE_LEN 10 |
114 | 125 | |
... | ... | @@ -134,6 +145,12 @@ |
134 | 145 | static atomic_t n_rcu_torture_free; |
135 | 146 | static atomic_t n_rcu_torture_mberror; |
136 | 147 | static atomic_t n_rcu_torture_error; |
148 | +static long n_rcu_torture_boost_ktrerror; | |
149 | +static long n_rcu_torture_boost_rterror; | |
150 | +static long n_rcu_torture_boost_allocerror; | |
151 | +static long n_rcu_torture_boost_afferror; | |
152 | +static long n_rcu_torture_boost_failure; | |
153 | +static long n_rcu_torture_boosts; | |
137 | 154 | static long n_rcu_torture_timers; |
138 | 155 | static struct list_head rcu_torture_removed; |
139 | 156 | static cpumask_var_t shuffle_tmp_mask; |
... | ... | @@ -147,6 +164,16 @@ |
147 | 164 | #endif |
148 | 165 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; |
149 | 166 | |
167 | +#ifdef CONFIG_RCU_BOOST | |
168 | +#define rcu_can_boost() 1 | |
169 | +#else /* #ifdef CONFIG_RCU_BOOST */ | |
170 | +#define rcu_can_boost() 0 | |
171 | +#endif /* #else #ifdef CONFIG_RCU_BOOST */ | |
172 | + | |
173 | +static unsigned long boost_starttime; /* jiffies of next boost test start. */ | |
174 | +DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | |
175 | + /* and boost task create/destroy. */ | |
176 | + | |
150 | 177 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ |
151 | 178 | |
152 | 179 | #define FULLSTOP_DONTSTOP 0 /* Normal operation. */ |
... | ... | @@ -277,6 +304,7 @@ |
277 | 304 | void (*fqs)(void); |
278 | 305 | int (*stats)(char *page); |
279 | 306 | int irq_capable; |
307 | + int can_boost; | |
280 | 308 | char *name; |
281 | 309 | }; |
282 | 310 | |
... | ... | @@ -366,6 +394,7 @@ |
366 | 394 | .fqs = rcu_force_quiescent_state, |
367 | 395 | .stats = NULL, |
368 | 396 | .irq_capable = 1, |
397 | + .can_boost = rcu_can_boost(), | |
369 | 398 | .name = "rcu" |
370 | 399 | }; |
371 | 400 | |
... | ... | @@ -408,6 +437,7 @@ |
408 | 437 | .fqs = rcu_force_quiescent_state, |
409 | 438 | .stats = NULL, |
410 | 439 | .irq_capable = 1, |
440 | + .can_boost = rcu_can_boost(), | |
411 | 441 | .name = "rcu_sync" |
412 | 442 | }; |
413 | 443 | |
... | ... | @@ -424,6 +454,7 @@ |
424 | 454 | .fqs = rcu_force_quiescent_state, |
425 | 455 | .stats = NULL, |
426 | 456 | .irq_capable = 1, |
457 | + .can_boost = rcu_can_boost(), | |
427 | 458 | .name = "rcu_expedited" |
428 | 459 | }; |
429 | 460 | |
... | ... | @@ -684,6 +715,110 @@ |
684 | 715 | }; |
685 | 716 | |
686 | 717 | /* |
718 | + * RCU torture priority-boost testing. Runs one real-time thread per | |
719 | + * CPU for moderate bursts, repeatedly registering RCU callbacks and | |
720 | + * spinning waiting for them to be invoked. If a given callback takes | |
721 | + * too long to be invoked, we assume that priority inversion has occurred. | |
722 | + */ | |
723 | + | |
724 | +struct rcu_boost_inflight { | |
725 | + struct rcu_head rcu; | |
726 | + int inflight; | |
727 | +}; | |
728 | + | |
729 | +static void rcu_torture_boost_cb(struct rcu_head *head) | |
730 | +{ | |
731 | + struct rcu_boost_inflight *rbip = | |
732 | + container_of(head, struct rcu_boost_inflight, rcu); | |
733 | + | |
734 | + smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ | |
735 | + rbip->inflight = 0; | |
736 | +} | |
737 | + | |
738 | +static int rcu_torture_boost(void *arg) | |
739 | +{ | |
740 | + unsigned long call_rcu_time; | |
741 | + unsigned long endtime; | |
742 | + unsigned long oldstarttime; | |
743 | + struct rcu_boost_inflight rbi = { .inflight = 0 }; | |
744 | + struct sched_param sp; | |
745 | + | |
746 | + VERBOSE_PRINTK_STRING("rcu_torture_boost started"); | |
747 | + | |
748 | + /* Set real-time priority. */ | |
749 | + sp.sched_priority = 1; | |
750 | + if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { | |
751 | + VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); | |
752 | + n_rcu_torture_boost_rterror++; | |
753 | + } | |
754 | + | |
755 | + /* Each pass through the following loop does one boost-test cycle. */ | |
756 | + do { | |
757 | + /* Wait for the next test interval. */ | |
758 | + oldstarttime = boost_starttime; | |
759 | + while (jiffies - oldstarttime > ULONG_MAX / 2) { | |
760 | + schedule_timeout_uninterruptible(1); | |
761 | + rcu_stutter_wait("rcu_torture_boost"); | |
762 | + if (kthread_should_stop() || | |
763 | + fullstop != FULLSTOP_DONTSTOP) | |
764 | + goto checkwait; | |
765 | + } | |
766 | + | |
767 | + /* Do one boost-test interval. */ | |
768 | + endtime = oldstarttime + test_boost_duration * HZ; | |
769 | + call_rcu_time = jiffies; | |
770 | + while (jiffies - endtime > ULONG_MAX / 2) { | |
771 | + /* If we don't have a callback in flight, post one. */ | |
772 | + if (!rbi.inflight) { | |
773 | + smp_mb(); /* RCU core before ->inflight = 1. */ | |
774 | + rbi.inflight = 1; | |
775 | + call_rcu(&rbi.rcu, rcu_torture_boost_cb); | |
776 | + if (jiffies - call_rcu_time > | |
777 | + test_boost_duration * HZ - HZ / 2) { | |
778 | + VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); | |
779 | + n_rcu_torture_boost_failure++; | |
780 | + } | |
781 | + call_rcu_time = jiffies; | |
782 | + } | |
783 | + cond_resched(); | |
784 | + rcu_stutter_wait("rcu_torture_boost"); | |
785 | + if (kthread_should_stop() || | |
786 | + fullstop != FULLSTOP_DONTSTOP) | |
787 | + goto checkwait; | |
788 | + } | |
789 | + | |
790 | + /* | |
791 | + * Set the start time of the next test interval. | |
792 | + * Yes, this is vulnerable to long delays, but such | |
793 | + * delays simply cause a false negative for the next | |
794 | + * interval. Besides, we are running at RT priority, | |
795 | + * so delays should be relatively rare. | |
796 | + */ | |
797 | + while (oldstarttime == boost_starttime) { | |
798 | + if (mutex_trylock(&boost_mutex)) { | |
799 | + boost_starttime = jiffies + | |
800 | + test_boost_interval * HZ; | |
801 | + n_rcu_torture_boosts++; | |
802 | + mutex_unlock(&boost_mutex); | |
803 | + break; | |
804 | + } | |
805 | + schedule_timeout_uninterruptible(1); | |
806 | + } | |
807 | + | |
808 | + /* Go do the stutter. */ | |
809 | +checkwait: rcu_stutter_wait("rcu_torture_boost"); | |
810 | + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | |
811 | + | |
812 | + /* Clean up and exit. */ | |
813 | + VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); | |
814 | + rcutorture_shutdown_absorb("rcu_torture_boost"); | |
815 | + while (!kthread_should_stop() || rbi.inflight) | |
816 | + schedule_timeout_uninterruptible(1); | |
817 | + smp_mb(); /* order accesses to ->inflight before stack-frame death. */ | |
818 | + return 0; | |
819 | +} | |
820 | + | |
821 | +/* | |
687 | 822 | * RCU torture force-quiescent-state kthread. Repeatedly induces |
688 | 823 | * bursts of calls to force_quiescent_state(), increasing the probability |
689 | 824 | * of occurrence of some important types of race conditions. |
... | ... | @@ -933,7 +1068,8 @@ |
933 | 1068 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); |
934 | 1069 | cnt += sprintf(&page[cnt], |
935 | 1070 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " |
936 | - "rtmbe: %d nt: %ld", | |
1071 | + "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld " | |
1072 | + "rtbf: %ld rtb: %ld nt: %ld", | |
937 | 1073 | rcu_torture_current, |
938 | 1074 | rcu_torture_current_version, |
939 | 1075 | list_empty(&rcu_torture_freelist), |
940 | 1076 | |
... | ... | @@ -941,8 +1077,19 @@ |
941 | 1077 | atomic_read(&n_rcu_torture_alloc_fail), |
942 | 1078 | atomic_read(&n_rcu_torture_free), |
943 | 1079 | atomic_read(&n_rcu_torture_mberror), |
1080 | + n_rcu_torture_boost_ktrerror, | |
1081 | + n_rcu_torture_boost_rterror, | |
1082 | + n_rcu_torture_boost_allocerror, | |
1083 | + n_rcu_torture_boost_afferror, | |
1084 | + n_rcu_torture_boost_failure, | |
1085 | + n_rcu_torture_boosts, | |
944 | 1086 | n_rcu_torture_timers); |
945 | - if (atomic_read(&n_rcu_torture_mberror) != 0) | |
1087 | + if (atomic_read(&n_rcu_torture_mberror) != 0 || | |
1088 | + n_rcu_torture_boost_ktrerror != 0 || | |
1089 | + n_rcu_torture_boost_rterror != 0 || | |
1090 | + n_rcu_torture_boost_allocerror != 0 || | |
1091 | + n_rcu_torture_boost_afferror != 0 || | |
1092 | + n_rcu_torture_boost_failure != 0) | |
946 | 1093 | cnt += sprintf(&page[cnt], " !!!"); |
947 | 1094 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
948 | 1095 | if (i > 1) { |
949 | 1096 | |
950 | 1097 | |
951 | 1098 | |
952 | 1099 | |
... | ... | @@ -1094,22 +1241,91 @@ |
1094 | 1241 | } |
1095 | 1242 | |
1096 | 1243 | static inline void |
1097 | -rcu_torture_print_module_parms(char *tag) | |
1244 | +rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) | |
1098 | 1245 | { |
1099 | 1246 | printk(KERN_ALERT "%s" TORTURE_FLAG |
1100 | 1247 | "--- %s: nreaders=%d nfakewriters=%d " |
1101 | 1248 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " |
1102 | 1249 | "shuffle_interval=%d stutter=%d irqreader=%d " |
1103 | - "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n", | |
1250 | + "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " | |
1251 | + "test_boost=%d/%d test_boost_interval=%d " | |
1252 | + "test_boost_duration=%d\n", | |
1104 | 1253 | torture_type, tag, nrealreaders, nfakewriters, |
1105 | 1254 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, |
1106 | - stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter); | |
1255 | + stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, | |
1256 | + test_boost, cur_ops->can_boost, | |
1257 | + test_boost_interval, test_boost_duration); | |
1107 | 1258 | } |
1108 | 1259 | |
1109 | -static struct notifier_block rcutorture_nb = { | |
1260 | +static struct notifier_block rcutorture_shutdown_nb = { | |
1110 | 1261 | .notifier_call = rcutorture_shutdown_notify, |
1111 | 1262 | }; |
1112 | 1263 | |
1264 | +static void rcutorture_booster_cleanup(int cpu) | |
1265 | +{ | |
1266 | + struct task_struct *t; | |
1267 | + | |
1268 | + if (boost_tasks[cpu] == NULL) | |
1269 | + return; | |
1270 | + mutex_lock(&boost_mutex); | |
1271 | + VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); | |
1272 | + t = boost_tasks[cpu]; | |
1273 | + boost_tasks[cpu] = NULL; | |
1274 | + mutex_unlock(&boost_mutex); | |
1275 | + | |
1276 | + /* This must be outside of the mutex, otherwise deadlock! */ | |
1277 | + kthread_stop(t); | |
1278 | +} | |
1279 | + | |
1280 | +static int rcutorture_booster_init(int cpu) | |
1281 | +{ | |
1282 | + int retval; | |
1283 | + | |
1284 | + if (boost_tasks[cpu] != NULL) | |
1285 | + return 0; /* Already created, nothing more to do. */ | |
1286 | + | |
1287 | + /* Don't allow time recalculation while creating a new task. */ | |
1288 | + mutex_lock(&boost_mutex); | |
1289 | + VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); | |
1290 | + boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL, | |
1291 | + "rcu_torture_boost"); | |
1292 | + if (IS_ERR(boost_tasks[cpu])) { | |
1293 | + retval = PTR_ERR(boost_tasks[cpu]); | |
1294 | + VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); | |
1295 | + n_rcu_torture_boost_ktrerror++; | |
1296 | + boost_tasks[cpu] = NULL; | |
1297 | + mutex_unlock(&boost_mutex); | |
1298 | + return retval; | |
1299 | + } | |
1300 | + kthread_bind(boost_tasks[cpu], cpu); | |
1301 | + wake_up_process(boost_tasks[cpu]); | |
1302 | + mutex_unlock(&boost_mutex); | |
1303 | + return 0; | |
1304 | +} | |
1305 | + | |
1306 | +static int rcutorture_cpu_notify(struct notifier_block *self, | |
1307 | + unsigned long action, void *hcpu) | |
1308 | +{ | |
1309 | + long cpu = (long)hcpu; | |
1310 | + | |
1311 | + switch (action) { | |
1312 | + case CPU_ONLINE: | |
1313 | + case CPU_DOWN_FAILED: | |
1314 | + (void)rcutorture_booster_init(cpu); | |
1315 | + break; | |
1316 | + case CPU_DOWN_PREPARE: | |
1317 | + rcutorture_booster_cleanup(cpu); | |
1318 | + break; | |
1319 | + default: | |
1320 | + break; | |
1321 | + } | |
1322 | + return NOTIFY_OK; | |
1323 | +} | |
1324 | + | |
1325 | +static struct notifier_block rcutorture_cpu_nb = { | |
1326 | + .notifier_call = rcutorture_cpu_notify, | |
1327 | +}; | |
1328 | + | |
1113 | 1329 | static void |
1114 | 1330 | rcu_torture_cleanup(void) |
1115 | 1331 | { |
... | ... | @@ -1127,7 +1343,7 @@ |
1127 | 1343 | } |
1128 | 1344 | fullstop = FULLSTOP_RMMOD; |
1129 | 1345 | mutex_unlock(&fullstop_mutex); |
1130 | - unregister_reboot_notifier(&rcutorture_nb); | |
1346 | + unregister_reboot_notifier(&rcutorture_shutdown_nb); | |
1131 | 1347 | if (stutter_task) { |
1132 | 1348 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); |
1133 | 1349 | kthread_stop(stutter_task); |
... | ... | @@ -1184,6 +1400,12 @@ |
1184 | 1400 | kthread_stop(fqs_task); |
1185 | 1401 | } |
1186 | 1402 | fqs_task = NULL; |
1403 | + if ((test_boost == 1 && cur_ops->can_boost) || | |
1404 | + test_boost == 2) { | |
1405 | + unregister_cpu_notifier(&rcutorture_cpu_nb); | |
1406 | + for_each_possible_cpu(i) | |
1407 | + rcutorture_booster_cleanup(i); | |
1408 | + } | |
1187 | 1409 | |
1188 | 1410 | /* Wait for all RCU callbacks to fire. */ |
1189 | 1411 | |
1190 | 1412 | |
... | ... | @@ -1195,9 +1417,9 @@ |
1195 | 1417 | if (cur_ops->cleanup) |
1196 | 1418 | cur_ops->cleanup(); |
1197 | 1419 | if (atomic_read(&n_rcu_torture_error)) |
1198 | - rcu_torture_print_module_parms("End of test: FAILURE"); | |
1420 | + rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); | |
1199 | 1421 | else |
1200 | - rcu_torture_print_module_parms("End of test: SUCCESS"); | |
1422 | + rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); | |
1201 | 1423 | } |
1202 | 1424 | |
1203 | 1425 | static int __init |
... | ... | @@ -1242,7 +1464,7 @@ |
1242 | 1464 | nrealreaders = nreaders; |
1243 | 1465 | else |
1244 | 1466 | nrealreaders = 2 * num_online_cpus(); |
1245 | - rcu_torture_print_module_parms("Start of test"); | |
1467 | + rcu_torture_print_module_parms(cur_ops, "Start of test"); | |
1246 | 1468 | fullstop = FULLSTOP_DONTSTOP; |
1247 | 1469 | |
1248 | 1470 | /* Set up the freelist. */ |
... | ... | @@ -1263,6 +1485,12 @@ |
1263 | 1485 | atomic_set(&n_rcu_torture_free, 0); |
1264 | 1486 | atomic_set(&n_rcu_torture_mberror, 0); |
1265 | 1487 | atomic_set(&n_rcu_torture_error, 0); |
1488 | + n_rcu_torture_boost_ktrerror = 0; | |
1489 | + n_rcu_torture_boost_rterror = 0; | |
1490 | + n_rcu_torture_boost_allocerror = 0; | |
1491 | + n_rcu_torture_boost_afferror = 0; | |
1492 | + n_rcu_torture_boost_failure = 0; | |
1493 | + n_rcu_torture_boosts = 0; | |
1266 | 1494 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
1267 | 1495 | atomic_set(&rcu_torture_wcount[i], 0); |
1268 | 1496 | for_each_possible_cpu(cpu) { |
... | ... | @@ -1376,7 +1604,27 @@ |
1376 | 1604 | goto unwind; |
1377 | 1605 | } |
1378 | 1606 | } |
1379 | - register_reboot_notifier(&rcutorture_nb); | |
1607 | + if (test_boost_interval < 1) | |
1608 | + test_boost_interval = 1; | |
1609 | + if (test_boost_duration < 2) | |
1610 | + test_boost_duration = 2; | |
1611 | + if ((test_boost == 1 && cur_ops->can_boost) || | |
1612 | + test_boost == 2) { | |
1613 | + int retval; | |
1614 | + | |
1615 | + boost_starttime = jiffies + test_boost_interval * HZ; | |
1616 | + register_cpu_notifier(&rcutorture_cpu_nb); | |
1617 | + for_each_possible_cpu(i) { | |
1618 | + if (cpu_is_offline(i)) | |
1619 | + continue; /* Heuristic: CPU can go offline. */ | |
1620 | + retval = rcutorture_booster_init(i); | |
1621 | + if (retval < 0) { | |
1622 | + firsterr = retval; | |
1623 | + goto unwind; | |
1624 | + } | |
1625 | + } | |
1626 | + } | |
1627 | + register_reboot_notifier(&rcutorture_shutdown_nb); | |
1380 | 1628 | mutex_unlock(&fullstop_mutex); |
1381 | 1629 | return 0; |
1382 | 1630 |
kernel/rcutree.c
... | ... | @@ -67,9 +67,6 @@ |
67 | 67 | .gpnum = -300, \ |
68 | 68 | .completed = -300, \ |
69 | 69 | .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ |
70 | - .orphan_cbs_list = NULL, \ | |
71 | - .orphan_cbs_tail = &structname.orphan_cbs_list, \ | |
72 | - .orphan_qlen = 0, \ | |
73 | 70 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ |
74 | 71 | .n_force_qs = 0, \ |
75 | 72 | .n_force_qs_ngp = 0, \ |
76 | 73 | |
... | ... | @@ -620,9 +617,17 @@ |
620 | 617 | static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) |
621 | 618 | { |
622 | 619 | if (rdp->gpnum != rnp->gpnum) { |
623 | - rdp->qs_pending = 1; | |
624 | - rdp->passed_quiesc = 0; | |
620 | + /* | |
621 | + * If the current grace period is waiting for this CPU, | |
622 | + * set up to detect a quiescent state, otherwise don't | |
623 | + * go looking for one. | |
624 | + */ | |
625 | 625 | rdp->gpnum = rnp->gpnum; |
626 | + if (rnp->qsmask & rdp->grpmask) { | |
627 | + rdp->qs_pending = 1; | |
628 | + rdp->passed_quiesc = 0; | |
629 | + } else | |
630 | + rdp->qs_pending = 0; | |
626 | 631 | } |
627 | 632 | } |
628 | 633 | |
... | ... | @@ -681,6 +686,24 @@ |
681 | 686 | |
682 | 687 | /* Remember that we saw this grace-period completion. */ |
683 | 688 | rdp->completed = rnp->completed; |
689 | + | |
690 | + /* | |
691 | + * If we were in an extended quiescent state, we may have | |
692 | + * missed some grace periods that others CPUs handled on | |
693 | + * our behalf. Catch up with this state to avoid noting | |
694 | + * spurious new grace periods. If another grace period | |
695 | + * has started, then rnp->gpnum will have advanced, so | |
696 | + * we will detect this later on. | |
697 | + */ | |
698 | + if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) | |
699 | + rdp->gpnum = rdp->completed; | |
700 | + | |
701 | + /* | |
702 | + * If RCU does not need a quiescent state from this CPU, | |
703 | + * then make sure that this CPU doesn't go looking for one. | |
704 | + */ | |
705 | + if ((rnp->qsmask & rdp->grpmask) == 0) | |
706 | + rdp->qs_pending = 0; | |
684 | 707 | } |
685 | 708 | } |
686 | 709 | |
687 | 710 | |
688 | 711 | |
689 | 712 | |
690 | 713 | |
691 | 714 | |
692 | 715 | |
693 | 716 | |
... | ... | @@ -984,56 +1007,34 @@ |
984 | 1007 | #ifdef CONFIG_HOTPLUG_CPU |
985 | 1008 | |
986 | 1009 | /* |
987 | - * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the | |
988 | - * specified flavor of RCU. The callbacks will be adopted by the next | |
989 | - * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever | |
990 | - * comes first. Because this is invoked from the CPU_DYING notifier, | |
991 | - * irqs are already disabled. | |
1010 | + * Move a dying CPU's RCU callbacks to online CPU's callback list. | |
1011 | + * Synchronization is not required because this function executes | |
1012 | + * in stop_machine() context. | |
992 | 1013 | */ |
993 | -static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | |
1014 | +static void rcu_send_cbs_to_online(struct rcu_state *rsp) | |
994 | 1015 | { |
995 | 1016 | int i; |
1017 | + /* current DYING CPU is cleared in the cpu_online_mask */ | |
1018 | + int receive_cpu = cpumask_any(cpu_online_mask); | |
996 | 1019 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); |
1020 | + struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); | |
997 | 1021 | |
998 | 1022 | if (rdp->nxtlist == NULL) |
999 | 1023 | return; /* irqs disabled, so comparison is stable. */ |
1000 | - raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ | |
1001 | - *rsp->orphan_cbs_tail = rdp->nxtlist; | |
1002 | - rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; | |
1024 | + | |
1025 | + *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; | |
1026 | + receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | |
1027 | + receive_rdp->qlen += rdp->qlen; | |
1028 | + receive_rdp->n_cbs_adopted += rdp->qlen; | |
1029 | + rdp->n_cbs_orphaned += rdp->qlen; | |
1030 | + | |
1003 | 1031 | rdp->nxtlist = NULL; |
1004 | 1032 | for (i = 0; i < RCU_NEXT_SIZE; i++) |
1005 | 1033 | rdp->nxttail[i] = &rdp->nxtlist; |
1006 | - rsp->orphan_qlen += rdp->qlen; | |
1007 | - rdp->n_cbs_orphaned += rdp->qlen; | |
1008 | 1034 | rdp->qlen = 0; |
1009 | - raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ | |
1010 | 1035 | } |
1011 | 1036 | |
1012 | 1037 | /* |
1013 | - * Adopt previously orphaned RCU callbacks. | |
1014 | - */ | |
1015 | -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | |
1016 | -{ | |
1017 | - unsigned long flags; | |
1018 | - struct rcu_data *rdp; | |
1019 | - | |
1020 | - raw_spin_lock_irqsave(&rsp->onofflock, flags); | |
1021 | - rdp = this_cpu_ptr(rsp->rda); | |
1022 | - if (rsp->orphan_cbs_list == NULL) { | |
1023 | - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | |
1024 | - return; | |
1025 | - } | |
1026 | - *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; | |
1027 | - rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; | |
1028 | - rdp->qlen += rsp->orphan_qlen; | |
1029 | - rdp->n_cbs_adopted += rsp->orphan_qlen; | |
1030 | - rsp->orphan_cbs_list = NULL; | |
1031 | - rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; | |
1032 | - rsp->orphan_qlen = 0; | |
1033 | - raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | |
1034 | -} | |
1035 | - | |
1036 | -/* | |
1037 | 1038 | * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy |
1038 | 1039 | * and move all callbacks from the outgoing CPU to the current one. |
1039 | 1040 | */ |
... | ... | @@ -1081,8 +1082,6 @@ |
1081 | 1082 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
1082 | 1083 | if (need_report & RCU_OFL_TASKS_EXP_GP) |
1083 | 1084 | rcu_report_exp_rnp(rsp, rnp); |
1084 | - | |
1085 | - rcu_adopt_orphan_cbs(rsp); | |
1086 | 1085 | } |
1087 | 1086 | |
1088 | 1087 | /* |
1089 | 1088 | |
... | ... | @@ -1100,14 +1099,10 @@ |
1100 | 1099 | |
1101 | 1100 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
1102 | 1101 | |
1103 | -static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) | |
1102 | +static void rcu_send_cbs_to_online(struct rcu_state *rsp) | |
1104 | 1103 | { |
1105 | 1104 | } |
1106 | 1105 | |
1107 | -static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | |
1108 | -{ | |
1109 | -} | |
1110 | - | |
1111 | 1106 | static void rcu_offline_cpu(int cpu) |
1112 | 1107 | { |
1113 | 1108 | } |
1114 | 1109 | |
... | ... | @@ -1440,22 +1435,11 @@ |
1440 | 1435 | */ |
1441 | 1436 | local_irq_save(flags); |
1442 | 1437 | rdp = this_cpu_ptr(rsp->rda); |
1443 | - rcu_process_gp_end(rsp, rdp); | |
1444 | - check_for_new_grace_period(rsp, rdp); | |
1445 | 1438 | |
1446 | 1439 | /* Add the callback to our list. */ |
1447 | 1440 | *rdp->nxttail[RCU_NEXT_TAIL] = head; |
1448 | 1441 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; |
1449 | 1442 | |
1450 | - /* Start a new grace period if one not already started. */ | |
1451 | - if (!rcu_gp_in_progress(rsp)) { | |
1452 | - unsigned long nestflag; | |
1453 | - struct rcu_node *rnp_root = rcu_get_root(rsp); | |
1454 | - | |
1455 | - raw_spin_lock_irqsave(&rnp_root->lock, nestflag); | |
1456 | - rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ | |
1457 | - } | |
1458 | - | |
1459 | 1443 | /* |
1460 | 1444 | * Force the grace period if too many callbacks or too long waiting. |
1461 | 1445 | * Enforce hysteresis, and don't invoke force_quiescent_state() |
... | ... | @@ -1464,12 +1448,27 @@ |
1464 | 1448 | * is the only one waiting for a grace period to complete. |
1465 | 1449 | */ |
1466 | 1450 | if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { |
1467 | - rdp->blimit = LONG_MAX; | |
1468 | - if (rsp->n_force_qs == rdp->n_force_qs_snap && | |
1469 | - *rdp->nxttail[RCU_DONE_TAIL] != head) | |
1470 | - force_quiescent_state(rsp, 0); | |
1471 | - rdp->n_force_qs_snap = rsp->n_force_qs; | |
1472 | - rdp->qlen_last_fqs_check = rdp->qlen; | |
1451 | + | |
1452 | + /* Are we ignoring a completed grace period? */ | |
1453 | + rcu_process_gp_end(rsp, rdp); | |
1454 | + check_for_new_grace_period(rsp, rdp); | |
1455 | + | |
1456 | + /* Start a new grace period if one not already started. */ | |
1457 | + if (!rcu_gp_in_progress(rsp)) { | |
1458 | + unsigned long nestflag; | |
1459 | + struct rcu_node *rnp_root = rcu_get_root(rsp); | |
1460 | + | |
1461 | + raw_spin_lock_irqsave(&rnp_root->lock, nestflag); | |
1462 | + rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ | |
1463 | + } else { | |
1464 | + /* Give the grace period a kick. */ | |
1465 | + rdp->blimit = LONG_MAX; | |
1466 | + if (rsp->n_force_qs == rdp->n_force_qs_snap && | |
1467 | + *rdp->nxttail[RCU_DONE_TAIL] != head) | |
1468 | + force_quiescent_state(rsp, 0); | |
1469 | + rdp->n_force_qs_snap = rsp->n_force_qs; | |
1470 | + rdp->qlen_last_fqs_check = rdp->qlen; | |
1471 | + } | |
1473 | 1472 | } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) |
1474 | 1473 | force_quiescent_state(rsp, 1); |
1475 | 1474 | local_irq_restore(flags); |
1476 | 1475 | |
1477 | 1476 | |
... | ... | @@ -1699,13 +1698,12 @@ |
1699 | 1698 | * decrement rcu_barrier_cpu_count -- otherwise the first CPU |
1700 | 1699 | * might complete its grace period before all of the other CPUs |
1701 | 1700 | * did their increment, causing this function to return too |
1702 | - * early. | |
1701 | + * early. Note that on_each_cpu() disables irqs, which prevents | |
1702 | + * any CPUs from coming online or going offline until each online | |
1703 | + * CPU has queued its RCU-barrier callback. | |
1703 | 1704 | */ |
1704 | 1705 | atomic_set(&rcu_barrier_cpu_count, 1); |
1705 | - preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */ | |
1706 | - rcu_adopt_orphan_cbs(rsp); | |
1707 | 1706 | on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); |
1708 | - preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */ | |
1709 | 1707 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
1710 | 1708 | complete(&rcu_barrier_completion); |
1711 | 1709 | wait_for_completion(&rcu_barrier_completion); |
1712 | 1710 | |
... | ... | @@ -1831,18 +1829,13 @@ |
1831 | 1829 | case CPU_DYING: |
1832 | 1830 | case CPU_DYING_FROZEN: |
1833 | 1831 | /* |
1834 | - * preempt_disable() in _rcu_barrier() prevents stop_machine(), | |
1835 | - * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" | |
1836 | - * returns, all online cpus have queued rcu_barrier_func(). | |
1837 | - * The dying CPU clears its cpu_online_mask bit and | |
1838 | - * moves all of its RCU callbacks to ->orphan_cbs_list | |
1839 | - * in the context of stop_machine(), so subsequent calls | |
1840 | - * to _rcu_barrier() will adopt these callbacks and only | |
1841 | - * then queue rcu_barrier_func() on all remaining CPUs. | |
1832 | + * The whole machine is "stopped" except this CPU, so we can | |
1833 | + * touch any data without introducing corruption. We send the | |
1834 | + * dying CPU's callbacks to an arbitrarily chosen online CPU. | |
1842 | 1835 | */ |
1843 | - rcu_send_cbs_to_orphanage(&rcu_bh_state); | |
1844 | - rcu_send_cbs_to_orphanage(&rcu_sched_state); | |
1845 | - rcu_preempt_send_cbs_to_orphanage(); | |
1836 | + rcu_send_cbs_to_online(&rcu_bh_state); | |
1837 | + rcu_send_cbs_to_online(&rcu_sched_state); | |
1838 | + rcu_preempt_send_cbs_to_online(); | |
1846 | 1839 | break; |
1847 | 1840 | case CPU_DEAD: |
1848 | 1841 | case CPU_DEAD_FROZEN: |
1849 | 1842 | |
... | ... | @@ -1880,8 +1873,9 @@ |
1880 | 1873 | { |
1881 | 1874 | int i; |
1882 | 1875 | |
1883 | - for (i = NUM_RCU_LVLS - 1; i >= 0; i--) | |
1876 | + for (i = NUM_RCU_LVLS - 1; i > 0; i--) | |
1884 | 1877 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; |
1878 | + rsp->levelspread[0] = RCU_FANOUT_LEAF; | |
1885 | 1879 | } |
1886 | 1880 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ |
1887 | 1881 | static void __init rcu_init_levelspread(struct rcu_state *rsp) |
kernel/rcutree.h
... | ... | @@ -31,46 +31,51 @@ |
31 | 31 | /* |
32 | 32 | * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. |
33 | 33 | * In theory, it should be possible to add more levels straightforwardly. |
34 | - * In practice, this has not been tested, so there is probably some | |
35 | - * bug somewhere. | |
34 | + * In practice, this did work well going from three levels to four. | |
35 | + * Of course, your mileage may vary. | |
36 | 36 | */ |
37 | 37 | #define MAX_RCU_LVLS 4 |
38 | -#define RCU_FANOUT (CONFIG_RCU_FANOUT) | |
39 | -#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) | |
40 | -#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) | |
41 | -#define RCU_FANOUT_FOURTH (RCU_FANOUT_CUBE * RCU_FANOUT) | |
38 | +#if CONFIG_RCU_FANOUT > 16 | |
39 | +#define RCU_FANOUT_LEAF 16 | |
40 | +#else /* #if CONFIG_RCU_FANOUT > 16 */ | |
41 | +#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT) | |
42 | +#endif /* #else #if CONFIG_RCU_FANOUT > 16 */ | |
43 | +#define RCU_FANOUT_1 (RCU_FANOUT_LEAF) | |
44 | +#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) | |
45 | +#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) | |
46 | +#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) | |
42 | 47 | |
43 | -#if NR_CPUS <= RCU_FANOUT | |
48 | +#if NR_CPUS <= RCU_FANOUT_1 | |
44 | 49 | # define NUM_RCU_LVLS 1 |
45 | 50 | # define NUM_RCU_LVL_0 1 |
46 | 51 | # define NUM_RCU_LVL_1 (NR_CPUS) |
47 | 52 | # define NUM_RCU_LVL_2 0 |
48 | 53 | # define NUM_RCU_LVL_3 0 |
49 | 54 | # define NUM_RCU_LVL_4 0 |
50 | -#elif NR_CPUS <= RCU_FANOUT_SQ | |
55 | +#elif NR_CPUS <= RCU_FANOUT_2 | |
51 | 56 | # define NUM_RCU_LVLS 2 |
52 | 57 | # define NUM_RCU_LVL_0 1 |
53 | -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) | |
58 | +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | |
54 | 59 | # define NUM_RCU_LVL_2 (NR_CPUS) |
55 | 60 | # define NUM_RCU_LVL_3 0 |
56 | 61 | # define NUM_RCU_LVL_4 0 |
57 | -#elif NR_CPUS <= RCU_FANOUT_CUBE | |
62 | +#elif NR_CPUS <= RCU_FANOUT_3 | |
58 | 63 | # define NUM_RCU_LVLS 3 |
59 | 64 | # define NUM_RCU_LVL_0 1 |
60 | -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) | |
61 | -# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) | |
62 | -# define NUM_RCU_LVL_3 NR_CPUS | |
65 | +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) | |
66 | +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | |
67 | +# define NUM_RCU_LVL_3 (NR_CPUS) | |
63 | 68 | # define NUM_RCU_LVL_4 0 |
64 | -#elif NR_CPUS <= RCU_FANOUT_FOURTH | |
69 | +#elif NR_CPUS <= RCU_FANOUT_4 | |
65 | 70 | # define NUM_RCU_LVLS 4 |
66 | 71 | # define NUM_RCU_LVL_0 1 |
67 | -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_CUBE) | |
68 | -# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ) | |
69 | -# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT) | |
70 | -# define NUM_RCU_LVL_4 NR_CPUS | |
72 | +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) | |
73 | +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) | |
74 | +# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | |
75 | +# define NUM_RCU_LVL_4 (NR_CPUS) | |
71 | 76 | #else |
72 | 77 | # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" |
73 | -#endif /* #if (NR_CPUS) <= RCU_FANOUT */ | |
78 | +#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ | |
74 | 79 | |
75 | 80 | #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) |
76 | 81 | #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) |
... | ... | @@ -203,8 +208,8 @@ |
203 | 208 | long qlen_last_fqs_check; |
204 | 209 | /* qlen at last check for QS forcing */ |
205 | 210 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ |
206 | - unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ | |
207 | - unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ | |
211 | + unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ | |
212 | + unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ | |
208 | 213 | unsigned long n_force_qs_snap; |
209 | 214 | /* did other CPU force QS recently? */ |
210 | 215 | long blimit; /* Upper limit on a processed batch */ |
... | ... | @@ -309,15 +314,7 @@ |
309 | 314 | /* End of fields guarded by root rcu_node's lock. */ |
310 | 315 | |
311 | 316 | raw_spinlock_t onofflock; /* exclude on/offline and */ |
312 | - /* starting new GP. Also */ | |
313 | - /* protects the following */ | |
314 | - /* orphan_cbs fields. */ | |
315 | - struct rcu_head *orphan_cbs_list; /* list of rcu_head structs */ | |
316 | - /* orphaned by all CPUs in */ | |
317 | - /* a given leaf rcu_node */ | |
318 | - /* going offline. */ | |
319 | - struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ | |
320 | - long orphan_qlen; /* Number of orphaned cbs. */ | |
317 | + /* starting new GP. */ | |
321 | 318 | raw_spinlock_t fqslock; /* Only one task forcing */ |
322 | 319 | /* quiescent states. */ |
323 | 320 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
... | ... | @@ -390,7 +387,7 @@ |
390 | 387 | static int rcu_preempt_pending(int cpu); |
391 | 388 | static int rcu_preempt_needs_cpu(int cpu); |
392 | 389 | static void __cpuinit rcu_preempt_init_percpu_data(int cpu); |
393 | -static void rcu_preempt_send_cbs_to_orphanage(void); | |
390 | +static void rcu_preempt_send_cbs_to_online(void); | |
394 | 391 | static void __init __rcu_init_preempt(void); |
395 | 392 | static void rcu_needs_cpu_flush(void); |
396 | 393 |
kernel/rcutree_plugin.h
... | ... | @@ -25,6 +25,7 @@ |
25 | 25 | */ |
26 | 26 | |
27 | 27 | #include <linux/delay.h> |
28 | +#include <linux/stop_machine.h> | |
28 | 29 | |
29 | 30 | /* |
30 | 31 | * Check the RCU kernel configuration parameters and print informative |
31 | 32 | |
32 | 33 | |
... | ... | @@ -773,11 +774,11 @@ |
773 | 774 | } |
774 | 775 | |
775 | 776 | /* |
776 | - * Move preemptable RCU's callbacks to ->orphan_cbs_list. | |
777 | + * Move preemptable RCU's callbacks from dying CPU to other online CPU. | |
777 | 778 | */ |
778 | -static void rcu_preempt_send_cbs_to_orphanage(void) | |
779 | +static void rcu_preempt_send_cbs_to_online(void) | |
779 | 780 | { |
780 | - rcu_send_cbs_to_orphanage(&rcu_preempt_state); | |
781 | + rcu_send_cbs_to_online(&rcu_preempt_state); | |
781 | 782 | } |
782 | 783 | |
783 | 784 | /* |
... | ... | @@ -1001,7 +1002,7 @@ |
1001 | 1002 | /* |
1002 | 1003 | * Because there is no preemptable RCU, there are no callbacks to move. |
1003 | 1004 | */ |
1004 | -static void rcu_preempt_send_cbs_to_orphanage(void) | |
1005 | +static void rcu_preempt_send_cbs_to_online(void) | |
1005 | 1006 | { |
1006 | 1007 | } |
1007 | 1008 | |
... | ... | @@ -1013,6 +1014,132 @@ |
1013 | 1014 | } |
1014 | 1015 | |
1015 | 1016 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ |
1017 | + | |
1018 | +#ifndef CONFIG_SMP | |
1019 | + | |
1020 | +void synchronize_sched_expedited(void) | |
1021 | +{ | |
1022 | + cond_resched(); | |
1023 | +} | |
1024 | +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | |
1025 | + | |
1026 | +#else /* #ifndef CONFIG_SMP */ | |
1027 | + | |
1028 | +static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); | |
1029 | +static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); | |
1030 | + | |
1031 | +static int synchronize_sched_expedited_cpu_stop(void *data) | |
1032 | +{ | |
1033 | + /* | |
1034 | + * There must be a full memory barrier on each affected CPU | |
1035 | + * between the time that try_stop_cpus() is called and the | |
1036 | + * time that it returns. | |
1037 | + * | |
1038 | + * In the current initial implementation of cpu_stop, the | |
1039 | + * above condition is already met when the control reaches | |
1040 | + * this point and the following smp_mb() is not strictly | |
1041 | + * necessary. Do smp_mb() anyway for documentation and | |
1042 | + * robustness against future implementation changes. | |
1043 | + */ | |
1044 | + smp_mb(); /* See above comment block. */ | |
1045 | + return 0; | |
1046 | +} | |
1047 | + | |
1048 | +/* | |
1049 | + * Wait for an rcu-sched grace period to elapse, but use "big hammer" | |
1050 | + * approach to force grace period to end quickly. This consumes | |
1051 | + * significant time on all CPUs, and is thus not recommended for | |
1052 | + * any sort of common-case code. | |
1053 | + * | |
1054 | + * Note that it is illegal to call this function while holding any | |
1055 | + * lock that is acquired by a CPU-hotplug notifier. Failing to | |
1056 | + * observe this restriction will result in deadlock. | |
1057 | + * | |
1058 | + * This implementation can be thought of as an application of ticket | |
1059 | + * locking to RCU, with sync_sched_expedited_started and | |
1060 | + * sync_sched_expedited_done taking on the roles of the halves | |
1061 | + * of the ticket-lock word. Each task atomically increments | |
1062 | + * sync_sched_expedited_started upon entry, snapshotting the old value, | |
1063 | + * then attempts to stop all the CPUs. If this succeeds, then each | |
1064 | + * CPU will have executed a context switch, resulting in an RCU-sched | |
1065 | + * grace period. We are then done, so we use atomic_cmpxchg() to | |
1066 | + * update sync_sched_expedited_done to match our snapshot -- but | |
1067 | + * only if someone else has not already advanced past our snapshot. | |
1068 | + * | |
1069 | + * On the other hand, if try_stop_cpus() fails, we check the value | |
1070 | + * of sync_sched_expedited_done. If it has advanced past our | |
1071 | + * initial snapshot, then someone else must have forced a grace period | |
1072 | + * some time after we took our snapshot. In this case, our work is | |
1073 | + * done for us, and we can simply return. Otherwise, we try again, | |
1074 | + * but keep our initial snapshot for purposes of checking for someone | |
1075 | + * doing our work for us. | |
1076 | + * | |
1077 | + * If we fail too many times in a row, we fall back to synchronize_sched(). | |
1078 | + */ | |
1079 | +void synchronize_sched_expedited(void) | |
1080 | +{ | |
1081 | + int firstsnap, s, snap, trycount = 0; | |
1082 | + | |
1083 | + /* Note that atomic_inc_return() implies full memory barrier. */ | |
1084 | + firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); | |
1085 | + get_online_cpus(); | |
1086 | + | |
1087 | + /* | |
1088 | + * Each pass through the following loop attempts to force a | |
1089 | + * context switch on each CPU. | |
1090 | + */ | |
1091 | + while (try_stop_cpus(cpu_online_mask, | |
1092 | + synchronize_sched_expedited_cpu_stop, | |
1093 | + NULL) == -EAGAIN) { | |
1094 | + put_online_cpus(); | |
1095 | + | |
1096 | + /* No joy, try again later. Or just synchronize_sched(). */ | |
1097 | + if (trycount++ < 10) | |
1098 | + udelay(trycount * num_online_cpus()); | |
1099 | + else { | |
1100 | + synchronize_sched(); | |
1101 | + return; | |
1102 | + } | |
1103 | + | |
1104 | + /* Check to see if someone else did our work for us. */ | |
1105 | + s = atomic_read(&sync_sched_expedited_done); | |
1106 | + if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { | |
1107 | + smp_mb(); /* ensure test happens before caller kfree */ | |
1108 | + return; | |
1109 | + } | |
1110 | + | |
1111 | + /* | |
1112 | + * Refetching sync_sched_expedited_started allows later | |
1113 | + * callers to piggyback on our grace period. We subtract | |
1114 | + * 1 to get the same token that the last incrementer got. | |
1115 | + * We retry after they started, so our grace period works | |
1116 | + * for them, and they started after our first try, so their | |
1117 | + * grace period works for us. | |
1118 | + */ | |
1119 | + get_online_cpus(); | |
1120 | + snap = atomic_read(&sync_sched_expedited_started) - 1; | |
1121 | + smp_mb(); /* ensure read is before try_stop_cpus(). */ | |
1122 | + } | |
1123 | + | |
1124 | + /* | |
1125 | + * Everyone up to our most recent fetch is covered by our grace | |
1126 | + * period. Update the counter, but only if our work is still | |
1127 | + * relevant -- which it won't be if someone who started later | |
1128 | + * than we did beat us to the punch. | |
1129 | + */ | |
1130 | + do { | |
1131 | + s = atomic_read(&sync_sched_expedited_done); | |
1132 | + if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { | |
1133 | + smp_mb(); /* ensure test happens before caller kfree */ | |
1134 | + break; | |
1135 | + } | |
1136 | + } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); | |
1137 | + | |
1138 | + put_online_cpus(); | |
1139 | +} | |
1140 | +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | |
1141 | + | |
1142 | +#endif /* #else #ifndef CONFIG_SMP */ | |
1016 | 1143 | |
1017 | 1144 | #if !defined(CONFIG_RCU_FAST_NO_HZ) |
1018 | 1145 |
kernel/rcutree_trace.c
... | ... | @@ -166,13 +166,13 @@ |
166 | 166 | |
167 | 167 | gpnum = rsp->gpnum; |
168 | 168 | seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " |
169 | - "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", | |
169 | + "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", | |
170 | 170 | rsp->completed, gpnum, rsp->signaled, |
171 | 171 | (long)(rsp->jiffies_force_qs - jiffies), |
172 | 172 | (int)(jiffies & 0xffff), |
173 | 173 | rsp->n_force_qs, rsp->n_force_qs_ngp, |
174 | 174 | rsp->n_force_qs - rsp->n_force_qs_ngp, |
175 | - rsp->n_force_qs_lh, rsp->orphan_qlen); | |
175 | + rsp->n_force_qs_lh); | |
176 | 176 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { |
177 | 177 | if (rnp->level != level) { |
178 | 178 | seq_puts(m, "\n"); |
... | ... | @@ -300,7 +300,7 @@ |
300 | 300 | |
301 | 301 | static struct dentry *rcudir; |
302 | 302 | |
303 | -static int __init rcuclassic_trace_init(void) | |
303 | +static int __init rcutree_trace_init(void) | |
304 | 304 | { |
305 | 305 | struct dentry *retval; |
306 | 306 | |
307 | 307 | |
... | ... | @@ -337,14 +337,14 @@ |
337 | 337 | return 1; |
338 | 338 | } |
339 | 339 | |
340 | -static void __exit rcuclassic_trace_cleanup(void) | |
340 | +static void __exit rcutree_trace_cleanup(void) | |
341 | 341 | { |
342 | 342 | debugfs_remove_recursive(rcudir); |
343 | 343 | } |
344 | 344 | |
345 | 345 | |
346 | -module_init(rcuclassic_trace_init); | |
347 | -module_exit(rcuclassic_trace_cleanup); | |
346 | +module_init(rcutree_trace_init); | |
347 | +module_exit(rcutree_trace_cleanup); | |
348 | 348 | |
349 | 349 | MODULE_AUTHOR("Paul E. McKenney"); |
350 | 350 | MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); |
kernel/sched.c
... | ... | @@ -9533,74 +9533,4 @@ |
9533 | 9533 | .subsys_id = cpuacct_subsys_id, |
9534 | 9534 | }; |
9535 | 9535 | #endif /* CONFIG_CGROUP_CPUACCT */ |
9536 | - | |
9537 | -#ifndef CONFIG_SMP | |
9538 | - | |
9539 | -void synchronize_sched_expedited(void) | |
9540 | -{ | |
9541 | - barrier(); | |
9542 | -} | |
9543 | -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | |
9544 | - | |
9545 | -#else /* #ifndef CONFIG_SMP */ | |
9546 | - | |
9547 | -static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0); | |
9548 | - | |
9549 | -static int synchronize_sched_expedited_cpu_stop(void *data) | |
9550 | -{ | |
9551 | - /* | |
9552 | - * There must be a full memory barrier on each affected CPU | |
9553 | - * between the time that try_stop_cpus() is called and the | |
9554 | - * time that it returns. | |
9555 | - * | |
9556 | - * In the current initial implementation of cpu_stop, the | |
9557 | - * above condition is already met when the control reaches | |
9558 | - * this point and the following smp_mb() is not strictly | |
9559 | - * necessary. Do smp_mb() anyway for documentation and | |
9560 | - * robustness against future implementation changes. | |
9561 | - */ | |
9562 | - smp_mb(); /* See above comment block. */ | |
9563 | - return 0; | |
9564 | -} | |
9565 | - | |
9566 | -/* | |
9567 | - * Wait for an rcu-sched grace period to elapse, but use "big hammer" | |
9568 | - * approach to force grace period to end quickly. This consumes | |
9569 | - * significant time on all CPUs, and is thus not recommended for | |
9570 | - * any sort of common-case code. | |
9571 | - * | |
9572 | - * Note that it is illegal to call this function while holding any | |
9573 | - * lock that is acquired by a CPU-hotplug notifier. Failing to | |
9574 | - * observe this restriction will result in deadlock. | |
9575 | - */ | |
9576 | -void synchronize_sched_expedited(void) | |
9577 | -{ | |
9578 | - int snap, trycount = 0; | |
9579 | - | |
9580 | - smp_mb(); /* ensure prior mod happens before capturing snap. */ | |
9581 | - snap = atomic_read(&synchronize_sched_expedited_count) + 1; | |
9582 | - get_online_cpus(); | |
9583 | - while (try_stop_cpus(cpu_online_mask, | |
9584 | - synchronize_sched_expedited_cpu_stop, | |
9585 | - NULL) == -EAGAIN) { | |
9586 | - put_online_cpus(); | |
9587 | - if (trycount++ < 10) | |
9588 | - udelay(trycount * num_online_cpus()); | |
9589 | - else { | |
9590 | - synchronize_sched(); | |
9591 | - return; | |
9592 | - } | |
9593 | - if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) { | |
9594 | - smp_mb(); /* ensure test happens before caller kfree */ | |
9595 | - return; | |
9596 | - } | |
9597 | - get_online_cpus(); | |
9598 | - } | |
9599 | - atomic_inc(&synchronize_sched_expedited_count); | |
9600 | - smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */ | |
9601 | - put_online_cpus(); | |
9602 | -} | |
9603 | -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | |
9604 | - | |
9605 | -#endif /* #else #ifndef CONFIG_SMP */ |
kernel/srcu.c
... | ... | @@ -31,6 +31,7 @@ |
31 | 31 | #include <linux/rcupdate.h> |
32 | 32 | #include <linux/sched.h> |
33 | 33 | #include <linux/smp.h> |
34 | +#include <linux/delay.h> | |
34 | 35 | #include <linux/srcu.h> |
35 | 36 | |
36 | 37 | static int init_srcu_struct_fields(struct srcu_struct *sp) |
37 | 38 | |
... | ... | @@ -203,9 +204,14 @@ |
203 | 204 | * all srcu_read_lock() calls using the old counters have completed. |
204 | 205 | * Their corresponding critical sections might well be still |
205 | 206 | * executing, but the srcu_read_lock() primitives themselves |
206 | - * will have finished executing. | |
207 | + * will have finished executing. We initially give readers | |
208 | + * an arbitrarily chosen 10 microseconds to get out of their | |
209 | + * SRCU read-side critical sections, then loop waiting 1/HZ | |
210 | + * seconds per iteration. | |
207 | 211 | */ |
208 | 212 | |
213 | + if (srcu_readers_active_idx(sp, idx)) | |
214 | + udelay(CONFIG_SRCU_SYNCHRONIZE_DELAY); | |
209 | 215 | while (srcu_readers_active_idx(sp, idx)) |
210 | 216 | schedule_timeout_interruptible(1); |
211 | 217 |