Commit 21a92735f660eaecf69a6f2e777f18463760ec32
Committed by
Linus Torvalds
1 parent
48af0d7cb3
Exists in
master
and in
20 other branches
mm: mmu_notifier: have mmu_notifiers use a global SRCU so they may safely schedule
With an RCU based mmu_notifier implementation, any callout to mmu_notifier_invalidate_range_{start,end}() or mmu_notifier_invalidate_page() would not be allowed to call schedule() as that could potentially allow a modification to the mmu_notifier structure while it is currently being used. Since srcu allocs 4 machine words per instance per cpu, we may end up with memory exhaustion if we use srcu per mm. So all mms share a global srcu. Note that during large mmu_notifier activity exit & unregister paths might hang for longer periods, but it is tolerable for current mmu_notifier clients. Signed-off-by: Sagi Grimberg <sagig@mellanox.co.il> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Haggai Eran <haggaie@mellanox.com> Cc: "Paul E. McKenney" <paulmck@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 2 changed files with 49 additions and 25 deletions Side-by-side Diff
include/linux/mmu_notifier.h
mm/mmu_notifier.c
... | ... | @@ -14,10 +14,14 @@ |
14 | 14 | #include <linux/export.h> |
15 | 15 | #include <linux/mm.h> |
16 | 16 | #include <linux/err.h> |
17 | +#include <linux/srcu.h> | |
17 | 18 | #include <linux/rcupdate.h> |
18 | 19 | #include <linux/sched.h> |
19 | 20 | #include <linux/slab.h> |
20 | 21 | |
22 | +/* global SRCU for all MMs */ | |
23 | +struct srcu_struct srcu; | |
24 | + | |
21 | 25 | /* |
22 | 26 | * This function can't run concurrently against mmu_notifier_register |
23 | 27 | * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap |
... | ... | @@ -25,8 +29,8 @@ |
25 | 29 | * in parallel despite there being no task using this mm any more, |
26 | 30 | * through the vmas outside of the exit_mmap context, such as with |
27 | 31 | * vmtruncate. This serializes against mmu_notifier_unregister with |
28 | - * the mmu_notifier_mm->lock in addition to RCU and it serializes | |
29 | - * against the other mmu notifiers with RCU. struct mmu_notifier_mm | |
32 | + * the mmu_notifier_mm->lock in addition to SRCU and it serializes | |
33 | + * against the other mmu notifiers with SRCU. struct mmu_notifier_mm | |
30 | 34 | * can't go away from under us as exit_mmap holds an mm_count pin |
31 | 35 | * itself. |
32 | 36 | */ |
33 | 37 | |
... | ... | @@ -34,12 +38,13 @@ |
34 | 38 | { |
35 | 39 | struct mmu_notifier *mn; |
36 | 40 | struct hlist_node *n; |
41 | + int id; | |
37 | 42 | |
38 | 43 | /* |
39 | 44 | * RCU here will block mmu_notifier_unregister until |
40 | 45 | * ->release returns. |
41 | 46 | */ |
42 | - rcu_read_lock(); | |
47 | + id = srcu_read_lock(&srcu); | |
43 | 48 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) |
44 | 49 | /* |
45 | 50 | * if ->release runs before mmu_notifier_unregister it |
... | ... | @@ -50,7 +55,7 @@ |
50 | 55 | */ |
51 | 56 | if (mn->ops->release) |
52 | 57 | mn->ops->release(mn, mm); |
53 | - rcu_read_unlock(); | |
58 | + srcu_read_unlock(&srcu, id); | |
54 | 59 | |
55 | 60 | spin_lock(&mm->mmu_notifier_mm->lock); |
56 | 61 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { |
... | ... | @@ -68,7 +73,7 @@ |
68 | 73 | spin_unlock(&mm->mmu_notifier_mm->lock); |
69 | 74 | |
70 | 75 | /* |
71 | - * synchronize_rcu here prevents mmu_notifier_release to | |
76 | + * synchronize_srcu here prevents mmu_notifier_release to | |
72 | 77 | * return to exit_mmap (which would proceed freeing all pages |
73 | 78 | * in the mm) until the ->release method returns, if it was |
74 | 79 | * invoked by mmu_notifier_unregister. |
... | ... | @@ -76,7 +81,7 @@ |
76 | 81 | * The mmu_notifier_mm can't go away from under us because one |
77 | 82 | * mm_count is hold by exit_mmap. |
78 | 83 | */ |
79 | - synchronize_rcu(); | |
84 | + synchronize_srcu(&srcu); | |
80 | 85 | } |
81 | 86 | |
82 | 87 | /* |
83 | 88 | |
84 | 89 | |
... | ... | @@ -89,14 +94,14 @@ |
89 | 94 | { |
90 | 95 | struct mmu_notifier *mn; |
91 | 96 | struct hlist_node *n; |
92 | - int young = 0; | |
97 | + int young = 0, id; | |
93 | 98 | |
94 | - rcu_read_lock(); | |
99 | + id = srcu_read_lock(&srcu); | |
95 | 100 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
96 | 101 | if (mn->ops->clear_flush_young) |
97 | 102 | young |= mn->ops->clear_flush_young(mn, mm, address); |
98 | 103 | } |
99 | - rcu_read_unlock(); | |
104 | + srcu_read_unlock(&srcu, id); | |
100 | 105 | |
101 | 106 | return young; |
102 | 107 | } |
103 | 108 | |
... | ... | @@ -106,9 +111,9 @@ |
106 | 111 | { |
107 | 112 | struct mmu_notifier *mn; |
108 | 113 | struct hlist_node *n; |
109 | - int young = 0; | |
114 | + int young = 0, id; | |
110 | 115 | |
111 | - rcu_read_lock(); | |
116 | + id = srcu_read_lock(&srcu); | |
112 | 117 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
113 | 118 | if (mn->ops->test_young) { |
114 | 119 | young = mn->ops->test_young(mn, mm, address); |
... | ... | @@ -116,7 +121,7 @@ |
116 | 121 | break; |
117 | 122 | } |
118 | 123 | } |
119 | - rcu_read_unlock(); | |
124 | + srcu_read_unlock(&srcu, id); | |
120 | 125 | |
121 | 126 | return young; |
122 | 127 | } |
123 | 128 | |
... | ... | @@ -126,8 +131,9 @@ |
126 | 131 | { |
127 | 132 | struct mmu_notifier *mn; |
128 | 133 | struct hlist_node *n; |
134 | + int id; | |
129 | 135 | |
130 | - rcu_read_lock(); | |
136 | + id = srcu_read_lock(&srcu); | |
131 | 137 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
132 | 138 | if (mn->ops->change_pte) |
133 | 139 | mn->ops->change_pte(mn, mm, address, pte); |
... | ... | @@ -138,7 +144,7 @@ |
138 | 144 | else if (mn->ops->invalidate_page) |
139 | 145 | mn->ops->invalidate_page(mn, mm, address); |
140 | 146 | } |
141 | - rcu_read_unlock(); | |
147 | + srcu_read_unlock(&srcu, id); | |
142 | 148 | } |
143 | 149 | |
144 | 150 | void __mmu_notifier_invalidate_page(struct mm_struct *mm, |
145 | 151 | |
146 | 152 | |
... | ... | @@ -146,13 +152,14 @@ |
146 | 152 | { |
147 | 153 | struct mmu_notifier *mn; |
148 | 154 | struct hlist_node *n; |
155 | + int id; | |
149 | 156 | |
150 | - rcu_read_lock(); | |
157 | + id = srcu_read_lock(&srcu); | |
151 | 158 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
152 | 159 | if (mn->ops->invalidate_page) |
153 | 160 | mn->ops->invalidate_page(mn, mm, address); |
154 | 161 | } |
155 | - rcu_read_unlock(); | |
162 | + srcu_read_unlock(&srcu, id); | |
156 | 163 | } |
157 | 164 | |
158 | 165 | void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, |
159 | 166 | |
160 | 167 | |
... | ... | @@ -160,13 +167,14 @@ |
160 | 167 | { |
161 | 168 | struct mmu_notifier *mn; |
162 | 169 | struct hlist_node *n; |
170 | + int id; | |
163 | 171 | |
164 | - rcu_read_lock(); | |
172 | + id = srcu_read_lock(&srcu); | |
165 | 173 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
166 | 174 | if (mn->ops->invalidate_range_start) |
167 | 175 | mn->ops->invalidate_range_start(mn, mm, start, end); |
168 | 176 | } |
169 | - rcu_read_unlock(); | |
177 | + srcu_read_unlock(&srcu, id); | |
170 | 178 | } |
171 | 179 | |
172 | 180 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, |
173 | 181 | |
174 | 182 | |
... | ... | @@ -174,13 +182,14 @@ |
174 | 182 | { |
175 | 183 | struct mmu_notifier *mn; |
176 | 184 | struct hlist_node *n; |
185 | + int id; | |
177 | 186 | |
178 | - rcu_read_lock(); | |
187 | + id = srcu_read_lock(&srcu); | |
179 | 188 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { |
180 | 189 | if (mn->ops->invalidate_range_end) |
181 | 190 | mn->ops->invalidate_range_end(mn, mm, start, end); |
182 | 191 | } |
183 | - rcu_read_unlock(); | |
192 | + srcu_read_unlock(&srcu, id); | |
184 | 193 | } |
185 | 194 | |
186 | 195 | static int do_mmu_notifier_register(struct mmu_notifier *mn, |
... | ... | @@ -192,6 +201,12 @@ |
192 | 201 | |
193 | 202 | BUG_ON(atomic_read(&mm->mm_users) <= 0); |
194 | 203 | |
204 | + /* | |
205 | + * Verify that mmu_notifier_init() already run and the global srcu is | |
206 | + * initialized. | |
207 | + */ | |
208 | + BUG_ON(!srcu.per_cpu_ref); | |
209 | + | |
195 | 210 | ret = -ENOMEM; |
196 | 211 | mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); |
197 | 212 | if (unlikely(!mmu_notifier_mm)) |
... | ... | @@ -274,8 +289,8 @@ |
274 | 289 | /* |
275 | 290 | * This releases the mm_count pin automatically and frees the mm |
276 | 291 | * structure if it was the last user of it. It serializes against |
277 | - * running mmu notifiers with RCU and against mmu_notifier_unregister | |
278 | - * with the unregister lock + RCU. All sptes must be dropped before | |
292 | + * running mmu notifiers with SRCU and against mmu_notifier_unregister | |
293 | + * with the unregister lock + SRCU. All sptes must be dropped before | |
279 | 294 | * calling mmu_notifier_unregister. ->release or any other notifier |
280 | 295 | * method may be invoked concurrently with mmu_notifier_unregister, |
281 | 296 | * and only after mmu_notifier_unregister returned we're guaranteed |
282 | 297 | |
... | ... | @@ -290,8 +305,9 @@ |
290 | 305 | * RCU here will force exit_mmap to wait ->release to finish |
291 | 306 | * before freeing the pages. |
292 | 307 | */ |
293 | - rcu_read_lock(); | |
308 | + int id; | |
294 | 309 | |
310 | + id = srcu_read_lock(&srcu); | |
295 | 311 | /* |
296 | 312 | * exit_mmap will block in mmu_notifier_release to |
297 | 313 | * guarantee ->release is called before freeing the |
... | ... | @@ -299,7 +315,7 @@ |
299 | 315 | */ |
300 | 316 | if (mn->ops->release) |
301 | 317 | mn->ops->release(mn, mm); |
302 | - rcu_read_unlock(); | |
318 | + srcu_read_unlock(&srcu, id); | |
303 | 319 | |
304 | 320 | spin_lock(&mm->mmu_notifier_mm->lock); |
305 | 321 | hlist_del_rcu(&mn->hlist); |
306 | 322 | |
... | ... | @@ -310,11 +326,18 @@ |
310 | 326 | * Wait any running method to finish, of course including |
311 | 327 | * ->release if it was run by mmu_notifier_relase instead of us. |
312 | 328 | */ |
313 | - synchronize_rcu(); | |
329 | + synchronize_srcu(&srcu); | |
314 | 330 | |
315 | 331 | BUG_ON(atomic_read(&mm->mm_count) <= 0); |
316 | 332 | |
317 | 333 | mmdrop(mm); |
318 | 334 | } |
319 | 335 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister); |
336 | + | |
337 | +static int __init mmu_notifier_init(void) | |
338 | +{ | |
339 | + return init_srcu_struct(&srcu); | |
340 | +} | |
341 | + | |
342 | +module_init(mmu_notifier_init); |