Commit 21a92735f660eaecf69a6f2e777f18463760ec32

Authored by Sagi Grimberg
Committed by Linus Torvalds
1 parent 48af0d7cb3

mm: mmu_notifier: have mmu_notifiers use a global SRCU so they may safely schedule

With an RCU based mmu_notifier implementation, any callout to
mmu_notifier_invalidate_range_{start,end}() or
mmu_notifier_invalidate_page() would not be allowed to call schedule()
as that could potentially allow a modification to the mmu_notifier
structure while it is currently being used.

Since srcu allocs 4 machine words per instance per cpu, we may end up
with memory exhaustion if we use srcu per mm.  So all mms share a global
srcu.  Note that during large mmu_notifier activity exit & unregister
paths might hang for longer periods, but it is tolerable for current
mmu_notifier clients.

Signed-off-by: Sagi Grimberg <sagig@mellanox.co.il>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Haggai Eran <haggaie@mellanox.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 2 changed files with 49 additions and 25 deletions Side-by-side Diff

include/linux/mmu_notifier.h
... ... @@ -4,6 +4,7 @@
4 4 #include <linux/list.h>
5 5 #include <linux/spinlock.h>
6 6 #include <linux/mm_types.h>
  7 +#include <linux/srcu.h>
7 8  
8 9 struct mmu_notifier;
9 10 struct mmu_notifier_ops;
... ... @@ -14,10 +14,14 @@
14 14 #include <linux/export.h>
15 15 #include <linux/mm.h>
16 16 #include <linux/err.h>
  17 +#include <linux/srcu.h>
17 18 #include <linux/rcupdate.h>
18 19 #include <linux/sched.h>
19 20 #include <linux/slab.h>
20 21  
  22 +/* global SRCU for all MMs */
  23 +struct srcu_struct srcu;
  24 +
21 25 /*
22 26 * This function can't run concurrently against mmu_notifier_register
23 27 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
... ... @@ -25,8 +29,8 @@
25 29 * in parallel despite there being no task using this mm any more,
26 30 * through the vmas outside of the exit_mmap context, such as with
27 31 * vmtruncate. This serializes against mmu_notifier_unregister with
28   - * the mmu_notifier_mm->lock in addition to RCU and it serializes
29   - * against the other mmu notifiers with RCU. struct mmu_notifier_mm
  32 + * the mmu_notifier_mm->lock in addition to SRCU and it serializes
  33 + * against the other mmu notifiers with SRCU. struct mmu_notifier_mm
30 34 * can't go away from under us as exit_mmap holds an mm_count pin
31 35 * itself.
32 36 */
33 37  
... ... @@ -34,12 +38,13 @@
34 38 {
35 39 struct mmu_notifier *mn;
36 40 struct hlist_node *n;
  41 + int id;
37 42  
38 43 /*
39 44 * RCU here will block mmu_notifier_unregister until
40 45 * ->release returns.
41 46 */
42   - rcu_read_lock();
  47 + id = srcu_read_lock(&srcu);
43 48 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
44 49 /*
45 50 * if ->release runs before mmu_notifier_unregister it
... ... @@ -50,7 +55,7 @@
50 55 */
51 56 if (mn->ops->release)
52 57 mn->ops->release(mn, mm);
53   - rcu_read_unlock();
  58 + srcu_read_unlock(&srcu, id);
54 59  
55 60 spin_lock(&mm->mmu_notifier_mm->lock);
56 61 while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
... ... @@ -68,7 +73,7 @@
68 73 spin_unlock(&mm->mmu_notifier_mm->lock);
69 74  
70 75 /*
71   - * synchronize_rcu here prevents mmu_notifier_release to
  76 + * synchronize_srcu here prevents mmu_notifier_release to
72 77 * return to exit_mmap (which would proceed freeing all pages
73 78 * in the mm) until the ->release method returns, if it was
74 79 * invoked by mmu_notifier_unregister.
... ... @@ -76,7 +81,7 @@
76 81 * The mmu_notifier_mm can't go away from under us because one
77 82 * mm_count is hold by exit_mmap.
78 83 */
79   - synchronize_rcu();
  84 + synchronize_srcu(&srcu);
80 85 }
81 86  
82 87 /*
83 88  
84 89  
... ... @@ -89,14 +94,14 @@
89 94 {
90 95 struct mmu_notifier *mn;
91 96 struct hlist_node *n;
92   - int young = 0;
  97 + int young = 0, id;
93 98  
94   - rcu_read_lock();
  99 + id = srcu_read_lock(&srcu);
95 100 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
96 101 if (mn->ops->clear_flush_young)
97 102 young |= mn->ops->clear_flush_young(mn, mm, address);
98 103 }
99   - rcu_read_unlock();
  104 + srcu_read_unlock(&srcu, id);
100 105  
101 106 return young;
102 107 }
103 108  
... ... @@ -106,9 +111,9 @@
106 111 {
107 112 struct mmu_notifier *mn;
108 113 struct hlist_node *n;
109   - int young = 0;
  114 + int young = 0, id;
110 115  
111   - rcu_read_lock();
  116 + id = srcu_read_lock(&srcu);
112 117 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
113 118 if (mn->ops->test_young) {
114 119 young = mn->ops->test_young(mn, mm, address);
... ... @@ -116,7 +121,7 @@
116 121 break;
117 122 }
118 123 }
119   - rcu_read_unlock();
  124 + srcu_read_unlock(&srcu, id);
120 125  
121 126 return young;
122 127 }
123 128  
... ... @@ -126,8 +131,9 @@
126 131 {
127 132 struct mmu_notifier *mn;
128 133 struct hlist_node *n;
  134 + int id;
129 135  
130   - rcu_read_lock();
  136 + id = srcu_read_lock(&srcu);
131 137 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
132 138 if (mn->ops->change_pte)
133 139 mn->ops->change_pte(mn, mm, address, pte);
... ... @@ -138,7 +144,7 @@
138 144 else if (mn->ops->invalidate_page)
139 145 mn->ops->invalidate_page(mn, mm, address);
140 146 }
141   - rcu_read_unlock();
  147 + srcu_read_unlock(&srcu, id);
142 148 }
143 149  
144 150 void __mmu_notifier_invalidate_page(struct mm_struct *mm,
145 151  
146 152  
... ... @@ -146,13 +152,14 @@
146 152 {
147 153 struct mmu_notifier *mn;
148 154 struct hlist_node *n;
  155 + int id;
149 156  
150   - rcu_read_lock();
  157 + id = srcu_read_lock(&srcu);
151 158 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
152 159 if (mn->ops->invalidate_page)
153 160 mn->ops->invalidate_page(mn, mm, address);
154 161 }
155   - rcu_read_unlock();
  162 + srcu_read_unlock(&srcu, id);
156 163 }
157 164  
158 165 void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
159 166  
160 167  
... ... @@ -160,13 +167,14 @@
160 167 {
161 168 struct mmu_notifier *mn;
162 169 struct hlist_node *n;
  170 + int id;
163 171  
164   - rcu_read_lock();
  172 + id = srcu_read_lock(&srcu);
165 173 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
166 174 if (mn->ops->invalidate_range_start)
167 175 mn->ops->invalidate_range_start(mn, mm, start, end);
168 176 }
169   - rcu_read_unlock();
  177 + srcu_read_unlock(&srcu, id);
170 178 }
171 179  
172 180 void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
173 181  
174 182  
... ... @@ -174,13 +182,14 @@
174 182 {
175 183 struct mmu_notifier *mn;
176 184 struct hlist_node *n;
  185 + int id;
177 186  
178   - rcu_read_lock();
  187 + id = srcu_read_lock(&srcu);
179 188 hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
180 189 if (mn->ops->invalidate_range_end)
181 190 mn->ops->invalidate_range_end(mn, mm, start, end);
182 191 }
183   - rcu_read_unlock();
  192 + srcu_read_unlock(&srcu, id);
184 193 }
185 194  
186 195 static int do_mmu_notifier_register(struct mmu_notifier *mn,
... ... @@ -192,6 +201,12 @@
192 201  
193 202 BUG_ON(atomic_read(&mm->mm_users) <= 0);
194 203  
  204 + /*
  205 + * Verify that mmu_notifier_init() already run and the global srcu is
  206 + * initialized.
  207 + */
  208 + BUG_ON(!srcu.per_cpu_ref);
  209 +
195 210 ret = -ENOMEM;
196 211 mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
197 212 if (unlikely(!mmu_notifier_mm))
... ... @@ -274,8 +289,8 @@
274 289 /*
275 290 * This releases the mm_count pin automatically and frees the mm
276 291 * structure if it was the last user of it. It serializes against
277   - * running mmu notifiers with RCU and against mmu_notifier_unregister
278   - * with the unregister lock + RCU. All sptes must be dropped before
  292 + * running mmu notifiers with SRCU and against mmu_notifier_unregister
  293 + * with the unregister lock + SRCU. All sptes must be dropped before
279 294 * calling mmu_notifier_unregister. ->release or any other notifier
280 295 * method may be invoked concurrently with mmu_notifier_unregister,
281 296 * and only after mmu_notifier_unregister returned we're guaranteed
282 297  
... ... @@ -290,8 +305,9 @@
290 305 * RCU here will force exit_mmap to wait ->release to finish
291 306 * before freeing the pages.
292 307 */
293   - rcu_read_lock();
  308 + int id;
294 309  
  310 + id = srcu_read_lock(&srcu);
295 311 /*
296 312 * exit_mmap will block in mmu_notifier_release to
297 313 * guarantee ->release is called before freeing the
... ... @@ -299,7 +315,7 @@
299 315 */
300 316 if (mn->ops->release)
301 317 mn->ops->release(mn, mm);
302   - rcu_read_unlock();
  318 + srcu_read_unlock(&srcu, id);
303 319  
304 320 spin_lock(&mm->mmu_notifier_mm->lock);
305 321 hlist_del_rcu(&mn->hlist);
306 322  
... ... @@ -310,11 +326,18 @@
310 326 * Wait any running method to finish, of course including
311 327 * ->release if it was run by mmu_notifier_relase instead of us.
312 328 */
313   - synchronize_rcu();
  329 + synchronize_srcu(&srcu);
314 330  
315 331 BUG_ON(atomic_read(&mm->mm_count) <= 0);
316 332  
317 333 mmdrop(mm);
318 334 }
319 335 EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
  336 +
  337 +static int __init mmu_notifier_init(void)
  338 +{
  339 + return init_srcu_struct(&srcu);
  340 +}
  341 +
  342 +module_init(mmu_notifier_init);