Commit 7a3ef208e662f4b63d43a23f61a64a129c525bbc

Authored by Konstantin Khlebnikov
Committed by Linus Torvalds
1 parent 3245d6acab

mm: prevent endless growth of anon_vma hierarchy

Constantly forking task causes unlimited grow of anon_vma chain.  Each
next child allocates new level of anon_vmas and links vma to all
previous levels because pages might be inherited from any level.

This patch adds heuristic which decides to reuse existing anon_vma
instead of forking new one.  It adds counter anon_vma->degree which
counts linked vmas and directly descending anon_vmas and reuses anon_vma
if counter is lower than two.  As a result each anon_vma has either vma
or at least two descending anon_vmas.  In such trees half of nodes are
leafs with alive vmas, thus count of anon_vmas is no more than two times
bigger than count of vmas.

This heuristic reuses anon_vmas as few as possible because each reuse
adds false aliasing among vmas and rmap walker ought to scan more ptes
when it searches where page is might be mapped.

Link: http://lkml.kernel.org/r/20120816024610.GA5350@evergreen.ssec.wisc.edu
Fixes: 5beb49305251 ("mm: change anon_vma linking to fix multi-process server scalability issue")
[akpm@linux-foundation.org: fix typo, per Rik]
Signed-off-by: Konstantin Khlebnikov <koct9i@gmail.com>
Reported-by: Daniel Forrest <dan.forrest@ssec.wisc.edu>
Tested-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Jerome Marchand <jmarchan@redhat.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: <stable@vger.kernel.org>	[2.6.34+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 2 changed files with 51 additions and 1 deletions Side-by-side Diff

include/linux/rmap.h
... ... @@ -37,6 +37,16 @@
37 37 atomic_t refcount;
38 38  
39 39 /*
  40 + * Count of child anon_vmas and VMAs which points to this anon_vma.
  41 + *
  42 + * This counter is used for making decision about reusing anon_vma
  43 + * instead of forking new one. See comments in function anon_vma_clone.
  44 + */
  45 + unsigned degree;
  46 +
  47 + struct anon_vma *parent; /* Parent of this anon_vma */
  48 +
  49 + /*
40 50 * NOTE: the LSB of the rb_root.rb_node is set by
41 51 * mm_take_all_locks() _after_ taking the above lock. So the
42 52 * rb_root must only be read/written after taking the above lock
... ... @@ -72,6 +72,8 @@
72 72 anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
73 73 if (anon_vma) {
74 74 atomic_set(&anon_vma->refcount, 1);
  75 + anon_vma->degree = 1; /* Reference for first vma */
  76 + anon_vma->parent = anon_vma;
75 77 /*
76 78 * Initialise the anon_vma root to point to itself. If called
77 79 * from fork, the root will be reset to the parents anon_vma.
... ... @@ -188,6 +190,8 @@
188 190 if (likely(!vma->anon_vma)) {
189 191 vma->anon_vma = anon_vma;
190 192 anon_vma_chain_link(vma, avc, anon_vma);
  193 + /* vma reference or self-parent link for new root */
  194 + anon_vma->degree++;
191 195 allocated = NULL;
192 196 avc = NULL;
193 197 }
... ... @@ -236,6 +240,14 @@
236 240 /*
237 241 * Attach the anon_vmas from src to dst.
238 242 * Returns 0 on success, -ENOMEM on failure.
  243 + *
  244 + * If dst->anon_vma is NULL this function tries to find and reuse existing
  245 + * anon_vma which has no vmas and only one child anon_vma. This prevents
  246 + * degradation of anon_vma hierarchy to endless linear chain in case of
  247 + * constantly forking task. On the other hand, an anon_vma with more than one
  248 + * child isn't reused even if there was no alive vma, thus rmap walker has a
  249 + * good chance of avoiding scanning the whole hierarchy when it searches where
  250 + * page is mapped.
239 251 */
240 252 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
241 253 {
242 254  
... ... @@ -256,7 +268,21 @@
256 268 anon_vma = pavc->anon_vma;
257 269 root = lock_anon_vma_root(root, anon_vma);
258 270 anon_vma_chain_link(dst, avc, anon_vma);
  271 +
  272 + /*
  273 + * Reuse existing anon_vma if its degree lower than two,
  274 + * that means it has no vma and only one anon_vma child.
  275 + *
  276 + * Do not chose parent anon_vma, otherwise first child
  277 + * will always reuse it. Root anon_vma is never reused:
  278 + * it has self-parent reference and at least one child.
  279 + */
  280 + if (!dst->anon_vma && anon_vma != src->anon_vma &&
  281 + anon_vma->degree < 2)
  282 + dst->anon_vma = anon_vma;
259 283 }
  284 + if (dst->anon_vma)
  285 + dst->anon_vma->degree++;
260 286 unlock_anon_vma_root(root);
261 287 return 0;
262 288  
... ... @@ -280,6 +306,9 @@
280 306 if (!pvma->anon_vma)
281 307 return 0;
282 308  
  309 + /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
  310 + vma->anon_vma = NULL;
  311 +
283 312 /*
284 313 * First, attach the new VMA to the parent VMA's anon_vmas,
285 314 * so rmap can find non-COWed pages in child processes.
... ... @@ -288,6 +317,10 @@
288 317 if (error)
289 318 return error;
290 319  
  320 + /* An existing anon_vma has been reused, all done then. */
  321 + if (vma->anon_vma)
  322 + return 0;
  323 +
291 324 /* Then add our own anon_vma. */
292 325 anon_vma = anon_vma_alloc();
293 326 if (!anon_vma)
... ... @@ -301,6 +334,7 @@
301 334 * lock any of the anon_vmas in this anon_vma tree.
302 335 */
303 336 anon_vma->root = pvma->anon_vma->root;
  337 + anon_vma->parent = pvma->anon_vma;
304 338 /*
305 339 * With refcounts, an anon_vma can stay around longer than the
306 340 * process it belongs to. The root anon_vma needs to be pinned until
... ... @@ -311,6 +345,7 @@
311 345 vma->anon_vma = anon_vma;
312 346 anon_vma_lock_write(anon_vma);
313 347 anon_vma_chain_link(vma, avc, anon_vma);
  348 + anon_vma->parent->degree++;
314 349 anon_vma_unlock_write(anon_vma);
315 350  
316 351 return 0;
317 352  
318 353  
... ... @@ -341,12 +376,16 @@
341 376 * Leave empty anon_vmas on the list - we'll need
342 377 * to free them outside the lock.
343 378 */
344   - if (RB_EMPTY_ROOT(&anon_vma->rb_root))
  379 + if (RB_EMPTY_ROOT(&anon_vma->rb_root)) {
  380 + anon_vma->parent->degree--;
345 381 continue;
  382 + }
346 383  
347 384 list_del(&avc->same_vma);
348 385 anon_vma_chain_free(avc);
349 386 }
  387 + if (vma->anon_vma)
  388 + vma->anon_vma->degree--;
350 389 unlock_anon_vma_root(root);
351 390  
352 391 /*
... ... @@ -357,6 +396,7 @@
357 396 list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
358 397 struct anon_vma *anon_vma = avc->anon_vma;
359 398  
  399 + BUG_ON(anon_vma->degree);
360 400 put_anon_vma(anon_vma);
361 401  
362 402 list_del(&avc->same_vma);