Commit 044d66c1d2b1c5aa50b4d6d68c21c6c93dd678da

Authored by Hugh Dickins
Committed by Linus Torvalds
1 parent 3062fc67da

memcgroup: reinstate swapoff mod

This patch reinstates the "swapoff: scan ptes preemptibly" mod we started
with: in due course it should be rendered down into the earlier patches,
leaving us with a more straightforward mem_cgroup_charge mod to unuse_pte,
allocating with GFP_KERNEL while holding no spinlock and no atomic kmap.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Paul Menage <menage@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Kirill Korotaev <dev@sw.ru>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: David Rientjes <rientjes@google.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 34 additions and 8 deletions Side-by-side Diff

... ... @@ -507,12 +507,24 @@
507 507 * just let do_wp_page work it out if a write is requested later - to
508 508 * force COW, vm_page_prot omits write permission from any private vma.
509 509 */
510   -static int unuse_pte(struct vm_area_struct *vma, pte_t *pte,
  510 +static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
511 511 unsigned long addr, swp_entry_t entry, struct page *page)
512 512 {
  513 + spinlock_t *ptl;
  514 + pte_t *pte;
  515 + int ret = 1;
  516 +
513 517 if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
514   - return -ENOMEM;
  518 + ret = -ENOMEM;
515 519  
  520 + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  521 + if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
  522 + if (ret > 0)
  523 + mem_cgroup_uncharge_page(page);
  524 + ret = 0;
  525 + goto out;
  526 + }
  527 +
516 528 inc_mm_counter(vma->vm_mm, anon_rss);
517 529 get_page(page);
518 530 set_pte_at(vma->vm_mm, addr, pte,
... ... @@ -524,7 +536,9 @@
524 536 * immediately swapped out again after swapon.
525 537 */
526 538 activate_page(page);
527   - return 1;
  539 +out:
  540 + pte_unmap_unlock(pte, ptl);
  541 + return ret;
528 542 }
529 543  
530 544 static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
531 545  
532 546  
533 547  
... ... @@ -533,21 +547,33 @@
533 547 {
534 548 pte_t swp_pte = swp_entry_to_pte(entry);
535 549 pte_t *pte;
536   - spinlock_t *ptl;
537 550 int ret = 0;
538 551  
539   - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  552 + /*
  553 + * We don't actually need pte lock while scanning for swp_pte: since
  554 + * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
  555 + * page table while we're scanning; though it could get zapped, and on
  556 + * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
  557 + * of unmatched parts which look like swp_pte, so unuse_pte must
  558 + * recheck under pte lock. Scanning without pte lock lets it be
  559 + * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
  560 + */
  561 + pte = pte_offset_map(pmd, addr);
540 562 do {
541 563 /*
542 564 * swapoff spends a _lot_ of time in this loop!
543 565 * Test inline before going to call unuse_pte.
544 566 */
545 567 if (unlikely(pte_same(*pte, swp_pte))) {
546   - ret = unuse_pte(vma, pte++, addr, entry, page);
547   - break;
  568 + pte_unmap(pte);
  569 + ret = unuse_pte(vma, pmd, addr, entry, page);
  570 + if (ret)
  571 + goto out;
  572 + pte = pte_offset_map(pmd, addr);
548 573 }
549 574 } while (pte++, addr += PAGE_SIZE, addr != end);
550   - pte_unmap_unlock(pte - 1, ptl);
  575 + pte_unmap(pte - 1);
  576 +out:
551 577 return ret;
552 578 }
553 579