Commit c5a647d09fe9fc3e0241c89845cf8e6220b916f5

Authored by Kirill A. Shutemov
Committed by Linus Torvalds
1 parent e180377f1a

thp: implement splitting pmd for huge zero page

We can't split huge zero page itself (and it's bug if we try), but we
can split the pmd which points to it.

On splitting the pmd we create a table with all ptes set to normal zero
page.

[akpm@linux-foundation.org: fix build error]
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 42 additions and 1 deletions Side-by-side Diff

... ... @@ -1616,6 +1616,7 @@
1616 1616 struct anon_vma *anon_vma;
1617 1617 int ret = 1;
1618 1618  
  1619 + BUG_ON(is_huge_zero_pfn(page_to_pfn(page)));
1619 1620 BUG_ON(!PageAnon(page));
1620 1621 anon_vma = page_lock_anon_vma(page);
1621 1622 if (!anon_vma)
1622 1623  
1623 1624  
1624 1625  
1625 1626  
1626 1627  
1627 1628  
... ... @@ -2475,24 +2476,64 @@
2475 2476 return 0;
2476 2477 }
2477 2478  
  2479 +static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
  2480 + unsigned long haddr, pmd_t *pmd)
  2481 +{
  2482 + struct mm_struct *mm = vma->vm_mm;
  2483 + pgtable_t pgtable;
  2484 + pmd_t _pmd;
  2485 + int i;
  2486 +
  2487 + pmdp_clear_flush(vma, haddr, pmd);
  2488 + /* leave pmd empty until pte is filled */
  2489 +
  2490 + pgtable = pgtable_trans_huge_withdraw(mm);
  2491 + pmd_populate(mm, &_pmd, pgtable);
  2492 +
  2493 + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
  2494 + pte_t *pte, entry;
  2495 + entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
  2496 + entry = pte_mkspecial(entry);
  2497 + pte = pte_offset_map(&_pmd, haddr);
  2498 + VM_BUG_ON(!pte_none(*pte));
  2499 + set_pte_at(mm, haddr, pte, entry);
  2500 + pte_unmap(pte);
  2501 + }
  2502 + smp_wmb(); /* make pte visible before pmd */
  2503 + pmd_populate(mm, pmd, pgtable);
  2504 +}
  2505 +
2478 2506 void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
2479 2507 pmd_t *pmd)
2480 2508 {
2481 2509 struct page *page;
2482   - unsigned long haddr = address & HPAGE_PMD_MASK;
2483 2510 struct mm_struct *mm = vma->vm_mm;
  2511 + unsigned long haddr = address & HPAGE_PMD_MASK;
  2512 + unsigned long mmun_start; /* For mmu_notifiers */
  2513 + unsigned long mmun_end; /* For mmu_notifiers */
2484 2514  
2485 2515 BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
2486 2516  
  2517 + mmun_start = haddr;
  2518 + mmun_end = haddr + HPAGE_PMD_SIZE;
  2519 + mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
2487 2520 spin_lock(&mm->page_table_lock);
2488 2521 if (unlikely(!pmd_trans_huge(*pmd))) {
2489 2522 spin_unlock(&mm->page_table_lock);
  2523 + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2490 2524 return;
2491 2525 }
  2526 + if (is_huge_zero_pmd(*pmd)) {
  2527 + __split_huge_zero_page_pmd(vma, haddr, pmd);
  2528 + spin_unlock(&mm->page_table_lock);
  2529 + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
  2530 + return;
  2531 + }
2492 2532 page = pmd_page(*pmd);
2493 2533 VM_BUG_ON(!page_count(page));
2494 2534 get_page(page);
2495 2535 spin_unlock(&mm->page_table_lock);
  2536 + mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
2496 2537  
2497 2538 split_huge_page(page);
2498 2539