Commit 3f926ab945b60a5824369d21add7710622a2eac0

Authored by Mel Gorman
Committed by Ingo Molnar
1 parent c61109e34f

mm: Close races between THP migration and PMD numa clearing

THP migration uses the page lock to guard against parallel allocations
but there are cases like this still open

  Task A					Task B
  ---------------------				---------------------
  do_huge_pmd_numa_page				do_huge_pmd_numa_page
  lock_page
  mpol_misplaced == -1
  unlock_page
  goto clear_pmdnuma
						lock_page
						mpol_misplaced == 2
						migrate_misplaced_transhuge
  pmd = pmd_mknonnuma
  set_pmd_at

During hours of testing, one crashed with weird errors and while I have
no direct evidence, I suspect something like the race above happened.
This patch extends the page lock to being held until the pmd_numa is
cleared to prevent migration starting in parallel while the pmd_numa is
being cleared. It also flushes the old pmd entry and orders pagetable
insertion before rmap insertion.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: <stable@kernel.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1381141781-10992-9-git-send-email-mgorman@suse.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Showing 2 changed files with 26 additions and 26 deletions Side-by-side Diff

... ... @@ -1304,24 +1304,25 @@
1304 1304 target_nid = mpol_misplaced(page, vma, haddr);
1305 1305 if (target_nid == -1) {
1306 1306 /* If the page was locked, there are no parallel migrations */
1307   - if (page_locked) {
1308   - unlock_page(page);
  1307 + if (page_locked)
1309 1308 goto clear_pmdnuma;
1310   - }
1311 1309  
1312   - /* Otherwise wait for potential migrations and retry fault */
  1310 + /*
  1311 + * Otherwise wait for potential migrations and retry. We do
  1312 + * relock and check_same as the page may no longer be mapped.
  1313 + * As the fault is being retried, do not account for it.
  1314 + */
1313 1315 spin_unlock(&mm->page_table_lock);
1314 1316 wait_on_page_locked(page);
  1317 + page_nid = -1;
1315 1318 goto out;
1316 1319 }
1317 1320  
1318 1321 /* Page is misplaced, serialise migrations and parallel THP splits */
1319 1322 get_page(page);
1320 1323 spin_unlock(&mm->page_table_lock);
1321   - if (!page_locked) {
  1324 + if (!page_locked)
1322 1325 lock_page(page);
1323   - page_locked = true;
1324   - }
1325 1326 anon_vma = page_lock_anon_vma_read(page);
1326 1327  
1327 1328 /* Confirm the PTE did not while locked */
1328 1329  
1329 1330  
1330 1331  
1331 1332  
1332 1333  
... ... @@ -1329,32 +1330,28 @@
1329 1330 if (unlikely(!pmd_same(pmd, *pmdp))) {
1330 1331 unlock_page(page);
1331 1332 put_page(page);
  1333 + page_nid = -1;
1332 1334 goto out_unlock;
1333 1335 }
1334 1336  
1335   - /* Migrate the THP to the requested node */
  1337 + /*
  1338 + * Migrate the THP to the requested node, returns with page unlocked
  1339 + * and pmd_numa cleared.
  1340 + */
1336 1341 spin_unlock(&mm->page_table_lock);
1337 1342 migrated = migrate_misplaced_transhuge_page(mm, vma,
1338 1343 pmdp, pmd, addr, page, target_nid);
1339 1344 if (migrated)
1340 1345 page_nid = target_nid;
1341   - else
1342   - goto check_same;
1343 1346  
1344 1347 goto out;
1345   -
1346   -check_same:
1347   - spin_lock(&mm->page_table_lock);
1348   - if (unlikely(!pmd_same(pmd, *pmdp))) {
1349   - /* Someone else took our fault */
1350   - page_nid = -1;
1351   - goto out_unlock;
1352   - }
1353 1348 clear_pmdnuma:
  1349 + BUG_ON(!PageLocked(page));
1354 1350 pmd = pmd_mknonnuma(pmd);
1355 1351 set_pmd_at(mm, haddr, pmdp, pmd);
1356 1352 VM_BUG_ON(pmd_numa(*pmdp));
1357 1353 update_mmu_cache_pmd(vma, addr, pmdp);
  1354 + unlock_page(page);
1358 1355 out_unlock:
1359 1356 spin_unlock(&mm->page_table_lock);
1360 1357  
... ... @@ -1715,12 +1715,12 @@
1715 1715 unlock_page(new_page);
1716 1716 put_page(new_page); /* Free it */
1717 1717  
1718   - unlock_page(page);
  1718 + /* Retake the callers reference and putback on LRU */
  1719 + get_page(page);
1719 1720 putback_lru_page(page);
1720   -
1721   - count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1722   - isolated = 0;
1723   - goto out;
  1721 + mod_zone_page_state(page_zone(page),
  1722 + NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
  1723 + goto out_fail;
1724 1724 }
1725 1725  
1726 1726 /*
1727 1727  
... ... @@ -1737,9 +1737,9 @@
1737 1737 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1738 1738 entry = pmd_mkhuge(entry);
1739 1739  
1740   - page_add_new_anon_rmap(new_page, vma, haddr);
1741   -
  1740 + pmdp_clear_flush(vma, haddr, pmd);
1742 1741 set_pmd_at(mm, haddr, pmd, entry);
  1742 + page_add_new_anon_rmap(new_page, vma, haddr);
1743 1743 update_mmu_cache_pmd(vma, address, &entry);
1744 1744 page_remove_rmap(page);
1745 1745 /*
... ... @@ -1758,7 +1758,6 @@
1758 1758 count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
1759 1759 count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
1760 1760  
1761   -out:
1762 1761 mod_zone_page_state(page_zone(page),
1763 1762 NR_ISOLATED_ANON + page_lru,
1764 1763 -HPAGE_PMD_NR);
... ... @@ -1767,6 +1766,10 @@
1767 1766 out_fail:
1768 1767 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1769 1768 out_dropref:
  1769 + entry = pmd_mknonnuma(entry);
  1770 + set_pmd_at(mm, haddr, pmd, entry);
  1771 + update_mmu_cache_pmd(vma, address, &entry);
  1772 +
1770 1773 unlock_page(page);
1771 1774 put_page(page);
1772 1775 return 0;