Commit a54a407fbf7735fd8f7841375574f5d9b0375f93

Authored by Mel Gorman
Committed by Ingo Molnar
1 parent 8191acbd30

mm: Close races between THP migration and PMD numa clearing

THP migration uses the page lock to guard against parallel allocations
but there are cases like this still open

  Task A					Task B
  ---------------------				---------------------
  do_huge_pmd_numa_page				do_huge_pmd_numa_page
  lock_page
  mpol_misplaced == -1
  unlock_page
  goto clear_pmdnuma
						lock_page
						mpol_misplaced == 2
						migrate_misplaced_transhuge
  pmd = pmd_mknonnuma
  set_pmd_at

During hours of testing, one crashed with weird errors and while I have
no direct evidence, I suspect something like the race above happened.
This patch extends the page lock to being held until the pmd_numa is
cleared to prevent migration starting in parallel while the pmd_numa is
being cleared. It also flushes the old pmd entry and orders pagetable
insertion before rmap insertion.

Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1381141781-10992-9-git-send-email-mgorman@suse.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

Showing 2 changed files with 26 additions and 26 deletions Side-by-side Diff

... ... @@ -1304,24 +1304,25 @@
1304 1304 target_nid = mpol_misplaced(page, vma, haddr);
1305 1305 if (target_nid == -1) {
1306 1306 /* If the page was locked, there are no parallel migrations */
1307   - if (page_locked) {
1308   - unlock_page(page);
  1307 + if (page_locked)
1309 1308 goto clear_pmdnuma;
1310   - }
1311 1309  
1312   - /* Otherwise wait for potential migrations and retry fault */
  1310 + /*
  1311 + * Otherwise wait for potential migrations and retry. We do
  1312 + * relock and check_same as the page may no longer be mapped.
  1313 + * As the fault is being retried, do not account for it.
  1314 + */
1313 1315 spin_unlock(&mm->page_table_lock);
1314 1316 wait_on_page_locked(page);
  1317 + page_nid = -1;
1315 1318 goto out;
1316 1319 }
1317 1320  
1318 1321 /* Page is misplaced, serialise migrations and parallel THP splits */
1319 1322 get_page(page);
1320 1323 spin_unlock(&mm->page_table_lock);
1321   - if (!page_locked) {
  1324 + if (!page_locked)
1322 1325 lock_page(page);
1323   - page_locked = true;
1324   - }
1325 1326 anon_vma = page_lock_anon_vma_read(page);
1326 1327  
1327 1328 /* Confirm the PMD did not change while page_table_lock was released */
1328 1329  
1329 1330  
1330 1331  
1331 1332  
1332 1333  
... ... @@ -1329,32 +1330,28 @@
1329 1330 if (unlikely(!pmd_same(pmd, *pmdp))) {
1330 1331 unlock_page(page);
1331 1332 put_page(page);
  1333 + page_nid = -1;
1332 1334 goto out_unlock;
1333 1335 }
1334 1336  
1335   - /* Migrate the THP to the requested node */
  1337 + /*
  1338 + * Migrate the THP to the requested node, returns with page unlocked
  1339 + * and pmd_numa cleared.
  1340 + */
1336 1341 spin_unlock(&mm->page_table_lock);
1337 1342 migrated = migrate_misplaced_transhuge_page(mm, vma,
1338 1343 pmdp, pmd, addr, page, target_nid);
1339 1344 if (migrated)
1340 1345 page_nid = target_nid;
1341   - else
1342   - goto check_same;
1343 1346  
1344 1347 goto out;
1345   -
1346   -check_same:
1347   - spin_lock(&mm->page_table_lock);
1348   - if (unlikely(!pmd_same(pmd, *pmdp))) {
1349   - /* Someone else took our fault */
1350   - page_nid = -1;
1351   - goto out_unlock;
1352   - }
1353 1348 clear_pmdnuma:
  1349 + BUG_ON(!PageLocked(page));
1354 1350 pmd = pmd_mknonnuma(pmd);
1355 1351 set_pmd_at(mm, haddr, pmdp, pmd);
1356 1352 VM_BUG_ON(pmd_numa(*pmdp));
1357 1353 update_mmu_cache_pmd(vma, addr, pmdp);
  1354 + unlock_page(page);
1358 1355 out_unlock:
1359 1356 spin_unlock(&mm->page_table_lock);
1360 1357  
... ... @@ -1713,12 +1713,12 @@
1713 1713 unlock_page(new_page);
1714 1714 put_page(new_page); /* Free it */
1715 1715  
1716   - unlock_page(page);
  1716 + /* Retake the callers reference and putback on LRU */
  1717 + get_page(page);
1717 1718 putback_lru_page(page);
1718   -
1719   - count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1720   - isolated = 0;
1721   - goto out;
  1719 + mod_zone_page_state(page_zone(page),
  1720 + NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
  1721 + goto out_fail;
1722 1722 }
1723 1723  
1724 1724 /*
1725 1725  
... ... @@ -1735,9 +1735,9 @@
1735 1735 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1736 1736 entry = pmd_mkhuge(entry);
1737 1737  
1738   - page_add_new_anon_rmap(new_page, vma, haddr);
1739   -
  1738 + pmdp_clear_flush(vma, haddr, pmd);
1740 1739 set_pmd_at(mm, haddr, pmd, entry);
  1740 + page_add_new_anon_rmap(new_page, vma, haddr);
1741 1741 update_mmu_cache_pmd(vma, address, &entry);
1742 1742 page_remove_rmap(page);
1743 1743 /*
... ... @@ -1756,7 +1756,6 @@
1756 1756 count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
1757 1757 count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
1758 1758  
1759   -out:
1760 1759 mod_zone_page_state(page_zone(page),
1761 1760 NR_ISOLATED_ANON + page_lru,
1762 1761 -HPAGE_PMD_NR);
... ... @@ -1765,6 +1764,10 @@
1765 1764 out_fail:
1766 1765 count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
1767 1766 out_dropref:
  1767 + entry = pmd_mknonnuma(entry);
  1768 + set_pmd_at(mm, haddr, pmd, entry);
  1769 + update_mmu_cache_pmd(vma, address, &entry);
  1770 +
1768 1771 unlock_page(page);
1769 1772 put_page(page);
1770 1773 return 0;