Commit 4146d2d673e8d6abf9b30a5b5dd8cd95f29632eb

Authored by Hugh Dickins
Committed by Linus Torvalds
1 parent c8d6553b95

ksm: make !merge_across_nodes migration safe

The new KSM NUMA merge_across_nodes knob introduces a problem, when it's
set to non-default 0: if a KSM page is migrated to a different NUMA node,
how do we migrate its stable node to the right tree?  And what if that
collides with an existing stable node?

ksm_migrate_page() can do no more than it's already doing, updating
stable_node->kpfn: the stable tree itself cannot be manipulated without
holding ksm_thread_mutex.  So accept that a stable tree may temporarily
indicate a page belonging to the wrong NUMA node, leave updating until the
next pass of ksmd, just be careful not to merge other pages on to a
misplaced page.  Note nid of holding tree in stable_node, and recognize
that it will not always match nid of kpfn.

A misplaced KSM page is discovered, either when ksm_do_scan() next comes
around to one of its rmap_items (we now have to go to cmp_and_merge_page
even on pages in a stable tree), or when stable_tree_search() arrives at a
matching node for another page, and this node page is found misplaced.

In each case, move the misplaced stable_node to a list of migrate_nodes
(and use the address of migrate_nodes as magic by which to identify them):
we don't need them in a tree.  If stable_tree_search() finds no match for
a page, but it's currently exiled to this list, then slot its stable_node
right there into the tree, bringing all of its mappings with it; otherwise
they get migrated one by one to the original page of the colliding node.
stable_tree_search() is now modelled more like stable_tree_insert(), in
order to handle these insertions of migrated nodes.

remove_node_from_stable_tree(), remove_all_stable_nodes() and
ksm_check_stable_tree() have to handle the migrate_nodes list as well as
the stable tree itself.  Less obviously, we do need to prune the list of
stale entries from time to time (scan_get_next_rmap_item() does it once
each full scan): whereas stale nodes in the stable tree get naturally
pruned as searches try to brush past them, these migrate_nodes may get
forgotten and accumulate.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Petr Holasek <pholasek@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Izik Eidus <izik.eidus@ravellosystems.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 134 additions and 30 deletions Side-by-side Diff

... ... @@ -122,13 +122,25 @@
122 122 /**
123 123 * struct stable_node - node of the stable rbtree
124 124 * @node: rb node of this ksm page in the stable tree
  125 + * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
  126 + * @list: linked into migrate_nodes, pending placement in the proper node tree
125 127 * @hlist: hlist head of rmap_items using this ksm page
126   - * @kpfn: page frame number of this ksm page
  128 + * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
  129 + * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
127 130 */
128 131 struct stable_node {
129   - struct rb_node node;
  132 + union {
  133 + struct rb_node node; /* when node of stable tree */
  134 + struct { /* when listed for migration */
  135 + struct list_head *head;
  136 + struct list_head list;
  137 + };
  138 + };
130 139 struct hlist_head hlist;
131 140 unsigned long kpfn;
  141 +#ifdef CONFIG_NUMA
  142 + int nid;
  143 +#endif
132 144 };
133 145  
134 146 /**
... ... @@ -169,6 +181,9 @@
169 181 static struct rb_root root_unstable_tree[MAX_NUMNODES];
170 182 static struct rb_root root_stable_tree[MAX_NUMNODES];
171 183  
  184 +/* Recently migrated nodes of stable tree, pending proper placement */
  185 +static LIST_HEAD(migrate_nodes);
  186 +
172 187 #define MM_SLOTS_HASH_BITS 10
173 188 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
174 189  
... ... @@ -311,11 +326,6 @@
311 326 hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm);
312 327 }
313 328  
314   -static inline int in_stable_tree(struct rmap_item *rmap_item)
315   -{
316   - return rmap_item->address & STABLE_FLAG;
317   -}
318   -
319 329 /*
320 330 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
321 331 * page tables after it has passed through ksm_exit() - which, if necessary,
... ... @@ -476,7 +486,6 @@
476 486 {
477 487 struct rmap_item *rmap_item;
478 488 struct hlist_node *hlist;
479   - int nid;
480 489  
481 490 hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
482 491 if (rmap_item->hlist.next)
... ... @@ -488,8 +497,11 @@
488 497 cond_resched();
489 498 }
490 499  
491   - nid = get_kpfn_nid(stable_node->kpfn);
492   - rb_erase(&stable_node->node, &root_stable_tree[nid]);
  500 + if (stable_node->head == &migrate_nodes)
  501 + list_del(&stable_node->list);
  502 + else
  503 + rb_erase(&stable_node->node,
  504 + &root_stable_tree[NUMA(stable_node->nid)]);
493 505 free_stable_node(stable_node);
494 506 }
495 507  
... ... @@ -712,6 +724,7 @@
712 724 static int remove_all_stable_nodes(void)
713 725 {
714 726 struct stable_node *stable_node;
  727 + struct list_head *this, *next;
715 728 int nid;
716 729 int err = 0;
717 730  
... ... @@ -726,6 +739,12 @@
726 739 cond_resched();
727 740 }
728 741 }
  742 + list_for_each_safe(this, next, &migrate_nodes) {
  743 + stable_node = list_entry(this, struct stable_node, list);
  744 + if (remove_stable_node(stable_node))
  745 + err = -EBUSY;
  746 + cond_resched();
  747 + }
729 748 return err;
730 749 }
731 750  
732 751  
733 752  
734 753  
735 754  
736 755  
... ... @@ -1113,25 +1132,30 @@
1113 1132 */
1114 1133 static struct page *stable_tree_search(struct page *page)
1115 1134 {
1116   - struct rb_node *node;
1117   - struct stable_node *stable_node;
1118 1135 int nid;
  1136 + struct rb_node **new;
  1137 + struct rb_node *parent;
  1138 + struct stable_node *stable_node;
  1139 + struct stable_node *page_node;
1119 1140  
1120   - stable_node = page_stable_node(page);
1121   - if (stable_node) { /* ksm page forked */
  1141 + page_node = page_stable_node(page);
  1142 + if (page_node && page_node->head != &migrate_nodes) {
  1143 + /* ksm page forked */
1122 1144 get_page(page);
1123 1145 return page;
1124 1146 }
1125 1147  
1126 1148 nid = get_kpfn_nid(page_to_pfn(page));
1127   - node = root_stable_tree[nid].rb_node;
  1149 +again:
  1150 + new = &root_stable_tree[nid].rb_node;
  1151 + parent = NULL;
1128 1152  
1129   - while (node) {
  1153 + while (*new) {
1130 1154 struct page *tree_page;
1131 1155 int ret;
1132 1156  
1133 1157 cond_resched();
1134   - stable_node = rb_entry(node, struct stable_node, node);
  1158 + stable_node = rb_entry(*new, struct stable_node, node);
1135 1159 tree_page = get_ksm_page(stable_node, false);
1136 1160 if (!tree_page)
1137 1161 return NULL;
1138 1162  
1139 1163  
... ... @@ -1139,10 +1163,11 @@
1139 1163 ret = memcmp_pages(page, tree_page);
1140 1164 put_page(tree_page);
1141 1165  
  1166 + parent = *new;
1142 1167 if (ret < 0)
1143   - node = node->rb_left;
  1168 + new = &parent->rb_left;
1144 1169 else if (ret > 0)
1145   - node = node->rb_right;
  1170 + new = &parent->rb_right;
1146 1171 else {
1147 1172 /*
1148 1173 * Lock and unlock the stable_node's page (which
1149 1174  
1150 1175  
... ... @@ -1152,13 +1177,49 @@
1152 1177 * than kpage, but that involves more changes.
1153 1178 */
1154 1179 tree_page = get_ksm_page(stable_node, true);
1155   - if (tree_page)
  1180 + if (tree_page) {
1156 1181 unlock_page(tree_page);
1157   - return tree_page;
  1182 + if (get_kpfn_nid(stable_node->kpfn) !=
  1183 + NUMA(stable_node->nid)) {
  1184 + put_page(tree_page);
  1185 + goto replace;
  1186 + }
  1187 + return tree_page;
  1188 + }
  1189 + /*
  1190 + * There is now a place for page_node, but the tree may
  1191 + * have been rebalanced, so re-evaluate parent and new.
  1192 + */
  1193 + if (page_node)
  1194 + goto again;
  1195 + return NULL;
1158 1196 }
1159 1197 }
1160 1198  
1161   - return NULL;
  1199 + if (!page_node)
  1200 + return NULL;
  1201 +
  1202 + list_del(&page_node->list);
  1203 + DO_NUMA(page_node->nid = nid);
  1204 + rb_link_node(&page_node->node, parent, new);
  1205 + rb_insert_color(&page_node->node, &root_stable_tree[nid]);
  1206 + get_page(page);
  1207 + return page;
  1208 +
  1209 +replace:
  1210 + if (page_node) {
  1211 + list_del(&page_node->list);
  1212 + DO_NUMA(page_node->nid = nid);
  1213 + rb_replace_node(&stable_node->node,
  1214 + &page_node->node, &root_stable_tree[nid]);
  1215 + get_page(page);
  1216 + } else {
  1217 + rb_erase(&stable_node->node, &root_stable_tree[nid]);
  1218 + page = NULL;
  1219 + }
  1220 + stable_node->head = &migrate_nodes;
  1221 + list_add(&stable_node->list, stable_node->head);
  1222 + return page;
1162 1223 }
1163 1224  
1164 1225 /*
... ... @@ -1215,6 +1276,7 @@
1215 1276 INIT_HLIST_HEAD(&stable_node->hlist);
1216 1277 stable_node->kpfn = kpfn;
1217 1278 set_page_stable_node(kpage, stable_node);
  1279 + DO_NUMA(stable_node->nid = nid);
1218 1280 rb_link_node(&stable_node->node, parent, new);
1219 1281 rb_insert_color(&stable_node->node, &root_stable_tree[nid]);
1220 1282  
... ... @@ -1311,11 +1373,6 @@
1311 1373 static void stable_tree_append(struct rmap_item *rmap_item,
1312 1374 struct stable_node *stable_node)
1313 1375 {
1314   - /*
1315   - * Usually rmap_item->nid is already set correctly,
1316   - * but it may be wrong after switching merge_across_nodes.
1317   - */
1318   - DO_NUMA(rmap_item->nid = get_kpfn_nid(stable_node->kpfn));
1319 1376 rmap_item->head = stable_node;
1320 1377 rmap_item->address |= STABLE_FLAG;
1321 1378 hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
1322 1379  
... ... @@ -1344,10 +1401,29 @@
1344 1401 unsigned int checksum;
1345 1402 int err;
1346 1403  
1347   - remove_rmap_item_from_tree(rmap_item);
  1404 + stable_node = page_stable_node(page);
  1405 + if (stable_node) {
  1406 + if (stable_node->head != &migrate_nodes &&
  1407 + get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
  1408 + rb_erase(&stable_node->node,
  1409 + &root_stable_tree[NUMA(stable_node->nid)]);
  1410 + stable_node->head = &migrate_nodes;
  1411 + list_add(&stable_node->list, stable_node->head);
  1412 + }
  1413 + if (stable_node->head != &migrate_nodes &&
  1414 + rmap_item->head == stable_node)
  1415 + return;
  1416 + }
1348 1417  
1349 1418 /* We first start with searching the page inside the stable tree */
1350 1419 kpage = stable_tree_search(page);
  1420 + if (kpage == page && rmap_item->head == stable_node) {
  1421 + put_page(kpage);
  1422 + return;
  1423 + }
  1424 +
  1425 + remove_rmap_item_from_tree(rmap_item);
  1426 +
1351 1427 if (kpage) {
1352 1428 err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
1353 1429 if (!err) {
... ... @@ -1464,6 +1540,27 @@
1464 1540 */
1465 1541 lru_add_drain_all();
1466 1542  
  1543 + /*
  1544 + * Whereas stale stable_nodes on the stable_tree itself
  1545 + * get pruned in the regular course of stable_tree_search(),
  1546 + * those moved out to the migrate_nodes list can accumulate:
  1547 + * so prune them once before each full scan.
  1548 + */
  1549 + if (!ksm_merge_across_nodes) {
  1550 + struct stable_node *stable_node;
  1551 + struct list_head *this, *next;
  1552 + struct page *page;
  1553 +
  1554 + list_for_each_safe(this, next, &migrate_nodes) {
  1555 + stable_node = list_entry(this,
  1556 + struct stable_node, list);
  1557 + page = get_ksm_page(stable_node, false);
  1558 + if (page)
  1559 + put_page(page);
  1560 + cond_resched();
  1561 + }
  1562 + }
  1563 +
1467 1564 for (nid = 0; nid < nr_node_ids; nid++)
1468 1565 root_unstable_tree[nid] = RB_ROOT;
1469 1566  
... ... @@ -1586,8 +1683,7 @@
1586 1683 rmap_item = scan_get_next_rmap_item(&page);
1587 1684 if (!rmap_item)
1588 1685 return;
1589   - if (!PageKsm(page) || !in_stable_tree(rmap_item))
1590   - cmp_and_merge_page(page, rmap_item);
  1686 + cmp_and_merge_page(page, rmap_item);
1591 1687 put_page(page);
1592 1688 }
1593 1689 }
... ... @@ -1964,6 +2060,7 @@
1964 2060 unsigned long end_pfn)
1965 2061 {
1966 2062 struct stable_node *stable_node;
  2063 + struct list_head *this, *next;
1967 2064 struct rb_node *node;
1968 2065 int nid;
1969 2066  
... ... @@ -1983,6 +2080,13 @@
1983 2080 node = rb_next(node);
1984 2081 cond_resched();
1985 2082 }
  2083 + }
  2084 + list_for_each_safe(this, next, &migrate_nodes) {
  2085 + stable_node = list_entry(this, struct stable_node, list);
  2086 + if (stable_node->kpfn >= start_pfn &&
  2087 + stable_node->kpfn < end_pfn)
  2088 + remove_node_from_stable_tree(stable_node);
  2089 + cond_resched();
1986 2090 }
1987 2091 }
1988 2092