Commit 4146d2d673e8d6abf9b30a5b5dd8cd95f29632eb
Committed by
Linus Torvalds
1 parent
c8d6553b95
Exists in
master
and in
20 other branches
ksm: make !merge_across_nodes migration safe
The new KSM NUMA merge_across_nodes knob introduces a problem, when it's set to non-default 0: if a KSM page is migrated to a different NUMA node, how do we migrate its stable node to the right tree? And what if that collides with an existing stable node? ksm_migrate_page() can do no more than it's already doing, updating stable_node->kpfn: the stable tree itself cannot be manipulated without holding ksm_thread_mutex. So accept that a stable tree may temporarily indicate a page belonging to the wrong NUMA node, leave updating until the next pass of ksmd, just be careful not to merge other pages on to a misplaced page. Note nid of holding tree in stable_node, and recognize that it will not always match nid of kpfn. A misplaced KSM page is discovered, either when ksm_do_scan() next comes around to one of its rmap_items (we now have to go to cmp_and_merge_page even on pages in a stable tree), or when stable_tree_search() arrives at a matching node for another page, and this node page is found misplaced. In each case, move the misplaced stable_node to a list of migrate_nodes (and use the address of migrate_nodes as magic by which to identify them): we don't need them in a tree. If stable_tree_search() finds no match for a page, but it's currently exiled to this list, then slot its stable_node right there into the tree, bringing all of its mappings with it; otherwise they get migrated one by one to the original page of the colliding node. stable_tree_search() is now modelled more like stable_tree_insert(), in order to handle these insertions of migrated nodes. remove_node_from_stable_tree(), remove_all_stable_nodes() and ksm_check_stable_tree() have to handle the migrate_nodes list as well as the stable tree itself. Less obviously, we do need to prune the list of stale entries from time to time (scan_get_next_rmap_item() does it once each full scan): whereas stale nodes in the stable tree get naturally pruned as searches try to brush past them, these migrate_nodes may get forgotten and accumulate. Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Petr Holasek <pholasek@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Izik Eidus <izik.eidus@ravellosystems.com> Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com> Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 134 additions and 30 deletions Side-by-side Diff
mm/ksm.c
... | ... | @@ -122,13 +122,25 @@ |
122 | 122 | /** |
123 | 123 | * struct stable_node - node of the stable rbtree |
124 | 124 | * @node: rb node of this ksm page in the stable tree |
125 | + * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list | |
126 | + * @list: linked into migrate_nodes, pending placement in the proper node tree | |
125 | 127 | * @hlist: hlist head of rmap_items using this ksm page |
126 | - * @kpfn: page frame number of this ksm page | |
128 | + * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid) | |
129 | + * @nid: NUMA node id of stable tree in which linked (may not match kpfn) | |
127 | 130 | */ |
128 | 131 | struct stable_node { |
129 | - struct rb_node node; | |
132 | + union { | |
133 | + struct rb_node node; /* when node of stable tree */ | |
134 | + struct { /* when listed for migration */ | |
135 | + struct list_head *head; | |
136 | + struct list_head list; | |
137 | + }; | |
138 | + }; | |
130 | 139 | struct hlist_head hlist; |
131 | 140 | unsigned long kpfn; |
141 | +#ifdef CONFIG_NUMA | |
142 | + int nid; | |
143 | +#endif | |
132 | 144 | }; |
133 | 145 | |
134 | 146 | /** |
... | ... | @@ -169,6 +181,9 @@ |
169 | 181 | static struct rb_root root_unstable_tree[MAX_NUMNODES]; |
170 | 182 | static struct rb_root root_stable_tree[MAX_NUMNODES]; |
171 | 183 | |
184 | +/* Recently migrated nodes of stable tree, pending proper placement */ | |
185 | +static LIST_HEAD(migrate_nodes); | |
186 | + | |
172 | 187 | #define MM_SLOTS_HASH_BITS 10 |
173 | 188 | static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); |
174 | 189 | |
... | ... | @@ -311,11 +326,6 @@ |
311 | 326 | hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); |
312 | 327 | } |
313 | 328 | |
314 | -static inline int in_stable_tree(struct rmap_item *rmap_item) | |
315 | -{ | |
316 | - return rmap_item->address & STABLE_FLAG; | |
317 | -} | |
318 | - | |
319 | 329 | /* |
320 | 330 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's |
321 | 331 | * page tables after it has passed through ksm_exit() - which, if necessary, |
... | ... | @@ -476,7 +486,6 @@ |
476 | 486 | { |
477 | 487 | struct rmap_item *rmap_item; |
478 | 488 | struct hlist_node *hlist; |
479 | - int nid; | |
480 | 489 | |
481 | 490 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { |
482 | 491 | if (rmap_item->hlist.next) |
... | ... | @@ -488,8 +497,11 @@ |
488 | 497 | cond_resched(); |
489 | 498 | } |
490 | 499 | |
491 | - nid = get_kpfn_nid(stable_node->kpfn); | |
492 | - rb_erase(&stable_node->node, &root_stable_tree[nid]); | |
500 | + if (stable_node->head == &migrate_nodes) | |
501 | + list_del(&stable_node->list); | |
502 | + else | |
503 | + rb_erase(&stable_node->node, | |
504 | + &root_stable_tree[NUMA(stable_node->nid)]); | |
493 | 505 | free_stable_node(stable_node); |
494 | 506 | } |
495 | 507 | |
... | ... | @@ -712,6 +724,7 @@ |
712 | 724 | static int remove_all_stable_nodes(void) |
713 | 725 | { |
714 | 726 | struct stable_node *stable_node; |
727 | + struct list_head *this, *next; | |
715 | 728 | int nid; |
716 | 729 | int err = 0; |
717 | 730 | |
... | ... | @@ -726,6 +739,12 @@ |
726 | 739 | cond_resched(); |
727 | 740 | } |
728 | 741 | } |
742 | + list_for_each_safe(this, next, &migrate_nodes) { | |
743 | + stable_node = list_entry(this, struct stable_node, list); | |
744 | + if (remove_stable_node(stable_node)) | |
745 | + err = -EBUSY; | |
746 | + cond_resched(); | |
747 | + } | |
729 | 748 | return err; |
730 | 749 | } |
731 | 750 | |
732 | 751 | |
733 | 752 | |
734 | 753 | |
735 | 754 | |
736 | 755 | |
... | ... | @@ -1113,25 +1132,30 @@ |
1113 | 1132 | */ |
1114 | 1133 | static struct page *stable_tree_search(struct page *page) |
1115 | 1134 | { |
1116 | - struct rb_node *node; | |
1117 | - struct stable_node *stable_node; | |
1118 | 1135 | int nid; |
1136 | + struct rb_node **new; | |
1137 | + struct rb_node *parent; | |
1138 | + struct stable_node *stable_node; | |
1139 | + struct stable_node *page_node; | |
1119 | 1140 | |
1120 | - stable_node = page_stable_node(page); | |
1121 | - if (stable_node) { /* ksm page forked */ | |
1141 | + page_node = page_stable_node(page); | |
1142 | + if (page_node && page_node->head != &migrate_nodes) { | |
1143 | + /* ksm page forked */ | |
1122 | 1144 | get_page(page); |
1123 | 1145 | return page; |
1124 | 1146 | } |
1125 | 1147 | |
1126 | 1148 | nid = get_kpfn_nid(page_to_pfn(page)); |
1127 | - node = root_stable_tree[nid].rb_node; | |
1149 | +again: | |
1150 | + new = &root_stable_tree[nid].rb_node; | |
1151 | + parent = NULL; | |
1128 | 1152 | |
1129 | - while (node) { | |
1153 | + while (*new) { | |
1130 | 1154 | struct page *tree_page; |
1131 | 1155 | int ret; |
1132 | 1156 | |
1133 | 1157 | cond_resched(); |
1134 | - stable_node = rb_entry(node, struct stable_node, node); | |
1158 | + stable_node = rb_entry(*new, struct stable_node, node); | |
1135 | 1159 | tree_page = get_ksm_page(stable_node, false); |
1136 | 1160 | if (!tree_page) |
1137 | 1161 | return NULL; |
1138 | 1162 | |
1139 | 1163 | |
... | ... | @@ -1139,10 +1163,11 @@ |
1139 | 1163 | ret = memcmp_pages(page, tree_page); |
1140 | 1164 | put_page(tree_page); |
1141 | 1165 | |
1166 | + parent = *new; | |
1142 | 1167 | if (ret < 0) |
1143 | - node = node->rb_left; | |
1168 | + new = &parent->rb_left; | |
1144 | 1169 | else if (ret > 0) |
1145 | - node = node->rb_right; | |
1170 | + new = &parent->rb_right; | |
1146 | 1171 | else { |
1147 | 1172 | /* |
1148 | 1173 | * Lock and unlock the stable_node's page (which |
1149 | 1174 | |
1150 | 1175 | |
... | ... | @@ -1152,13 +1177,49 @@ |
1152 | 1177 | * than kpage, but that involves more changes. |
1153 | 1178 | */ |
1154 | 1179 | tree_page = get_ksm_page(stable_node, true); |
1155 | - if (tree_page) | |
1180 | + if (tree_page) { | |
1156 | 1181 | unlock_page(tree_page); |
1157 | - return tree_page; | |
1182 | + if (get_kpfn_nid(stable_node->kpfn) != | |
1183 | + NUMA(stable_node->nid)) { | |
1184 | + put_page(tree_page); | |
1185 | + goto replace; | |
1186 | + } | |
1187 | + return tree_page; | |
1188 | + } | |
1189 | + /* | |
1190 | + * There is now a place for page_node, but the tree may | |
1191 | + * have been rebalanced, so re-evaluate parent and new. | |
1192 | + */ | |
1193 | + if (page_node) | |
1194 | + goto again; | |
1195 | + return NULL; | |
1158 | 1196 | } |
1159 | 1197 | } |
1160 | 1198 | |
1161 | - return NULL; | |
1199 | + if (!page_node) | |
1200 | + return NULL; | |
1201 | + | |
1202 | + list_del(&page_node->list); | |
1203 | + DO_NUMA(page_node->nid = nid); | |
1204 | + rb_link_node(&page_node->node, parent, new); | |
1205 | + rb_insert_color(&page_node->node, &root_stable_tree[nid]); | |
1206 | + get_page(page); | |
1207 | + return page; | |
1208 | + | |
1209 | +replace: | |
1210 | + if (page_node) { | |
1211 | + list_del(&page_node->list); | |
1212 | + DO_NUMA(page_node->nid = nid); | |
1213 | + rb_replace_node(&stable_node->node, | |
1214 | + &page_node->node, &root_stable_tree[nid]); | |
1215 | + get_page(page); | |
1216 | + } else { | |
1217 | + rb_erase(&stable_node->node, &root_stable_tree[nid]); | |
1218 | + page = NULL; | |
1219 | + } | |
1220 | + stable_node->head = &migrate_nodes; | |
1221 | + list_add(&stable_node->list, stable_node->head); | |
1222 | + return page; | |
1162 | 1223 | } |
1163 | 1224 | |
1164 | 1225 | /* |
... | ... | @@ -1215,6 +1276,7 @@ |
1215 | 1276 | INIT_HLIST_HEAD(&stable_node->hlist); |
1216 | 1277 | stable_node->kpfn = kpfn; |
1217 | 1278 | set_page_stable_node(kpage, stable_node); |
1279 | + DO_NUMA(stable_node->nid = nid); | |
1218 | 1280 | rb_link_node(&stable_node->node, parent, new); |
1219 | 1281 | rb_insert_color(&stable_node->node, &root_stable_tree[nid]); |
1220 | 1282 | |
... | ... | @@ -1311,11 +1373,6 @@ |
1311 | 1373 | static void stable_tree_append(struct rmap_item *rmap_item, |
1312 | 1374 | struct stable_node *stable_node) |
1313 | 1375 | { |
1314 | - /* | |
1315 | - * Usually rmap_item->nid is already set correctly, | |
1316 | - * but it may be wrong after switching merge_across_nodes. | |
1317 | - */ | |
1318 | - DO_NUMA(rmap_item->nid = get_kpfn_nid(stable_node->kpfn)); | |
1319 | 1376 | rmap_item->head = stable_node; |
1320 | 1377 | rmap_item->address |= STABLE_FLAG; |
1321 | 1378 | hlist_add_head(&rmap_item->hlist, &stable_node->hlist); |
1322 | 1379 | |
... | ... | @@ -1344,10 +1401,29 @@ |
1344 | 1401 | unsigned int checksum; |
1345 | 1402 | int err; |
1346 | 1403 | |
1347 | - remove_rmap_item_from_tree(rmap_item); | |
1404 | + stable_node = page_stable_node(page); | |
1405 | + if (stable_node) { | |
1406 | + if (stable_node->head != &migrate_nodes && | |
1407 | + get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) { | |
1408 | + rb_erase(&stable_node->node, | |
1409 | + &root_stable_tree[NUMA(stable_node->nid)]); | |
1410 | + stable_node->head = &migrate_nodes; | |
1411 | + list_add(&stable_node->list, stable_node->head); | |
1412 | + } | |
1413 | + if (stable_node->head != &migrate_nodes && | |
1414 | + rmap_item->head == stable_node) | |
1415 | + return; | |
1416 | + } | |
1348 | 1417 | |
1349 | 1418 | /* We first start with searching the page inside the stable tree */ |
1350 | 1419 | kpage = stable_tree_search(page); |
1420 | + if (kpage == page && rmap_item->head == stable_node) { | |
1421 | + put_page(kpage); | |
1422 | + return; | |
1423 | + } | |
1424 | + | |
1425 | + remove_rmap_item_from_tree(rmap_item); | |
1426 | + | |
1351 | 1427 | if (kpage) { |
1352 | 1428 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); |
1353 | 1429 | if (!err) { |
... | ... | @@ -1464,6 +1540,27 @@ |
1464 | 1540 | */ |
1465 | 1541 | lru_add_drain_all(); |
1466 | 1542 | |
1543 | + /* | |
1544 | + * Whereas stale stable_nodes on the stable_tree itself | |
1545 | + * get pruned in the regular course of stable_tree_search(), | |
1546 | + * those moved out to the migrate_nodes list can accumulate: | |
1547 | + * so prune them once before each full scan. | |
1548 | + */ | |
1549 | + if (!ksm_merge_across_nodes) { | |
1550 | + struct stable_node *stable_node; | |
1551 | + struct list_head *this, *next; | |
1552 | + struct page *page; | |
1553 | + | |
1554 | + list_for_each_safe(this, next, &migrate_nodes) { | |
1555 | + stable_node = list_entry(this, | |
1556 | + struct stable_node, list); | |
1557 | + page = get_ksm_page(stable_node, false); | |
1558 | + if (page) | |
1559 | + put_page(page); | |
1560 | + cond_resched(); | |
1561 | + } | |
1562 | + } | |
1563 | + | |
1467 | 1564 | for (nid = 0; nid < nr_node_ids; nid++) |
1468 | 1565 | root_unstable_tree[nid] = RB_ROOT; |
1469 | 1566 | |
... | ... | @@ -1586,8 +1683,7 @@ |
1586 | 1683 | rmap_item = scan_get_next_rmap_item(&page); |
1587 | 1684 | if (!rmap_item) |
1588 | 1685 | return; |
1589 | - if (!PageKsm(page) || !in_stable_tree(rmap_item)) | |
1590 | - cmp_and_merge_page(page, rmap_item); | |
1686 | + cmp_and_merge_page(page, rmap_item); | |
1591 | 1687 | put_page(page); |
1592 | 1688 | } |
1593 | 1689 | } |
... | ... | @@ -1964,6 +2060,7 @@ |
1964 | 2060 | unsigned long end_pfn) |
1965 | 2061 | { |
1966 | 2062 | struct stable_node *stable_node; |
2063 | + struct list_head *this, *next; | |
1967 | 2064 | struct rb_node *node; |
1968 | 2065 | int nid; |
1969 | 2066 | |
... | ... | @@ -1983,6 +2080,13 @@ |
1983 | 2080 | node = rb_next(node); |
1984 | 2081 | cond_resched(); |
1985 | 2082 | } |
2083 | + } | |
2084 | + list_for_each_safe(this, next, &migrate_nodes) { | |
2085 | + stable_node = list_entry(this, struct stable_node, list); | |
2086 | + if (stable_node->kpfn >= start_pfn && | |
2087 | + stable_node->kpfn < end_pfn) | |
2088 | + remove_node_from_stable_tree(stable_node); | |
2089 | + cond_resched(); | |
1986 | 2090 | } |
1987 | 2091 | } |
1988 | 2092 |