Commit e2bdb933ab8b7db71c318a4ddcf78a9fffd61ecb
Committed by
Linus Torvalds
1 parent
928da837ac
Exists in
master
and in
38 other branches
radix_tree: take radix_tree_path off stack
Down, down in the deepest depths of GFP_NOIO page reclaim, we have shrink_page_list() calling __remove_mapping() calling __delete_from_ swap_cache() or __delete_from_page_cache(). You would not expect those to need much stack, but in fact they call radix_tree_delete(): which declares a 192-byte radix_tree_path array on its stack (to record the node,offsets it visits when descending, in case it needs to ascend to update them). And if any tag is still set [1], that calls radix_tree_tag_clear(), which declares a further such 192-byte radix_tree_path array on the stack. (At least we have interrupts disabled here, so won't then be pushing registers too.) That was probably a good choice when most users were 32-bit (array of half the size), and adding fields to radix_tree_node would have bloated it unnecessarily. But nowadays many are 64-bit, and each radix_tree_node contains a struct rcu_head, which is only used when freeing; whereas the radix_tree_path info is only used for updating the tree (deleting, clearing tags or setting tags if tagged) when a lock must be held, of no interest when accessing the tree locklessly. So add a parent pointer to the radix_tree_node, in union with the rcu_head, and remove all uses of the radix_tree_path. There would be space in that union to save the offset when descending as before (we can argue that a lock must already be held to exclude other users), but recalculating it when ascending is both easy (a constant shift and a constant mask) and uncommon, so it seems better just to do that. Two little optimizations: no need to decrement height when descending, adjusting shift is enough; and once radix_tree_tag_if_tagged() has set tag on a node and its ancestors, it need not ascend from that node again. perf on the radix tree test harness reports radix_tree_insert() as 2% slower (now having to set parent), but radix_tree_delete() 24% faster. Surely that's an exaggeration from rtth's artificially low map shift 3, but forcing it back to 6 still rates radix_tree_delete() 8% faster. [1] Can a pagecache tag (dirty, writeback or towrite) actually still be set at the time of radix_tree_delete()? Perhaps not if the filesystem is well-behaved. But although I've not tracked any stack overflow down to this cause, I have observed a curious case in which a dirty tag is set and left set on tmpfs: page migration's migrate_page_copy() happens to use __set_page_dirty_nobuffers() to set PageDirty on the newpage, and that sets PAGECACHE_TAG_DIRTY as a side-effect - harmless to a filesystem which doesn't use tags, except for this stack depth issue. Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Jan Kara <jack@suse.cz> Cc: Dave Chinner <david@fromorbit.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Nai Xia <nai.xia@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 76 additions and 78 deletions Side-by-side Diff
lib/radix-tree.c
... | ... | @@ -48,16 +48,14 @@ |
48 | 48 | struct radix_tree_node { |
49 | 49 | unsigned int height; /* Height from the bottom */ |
50 | 50 | unsigned int count; |
51 | - struct rcu_head rcu_head; | |
51 | + union { | |
52 | + struct radix_tree_node *parent; /* Used when ascending tree */ | |
53 | + struct rcu_head rcu_head; /* Used when freeing node */ | |
54 | + }; | |
52 | 55 | void __rcu *slots[RADIX_TREE_MAP_SIZE]; |
53 | 56 | unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; |
54 | 57 | }; |
55 | 58 | |
56 | -struct radix_tree_path { | |
57 | - struct radix_tree_node *node; | |
58 | - int offset; | |
59 | -}; | |
60 | - | |
61 | 59 | #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) |
62 | 60 | #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ |
63 | 61 | RADIX_TREE_MAP_SHIFT)) |
... | ... | @@ -256,6 +254,7 @@ |
256 | 254 | static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) |
257 | 255 | { |
258 | 256 | struct radix_tree_node *node; |
257 | + struct radix_tree_node *slot; | |
259 | 258 | unsigned int height; |
260 | 259 | int tag; |
261 | 260 | |
262 | 261 | |
263 | 262 | |
... | ... | @@ -274,18 +273,23 @@ |
274 | 273 | if (!(node = radix_tree_node_alloc(root))) |
275 | 274 | return -ENOMEM; |
276 | 275 | |
277 | - /* Increase the height. */ | |
278 | - node->slots[0] = indirect_to_ptr(root->rnode); | |
279 | - | |
280 | 276 | /* Propagate the aggregated tag info into the new root */ |
281 | 277 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { |
282 | 278 | if (root_tag_get(root, tag)) |
283 | 279 | tag_set(node, tag, 0); |
284 | 280 | } |
285 | 281 | |
282 | + /* Increase the height. */ | |
286 | 283 | newheight = root->height+1; |
287 | 284 | node->height = newheight; |
288 | 285 | node->count = 1; |
286 | + node->parent = NULL; | |
287 | + slot = root->rnode; | |
288 | + if (newheight > 1) { | |
289 | + slot = indirect_to_ptr(slot); | |
290 | + slot->parent = node; | |
291 | + } | |
292 | + node->slots[0] = slot; | |
289 | 293 | node = ptr_to_indirect(node); |
290 | 294 | rcu_assign_pointer(root->rnode, node); |
291 | 295 | root->height = newheight; |
... | ... | @@ -331,6 +335,7 @@ |
331 | 335 | if (!(slot = radix_tree_node_alloc(root))) |
332 | 336 | return -ENOMEM; |
333 | 337 | slot->height = height; |
338 | + slot->parent = node; | |
334 | 339 | if (node) { |
335 | 340 | rcu_assign_pointer(node->slots[offset], slot); |
336 | 341 | node->count++; |
337 | 342 | |
338 | 343 | |
339 | 344 | |
340 | 345 | |
341 | 346 | |
342 | 347 | |
343 | 348 | |
344 | 349 | |
345 | 350 | |
... | ... | @@ -504,47 +509,41 @@ |
504 | 509 | void *radix_tree_tag_clear(struct radix_tree_root *root, |
505 | 510 | unsigned long index, unsigned int tag) |
506 | 511 | { |
507 | - /* | |
508 | - * The radix tree path needs to be one longer than the maximum path | |
509 | - * since the "list" is null terminated. | |
510 | - */ | |
511 | - struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path; | |
512 | + struct radix_tree_node *node = NULL; | |
512 | 513 | struct radix_tree_node *slot = NULL; |
513 | 514 | unsigned int height, shift; |
515 | + int uninitialized_var(offset); | |
514 | 516 | |
515 | 517 | height = root->height; |
516 | 518 | if (index > radix_tree_maxindex(height)) |
517 | 519 | goto out; |
518 | 520 | |
519 | - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; | |
520 | - pathp->node = NULL; | |
521 | + shift = height * RADIX_TREE_MAP_SHIFT; | |
521 | 522 | slot = indirect_to_ptr(root->rnode); |
522 | 523 | |
523 | - while (height > 0) { | |
524 | - int offset; | |
525 | - | |
524 | + while (shift) { | |
526 | 525 | if (slot == NULL) |
527 | 526 | goto out; |
528 | 527 | |
528 | + shift -= RADIX_TREE_MAP_SHIFT; | |
529 | 529 | offset = (index >> shift) & RADIX_TREE_MAP_MASK; |
530 | - pathp[1].offset = offset; | |
531 | - pathp[1].node = slot; | |
530 | + node = slot; | |
532 | 531 | slot = slot->slots[offset]; |
533 | - pathp++; | |
534 | - shift -= RADIX_TREE_MAP_SHIFT; | |
535 | - height--; | |
536 | 532 | } |
537 | 533 | |
538 | 534 | if (slot == NULL) |
539 | 535 | goto out; |
540 | 536 | |
541 | - while (pathp->node) { | |
542 | - if (!tag_get(pathp->node, tag, pathp->offset)) | |
537 | + while (node) { | |
538 | + if (!tag_get(node, tag, offset)) | |
543 | 539 | goto out; |
544 | - tag_clear(pathp->node, tag, pathp->offset); | |
545 | - if (any_tag_set(pathp->node, tag)) | |
540 | + tag_clear(node, tag, offset); | |
541 | + if (any_tag_set(node, tag)) | |
546 | 542 | goto out; |
547 | - pathp--; | |
543 | + | |
544 | + index >>= RADIX_TREE_MAP_SHIFT; | |
545 | + offset = index & RADIX_TREE_MAP_MASK; | |
546 | + node = node->parent; | |
548 | 547 | } |
549 | 548 | |
550 | 549 | /* clear the root's tag bit */ |
... | ... | @@ -646,8 +645,7 @@ |
646 | 645 | unsigned int iftag, unsigned int settag) |
647 | 646 | { |
648 | 647 | unsigned int height = root->height; |
649 | - struct radix_tree_path path[height]; | |
650 | - struct radix_tree_path *pathp = path; | |
648 | + struct radix_tree_node *node = NULL; | |
651 | 649 | struct radix_tree_node *slot; |
652 | 650 | unsigned int shift; |
653 | 651 | unsigned long tagged = 0; |
654 | 652 | |
... | ... | @@ -671,14 +669,8 @@ |
671 | 669 | shift = (height - 1) * RADIX_TREE_MAP_SHIFT; |
672 | 670 | slot = indirect_to_ptr(root->rnode); |
673 | 671 | |
674 | - /* | |
675 | - * we fill the path from (root->height - 2) to 0, leaving the index at | |
676 | - * (root->height - 1) as a terminator. Zero the node in the terminator | |
677 | - * so that we can use this to end walk loops back up the path. | |
678 | - */ | |
679 | - path[height - 1].node = NULL; | |
680 | - | |
681 | 672 | for (;;) { |
673 | + unsigned long upindex; | |
682 | 674 | int offset; |
683 | 675 | |
684 | 676 | offset = (index >> shift) & RADIX_TREE_MAP_MASK; |
685 | 677 | |
686 | 678 | |
... | ... | @@ -686,12 +678,10 @@ |
686 | 678 | goto next; |
687 | 679 | if (!tag_get(slot, iftag, offset)) |
688 | 680 | goto next; |
689 | - if (height > 1) { | |
681 | + if (shift) { | |
690 | 682 | /* Go down one level */ |
691 | - height--; | |
692 | 683 | shift -= RADIX_TREE_MAP_SHIFT; |
693 | - path[height - 1].node = slot; | |
694 | - path[height - 1].offset = offset; | |
684 | + node = slot; | |
695 | 685 | slot = slot->slots[offset]; |
696 | 686 | continue; |
697 | 687 | } |
698 | 688 | |
699 | 689 | |
700 | 690 | |
... | ... | @@ -701,15 +691,27 @@ |
701 | 691 | tag_set(slot, settag, offset); |
702 | 692 | |
703 | 693 | /* walk back up the path tagging interior nodes */ |
704 | - pathp = &path[0]; | |
705 | - while (pathp->node) { | |
694 | + upindex = index; | |
695 | + while (node) { | |
696 | + upindex >>= RADIX_TREE_MAP_SHIFT; | |
697 | + offset = upindex & RADIX_TREE_MAP_MASK; | |
698 | + | |
706 | 699 | /* stop if we find a node with the tag already set */ |
707 | - if (tag_get(pathp->node, settag, pathp->offset)) | |
700 | + if (tag_get(node, settag, offset)) | |
708 | 701 | break; |
709 | - tag_set(pathp->node, settag, pathp->offset); | |
710 | - pathp++; | |
702 | + tag_set(node, settag, offset); | |
703 | + node = node->parent; | |
711 | 704 | } |
712 | 705 | |
706 | + /* | |
707 | + * Small optimization: now clear that node pointer. | |
708 | + * Since all of this slot's ancestors now have the tag set | |
709 | + * from setting it above, we have no further need to walk | |
710 | + * back up the tree setting tags, until we update slot to | |
711 | + * point to another radix_tree_node. | |
712 | + */ | |
713 | + node = NULL; | |
714 | + | |
713 | 715 | next: |
714 | 716 | /* Go to next item at level determined by 'shift' */ |
715 | 717 | index = ((index >> shift) + 1) << shift; |
... | ... | @@ -724,8 +726,7 @@ |
724 | 726 | * last_index is guaranteed to be in the tree, what |
725 | 727 | * we do below cannot wander astray. |
726 | 728 | */ |
727 | - slot = path[height - 1].node; | |
728 | - height++; | |
729 | + slot = slot->parent; | |
729 | 730 | shift += RADIX_TREE_MAP_SHIFT; |
730 | 731 | } |
731 | 732 | } |
... | ... | @@ -1299,7 +1300,7 @@ |
1299 | 1300 | /* try to shrink tree height */ |
1300 | 1301 | while (root->height > 0) { |
1301 | 1302 | struct radix_tree_node *to_free = root->rnode; |
1302 | - void *newptr; | |
1303 | + struct radix_tree_node *slot; | |
1303 | 1304 | |
1304 | 1305 | BUG_ON(!radix_tree_is_indirect_ptr(to_free)); |
1305 | 1306 | to_free = indirect_to_ptr(to_free); |
... | ... | @@ -1320,10 +1321,12 @@ |
1320 | 1321 | * (to_free->slots[0]), it will be safe to dereference the new |
1321 | 1322 | * one (root->rnode) as far as dependent read barriers go. |
1322 | 1323 | */ |
1323 | - newptr = to_free->slots[0]; | |
1324 | - if (root->height > 1) | |
1325 | - newptr = ptr_to_indirect(newptr); | |
1326 | - root->rnode = newptr; | |
1324 | + slot = to_free->slots[0]; | |
1325 | + if (root->height > 1) { | |
1326 | + slot->parent = NULL; | |
1327 | + slot = ptr_to_indirect(slot); | |
1328 | + } | |
1329 | + root->rnode = slot; | |
1327 | 1330 | root->height--; |
1328 | 1331 | |
1329 | 1332 | /* |
1330 | 1333 | |
... | ... | @@ -1363,16 +1366,12 @@ |
1363 | 1366 | */ |
1364 | 1367 | void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) |
1365 | 1368 | { |
1366 | - /* | |
1367 | - * The radix tree path needs to be one longer than the maximum path | |
1368 | - * since the "list" is null terminated. | |
1369 | - */ | |
1370 | - struct radix_tree_path path[RADIX_TREE_MAX_PATH + 1], *pathp = path; | |
1369 | + struct radix_tree_node *node = NULL; | |
1371 | 1370 | struct radix_tree_node *slot = NULL; |
1372 | 1371 | struct radix_tree_node *to_free; |
1373 | 1372 | unsigned int height, shift; |
1374 | 1373 | int tag; |
1375 | - int offset; | |
1374 | + int uninitialized_var(offset); | |
1376 | 1375 | |
1377 | 1376 | height = root->height; |
1378 | 1377 | if (index > radix_tree_maxindex(height)) |
1379 | 1378 | |
1380 | 1379 | |
1381 | 1380 | |
1382 | 1381 | |
1383 | 1382 | |
1384 | 1383 | |
1385 | 1384 | |
... | ... | @@ -1385,39 +1384,35 @@ |
1385 | 1384 | goto out; |
1386 | 1385 | } |
1387 | 1386 | slot = indirect_to_ptr(slot); |
1387 | + shift = height * RADIX_TREE_MAP_SHIFT; | |
1388 | 1388 | |
1389 | - shift = (height - 1) * RADIX_TREE_MAP_SHIFT; | |
1390 | - pathp->node = NULL; | |
1391 | - | |
1392 | 1389 | do { |
1393 | 1390 | if (slot == NULL) |
1394 | 1391 | goto out; |
1395 | 1392 | |
1396 | - pathp++; | |
1393 | + shift -= RADIX_TREE_MAP_SHIFT; | |
1397 | 1394 | offset = (index >> shift) & RADIX_TREE_MAP_MASK; |
1398 | - pathp->offset = offset; | |
1399 | - pathp->node = slot; | |
1395 | + node = slot; | |
1400 | 1396 | slot = slot->slots[offset]; |
1401 | - shift -= RADIX_TREE_MAP_SHIFT; | |
1402 | - height--; | |
1403 | - } while (height > 0); | |
1397 | + } while (shift); | |
1404 | 1398 | |
1405 | 1399 | if (slot == NULL) |
1406 | 1400 | goto out; |
1407 | 1401 | |
1408 | 1402 | /* |
1409 | - * Clear all tags associated with the just-deleted item | |
1403 | + * Clear all tags associated with the item to be deleted. | |
1404 | + * This way of doing it would be inefficient, but seldom is any set. | |
1410 | 1405 | */ |
1411 | 1406 | for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { |
1412 | - if (tag_get(pathp->node, tag, pathp->offset)) | |
1407 | + if (tag_get(node, tag, offset)) | |
1413 | 1408 | radix_tree_tag_clear(root, index, tag); |
1414 | 1409 | } |
1415 | 1410 | |
1416 | 1411 | to_free = NULL; |
1417 | 1412 | /* Now free the nodes we do not need anymore */ |
1418 | - while (pathp->node) { | |
1419 | - pathp->node->slots[pathp->offset] = NULL; | |
1420 | - pathp->node->count--; | |
1413 | + while (node) { | |
1414 | + node->slots[offset] = NULL; | |
1415 | + node->count--; | |
1421 | 1416 | /* |
1422 | 1417 | * Queue the node for deferred freeing after the |
1423 | 1418 | * last reference to it disappears (set NULL, above). |
1424 | 1419 | |
1425 | 1420 | |
1426 | 1421 | |
... | ... | @@ -1425,17 +1420,20 @@ |
1425 | 1420 | if (to_free) |
1426 | 1421 | radix_tree_node_free(to_free); |
1427 | 1422 | |
1428 | - if (pathp->node->count) { | |
1429 | - if (pathp->node == indirect_to_ptr(root->rnode)) | |
1423 | + if (node->count) { | |
1424 | + if (node == indirect_to_ptr(root->rnode)) | |
1430 | 1425 | radix_tree_shrink(root); |
1431 | 1426 | goto out; |
1432 | 1427 | } |
1433 | 1428 | |
1434 | 1429 | /* Node with zero slots in use so free it */ |
1435 | - to_free = pathp->node; | |
1436 | - pathp--; | |
1430 | + to_free = node; | |
1437 | 1431 | |
1432 | + index >>= RADIX_TREE_MAP_SHIFT; | |
1433 | + offset = index & RADIX_TREE_MAP_MASK; | |
1434 | + node = node->parent; | |
1438 | 1435 | } |
1436 | + | |
1439 | 1437 | root_tag_clear_all(root); |
1440 | 1438 | root->height = 0; |
1441 | 1439 | root->rnode = NULL; |