Commit a4912123b688e057084e6557cef8924f7ae5bbde

Authored by Theodore Ts'o
1 parent 2dc6b0d48c

ext4: New inode/block allocation algorithms for flex_bg filesystems

The find_group_flex() inode allocator is now only used if the
filesystem is mounted using the "oldalloc" mount option.  It is
replaced with the original Orlov allocator that has been updated for
flex_bg filesystems (it should behave the same way if flex_bg is
disabled).  The inode allocator now functions by taking into account
each flex_bg group, instead of each block group, when deciding whether
or not it's time to allocate a new directory into a fresh flex_bg.

The block allocator has also been changed so that the first block
group in each flex_bg is preferred for use for storing directory
blocks.  This keeps directory blocks close together, which is good for
speeding up e2fsck since large directories are more likely to look
like this:

debugfs:  stat /home/tytso/Maildir/cur
Inode: 1844562   Type: directory    Mode:  0700   Flags: 0x81000
Generation: 1132745781    Version: 0x00000000:0000ad71
User: 15806   Group: 15806   Size: 1060864
File ACL: 0    Directory ACL: 0
Links: 2   Blockcount: 2072
Fragment:  Address: 0    Number: 0    Size: 0
 ctime: 0x499c0ff4:164961f4 -- Wed Feb 18 08:41:08 2009
 atime: 0x499c0ff4:00000000 -- Wed Feb 18 08:41:08 2009
 mtime: 0x49957f51:00000000 -- Fri Feb 13 09:10:25 2009
crtime: 0x499c0f57:00d51440 -- Wed Feb 18 08:38:31 2009
Size of extra inode fields: 28
BLOCKS:
(0):7348651, (1-258):7348654-7348911
TOTAL: 259

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

Showing 6 changed files with 216 additions and 58 deletions Side-by-side Diff

... ... @@ -828,6 +828,12 @@
828 828 #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
829 829  
830 830 /*
  831 + * Minimum number of groups in a flexgroup before we separate out
  832 + * directories into the first block group of a flexgroup
  833 + */
  834 +#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4
  835 +
  836 +/*
831 837 * Structure of a directory entry
832 838 */
833 839 #define EXT4_NAME_LEN 255
... ... @@ -122,6 +122,9 @@
122 122 struct list_head i_prealloc_list;
123 123 spinlock_t i_prealloc_lock;
124 124  
  125 + /* ialloc */
  126 + ext4_group_t i_last_alloc_group;
  127 +
125 128 /* allocation reservation info for delalloc */
126 129 unsigned int i_reserved_data_blocks;
127 130 unsigned int i_reserved_meta_blocks;
... ... @@ -152,6 +152,8 @@
152 152 ext4_fsblk_t bg_start;
153 153 ext4_fsblk_t last_block;
154 154 ext4_grpblk_t colour;
  155 + ext4_group_t block_group;
  156 + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
155 157 int depth;
156 158  
157 159 if (path) {
158 160  
... ... @@ -170,9 +172,30 @@
170 172 }
171 173  
172 174 /* OK. use inode's group */
173   - bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
  175 + block_group = ei->i_block_group;
  176 + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
  177 + /*
  178 + * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
  179 + * block groups per flexgroup, reserve the first block
  180 + * group for directories and special files. Regular
  181 + * files will start at the second block group. This
  182 + * tends to speed up directory access and improves
  183 + * fsck times.
  184 + */
  185 + block_group &= ~(flex_size-1);
  186 + if (S_ISREG(inode->i_mode))
  187 + block_group++;
  188 + }
  189 + bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
174 190 le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
175 191 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
  192 +
  193 + /*
  194 + * If we are doing delayed allocation, we don't need take
  195 + * colour into account.
  196 + */
  197 + if (test_opt(inode->i_sb, DELALLOC))
  198 + return bg_start;
176 199  
177 200 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
178 201 colour = (current->pid % 16) *
... ... @@ -410,7 +410,44 @@
410 410 return 0;
411 411 }
412 412  
  413 +struct orlov_stats {
  414 + __u32 free_inodes;
  415 + __u32 free_blocks;
  416 + __u32 used_dirs;
  417 +};
  418 +
413 419 /*
  420 + * Helper function for Orlov's allocator; returns critical information
  421 + * for a particular block group or flex_bg. If flex_size is 1, then g
  422 + * is a block group number; otherwise it is flex_bg number.
  423 + */
  424 +void get_orlov_stats(struct super_block *sb, ext4_group_t g,
  425 + int flex_size, struct orlov_stats *stats)
  426 +{
  427 + struct ext4_group_desc *desc;
  428 + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
  429 + int i;
  430 +
  431 + stats->free_inodes = 0;
  432 + stats->free_blocks = 0;
  433 + stats->used_dirs = 0;
  434 +
  435 + g *= flex_size;
  436 +
  437 + for (i = 0; i < flex_size; i++) {
  438 + if (g >= ngroups)
  439 + break;
  440 + desc = ext4_get_group_desc(sb, g++, NULL);
  441 + if (!desc)
  442 + continue;
  443 +
  444 + stats->free_inodes += ext4_free_inodes_count(sb, desc);
  445 + stats->free_blocks += ext4_free_blks_count(sb, desc);
  446 + stats->used_dirs += ext4_used_dirs_count(sb, desc);
  447 + }
  448 +}
  449 +
  450 +/*
414 451 * Orlov's allocator for directories.
415 452 *
416 453 * We always try to spread first-level directories.
417 454  
418 455  
419 456  
420 457  
421 458  
422 459  
423 460  
424 461  
425 462  
... ... @@ -425,36 +462,35 @@
425 462 * it has too many directories already (max_dirs) or
426 463 * it has too few free inodes left (min_inodes) or
427 464 * it has too few free blocks left (min_blocks) or
428   - * it's already running too large debt (max_debt).
429 465 * Parent's group is preferred, if it doesn't satisfy these
430 466 * conditions we search cyclically through the rest. If none
431 467 * of the groups look good we just look for a group with more
432 468 * free inodes than average (starting at parent's group).
433   - *
434   - * Debt is incremented each time we allocate a directory and decremented
435   - * when we allocate an inode, within 0--255.
436 469 */
437 470  
438   -#define INODE_COST 64
439   -#define BLOCK_COST 256
440   -
441 471 static int find_group_orlov(struct super_block *sb, struct inode *parent,
442   - ext4_group_t *group)
  472 + ext4_group_t *group, int mode)
443 473 {
444 474 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
445 475 struct ext4_sb_info *sbi = EXT4_SB(sb);
446   - struct ext4_super_block *es = sbi->s_es;
447 476 ext4_group_t ngroups = sbi->s_groups_count;
448 477 int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
449 478 unsigned int freei, avefreei;
450 479 ext4_fsblk_t freeb, avefreeb;
451   - ext4_fsblk_t blocks_per_dir;
452 480 unsigned int ndirs;
453   - int max_debt, max_dirs, min_inodes;
  481 + int max_dirs, min_inodes;
454 482 ext4_grpblk_t min_blocks;
455   - ext4_group_t i;
  483 + ext4_group_t i, grp, g;
456 484 struct ext4_group_desc *desc;
  485 + struct orlov_stats stats;
  486 + int flex_size = ext4_flex_bg_size(sbi);
457 487  
  488 + if (flex_size > 1) {
  489 + ngroups = (ngroups + flex_size - 1) >>
  490 + sbi->s_log_groups_per_flex;
  491 + parent_group >>= sbi->s_log_groups_per_flex;
  492 + }
  493 +
458 494 freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
459 495 avefreei = freei / ngroups;
460 496 freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
461 497  
462 498  
463 499  
464 500  
465 501  
466 502  
467 503  
468 504  
469 505  
470 506  
471 507  
472 508  
473 509  
474 510  
475 511  
476 512  
477 513  
478 514  
479 515  
... ... @@ -462,71 +498,97 @@
462 498 do_div(avefreeb, ngroups);
463 499 ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
464 500  
465   - if ((parent == sb->s_root->d_inode) ||
466   - (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
  501 + if (S_ISDIR(mode) &&
  502 + ((parent == sb->s_root->d_inode) ||
  503 + (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
467 504 int best_ndir = inodes_per_group;
468   - ext4_group_t grp;
469 505 int ret = -1;
470 506  
471 507 get_random_bytes(&grp, sizeof(grp));
472 508 parent_group = (unsigned)grp % ngroups;
473 509 for (i = 0; i < ngroups; i++) {
474   - grp = (parent_group + i) % ngroups;
475   - desc = ext4_get_group_desc(sb, grp, NULL);
476   - if (!desc || !ext4_free_inodes_count(sb, desc))
  510 + g = (parent_group + i) % ngroups;
  511 + get_orlov_stats(sb, g, flex_size, &stats);
  512 + if (!stats.free_inodes)
477 513 continue;
478   - if (ext4_used_dirs_count(sb, desc) >= best_ndir)
  514 + if (stats.used_dirs >= best_ndir)
479 515 continue;
480   - if (ext4_free_inodes_count(sb, desc) < avefreei)
  516 + if (stats.free_inodes < avefreei)
481 517 continue;
482   - if (ext4_free_blks_count(sb, desc) < avefreeb)
  518 + if (stats.free_blocks < avefreeb)
483 519 continue;
484   - *group = grp;
  520 + grp = g;
485 521 ret = 0;
486   - best_ndir = ext4_used_dirs_count(sb, desc);
  522 + best_ndir = stats.used_dirs;
487 523 }
488   - if (ret == 0)
489   - return ret;
  524 + if (ret)
  525 + goto fallback;
  526 + found_flex_bg:
  527 + if (flex_size == 1) {
  528 + *group = grp;
  529 + return 0;
  530 + }
  531 +
  532 + /*
  533 + * We pack inodes at the beginning of the flexgroup's
  534 + * inode tables. Block allocation decisions will do
  535 + * something similar, although regular files will
  536 + * start at 2nd block group of the flexgroup. See
  537 + * ext4_ext_find_goal() and ext4_find_near().
  538 + */
  539 + grp *= flex_size;
  540 + for (i = 0; i < flex_size; i++) {
  541 + if (grp+i >= sbi->s_groups_count)
  542 + break;
  543 + desc = ext4_get_group_desc(sb, grp+i, NULL);
  544 + if (desc && ext4_free_inodes_count(sb, desc)) {
  545 + *group = grp+i;
  546 + return 0;
  547 + }
  548 + }
490 549 goto fallback;
491 550 }
492 551  
493   - blocks_per_dir = ext4_blocks_count(es) - freeb;
494   - do_div(blocks_per_dir, ndirs);
495   -
496 552 max_dirs = ndirs / ngroups + inodes_per_group / 16;
497   - min_inodes = avefreei - inodes_per_group / 4;
498   - min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
  553 + min_inodes = avefreei - inodes_per_group*flex_size / 4;
  554 + if (min_inodes < 1)
  555 + min_inodes = 1;
  556 + min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
499 557  
500   - max_debt = EXT4_BLOCKS_PER_GROUP(sb);
501   - max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
502   - if (max_debt * INODE_COST > inodes_per_group)
503   - max_debt = inodes_per_group / INODE_COST;
504   - if (max_debt > 255)
505   - max_debt = 255;
506   - if (max_debt == 0)
507   - max_debt = 1;
  558 + /*
  559 + * Start looking in the flex group where we last allocated an
  560 + * inode for this parent directory
  561 + */
  562 + if (EXT4_I(parent)->i_last_alloc_group != ~0) {
  563 + parent_group = EXT4_I(parent)->i_last_alloc_group;
  564 + if (flex_size > 1)
  565 + parent_group >>= sbi->s_log_groups_per_flex;
  566 + }
508 567  
509 568 for (i = 0; i < ngroups; i++) {
510   - *group = (parent_group + i) % ngroups;
511   - desc = ext4_get_group_desc(sb, *group, NULL);
512   - if (!desc || !ext4_free_inodes_count(sb, desc))
  569 + grp = (parent_group + i) % ngroups;
  570 + get_orlov_stats(sb, grp, flex_size, &stats);
  571 + if (stats.used_dirs >= max_dirs)
513 572 continue;
514   - if (ext4_used_dirs_count(sb, desc) >= max_dirs)
  573 + if (stats.free_inodes < min_inodes)
515 574 continue;
516   - if (ext4_free_inodes_count(sb, desc) < min_inodes)
  575 + if (stats.free_blocks < min_blocks)
517 576 continue;
518   - if (ext4_free_blks_count(sb, desc) < min_blocks)
519   - continue;
520   - return 0;
  577 + goto found_flex_bg;
521 578 }
522 579  
523 580 fallback:
  581 + ngroups = sbi->s_groups_count;
  582 + avefreei = freei / ngroups;
  583 + parent_group = EXT4_I(parent)->i_block_group;
524 584 for (i = 0; i < ngroups; i++) {
525   - *group = (parent_group + i) % ngroups;
526   - desc = ext4_get_group_desc(sb, *group, NULL);
  585 + grp = (parent_group + i) % ngroups;
  586 + desc = ext4_get_group_desc(sb, grp, NULL);
527 587 if (desc && ext4_free_inodes_count(sb, desc) &&
528   - ext4_free_inodes_count(sb, desc) >= avefreei)
  588 + ext4_free_inodes_count(sb, desc) >= avefreei) {
  589 + *group = grp;
529 590 return 0;
  591 + }
530 592 }
531 593  
532 594 if (avefreei) {
533 595  
534 596  
... ... @@ -542,14 +604,53 @@
542 604 }
543 605  
544 606 static int find_group_other(struct super_block *sb, struct inode *parent,
545   - ext4_group_t *group)
  607 + ext4_group_t *group, int mode)
546 608 {
547 609 ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
548 610 ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
549 611 struct ext4_group_desc *desc;
550   - ext4_group_t i;
  612 + ext4_group_t i, last;
  613 + int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
551 614  
552 615 /*
  616 + * Try to place the inode is the same flex group as its
  617 + * parent. If we can't find space, use the Orlov algorithm to
  618 + * find another flex group, and store that information in the
  619 + * parent directory's inode information so that use that flex
  620 + * group for future allocations.
  621 + */
  622 + if (flex_size > 1) {
  623 + int retry = 0;
  624 +
  625 + try_again:
  626 + parent_group &= ~(flex_size-1);
  627 + last = parent_group + flex_size;
  628 + if (last > ngroups)
  629 + last = ngroups;
  630 + for (i = parent_group; i < last; i++) {
  631 + desc = ext4_get_group_desc(sb, i, NULL);
  632 + if (desc && ext4_free_inodes_count(sb, desc)) {
  633 + *group = i;
  634 + return 0;
  635 + }
  636 + }
  637 + if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
  638 + retry = 1;
  639 + parent_group = EXT4_I(parent)->i_last_alloc_group;
  640 + goto try_again;
  641 + }
  642 + /*
  643 + * If this didn't work, use the Orlov search algorithm
  644 + * to find a new flex group; we pass in the mode to
  645 + * avoid the topdir algorithms.
  646 + */
  647 + *group = parent_group + flex_size;
  648 + if (*group > ngroups)
  649 + *group = 0;
  650 + return find_group_orlov(sb, parent, group, mode);
  651 + }
  652 +
  653 + /*
553 654 * Try to place the inode in its parent directory
554 655 */
555 656 *group = parent_group;
556 657  
... ... @@ -716,10 +817,10 @@
716 817 sbi = EXT4_SB(sb);
717 818 es = sbi->s_es;
718 819  
719   - if (sbi->s_log_groups_per_flex) {
  820 + if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
720 821 ret2 = find_group_flex(sb, dir, &group);
721 822 if (ret2 == -1) {
722   - ret2 = find_group_other(sb, dir, &group);
  823 + ret2 = find_group_other(sb, dir, &group, mode);
723 824 if (ret2 == 0 && once)
724 825 once = 0;
725 826 printk(KERN_NOTICE "ext4: find_group_flex "
726 827  
727 828  
... ... @@ -733,11 +834,12 @@
733 834 if (test_opt(sb, OLDALLOC))
734 835 ret2 = find_group_dir(sb, dir, &group);
735 836 else
736   - ret2 = find_group_orlov(sb, dir, &group);
  837 + ret2 = find_group_orlov(sb, dir, &group, mode);
737 838 } else
738   - ret2 = find_group_other(sb, dir, &group);
  839 + ret2 = find_group_other(sb, dir, &group, mode);
739 840  
740 841 got_group:
  842 + EXT4_I(dir)->i_last_alloc_group = group;
741 843 err = -ENOSPC;
742 844 if (ret2 == -1)
743 845 goto out;
... ... @@ -894,6 +996,7 @@
894 996 ei->i_file_acl = 0;
895 997 ei->i_dtime = 0;
896 998 ei->i_block_group = group;
  999 + ei->i_last_alloc_group = ~0;
897 1000  
898 1001 ext4_set_inode_flags(inode);
899 1002 if (IS_DIRSYNC(inode))
... ... @@ -459,6 +459,8 @@
459 459 ext4_fsblk_t bg_start;
460 460 ext4_fsblk_t last_block;
461 461 ext4_grpblk_t colour;
  462 + ext4_group_t block_group;
  463 + int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
462 464  
463 465 /* Try to find previous block */
464 466 for (p = ind->p - 1; p >= start; p--) {
465 467  
... ... @@ -474,9 +476,22 @@
474 476 * It is going to be referred to from the inode itself? OK, just put it
475 477 * into the same cylinder group then.
476 478 */
477   - bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
  479 + block_group = ei->i_block_group;
  480 + if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
  481 + block_group &= ~(flex_size-1);
  482 + if (S_ISREG(inode->i_mode))
  483 + block_group++;
  484 + }
  485 + bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
478 486 last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
479 487  
  488 + /*
  489 + * If we are doing delayed allocation, we don't need take
  490 + * colour into account.
  491 + */
  492 + if (test_opt(inode->i_sb, DELALLOC))
  493 + return bg_start;
  494 +
480 495 if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
481 496 colour = (current->pid % 16) *
482 497 (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
... ... @@ -4287,6 +4302,7 @@
4287 4302 ei->i_disksize = inode->i_size;
4288 4303 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
4289 4304 ei->i_block_group = iloc.block_group;
  4305 + ei->i_last_alloc_group = ~0;
4290 4306 /*
4291 4307 * NOTE! The in-memory inode i_data array is in little-endian order
4292 4308 * even on big-endian machines: we do NOT byteswap the block numbers!
... ... @@ -1726,6 +1726,7 @@
1726 1726 {
1727 1727 unsigned free, fragments;
1728 1728 unsigned i, bits;
  1729 + int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
1729 1730 struct ext4_group_desc *desc;
1730 1731 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1731 1732  
... ... @@ -1745,6 +1746,12 @@
1745 1746 /* If this group is uninitialized, skip it initially */
1746 1747 desc = ext4_get_group_desc(ac->ac_sb, group, NULL);
1747 1748 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
  1749 + return 0;
  1750 +
  1751 + /* Avoid using the first bg of a flexgroup for data files */
  1752 + if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
  1753 + (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
  1754 + ((group % flex_size) == 0))
1748 1755 return 0;
1749 1756  
1750 1757 bits = ac->ac_sb->s_blocksize_bits + 1;