Commit a97b52022a73ec12e43f0b2c7d4bd1f40f89c81d

Authored by Linus Torvalds

Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  ext4: fix data corruption regression by reverting commit 6de9843dab3f
  ext4: Allow indirect-block file to grow the file size to max file size
  ext4: allow an active handle to be started when freezing
  ext4: sync the directory inode in ext4_sync_parent()
  ext4: init timer earlier to avoid a kernel panic in __save_error_info
  jbd2: fix potential memory leak on transaction commit
  ext4: fix a double free in ext4_register_li_request
  ext4: fix credits computing for indirect mapped files
  ext4: remove unnecessary [cm]time update of quota file
  jbd2: move bdget out of critical section

Showing 6 changed files Side-by-side Diff

... ... @@ -86,8 +86,8 @@
86 86  
87 87 #ifdef CONFIG_QUOTA
88 88 /* Amount of blocks needed for quota update - we know that the structure was
89   - * allocated so we need to update only inode+data */
90   -#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0)
  89 + * allocated so we need to update only data block */
  90 +#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0)
91 91 /* Amount of blocks needed for quota insert/delete - we do some block writes
92 92 * but inode, sb and group updates are done only once */
93 93 #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
... ... @@ -125,9 +125,11 @@
125 125 * the parent directory's parent as well, and so on recursively, if
126 126 * they are also freshly created.
127 127 */
128   -static void ext4_sync_parent(struct inode *inode)
  128 +static int ext4_sync_parent(struct inode *inode)
129 129 {
  130 + struct writeback_control wbc;
130 131 struct dentry *dentry = NULL;
  132 + int ret = 0;
131 133  
132 134 while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
133 135 ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
134 136  
... ... @@ -136,8 +138,17 @@
136 138 if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
137 139 break;
138 140 inode = dentry->d_parent->d_inode;
139   - sync_mapping_buffers(inode->i_mapping);
  141 + ret = sync_mapping_buffers(inode->i_mapping);
  142 + if (ret)
  143 + break;
  144 + memset(&wbc, 0, sizeof(wbc));
  145 + wbc.sync_mode = WB_SYNC_ALL;
  146 + wbc.nr_to_write = 0; /* only write out the inode */
  147 + ret = sync_inode(inode, &wbc);
  148 + if (ret)
  149 + break;
140 150 }
  151 + return ret;
141 152 }
142 153  
143 154 /*
... ... @@ -176,7 +187,7 @@
176 187 if (!journal) {
177 188 ret = generic_file_fsync(file, datasync);
178 189 if (!ret && !list_empty(&inode->i_dentry))
179   - ext4_sync_parent(inode);
  190 + ret = ext4_sync_parent(inode);
180 191 goto out;
181 192 }
182 193  
... ... @@ -2502,6 +2502,7 @@
2502 2502 * for partial write.
2503 2503 */
2504 2504 set_buffer_new(bh);
  2505 + set_buffer_mapped(bh);
2505 2506 }
2506 2507 return 0;
2507 2508 }
... ... @@ -4429,8 +4430,8 @@
4429 4430 Indirect chain[4];
4430 4431 Indirect *partial;
4431 4432 __le32 nr = 0;
4432   - int n;
4433   - ext4_lblk_t last_block;
  4433 + int n = 0;
  4434 + ext4_lblk_t last_block, max_block;
4434 4435 unsigned blocksize = inode->i_sb->s_blocksize;
4435 4436  
4436 4437 trace_ext4_truncate_enter(inode);
4437 4438  
... ... @@ -4455,14 +4456,18 @@
4455 4456  
4456 4457 last_block = (inode->i_size + blocksize-1)
4457 4458 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
  4459 + max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1)
  4460 + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4458 4461  
4459 4462 if (inode->i_size & (blocksize - 1))
4460 4463 if (ext4_block_truncate_page(handle, mapping, inode->i_size))
4461 4464 goto out_stop;
4462 4465  
4463   - n = ext4_block_to_path(inode, last_block, offsets, NULL);
4464   - if (n == 0)
4465   - goto out_stop; /* error */
  4466 + if (last_block != max_block) {
  4467 + n = ext4_block_to_path(inode, last_block, offsets, NULL);
  4468 + if (n == 0)
  4469 + goto out_stop; /* error */
  4470 + }
4466 4471  
4467 4472 /*
4468 4473 * OK. This truncate is going to happen. We add the inode to the
... ... @@ -4493,7 +4498,13 @@
4493 4498 */
4494 4499 ei->i_disksize = inode->i_size;
4495 4500  
4496   - if (n == 1) { /* direct blocks */
  4501 + if (last_block == max_block) {
  4502 + /*
  4503 + * It is unnecessary to free any data blocks if last_block is
  4504 + * equal to the indirect block limit.
  4505 + */
  4506 + goto out_unlock;
  4507 + } else if (n == 1) { /* direct blocks */
4497 4508 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
4498 4509 i_data + EXT4_NDIR_BLOCKS);
4499 4510 goto do_indirects;
... ... @@ -4553,6 +4564,7 @@
4553 4564 ;
4554 4565 }
4555 4566  
  4567 +out_unlock:
4556 4568 up_write(&ei->i_data_sem);
4557 4569 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4558 4570 ext4_mark_inode_dirty(handle, inode);
4559 4571  
... ... @@ -5398,13 +5410,12 @@
5398 5410 /* if nrblocks are contiguous */
5399 5411 if (chunk) {
5400 5412 /*
5401   - * With N contiguous data blocks, it need at most
5402   - * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
5403   - * 2 dindirect blocks
5404   - * 1 tindirect block
  5413 + * With N contiguous data blocks, we need at most
  5414 + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
  5415 + * 2 dindirect blocks, and 1 tindirect block
5405 5416 */
5406   - indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
5407   - return indirects + 3;
  5417 + return DIV_ROUND_UP(nrblocks,
  5418 + EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
5408 5419 }
5409 5420 /*
5410 5421 * if nrblocks are not contiguous, worse case, each block touch
... ... @@ -242,27 +242,44 @@
242 242 * journal_end calls result in the superblock being marked dirty, so
243 243 * that sync() will call the filesystem's write_super callback if
244 244 * appropriate.
  245 + *
  246 + * To avoid j_barrier hold in userspace when a user calls freeze(),
  247 + * ext4 prevents a new handle from being started by s_frozen, which
  248 + * is in an upper layer.
245 249 */
246 250 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
247 251 {
248 252 journal_t *journal;
  253 + handle_t *handle;
249 254  
250 255 if (sb->s_flags & MS_RDONLY)
251 256 return ERR_PTR(-EROFS);
252 257  
253   - vfs_check_frozen(sb, SB_FREEZE_TRANS);
254   - /* Special case here: if the journal has aborted behind our
255   - * backs (eg. EIO in the commit thread), then we still need to
256   - * take the FS itself readonly cleanly. */
257 258 journal = EXT4_SB(sb)->s_journal;
258   - if (journal) {
259   - if (is_journal_aborted(journal)) {
260   - ext4_abort(sb, "Detected aborted journal");
261   - return ERR_PTR(-EROFS);
262   - }
263   - return jbd2_journal_start(journal, nblocks);
  259 + handle = ext4_journal_current_handle();
  260 +
  261 + /*
  262 + * If a handle has been started, it should be allowed to
  263 + * finish, otherwise deadlock could happen between freeze
  264 + * and others(e.g. truncate) due to the restart of the
  265 + * journal handle if the filesystem is forzen and active
  266 + * handles are not stopped.
  267 + */
  268 + if (!handle)
  269 + vfs_check_frozen(sb, SB_FREEZE_TRANS);
  270 +
  271 + if (!journal)
  272 + return ext4_get_nojournal();
  273 + /*
  274 + * Special case here: if the journal has aborted behind our
  275 + * backs (eg. EIO in the commit thread), then we still need to
  276 + * take the FS itself readonly cleanly.
  277 + */
  278 + if (is_journal_aborted(journal)) {
  279 + ext4_abort(sb, "Detected aborted journal");
  280 + return ERR_PTR(-EROFS);
264 281 }
265   - return ext4_get_nojournal();
  282 + return jbd2_journal_start(journal, nblocks);
266 283 }
267 284  
268 285 /*
... ... @@ -2975,6 +2992,12 @@
2975 2992 mutex_unlock(&ext4_li_info->li_list_mtx);
2976 2993  
2977 2994 sbi->s_li_request = elr;
  2995 + /*
  2996 + * set elr to NULL here since it has been inserted to
  2997 + * the request_list and the removal and free of it is
  2998 + * handled by ext4_clear_request_list from now on.
  2999 + */
  3000 + elr = NULL;
2978 3001  
2979 3002 if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) {
2980 3003 ret = ext4_run_lazyinit_thread();
... ... @@ -3385,6 +3408,10 @@
3385 3408 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
3386 3409 spin_lock_init(&sbi->s_next_gen_lock);
3387 3410  
  3411 + init_timer(&sbi->s_err_report);
  3412 + sbi->s_err_report.function = print_daily_error_info;
  3413 + sbi->s_err_report.data = (unsigned long) sb;
  3414 +
3388 3415 err = percpu_counter_init(&sbi->s_freeblocks_counter,
3389 3416 ext4_count_free_blocks(sb));
3390 3417 if (!err) {
... ... @@ -3646,9 +3673,6 @@
3646 3673 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
3647 3674 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
3648 3675  
3649   - init_timer(&sbi->s_err_report);
3650   - sbi->s_err_report.function = print_daily_error_info;
3651   - sbi->s_err_report.data = (unsigned long) sb;
3652 3676 if (es->s_error_count)
3653 3677 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */
3654 3678  
... ... @@ -3672,6 +3696,7 @@
3672 3696 sbi->s_journal = NULL;
3673 3697 }
3674 3698 failed_mount3:
  3699 + del_timer(&sbi->s_err_report);
3675 3700 if (sbi->s_flex_groups) {
3676 3701 if (is_vmalloc_addr(sbi->s_flex_groups))
3677 3702 vfree(sbi->s_flex_groups);
... ... @@ -4138,6 +4163,11 @@
4138 4163 /*
4139 4164 * LVM calls this function before a (read-only) snapshot is created. This
4140 4165 * gives us a chance to flush the journal completely and mark the fs clean.
  4166 + *
  4167 + * Note that only this function cannot bring a filesystem to be in a clean
  4168 + * state independently, because ext4 prevents a new handle from being started
  4169 + * by @sb->s_frozen, which stays in an upper layer. It thus needs help from
  4170 + * the upper layer.
4141 4171 */
4142 4172 static int ext4_freeze(struct super_block *sb)
4143 4173 {
4144 4174  
... ... @@ -4614,11 +4644,24 @@
4614 4644  
4615 4645 static int ext4_quota_off(struct super_block *sb, int type)
4616 4646 {
  4647 + struct inode *inode = sb_dqopt(sb)->files[type];
  4648 + handle_t *handle;
  4649 +
4617 4650 /* Force all delayed allocation blocks to be allocated.
4618 4651 * Caller already holds s_umount sem */
4619 4652 if (test_opt(sb, DELALLOC))
4620 4653 sync_filesystem(sb);
4621 4654  
  4655 + /* Update modification times of quota files when userspace can
  4656 + * start looking at them */
  4657 + handle = ext4_journal_start(inode, 1);
  4658 + if (IS_ERR(handle))
  4659 + goto out;
  4660 + inode->i_mtime = inode->i_ctime = CURRENT_TIME;
  4661 + ext4_mark_inode_dirty(handle, inode);
  4662 + ext4_journal_stop(handle);
  4663 +
  4664 +out:
4622 4665 return dquot_quota_off(sb, type);
4623 4666 }
4624 4667  
4625 4668  
... ... @@ -4714,9 +4757,8 @@
4714 4757 if (inode->i_size < off + len) {
4715 4758 i_size_write(inode, off + len);
4716 4759 EXT4_I(inode)->i_disksize = inode->i_size;
  4760 + ext4_mark_inode_dirty(handle, inode);
4717 4761 }
4718   - inode->i_mtime = inode->i_ctime = CURRENT_TIME;
4719   - ext4_mark_inode_dirty(handle, inode);
4720 4762 mutex_unlock(&inode->i_mutex);
4721 4763 return len;
4722 4764 }
... ... @@ -105,6 +105,8 @@
105 105 int ret;
106 106 struct timespec now = current_kernel_time();
107 107  
  108 + *cbh = NULL;
  109 +
108 110 if (is_journal_aborted(journal))
109 111 return 0;
110 112  
... ... @@ -806,7 +808,7 @@
806 808 if (err)
807 809 __jbd2_journal_abort_hard(journal);
808 810 }
809   - if (!err && !is_journal_aborted(journal))
  811 + if (cbh)
810 812 err = journal_wait_on_commit_record(journal, cbh);
811 813 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
812 814 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
... ... @@ -2413,10 +2413,12 @@
2413 2413 new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL);
2414 2414 if (!new_dev)
2415 2415 return "NODEV-ALLOCFAILURE"; /* Something non-NULL */
  2416 + bd = bdget(device);
2416 2417 spin_lock(&devname_cache_lock);
2417 2418 if (devcache[i]) {
2418 2419 if (devcache[i]->device == device) {
2419 2420 kfree(new_dev);
  2421 + bdput(bd);
2420 2422 ret = devcache[i]->devname;
2421 2423 spin_unlock(&devname_cache_lock);
2422 2424 return ret;
... ... @@ -2425,7 +2427,6 @@
2425 2427 }
2426 2428 devcache[i] = new_dev;
2427 2429 devcache[i]->device = device;
2428   - bd = bdget(device);
2429 2430 if (bd) {
2430 2431 bdevname(bd, devcache[i]->devname);
2431 2432 bdput(bd);