Commit a97b52022a73ec12e43f0b2c7d4bd1f40f89c81d
Exists in
master
and in
39 other branches
Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: ext4: fix data corruption regression by reverting commit 6de9843dab3f ext4: Allow indirect-block file to grow the file size to max file size ext4: allow an active handle to be started when freezing ext4: sync the directory inode in ext4_sync_parent() ext4: init timer earlier to avoid a kernel panic in __save_error_info jbd2: fix potential memory leak on transaction commit ext4: fix a double free in ext4_register_li_request ext4: fix credits computing for indirect mapped files ext4: remove unnecessary [cm]time update of quota file jbd2: move bdget out of critical section
Showing 6 changed files Side-by-side Diff
fs/ext4/ext4_jbd2.h
... | ... | @@ -86,8 +86,8 @@ |
86 | 86 | |
87 | 87 | #ifdef CONFIG_QUOTA |
88 | 88 | /* Amount of blocks needed for quota update - we know that the structure was |
89 | - * allocated so we need to update only inode+data */ | |
90 | -#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0) | |
89 | + * allocated so we need to update only data block */ | |
90 | +#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0) | |
91 | 91 | /* Amount of blocks needed for quota insert/delete - we do some block writes |
92 | 92 | * but inode, sb and group updates are done only once */ |
93 | 93 | #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ |
fs/ext4/fsync.c
... | ... | @@ -125,9 +125,11 @@ |
125 | 125 | * the parent directory's parent as well, and so on recursively, if |
126 | 126 | * they are also freshly created. |
127 | 127 | */ |
128 | -static void ext4_sync_parent(struct inode *inode) | |
128 | +static int ext4_sync_parent(struct inode *inode) | |
129 | 129 | { |
130 | + struct writeback_control wbc; | |
130 | 131 | struct dentry *dentry = NULL; |
132 | + int ret = 0; | |
131 | 133 | |
132 | 134 | while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { |
133 | 135 | ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); |
134 | 136 | |
... | ... | @@ -136,8 +138,17 @@ |
136 | 138 | if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode) |
137 | 139 | break; |
138 | 140 | inode = dentry->d_parent->d_inode; |
139 | - sync_mapping_buffers(inode->i_mapping); | |
141 | + ret = sync_mapping_buffers(inode->i_mapping); | |
142 | + if (ret) | |
143 | + break; | |
144 | + memset(&wbc, 0, sizeof(wbc)); | |
145 | + wbc.sync_mode = WB_SYNC_ALL; | |
146 | + wbc.nr_to_write = 0; /* only write out the inode */ | |
147 | + ret = sync_inode(inode, &wbc); | |
148 | + if (ret) | |
149 | + break; | |
140 | 150 | } |
151 | + return ret; | |
141 | 152 | } |
142 | 153 | |
143 | 154 | /* |
... | ... | @@ -176,7 +187,7 @@ |
176 | 187 | if (!journal) { |
177 | 188 | ret = generic_file_fsync(file, datasync); |
178 | 189 | if (!ret && !list_empty(&inode->i_dentry)) |
179 | - ext4_sync_parent(inode); | |
190 | + ret = ext4_sync_parent(inode); | |
180 | 191 | goto out; |
181 | 192 | } |
182 | 193 |
fs/ext4/inode.c
... | ... | @@ -2502,6 +2502,7 @@ |
2502 | 2502 | * for partial write. |
2503 | 2503 | */ |
2504 | 2504 | set_buffer_new(bh); |
2505 | + set_buffer_mapped(bh); | |
2505 | 2506 | } |
2506 | 2507 | return 0; |
2507 | 2508 | } |
... | ... | @@ -4429,8 +4430,8 @@ |
4429 | 4430 | Indirect chain[4]; |
4430 | 4431 | Indirect *partial; |
4431 | 4432 | __le32 nr = 0; |
4432 | - int n; | |
4433 | - ext4_lblk_t last_block; | |
4433 | + int n = 0; | |
4434 | + ext4_lblk_t last_block, max_block; | |
4434 | 4435 | unsigned blocksize = inode->i_sb->s_blocksize; |
4435 | 4436 | |
4436 | 4437 | trace_ext4_truncate_enter(inode); |
4437 | 4438 | |
... | ... | @@ -4455,14 +4456,18 @@ |
4455 | 4456 | |
4456 | 4457 | last_block = (inode->i_size + blocksize-1) |
4457 | 4458 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); |
4459 | + max_block = (EXT4_SB(inode->i_sb)->s_bitmap_maxbytes + blocksize-1) | |
4460 | + >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | |
4458 | 4461 | |
4459 | 4462 | if (inode->i_size & (blocksize - 1)) |
4460 | 4463 | if (ext4_block_truncate_page(handle, mapping, inode->i_size)) |
4461 | 4464 | goto out_stop; |
4462 | 4465 | |
4463 | - n = ext4_block_to_path(inode, last_block, offsets, NULL); | |
4464 | - if (n == 0) | |
4465 | - goto out_stop; /* error */ | |
4466 | + if (last_block != max_block) { | |
4467 | + n = ext4_block_to_path(inode, last_block, offsets, NULL); | |
4468 | + if (n == 0) | |
4469 | + goto out_stop; /* error */ | |
4470 | + } | |
4466 | 4471 | |
4467 | 4472 | /* |
4468 | 4473 | * OK. This truncate is going to happen. We add the inode to the |
... | ... | @@ -4493,7 +4498,13 @@ |
4493 | 4498 | */ |
4494 | 4499 | ei->i_disksize = inode->i_size; |
4495 | 4500 | |
4496 | - if (n == 1) { /* direct blocks */ | |
4501 | + if (last_block == max_block) { | |
4502 | + /* | |
4503 | + * It is unnecessary to free any data blocks if last_block is | |
4504 | + * equal to the indirect block limit. | |
4505 | + */ | |
4506 | + goto out_unlock; | |
4507 | + } else if (n == 1) { /* direct blocks */ | |
4497 | 4508 | ext4_free_data(handle, inode, NULL, i_data+offsets[0], |
4498 | 4509 | i_data + EXT4_NDIR_BLOCKS); |
4499 | 4510 | goto do_indirects; |
... | ... | @@ -4553,6 +4564,7 @@ |
4553 | 4564 | ; |
4554 | 4565 | } |
4555 | 4566 | |
4567 | +out_unlock: | |
4556 | 4568 | up_write(&ei->i_data_sem); |
4557 | 4569 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); |
4558 | 4570 | ext4_mark_inode_dirty(handle, inode); |
4559 | 4571 | |
... | ... | @@ -5398,13 +5410,12 @@ |
5398 | 5410 | /* if nrblocks are contiguous */ |
5399 | 5411 | if (chunk) { |
5400 | 5412 | /* |
5401 | - * With N contiguous data blocks, it need at most | |
5402 | - * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks | |
5403 | - * 2 dindirect blocks | |
5404 | - * 1 tindirect block | |
5413 | + * With N contiguous data blocks, we need at most | |
5414 | + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks, | |
5415 | + * 2 dindirect blocks, and 1 tindirect block | |
5405 | 5416 | */ |
5406 | - indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); | |
5407 | - return indirects + 3; | |
5417 | + return DIV_ROUND_UP(nrblocks, | |
5418 | + EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4; | |
5408 | 5419 | } |
5409 | 5420 | /* |
5410 | 5421 | * if nrblocks are not contiguous, worse case, each block touch |
fs/ext4/super.c
... | ... | @@ -242,27 +242,44 @@ |
242 | 242 | * journal_end calls result in the superblock being marked dirty, so |
243 | 243 | * that sync() will call the filesystem's write_super callback if |
244 | 244 | * appropriate. |
245 | + * | |
246 | + * To avoid j_barrier hold in userspace when a user calls freeze(), | |
247 | + * ext4 prevents a new handle from being started by s_frozen, which | |
248 | + * is in an upper layer. | |
245 | 249 | */ |
246 | 250 | handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) |
247 | 251 | { |
248 | 252 | journal_t *journal; |
253 | + handle_t *handle; | |
249 | 254 | |
250 | 255 | if (sb->s_flags & MS_RDONLY) |
251 | 256 | return ERR_PTR(-EROFS); |
252 | 257 | |
253 | - vfs_check_frozen(sb, SB_FREEZE_TRANS); | |
254 | - /* Special case here: if the journal has aborted behind our | |
255 | - * backs (eg. EIO in the commit thread), then we still need to | |
256 | - * take the FS itself readonly cleanly. */ | |
257 | 258 | journal = EXT4_SB(sb)->s_journal; |
258 | - if (journal) { | |
259 | - if (is_journal_aborted(journal)) { | |
260 | - ext4_abort(sb, "Detected aborted journal"); | |
261 | - return ERR_PTR(-EROFS); | |
262 | - } | |
263 | - return jbd2_journal_start(journal, nblocks); | |
259 | + handle = ext4_journal_current_handle(); | |
260 | + | |
261 | + /* | |
262 | + * If a handle has been started, it should be allowed to | |
263 | + * finish, otherwise deadlock could happen between freeze | |
264 | + * and others(e.g. truncate) due to the restart of the | |
265 | + * journal handle if the filesystem is forzen and active | |
266 | + * handles are not stopped. | |
267 | + */ | |
268 | + if (!handle) | |
269 | + vfs_check_frozen(sb, SB_FREEZE_TRANS); | |
270 | + | |
271 | + if (!journal) | |
272 | + return ext4_get_nojournal(); | |
273 | + /* | |
274 | + * Special case here: if the journal has aborted behind our | |
275 | + * backs (eg. EIO in the commit thread), then we still need to | |
276 | + * take the FS itself readonly cleanly. | |
277 | + */ | |
278 | + if (is_journal_aborted(journal)) { | |
279 | + ext4_abort(sb, "Detected aborted journal"); | |
280 | + return ERR_PTR(-EROFS); | |
264 | 281 | } |
265 | - return ext4_get_nojournal(); | |
282 | + return jbd2_journal_start(journal, nblocks); | |
266 | 283 | } |
267 | 284 | |
268 | 285 | /* |
... | ... | @@ -2975,6 +2992,12 @@ |
2975 | 2992 | mutex_unlock(&ext4_li_info->li_list_mtx); |
2976 | 2993 | |
2977 | 2994 | sbi->s_li_request = elr; |
2995 | + /* | |
2996 | + * set elr to NULL here since it has been inserted to | |
2997 | + * the request_list and the removal and free of it is | |
2998 | + * handled by ext4_clear_request_list from now on. | |
2999 | + */ | |
3000 | + elr = NULL; | |
2978 | 3001 | |
2979 | 3002 | if (!(ext4_li_info->li_state & EXT4_LAZYINIT_RUNNING)) { |
2980 | 3003 | ret = ext4_run_lazyinit_thread(); |
... | ... | @@ -3385,6 +3408,10 @@ |
3385 | 3408 | get_random_bytes(&sbi->s_next_generation, sizeof(u32)); |
3386 | 3409 | spin_lock_init(&sbi->s_next_gen_lock); |
3387 | 3410 | |
3411 | + init_timer(&sbi->s_err_report); | |
3412 | + sbi->s_err_report.function = print_daily_error_info; | |
3413 | + sbi->s_err_report.data = (unsigned long) sb; | |
3414 | + | |
3388 | 3415 | err = percpu_counter_init(&sbi->s_freeblocks_counter, |
3389 | 3416 | ext4_count_free_blocks(sb)); |
3390 | 3417 | if (!err) { |
... | ... | @@ -3646,9 +3673,6 @@ |
3646 | 3673 | "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, |
3647 | 3674 | *sbi->s_es->s_mount_opts ? "; " : "", orig_data); |
3648 | 3675 | |
3649 | - init_timer(&sbi->s_err_report); | |
3650 | - sbi->s_err_report.function = print_daily_error_info; | |
3651 | - sbi->s_err_report.data = (unsigned long) sb; | |
3652 | 3676 | if (es->s_error_count) |
3653 | 3677 | mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ |
3654 | 3678 | |
... | ... | @@ -3672,6 +3696,7 @@ |
3672 | 3696 | sbi->s_journal = NULL; |
3673 | 3697 | } |
3674 | 3698 | failed_mount3: |
3699 | + del_timer(&sbi->s_err_report); | |
3675 | 3700 | if (sbi->s_flex_groups) { |
3676 | 3701 | if (is_vmalloc_addr(sbi->s_flex_groups)) |
3677 | 3702 | vfree(sbi->s_flex_groups); |
... | ... | @@ -4138,6 +4163,11 @@ |
4138 | 4163 | /* |
4139 | 4164 | * LVM calls this function before a (read-only) snapshot is created. This |
4140 | 4165 | * gives us a chance to flush the journal completely and mark the fs clean. |
4166 | + * | |
4167 | + * Note that only this function cannot bring a filesystem to be in a clean | |
4168 | + * state independently, because ext4 prevents a new handle from being started | |
4169 | + * by @sb->s_frozen, which stays in an upper layer. It thus needs help from | |
4170 | + * the upper layer. | |
4141 | 4171 | */ |
4142 | 4172 | static int ext4_freeze(struct super_block *sb) |
4143 | 4173 | { |
4144 | 4174 | |
... | ... | @@ -4614,11 +4644,24 @@ |
4614 | 4644 | |
4615 | 4645 | static int ext4_quota_off(struct super_block *sb, int type) |
4616 | 4646 | { |
4647 | + struct inode *inode = sb_dqopt(sb)->files[type]; | |
4648 | + handle_t *handle; | |
4649 | + | |
4617 | 4650 | /* Force all delayed allocation blocks to be allocated. |
4618 | 4651 | * Caller already holds s_umount sem */ |
4619 | 4652 | if (test_opt(sb, DELALLOC)) |
4620 | 4653 | sync_filesystem(sb); |
4621 | 4654 | |
4655 | + /* Update modification times of quota files when userspace can | |
4656 | + * start looking at them */ | |
4657 | + handle = ext4_journal_start(inode, 1); | |
4658 | + if (IS_ERR(handle)) | |
4659 | + goto out; | |
4660 | + inode->i_mtime = inode->i_ctime = CURRENT_TIME; | |
4661 | + ext4_mark_inode_dirty(handle, inode); | |
4662 | + ext4_journal_stop(handle); | |
4663 | + | |
4664 | +out: | |
4622 | 4665 | return dquot_quota_off(sb, type); |
4623 | 4666 | } |
4624 | 4667 | |
4625 | 4668 | |
... | ... | @@ -4714,9 +4757,8 @@ |
4714 | 4757 | if (inode->i_size < off + len) { |
4715 | 4758 | i_size_write(inode, off + len); |
4716 | 4759 | EXT4_I(inode)->i_disksize = inode->i_size; |
4760 | + ext4_mark_inode_dirty(handle, inode); | |
4717 | 4761 | } |
4718 | - inode->i_mtime = inode->i_ctime = CURRENT_TIME; | |
4719 | - ext4_mark_inode_dirty(handle, inode); | |
4720 | 4762 | mutex_unlock(&inode->i_mutex); |
4721 | 4763 | return len; |
4722 | 4764 | } |
fs/jbd2/commit.c
... | ... | @@ -105,6 +105,8 @@ |
105 | 105 | int ret; |
106 | 106 | struct timespec now = current_kernel_time(); |
107 | 107 | |
108 | + *cbh = NULL; | |
109 | + | |
108 | 110 | if (is_journal_aborted(journal)) |
109 | 111 | return 0; |
110 | 112 | |
... | ... | @@ -806,7 +808,7 @@ |
806 | 808 | if (err) |
807 | 809 | __jbd2_journal_abort_hard(journal); |
808 | 810 | } |
809 | - if (!err && !is_journal_aborted(journal)) | |
811 | + if (cbh) | |
810 | 812 | err = journal_wait_on_commit_record(journal, cbh); |
811 | 813 | if (JBD2_HAS_INCOMPAT_FEATURE(journal, |
812 | 814 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) && |
fs/jbd2/journal.c
... | ... | @@ -2413,10 +2413,12 @@ |
2413 | 2413 | new_dev = kmalloc(sizeof(struct devname_cache), GFP_KERNEL); |
2414 | 2414 | if (!new_dev) |
2415 | 2415 | return "NODEV-ALLOCFAILURE"; /* Something non-NULL */ |
2416 | + bd = bdget(device); | |
2416 | 2417 | spin_lock(&devname_cache_lock); |
2417 | 2418 | if (devcache[i]) { |
2418 | 2419 | if (devcache[i]->device == device) { |
2419 | 2420 | kfree(new_dev); |
2421 | + bdput(bd); | |
2420 | 2422 | ret = devcache[i]->devname; |
2421 | 2423 | spin_unlock(&devname_cache_lock); |
2422 | 2424 | return ret; |
... | ... | @@ -2425,7 +2427,6 @@ |
2425 | 2427 | } |
2426 | 2428 | devcache[i] = new_dev; |
2427 | 2429 | devcache[i]->device = device; |
2428 | - bd = bdget(device); | |
2429 | 2430 | if (bd) { |
2430 | 2431 | bdevname(bd, devcache[i]->devname); |
2431 | 2432 | bdput(bd); |