Commit 2cd05cc39347ddd8994b7f63ab5cb886f042477f

Authored by Theodore Ts'o
1 parent 2846e82004

ext4: remove unneeded ext4_journal_get_undo_access

The block allocation code used to use jbd2_journal_get_undo_access as
a way to make changes that wouldn't show up until the commit took
place.  The new multi-block allocation code has a its own way of
preventing newly freed blocks from getting reused until the commit
takes place (it avoids updating the buddy bitmaps until the commit is
done), so we don't need to use jbd2_journal_get_undo_access(), which
has extra overhead compared to jbd2_journal_get_write_access().

There was one last vestigal use of ext4_journal_get_undo_access() in
ext4_add_groupblocks(); change it to use ext4_journal_get_write_access()
and then remove the ext4_journal_get_undo_access() support.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

Showing 3 changed files with 4 additions and 27 deletions Inline Diff

1 /* 1 /*
2 * Interface between ext4 and JBD 2 * Interface between ext4 and JBD
3 */ 3 */
4 4
5 #include "ext4_jbd2.h" 5 #include "ext4_jbd2.h"
6 6
7 #include <trace/events/ext4.h> 7 #include <trace/events/ext4.h>
8 8
9 int __ext4_journal_get_undo_access(const char *where, unsigned int line,
10 handle_t *handle, struct buffer_head *bh)
11 {
12 int err = 0;
13
14 if (ext4_handle_valid(handle)) {
15 err = jbd2_journal_get_undo_access(handle, bh);
16 if (err)
17 ext4_journal_abort_handle(where, line, __func__, bh,
18 handle, err);
19 }
20 return err;
21 }
22
23 int __ext4_journal_get_write_access(const char *where, unsigned int line, 9 int __ext4_journal_get_write_access(const char *where, unsigned int line,
24 handle_t *handle, struct buffer_head *bh) 10 handle_t *handle, struct buffer_head *bh)
25 { 11 {
26 int err = 0; 12 int err = 0;
27 13
28 if (ext4_handle_valid(handle)) { 14 if (ext4_handle_valid(handle)) {
29 err = jbd2_journal_get_write_access(handle, bh); 15 err = jbd2_journal_get_write_access(handle, bh);
30 if (err) 16 if (err)
31 ext4_journal_abort_handle(where, line, __func__, bh, 17 ext4_journal_abort_handle(where, line, __func__, bh,
32 handle, err); 18 handle, err);
33 } 19 }
34 return err; 20 return err;
35 } 21 }
36 22
37 /* 23 /*
38 * The ext4 forget function must perform a revoke if we are freeing data 24 * The ext4 forget function must perform a revoke if we are freeing data
39 * which has been journaled. Metadata (eg. indirect blocks) must be 25 * which has been journaled. Metadata (eg. indirect blocks) must be
40 * revoked in all cases. 26 * revoked in all cases.
41 * 27 *
42 * "bh" may be NULL: a metadata block may have been freed from memory 28 * "bh" may be NULL: a metadata block may have been freed from memory
43 * but there may still be a record of it in the journal, and that record 29 * but there may still be a record of it in the journal, and that record
44 * still needs to be revoked. 30 * still needs to be revoked.
45 * 31 *
46 * If the handle isn't valid we're not journaling, but we still need to 32 * If the handle isn't valid we're not journaling, but we still need to
47 * call into ext4_journal_revoke() to put the buffer head. 33 * call into ext4_journal_revoke() to put the buffer head.
48 */ 34 */
49 int __ext4_forget(const char *where, unsigned int line, handle_t *handle, 35 int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
50 int is_metadata, struct inode *inode, 36 int is_metadata, struct inode *inode,
51 struct buffer_head *bh, ext4_fsblk_t blocknr) 37 struct buffer_head *bh, ext4_fsblk_t blocknr)
52 { 38 {
53 int err; 39 int err;
54 40
55 might_sleep(); 41 might_sleep();
56 42
57 trace_ext4_forget(inode, is_metadata, blocknr); 43 trace_ext4_forget(inode, is_metadata, blocknr);
58 BUFFER_TRACE(bh, "enter"); 44 BUFFER_TRACE(bh, "enter");
59 45
60 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, " 46 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
61 "data mode %x\n", 47 "data mode %x\n",
62 bh, is_metadata, inode->i_mode, 48 bh, is_metadata, inode->i_mode,
63 test_opt(inode->i_sb, DATA_FLAGS)); 49 test_opt(inode->i_sb, DATA_FLAGS));
64 50
65 /* In the no journal case, we can just do a bforget and return */ 51 /* In the no journal case, we can just do a bforget and return */
66 if (!ext4_handle_valid(handle)) { 52 if (!ext4_handle_valid(handle)) {
67 bforget(bh); 53 bforget(bh);
68 return 0; 54 return 0;
69 } 55 }
70 56
71 /* Never use the revoke function if we are doing full data 57 /* Never use the revoke function if we are doing full data
72 * journaling: there is no need to, and a V1 superblock won't 58 * journaling: there is no need to, and a V1 superblock won't
73 * support it. Otherwise, only skip the revoke on un-journaled 59 * support it. Otherwise, only skip the revoke on un-journaled
74 * data blocks. */ 60 * data blocks. */
75 61
76 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || 62 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
77 (!is_metadata && !ext4_should_journal_data(inode))) { 63 (!is_metadata && !ext4_should_journal_data(inode))) {
78 if (bh) { 64 if (bh) {
79 BUFFER_TRACE(bh, "call jbd2_journal_forget"); 65 BUFFER_TRACE(bh, "call jbd2_journal_forget");
80 err = jbd2_journal_forget(handle, bh); 66 err = jbd2_journal_forget(handle, bh);
81 if (err) 67 if (err)
82 ext4_journal_abort_handle(where, line, __func__, 68 ext4_journal_abort_handle(where, line, __func__,
83 bh, handle, err); 69 bh, handle, err);
84 return err; 70 return err;
85 } 71 }
86 return 0; 72 return 0;
87 } 73 }
88 74
89 /* 75 /*
90 * data!=journal && (is_metadata || should_journal_data(inode)) 76 * data!=journal && (is_metadata || should_journal_data(inode))
91 */ 77 */
92 BUFFER_TRACE(bh, "call jbd2_journal_revoke"); 78 BUFFER_TRACE(bh, "call jbd2_journal_revoke");
93 err = jbd2_journal_revoke(handle, blocknr, bh); 79 err = jbd2_journal_revoke(handle, blocknr, bh);
94 if (err) { 80 if (err) {
95 ext4_journal_abort_handle(where, line, __func__, 81 ext4_journal_abort_handle(where, line, __func__,
96 bh, handle, err); 82 bh, handle, err);
97 __ext4_abort(inode->i_sb, where, line, 83 __ext4_abort(inode->i_sb, where, line,
98 "error %d when attempting revoke", err); 84 "error %d when attempting revoke", err);
99 } 85 }
100 BUFFER_TRACE(bh, "exit"); 86 BUFFER_TRACE(bh, "exit");
101 return err; 87 return err;
102 } 88 }
103 89
104 int __ext4_journal_get_create_access(const char *where, unsigned int line, 90 int __ext4_journal_get_create_access(const char *where, unsigned int line,
105 handle_t *handle, struct buffer_head *bh) 91 handle_t *handle, struct buffer_head *bh)
106 { 92 {
107 int err = 0; 93 int err = 0;
108 94
109 if (ext4_handle_valid(handle)) { 95 if (ext4_handle_valid(handle)) {
110 err = jbd2_journal_get_create_access(handle, bh); 96 err = jbd2_journal_get_create_access(handle, bh);
111 if (err) 97 if (err)
112 ext4_journal_abort_handle(where, line, __func__, 98 ext4_journal_abort_handle(where, line, __func__,
113 bh, handle, err); 99 bh, handle, err);
114 } 100 }
115 return err; 101 return err;
116 } 102 }
117 103
118 int __ext4_handle_dirty_metadata(const char *where, unsigned int line, 104 int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
119 handle_t *handle, struct inode *inode, 105 handle_t *handle, struct inode *inode,
120 struct buffer_head *bh) 106 struct buffer_head *bh)
121 { 107 {
122 int err = 0; 108 int err = 0;
123 109
124 if (ext4_handle_valid(handle)) { 110 if (ext4_handle_valid(handle)) {
125 err = jbd2_journal_dirty_metadata(handle, bh); 111 err = jbd2_journal_dirty_metadata(handle, bh);
126 if (err) 112 if (err)
127 ext4_journal_abort_handle(where, line, __func__, 113 ext4_journal_abort_handle(where, line, __func__,
128 bh, handle, err); 114 bh, handle, err);
129 } else { 115 } else {
130 if (inode) 116 if (inode)
131 mark_buffer_dirty_inode(bh, inode); 117 mark_buffer_dirty_inode(bh, inode);
132 else 118 else
133 mark_buffer_dirty(bh); 119 mark_buffer_dirty(bh);
134 if (inode && inode_needs_sync(inode)) { 120 if (inode && inode_needs_sync(inode)) {
135 sync_dirty_buffer(bh); 121 sync_dirty_buffer(bh);
136 if (buffer_req(bh) && !buffer_uptodate(bh)) { 122 if (buffer_req(bh) && !buffer_uptodate(bh)) {
137 struct ext4_super_block *es; 123 struct ext4_super_block *es;
138 124
139 es = EXT4_SB(inode->i_sb)->s_es; 125 es = EXT4_SB(inode->i_sb)->s_es;
140 es->s_last_error_block = 126 es->s_last_error_block =
141 cpu_to_le64(bh->b_blocknr); 127 cpu_to_le64(bh->b_blocknr);
142 ext4_error_inode(inode, where, line, 128 ext4_error_inode(inode, where, line,
143 bh->b_blocknr, 129 bh->b_blocknr,
144 "IO error syncing itable block"); 130 "IO error syncing itable block");
145 err = -EIO; 131 err = -EIO;
146 } 132 }
147 } 133 }
148 } 134 }
149 return err; 135 return err;
150 } 136 }
151 137
152 int __ext4_handle_dirty_super(const char *where, unsigned int line, 138 int __ext4_handle_dirty_super(const char *where, unsigned int line,
153 handle_t *handle, struct super_block *sb) 139 handle_t *handle, struct super_block *sb)
154 { 140 {
155 struct buffer_head *bh = EXT4_SB(sb)->s_sbh; 141 struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
156 int err = 0; 142 int err = 0;
157 143
158 if (ext4_handle_valid(handle)) { 144 if (ext4_handle_valid(handle)) {
159 err = jbd2_journal_dirty_metadata(handle, bh); 145 err = jbd2_journal_dirty_metadata(handle, bh);
160 if (err) 146 if (err)
161 ext4_journal_abort_handle(where, line, __func__, 147 ext4_journal_abort_handle(where, line, __func__,
162 bh, handle, err); 148 bh, handle, err);
163 } else 149 } else
164 sb->s_dirt = 1; 150 sb->s_dirt = 1;
165 return err; 151 return err;
166 } 152 }
167 153
1 /* 1 /*
2 * ext4_jbd2.h 2 * ext4_jbd2.h
3 * 3 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
5 * 5 *
6 * Copyright 1998--1999 Red Hat corp --- All Rights Reserved 6 * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
7 * 7 *
8 * This file is part of the Linux kernel and is made available under 8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your 9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference. 10 * option, any later version, incorporated herein by reference.
11 * 11 *
12 * Ext4-specific journaling extensions. 12 * Ext4-specific journaling extensions.
13 */ 13 */
14 14
15 #ifndef _EXT4_JBD2_H 15 #ifndef _EXT4_JBD2_H
16 #define _EXT4_JBD2_H 16 #define _EXT4_JBD2_H
17 17
18 #include <linux/fs.h> 18 #include <linux/fs.h>
19 #include <linux/jbd2.h> 19 #include <linux/jbd2.h>
20 #include "ext4.h" 20 #include "ext4.h"
21 21
22 #define EXT4_JOURNAL(inode) (EXT4_SB((inode)->i_sb)->s_journal) 22 #define EXT4_JOURNAL(inode) (EXT4_SB((inode)->i_sb)->s_journal)
23 23
24 /* Define the number of blocks we need to account to a transaction to 24 /* Define the number of blocks we need to account to a transaction to
25 * modify one block of data. 25 * modify one block of data.
26 * 26 *
27 * We may have to touch one inode, one bitmap buffer, up to three 27 * We may have to touch one inode, one bitmap buffer, up to three
28 * indirection blocks, the group and superblock summaries, and the data 28 * indirection blocks, the group and superblock summaries, and the data
29 * block to complete the transaction. 29 * block to complete the transaction.
30 * 30 *
31 * For extents-enabled fs we may have to allocate and modify up to 31 * For extents-enabled fs we may have to allocate and modify up to
32 * 5 levels of tree + root which are stored in the inode. */ 32 * 5 levels of tree + root which are stored in the inode. */
33 33
34 #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ 34 #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
35 (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ 35 (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
36 ? 27U : 8U) 36 ? 27U : 8U)
37 37
38 /* Extended attribute operations touch at most two data buffers, 38 /* Extended attribute operations touch at most two data buffers,
39 * two bitmap buffers, and two group summaries, in addition to the inode 39 * two bitmap buffers, and two group summaries, in addition to the inode
40 * and the superblock, which are already accounted for. */ 40 * and the superblock, which are already accounted for. */
41 41
42 #define EXT4_XATTR_TRANS_BLOCKS 6U 42 #define EXT4_XATTR_TRANS_BLOCKS 6U
43 43
44 /* Define the minimum size for a transaction which modifies data. This 44 /* Define the minimum size for a transaction which modifies data. This
45 * needs to take into account the fact that we may end up modifying two 45 * needs to take into account the fact that we may end up modifying two
46 * quota files too (one for the group, one for the user quota). The 46 * quota files too (one for the group, one for the user quota). The
47 * superblock only gets updated once, of course, so don't bother 47 * superblock only gets updated once, of course, so don't bother
48 * counting that again for the quota updates. */ 48 * counting that again for the quota updates. */
49 49
50 #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ 50 #define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
51 EXT4_XATTR_TRANS_BLOCKS - 2 + \ 51 EXT4_XATTR_TRANS_BLOCKS - 2 + \
52 EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) 52 EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
53 53
54 /* 54 /*
55 * Define the number of metadata blocks we need to account to modify data. 55 * Define the number of metadata blocks we need to account to modify data.
56 * 56 *
57 * This include super block, inode block, quota blocks and xattr blocks 57 * This include super block, inode block, quota blocks and xattr blocks
58 */ 58 */
59 #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ 59 #define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
60 EXT4_MAXQUOTAS_TRANS_BLOCKS(sb)) 60 EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
61 61
62 /* Delete operations potentially hit one directory's namespace plus an 62 /* Delete operations potentially hit one directory's namespace plus an
63 * entire inode, plus arbitrary amounts of bitmap/indirection data. Be 63 * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
64 * generous. We can grow the delete transaction later if necessary. */ 64 * generous. We can grow the delete transaction later if necessary. */
65 65
66 #define EXT4_DELETE_TRANS_BLOCKS(sb) (2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64) 66 #define EXT4_DELETE_TRANS_BLOCKS(sb) (2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64)
67 67
68 /* Define an arbitrary limit for the amount of data we will anticipate 68 /* Define an arbitrary limit for the amount of data we will anticipate
69 * writing to any given transaction. For unbounded transactions such as 69 * writing to any given transaction. For unbounded transactions such as
70 * write(2) and truncate(2) we can write more than this, but we always 70 * write(2) and truncate(2) we can write more than this, but we always
71 * start off at the maximum transaction size and grow the transaction 71 * start off at the maximum transaction size and grow the transaction
72 * optimistically as we go. */ 72 * optimistically as we go. */
73 73
74 #define EXT4_MAX_TRANS_DATA 64U 74 #define EXT4_MAX_TRANS_DATA 64U
75 75
76 /* We break up a large truncate or write transaction once the handle's 76 /* We break up a large truncate or write transaction once the handle's
77 * buffer credits gets this low, we need either to extend the 77 * buffer credits gets this low, we need either to extend the
78 * transaction or to start a new one. Reserve enough space here for 78 * transaction or to start a new one. Reserve enough space here for
79 * inode, bitmap, superblock, group and indirection updates for at least 79 * inode, bitmap, superblock, group and indirection updates for at least
80 * one block, plus two quota updates. Quota allocations are not 80 * one block, plus two quota updates. Quota allocations are not
81 * needed. */ 81 * needed. */
82 82
83 #define EXT4_RESERVE_TRANS_BLOCKS 12U 83 #define EXT4_RESERVE_TRANS_BLOCKS 12U
84 84
85 #define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8 85 #define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8
86 86
87 #ifdef CONFIG_QUOTA 87 #ifdef CONFIG_QUOTA
88 /* Amount of blocks needed for quota update - we know that the structure was 88 /* Amount of blocks needed for quota update - we know that the structure was
89 * allocated so we need to update only data block */ 89 * allocated so we need to update only data block */
90 #define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0) 90 #define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 1 : 0)
91 /* Amount of blocks needed for quota insert/delete - we do some block writes 91 /* Amount of blocks needed for quota insert/delete - we do some block writes
92 * but inode, sb and group updates are done only once */ 92 * but inode, sb and group updates are done only once */
93 #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ 93 #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
94 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) 94 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
95 95
96 #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ 96 #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
97 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) 97 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
98 #else 98 #else
99 #define EXT4_QUOTA_TRANS_BLOCKS(sb) 0 99 #define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
100 #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 100 #define EXT4_QUOTA_INIT_BLOCKS(sb) 0
101 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 101 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0
102 #endif 102 #endif
103 #define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) 103 #define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
104 #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) 104 #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
105 #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) 105 #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
106 106
107 int 107 int
108 ext4_mark_iloc_dirty(handle_t *handle, 108 ext4_mark_iloc_dirty(handle_t *handle,
109 struct inode *inode, 109 struct inode *inode,
110 struct ext4_iloc *iloc); 110 struct ext4_iloc *iloc);
111 111
112 /* 112 /*
113 * On success, We end up with an outstanding reference count against 113 * On success, We end up with an outstanding reference count against
114 * iloc->bh. This _must_ be cleaned up later. 114 * iloc->bh. This _must_ be cleaned up later.
115 */ 115 */
116 116
117 int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, 117 int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
118 struct ext4_iloc *iloc); 118 struct ext4_iloc *iloc);
119 119
120 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); 120 int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
121 121
122 /* 122 /*
123 * Wrapper functions with which ext4 calls into JBD. 123 * Wrapper functions with which ext4 calls into JBD.
124 */ 124 */
125 void ext4_journal_abort_handle(const char *caller, unsigned int line, 125 void ext4_journal_abort_handle(const char *caller, unsigned int line,
126 const char *err_fn, 126 const char *err_fn,
127 struct buffer_head *bh, handle_t *handle, int err); 127 struct buffer_head *bh, handle_t *handle, int err);
128 128
129 int __ext4_journal_get_undo_access(const char *where, unsigned int line,
130 handle_t *handle, struct buffer_head *bh);
131
132 int __ext4_journal_get_write_access(const char *where, unsigned int line, 129 int __ext4_journal_get_write_access(const char *where, unsigned int line,
133 handle_t *handle, struct buffer_head *bh); 130 handle_t *handle, struct buffer_head *bh);
134 131
135 int __ext4_forget(const char *where, unsigned int line, handle_t *handle, 132 int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
136 int is_metadata, struct inode *inode, 133 int is_metadata, struct inode *inode,
137 struct buffer_head *bh, ext4_fsblk_t blocknr); 134 struct buffer_head *bh, ext4_fsblk_t blocknr);
138 135
139 int __ext4_journal_get_create_access(const char *where, unsigned int line, 136 int __ext4_journal_get_create_access(const char *where, unsigned int line,
140 handle_t *handle, struct buffer_head *bh); 137 handle_t *handle, struct buffer_head *bh);
141 138
142 int __ext4_handle_dirty_metadata(const char *where, unsigned int line, 139 int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
143 handle_t *handle, struct inode *inode, 140 handle_t *handle, struct inode *inode,
144 struct buffer_head *bh); 141 struct buffer_head *bh);
145 142
146 int __ext4_handle_dirty_super(const char *where, unsigned int line, 143 int __ext4_handle_dirty_super(const char *where, unsigned int line,
147 handle_t *handle, struct super_block *sb); 144 handle_t *handle, struct super_block *sb);
148 145
149 #define ext4_journal_get_undo_access(handle, bh) \
150 __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh))
151 #define ext4_journal_get_write_access(handle, bh) \ 146 #define ext4_journal_get_write_access(handle, bh) \
152 __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) 147 __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
153 #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ 148 #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
154 __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \ 149 __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
155 (bh), (block_nr)) 150 (bh), (block_nr))
156 #define ext4_journal_get_create_access(handle, bh) \ 151 #define ext4_journal_get_create_access(handle, bh) \
157 __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh)) 152 __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh))
158 #define ext4_handle_dirty_metadata(handle, inode, bh) \ 153 #define ext4_handle_dirty_metadata(handle, inode, bh) \
159 __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ 154 __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
160 (bh)) 155 (bh))
161 #define ext4_handle_dirty_super(handle, sb) \ 156 #define ext4_handle_dirty_super(handle, sb) \
162 __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) 157 __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
163 158
164 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); 159 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
165 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); 160 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
166 161
167 #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) 162 #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
168 163
169 /* Note: Do not use this for NULL handles. This is only to determine if 164 /* Note: Do not use this for NULL handles. This is only to determine if
170 * a properly allocated handle is using a journal or not. */ 165 * a properly allocated handle is using a journal or not. */
171 static inline int ext4_handle_valid(handle_t *handle) 166 static inline int ext4_handle_valid(handle_t *handle)
172 { 167 {
173 if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT) 168 if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
174 return 0; 169 return 0;
175 return 1; 170 return 1;
176 } 171 }
177 172
178 static inline void ext4_handle_sync(handle_t *handle) 173 static inline void ext4_handle_sync(handle_t *handle)
179 { 174 {
180 if (ext4_handle_valid(handle)) 175 if (ext4_handle_valid(handle))
181 handle->h_sync = 1; 176 handle->h_sync = 1;
182 } 177 }
183 178
184 static inline void ext4_handle_release_buffer(handle_t *handle, 179 static inline void ext4_handle_release_buffer(handle_t *handle,
185 struct buffer_head *bh) 180 struct buffer_head *bh)
186 { 181 {
187 if (ext4_handle_valid(handle)) 182 if (ext4_handle_valid(handle))
188 jbd2_journal_release_buffer(handle, bh); 183 jbd2_journal_release_buffer(handle, bh);
189 } 184 }
190 185
191 static inline int ext4_handle_is_aborted(handle_t *handle) 186 static inline int ext4_handle_is_aborted(handle_t *handle)
192 { 187 {
193 if (ext4_handle_valid(handle)) 188 if (ext4_handle_valid(handle))
194 return is_handle_aborted(handle); 189 return is_handle_aborted(handle);
195 return 0; 190 return 0;
196 } 191 }
197 192
198 static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) 193 static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
199 { 194 {
200 if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed) 195 if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed)
201 return 0; 196 return 0;
202 return 1; 197 return 1;
203 } 198 }
204 199
205 static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) 200 static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
206 { 201 {
207 return ext4_journal_start_sb(inode->i_sb, nblocks); 202 return ext4_journal_start_sb(inode->i_sb, nblocks);
208 } 203 }
209 204
210 #define ext4_journal_stop(handle) \ 205 #define ext4_journal_stop(handle) \
211 __ext4_journal_stop(__func__, __LINE__, (handle)) 206 __ext4_journal_stop(__func__, __LINE__, (handle))
212 207
213 static inline handle_t *ext4_journal_current_handle(void) 208 static inline handle_t *ext4_journal_current_handle(void)
214 { 209 {
215 return journal_current_handle(); 210 return journal_current_handle();
216 } 211 }
217 212
218 static inline int ext4_journal_extend(handle_t *handle, int nblocks) 213 static inline int ext4_journal_extend(handle_t *handle, int nblocks)
219 { 214 {
220 if (ext4_handle_valid(handle)) 215 if (ext4_handle_valid(handle))
221 return jbd2_journal_extend(handle, nblocks); 216 return jbd2_journal_extend(handle, nblocks);
222 return 0; 217 return 0;
223 } 218 }
224 219
225 static inline int ext4_journal_restart(handle_t *handle, int nblocks) 220 static inline int ext4_journal_restart(handle_t *handle, int nblocks)
226 { 221 {
227 if (ext4_handle_valid(handle)) 222 if (ext4_handle_valid(handle))
228 return jbd2_journal_restart(handle, nblocks); 223 return jbd2_journal_restart(handle, nblocks);
229 return 0; 224 return 0;
230 } 225 }
231 226
232 static inline int ext4_journal_blocks_per_page(struct inode *inode) 227 static inline int ext4_journal_blocks_per_page(struct inode *inode)
233 { 228 {
234 if (EXT4_JOURNAL(inode) != NULL) 229 if (EXT4_JOURNAL(inode) != NULL)
235 return jbd2_journal_blocks_per_page(inode); 230 return jbd2_journal_blocks_per_page(inode);
236 return 0; 231 return 0;
237 } 232 }
238 233
239 static inline int ext4_journal_force_commit(journal_t *journal) 234 static inline int ext4_journal_force_commit(journal_t *journal)
240 { 235 {
241 if (journal) 236 if (journal)
242 return jbd2_journal_force_commit(journal); 237 return jbd2_journal_force_commit(journal);
243 return 0; 238 return 0;
244 } 239 }
245 240
246 static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) 241 static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
247 { 242 {
248 if (ext4_handle_valid(handle)) 243 if (ext4_handle_valid(handle))
249 return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode); 244 return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode);
250 return 0; 245 return 0;
251 } 246 }
252 247
253 static inline void ext4_update_inode_fsync_trans(handle_t *handle, 248 static inline void ext4_update_inode_fsync_trans(handle_t *handle,
254 struct inode *inode, 249 struct inode *inode,
255 int datasync) 250 int datasync)
256 { 251 {
257 struct ext4_inode_info *ei = EXT4_I(inode); 252 struct ext4_inode_info *ei = EXT4_I(inode);
258 253
259 if (ext4_handle_valid(handle)) { 254 if (ext4_handle_valid(handle)) {
260 ei->i_sync_tid = handle->h_transaction->t_tid; 255 ei->i_sync_tid = handle->h_transaction->t_tid;
261 if (datasync) 256 if (datasync)
262 ei->i_datasync_tid = handle->h_transaction->t_tid; 257 ei->i_datasync_tid = handle->h_transaction->t_tid;
263 } 258 }
264 } 259 }
265 260
266 /* super.c */ 261 /* super.c */
267 int ext4_force_commit(struct super_block *sb); 262 int ext4_force_commit(struct super_block *sb);
268 263
269 static inline int ext4_should_journal_data(struct inode *inode) 264 static inline int ext4_should_journal_data(struct inode *inode)
270 { 265 {
271 if (EXT4_JOURNAL(inode) == NULL) 266 if (EXT4_JOURNAL(inode) == NULL)
272 return 0; 267 return 0;
273 if (!S_ISREG(inode->i_mode)) 268 if (!S_ISREG(inode->i_mode))
274 return 1; 269 return 1;
275 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 270 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
276 return 1; 271 return 1;
277 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) 272 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
278 return 1; 273 return 1;
279 return 0; 274 return 0;
280 } 275 }
281 276
282 static inline int ext4_should_order_data(struct inode *inode) 277 static inline int ext4_should_order_data(struct inode *inode)
283 { 278 {
284 if (EXT4_JOURNAL(inode) == NULL) 279 if (EXT4_JOURNAL(inode) == NULL)
285 return 0; 280 return 0;
286 if (!S_ISREG(inode->i_mode)) 281 if (!S_ISREG(inode->i_mode))
287 return 0; 282 return 0;
288 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) 283 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
289 return 0; 284 return 0;
290 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) 285 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
291 return 1; 286 return 1;
292 return 0; 287 return 0;
293 } 288 }
294 289
295 static inline int ext4_should_writeback_data(struct inode *inode) 290 static inline int ext4_should_writeback_data(struct inode *inode)
296 { 291 {
297 if (!S_ISREG(inode->i_mode)) 292 if (!S_ISREG(inode->i_mode))
298 return 0; 293 return 0;
299 if (EXT4_JOURNAL(inode) == NULL) 294 if (EXT4_JOURNAL(inode) == NULL)
300 return 1; 295 return 1;
301 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) 296 if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
302 return 0; 297 return 0;
303 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) 298 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
304 return 1; 299 return 1;
305 return 0; 300 return 0;
306 } 301 }
307 302
308 /* 303 /*
309 * This function controls whether or not we should try to go down the 304 * This function controls whether or not we should try to go down the
310 * dioread_nolock code paths, which makes it safe to avoid taking 305 * dioread_nolock code paths, which makes it safe to avoid taking
311 * i_mutex for direct I/O reads. This only works for extent-based 306 * i_mutex for direct I/O reads. This only works for extent-based
312 * files, and it doesn't work if data journaling is enabled, since the 307 * files, and it doesn't work if data journaling is enabled, since the
313 * dioread_nolock code uses b_private to pass information back to the 308 * dioread_nolock code uses b_private to pass information back to the
314 * I/O completion handler, and this conflicts with the jbd's use of 309 * I/O completion handler, and this conflicts with the jbd's use of
315 * b_private. 310 * b_private.
316 */ 311 */
317 static inline int ext4_should_dioread_nolock(struct inode *inode) 312 static inline int ext4_should_dioread_nolock(struct inode *inode)
318 { 313 {
319 if (!test_opt(inode->i_sb, DIOREAD_NOLOCK)) 314 if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
320 return 0; 315 return 0;
321 if (!S_ISREG(inode->i_mode)) 316 if (!S_ISREG(inode->i_mode))
322 return 0; 317 return 0;
323 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 318 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
324 return 0; 319 return 0;
325 if (ext4_should_journal_data(inode)) 320 if (ext4_should_journal_data(inode))
326 return 0; 321 return 0;
327 return 1; 322 return 1;
328 } 323 }
329 324
330 #endif /* _EXT4_JBD2_H */ 325 #endif /* _EXT4_JBD2_H */
331 326
1 /* 1 /*
2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com 2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
3 * Written by Alex Tomas <alex@clusterfs.com> 3 * Written by Alex Tomas <alex@clusterfs.com>
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as 6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 * 8 *
9 * This program is distributed in the hope that it will be useful, 9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public Licens 14 * You should have received a copy of the GNU General Public Licens
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
17 */ 17 */
18 18
19 19
20 /* 20 /*
21 * mballoc.c contains the multiblocks allocation routines 21 * mballoc.c contains the multiblocks allocation routines
22 */ 22 */
23 23
24 #include "mballoc.h" 24 #include "mballoc.h"
25 #include <linux/debugfs.h> 25 #include <linux/debugfs.h>
26 #include <linux/slab.h> 26 #include <linux/slab.h>
27 #include <trace/events/ext4.h> 27 #include <trace/events/ext4.h>
28 28
29 /* 29 /*
30 * MUSTDO: 30 * MUSTDO:
31 * - test ext4_ext_search_left() and ext4_ext_search_right() 31 * - test ext4_ext_search_left() and ext4_ext_search_right()
32 * - search for metadata in few groups 32 * - search for metadata in few groups
33 * 33 *
34 * TODO v4: 34 * TODO v4:
35 * - normalization should take into account whether file is still open 35 * - normalization should take into account whether file is still open
36 * - discard preallocations if no free space left (policy?) 36 * - discard preallocations if no free space left (policy?)
37 * - don't normalize tails 37 * - don't normalize tails
38 * - quota 38 * - quota
39 * - reservation for superuser 39 * - reservation for superuser
40 * 40 *
41 * TODO v3: 41 * TODO v3:
42 * - bitmap read-ahead (proposed by Oleg Drokin aka green) 42 * - bitmap read-ahead (proposed by Oleg Drokin aka green)
43 * - track min/max extents in each group for better group selection 43 * - track min/max extents in each group for better group selection
44 * - mb_mark_used() may allocate chunk right after splitting buddy 44 * - mb_mark_used() may allocate chunk right after splitting buddy
45 * - tree of groups sorted by number of free blocks 45 * - tree of groups sorted by number of free blocks
46 * - error handling 46 * - error handling
47 */ 47 */
48 48
49 /* 49 /*
50 * The allocation request involve request for multiple number of blocks 50 * The allocation request involve request for multiple number of blocks
51 * near to the goal(block) value specified. 51 * near to the goal(block) value specified.
52 * 52 *
53 * During initialization phase of the allocator we decide to use the 53 * During initialization phase of the allocator we decide to use the
54 * group preallocation or inode preallocation depending on the size of 54 * group preallocation or inode preallocation depending on the size of
55 * the file. The size of the file could be the resulting file size we 55 * the file. The size of the file could be the resulting file size we
56 * would have after allocation, or the current file size, which ever 56 * would have after allocation, or the current file size, which ever
57 * is larger. If the size is less than sbi->s_mb_stream_request we 57 * is larger. If the size is less than sbi->s_mb_stream_request we
58 * select to use the group preallocation. The default value of 58 * select to use the group preallocation. The default value of
59 * s_mb_stream_request is 16 blocks. This can also be tuned via 59 * s_mb_stream_request is 16 blocks. This can also be tuned via
60 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in 60 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
61 * terms of number of blocks. 61 * terms of number of blocks.
62 * 62 *
63 * The main motivation for having small file use group preallocation is to 63 * The main motivation for having small file use group preallocation is to
64 * ensure that we have small files closer together on the disk. 64 * ensure that we have small files closer together on the disk.
65 * 65 *
66 * First stage the allocator looks at the inode prealloc list, 66 * First stage the allocator looks at the inode prealloc list,
67 * ext4_inode_info->i_prealloc_list, which contains list of prealloc 67 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
68 * spaces for this particular inode. The inode prealloc space is 68 * spaces for this particular inode. The inode prealloc space is
69 * represented as: 69 * represented as:
70 * 70 *
71 * pa_lstart -> the logical start block for this prealloc space 71 * pa_lstart -> the logical start block for this prealloc space
72 * pa_pstart -> the physical start block for this prealloc space 72 * pa_pstart -> the physical start block for this prealloc space
73 * pa_len -> length for this prealloc space 73 * pa_len -> length for this prealloc space
74 * pa_free -> free space available in this prealloc space 74 * pa_free -> free space available in this prealloc space
75 * 75 *
76 * The inode preallocation space is used looking at the _logical_ start 76 * The inode preallocation space is used looking at the _logical_ start
77 * block. If only the logical file block falls within the range of prealloc 77 * block. If only the logical file block falls within the range of prealloc
78 * space we will consume the particular prealloc space. This make sure that 78 * space we will consume the particular prealloc space. This make sure that
79 * that the we have contiguous physical blocks representing the file blocks 79 * that the we have contiguous physical blocks representing the file blocks
80 * 80 *
81 * The important thing to be noted in case of inode prealloc space is that 81 * The important thing to be noted in case of inode prealloc space is that
82 * we don't modify the values associated to inode prealloc space except 82 * we don't modify the values associated to inode prealloc space except
83 * pa_free. 83 * pa_free.
84 * 84 *
85 * If we are not able to find blocks in the inode prealloc space and if we 85 * If we are not able to find blocks in the inode prealloc space and if we
86 * have the group allocation flag set then we look at the locality group 86 * have the group allocation flag set then we look at the locality group
87 * prealloc space. These are per CPU prealloc list repreasented as 87 * prealloc space. These are per CPU prealloc list repreasented as
88 * 88 *
89 * ext4_sb_info.s_locality_groups[smp_processor_id()] 89 * ext4_sb_info.s_locality_groups[smp_processor_id()]
90 * 90 *
91 * The reason for having a per cpu locality group is to reduce the contention 91 * The reason for having a per cpu locality group is to reduce the contention
92 * between CPUs. It is possible to get scheduled at this point. 92 * between CPUs. It is possible to get scheduled at this point.
93 * 93 *
94 * The locality group prealloc space is used looking at whether we have 94 * The locality group prealloc space is used looking at whether we have
95 * enough free space (pa_free) within the prealloc space. 95 * enough free space (pa_free) within the prealloc space.
96 * 96 *
97 * If we can't allocate blocks via inode prealloc or/and locality group 97 * If we can't allocate blocks via inode prealloc or/and locality group
98 * prealloc then we look at the buddy cache. The buddy cache is represented 98 * prealloc then we look at the buddy cache. The buddy cache is represented
99 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets 99 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
100 * mapped to the buddy and bitmap information regarding different 100 * mapped to the buddy and bitmap information regarding different
101 * groups. The buddy information is attached to buddy cache inode so that 101 * groups. The buddy information is attached to buddy cache inode so that
102 * we can access them through the page cache. The information regarding 102 * we can access them through the page cache. The information regarding
103 * each group is loaded via ext4_mb_load_buddy. The information involve 103 * each group is loaded via ext4_mb_load_buddy. The information involve
104 * block bitmap and buddy information. The information are stored in the 104 * block bitmap and buddy information. The information are stored in the
105 * inode as: 105 * inode as:
106 * 106 *
107 * { page } 107 * { page }
108 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... 108 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
109 * 109 *
110 * 110 *
111 * one block each for bitmap and buddy information. So for each group we 111 * one block each for bitmap and buddy information. So for each group we
112 * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE / 112 * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
113 * blocksize) blocks. So it can have information regarding groups_per_page 113 * blocksize) blocks. So it can have information regarding groups_per_page
114 * which is blocks_per_page/2 114 * which is blocks_per_page/2
115 * 115 *
116 * The buddy cache inode is not stored on disk. The inode is thrown 116 * The buddy cache inode is not stored on disk. The inode is thrown
117 * away when the filesystem is unmounted. 117 * away when the filesystem is unmounted.
118 * 118 *
119 * We look for count number of blocks in the buddy cache. If we were able 119 * We look for count number of blocks in the buddy cache. If we were able
120 * to locate that many free blocks we return with additional information 120 * to locate that many free blocks we return with additional information
121 * regarding rest of the contiguous physical block available 121 * regarding rest of the contiguous physical block available
122 * 122 *
123 * Before allocating blocks via buddy cache we normalize the request 123 * Before allocating blocks via buddy cache we normalize the request
124 * blocks. This ensure we ask for more blocks that we needed. The extra 124 * blocks. This ensure we ask for more blocks that we needed. The extra
125 * blocks that we get after allocation is added to the respective prealloc 125 * blocks that we get after allocation is added to the respective prealloc
126 * list. In case of inode preallocation we follow a list of heuristics 126 * list. In case of inode preallocation we follow a list of heuristics
127 * based on file size. This can be found in ext4_mb_normalize_request. If 127 * based on file size. This can be found in ext4_mb_normalize_request. If
128 * we are doing a group prealloc we try to normalize the request to 128 * we are doing a group prealloc we try to normalize the request to
129 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is 129 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
130 * 512 blocks. This can be tuned via 130 * 512 blocks. This can be tuned via
131 * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in 131 * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in
132 * terms of number of blocks. If we have mounted the file system with -O 132 * terms of number of blocks. If we have mounted the file system with -O
133 * stripe=<value> option the group prealloc request is normalized to the 133 * stripe=<value> option the group prealloc request is normalized to the
134 * stripe value (sbi->s_stripe) 134 * stripe value (sbi->s_stripe)
135 * 135 *
136 * The regular allocator(using the buddy cache) supports few tunables. 136 * The regular allocator(using the buddy cache) supports few tunables.
137 * 137 *
138 * /sys/fs/ext4/<partition>/mb_min_to_scan 138 * /sys/fs/ext4/<partition>/mb_min_to_scan
139 * /sys/fs/ext4/<partition>/mb_max_to_scan 139 * /sys/fs/ext4/<partition>/mb_max_to_scan
140 * /sys/fs/ext4/<partition>/mb_order2_req 140 * /sys/fs/ext4/<partition>/mb_order2_req
141 * 141 *
142 * The regular allocator uses buddy scan only if the request len is power of 142 * The regular allocator uses buddy scan only if the request len is power of
143 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The 143 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
144 * value of s_mb_order2_reqs can be tuned via 144 * value of s_mb_order2_reqs can be tuned via
145 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to 145 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
146 * stripe size (sbi->s_stripe), we try to search for contiguous block in 146 * stripe size (sbi->s_stripe), we try to search for contiguous block in
147 * stripe size. This should result in better allocation on RAID setups. If 147 * stripe size. This should result in better allocation on RAID setups. If
148 * not, we search in the specific group using bitmap for best extents. The 148 * not, we search in the specific group using bitmap for best extents. The
149 * tunable min_to_scan and max_to_scan control the behaviour here. 149 * tunable min_to_scan and max_to_scan control the behaviour here.
150 * min_to_scan indicate how long the mballoc __must__ look for a best 150 * min_to_scan indicate how long the mballoc __must__ look for a best
151 * extent and max_to_scan indicates how long the mballoc __can__ look for a 151 * extent and max_to_scan indicates how long the mballoc __can__ look for a
152 * best extent in the found extents. Searching for the blocks starts with 152 * best extent in the found extents. Searching for the blocks starts with
153 * the group specified as the goal value in allocation context via 153 * the group specified as the goal value in allocation context via
154 * ac_g_ex. Each group is first checked based on the criteria whether it 154 * ac_g_ex. Each group is first checked based on the criteria whether it
155 * can used for allocation. ext4_mb_good_group explains how the groups are 155 * can used for allocation. ext4_mb_good_group explains how the groups are
156 * checked. 156 * checked.
157 * 157 *
158 * Both the prealloc space are getting populated as above. So for the first 158 * Both the prealloc space are getting populated as above. So for the first
159 * request we will hit the buddy cache which will result in this prealloc 159 * request we will hit the buddy cache which will result in this prealloc
160 * space getting filled. The prealloc space is then later used for the 160 * space getting filled. The prealloc space is then later used for the
161 * subsequent request. 161 * subsequent request.
162 */ 162 */
163 163
164 /* 164 /*
165 * mballoc operates on the following data: 165 * mballoc operates on the following data:
166 * - on-disk bitmap 166 * - on-disk bitmap
167 * - in-core buddy (actually includes buddy and bitmap) 167 * - in-core buddy (actually includes buddy and bitmap)
168 * - preallocation descriptors (PAs) 168 * - preallocation descriptors (PAs)
169 * 169 *
170 * there are two types of preallocations: 170 * there are two types of preallocations:
171 * - inode 171 * - inode
172 * assiged to specific inode and can be used for this inode only. 172 * assiged to specific inode and can be used for this inode only.
173 * it describes part of inode's space preallocated to specific 173 * it describes part of inode's space preallocated to specific
174 * physical blocks. any block from that preallocated can be used 174 * physical blocks. any block from that preallocated can be used
175 * independent. the descriptor just tracks number of blocks left 175 * independent. the descriptor just tracks number of blocks left
176 * unused. so, before taking some block from descriptor, one must 176 * unused. so, before taking some block from descriptor, one must
177 * make sure corresponded logical block isn't allocated yet. this 177 * make sure corresponded logical block isn't allocated yet. this
178 * also means that freeing any block within descriptor's range 178 * also means that freeing any block within descriptor's range
179 * must discard all preallocated blocks. 179 * must discard all preallocated blocks.
180 * - locality group 180 * - locality group
181 * assigned to specific locality group which does not translate to 181 * assigned to specific locality group which does not translate to
182 * permanent set of inodes: inode can join and leave group. space 182 * permanent set of inodes: inode can join and leave group. space
183 * from this type of preallocation can be used for any inode. thus 183 * from this type of preallocation can be used for any inode. thus
184 * it's consumed from the beginning to the end. 184 * it's consumed from the beginning to the end.
185 * 185 *
186 * relation between them can be expressed as: 186 * relation between them can be expressed as:
187 * in-core buddy = on-disk bitmap + preallocation descriptors 187 * in-core buddy = on-disk bitmap + preallocation descriptors
188 * 188 *
189 * this mean blocks mballoc considers used are: 189 * this mean blocks mballoc considers used are:
190 * - allocated blocks (persistent) 190 * - allocated blocks (persistent)
191 * - preallocated blocks (non-persistent) 191 * - preallocated blocks (non-persistent)
192 * 192 *
193 * consistency in mballoc world means that at any time a block is either 193 * consistency in mballoc world means that at any time a block is either
194 * free or used in ALL structures. notice: "any time" should not be read 194 * free or used in ALL structures. notice: "any time" should not be read
195 * literally -- time is discrete and delimited by locks. 195 * literally -- time is discrete and delimited by locks.
196 * 196 *
197 * to keep it simple, we don't use block numbers, instead we count number of 197 * to keep it simple, we don't use block numbers, instead we count number of
198 * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. 198 * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
199 * 199 *
200 * all operations can be expressed as: 200 * all operations can be expressed as:
201 * - init buddy: buddy = on-disk + PAs 201 * - init buddy: buddy = on-disk + PAs
202 * - new PA: buddy += N; PA = N 202 * - new PA: buddy += N; PA = N
203 * - use inode PA: on-disk += N; PA -= N 203 * - use inode PA: on-disk += N; PA -= N
204 * - discard inode PA buddy -= on-disk - PA; PA = 0 204 * - discard inode PA buddy -= on-disk - PA; PA = 0
205 * - use locality group PA on-disk += N; PA -= N 205 * - use locality group PA on-disk += N; PA -= N
206 * - discard locality group PA buddy -= PA; PA = 0 206 * - discard locality group PA buddy -= PA; PA = 0
207 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap 207 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
208 * is used in real operation because we can't know actual used 208 * is used in real operation because we can't know actual used
209 * bits from PA, only from on-disk bitmap 209 * bits from PA, only from on-disk bitmap
210 * 210 *
211 * if we follow this strict logic, then all operations above should be atomic. 211 * if we follow this strict logic, then all operations above should be atomic.
212 * given some of them can block, we'd have to use something like semaphores 212 * given some of them can block, we'd have to use something like semaphores
213 * killing performance on high-end SMP hardware. let's try to relax it using 213 * killing performance on high-end SMP hardware. let's try to relax it using
214 * the following knowledge: 214 * the following knowledge:
215 * 1) if buddy is referenced, it's already initialized 215 * 1) if buddy is referenced, it's already initialized
216 * 2) while block is used in buddy and the buddy is referenced, 216 * 2) while block is used in buddy and the buddy is referenced,
217 * nobody can re-allocate that block 217 * nobody can re-allocate that block
218 * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has 218 * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
219 * bit set and PA claims same block, it's OK. IOW, one can set bit in 219 * bit set and PA claims same block, it's OK. IOW, one can set bit in
220 * on-disk bitmap if buddy has same bit set or/and PA covers corresponded 220 * on-disk bitmap if buddy has same bit set or/and PA covers corresponded
221 * block 221 * block
222 * 222 *
223 * so, now we're building a concurrency table: 223 * so, now we're building a concurrency table:
224 * - init buddy vs. 224 * - init buddy vs.
225 * - new PA 225 * - new PA
226 * blocks for PA are allocated in the buddy, buddy must be referenced 226 * blocks for PA are allocated in the buddy, buddy must be referenced
227 * until PA is linked to allocation group to avoid concurrent buddy init 227 * until PA is linked to allocation group to avoid concurrent buddy init
228 * - use inode PA 228 * - use inode PA
229 * we need to make sure that either on-disk bitmap or PA has uptodate data 229 * we need to make sure that either on-disk bitmap or PA has uptodate data
230 * given (3) we care that PA-=N operation doesn't interfere with init 230 * given (3) we care that PA-=N operation doesn't interfere with init
231 * - discard inode PA 231 * - discard inode PA
232 * the simplest way would be to have buddy initialized by the discard 232 * the simplest way would be to have buddy initialized by the discard
233 * - use locality group PA 233 * - use locality group PA
234 * again PA-=N must be serialized with init 234 * again PA-=N must be serialized with init
235 * - discard locality group PA 235 * - discard locality group PA
236 * the simplest way would be to have buddy initialized by the discard 236 * the simplest way would be to have buddy initialized by the discard
237 * - new PA vs. 237 * - new PA vs.
238 * - use inode PA 238 * - use inode PA
239 * i_data_sem serializes them 239 * i_data_sem serializes them
240 * - discard inode PA 240 * - discard inode PA
241 * discard process must wait until PA isn't used by another process 241 * discard process must wait until PA isn't used by another process
242 * - use locality group PA 242 * - use locality group PA
243 * some mutex should serialize them 243 * some mutex should serialize them
244 * - discard locality group PA 244 * - discard locality group PA
245 * discard process must wait until PA isn't used by another process 245 * discard process must wait until PA isn't used by another process
246 * - use inode PA 246 * - use inode PA
247 * - use inode PA 247 * - use inode PA
248 * i_data_sem or another mutex should serializes them 248 * i_data_sem or another mutex should serializes them
249 * - discard inode PA 249 * - discard inode PA
250 * discard process must wait until PA isn't used by another process 250 * discard process must wait until PA isn't used by another process
251 * - use locality group PA 251 * - use locality group PA
252 * nothing wrong here -- they're different PAs covering different blocks 252 * nothing wrong here -- they're different PAs covering different blocks
253 * - discard locality group PA 253 * - discard locality group PA
254 * discard process must wait until PA isn't used by another process 254 * discard process must wait until PA isn't used by another process
255 * 255 *
256 * now we're ready to make few consequences: 256 * now we're ready to make few consequences:
257 * - PA is referenced and while it is no discard is possible 257 * - PA is referenced and while it is no discard is possible
258 * - PA is referenced until block isn't marked in on-disk bitmap 258 * - PA is referenced until block isn't marked in on-disk bitmap
259 * - PA changes only after on-disk bitmap 259 * - PA changes only after on-disk bitmap
260 * - discard must not compete with init. either init is done before 260 * - discard must not compete with init. either init is done before
261 * any discard or they're serialized somehow 261 * any discard or they're serialized somehow
262 * - buddy init as sum of on-disk bitmap and PAs is done atomically 262 * - buddy init as sum of on-disk bitmap and PAs is done atomically
263 * 263 *
264 * a special case when we've used PA to emptiness. no need to modify buddy 264 * a special case when we've used PA to emptiness. no need to modify buddy
265 * in this case, but we should care about concurrent init 265 * in this case, but we should care about concurrent init
266 * 266 *
267 */ 267 */
268 268
269 /* 269 /*
270 * Logic in few words: 270 * Logic in few words:
271 * 271 *
272 * - allocation: 272 * - allocation:
273 * load group 273 * load group
274 * find blocks 274 * find blocks
275 * mark bits in on-disk bitmap 275 * mark bits in on-disk bitmap
276 * release group 276 * release group
277 * 277 *
278 * - use preallocation: 278 * - use preallocation:
279 * find proper PA (per-inode or group) 279 * find proper PA (per-inode or group)
280 * load group 280 * load group
281 * mark bits in on-disk bitmap 281 * mark bits in on-disk bitmap
282 * release group 282 * release group
283 * release PA 283 * release PA
284 * 284 *
285 * - free: 285 * - free:
286 * load group 286 * load group
287 * mark bits in on-disk bitmap 287 * mark bits in on-disk bitmap
288 * release group 288 * release group
289 * 289 *
290 * - discard preallocations in group: 290 * - discard preallocations in group:
291 * mark PAs deleted 291 * mark PAs deleted
292 * move them onto local list 292 * move them onto local list
293 * load on-disk bitmap 293 * load on-disk bitmap
294 * load group 294 * load group
295 * remove PA from object (inode or locality group) 295 * remove PA from object (inode or locality group)
296 * mark free blocks in-core 296 * mark free blocks in-core
297 * 297 *
298 * - discard inode's preallocations: 298 * - discard inode's preallocations:
299 */ 299 */
300 300
301 /* 301 /*
302 * Locking rules 302 * Locking rules
303 * 303 *
304 * Locks: 304 * Locks:
305 * - bitlock on a group (group) 305 * - bitlock on a group (group)
306 * - object (inode/locality) (object) 306 * - object (inode/locality) (object)
307 * - per-pa lock (pa) 307 * - per-pa lock (pa)
308 * 308 *
309 * Paths: 309 * Paths:
310 * - new pa 310 * - new pa
311 * object 311 * object
312 * group 312 * group
313 * 313 *
314 * - find and use pa: 314 * - find and use pa:
315 * pa 315 * pa
316 * 316 *
317 * - release consumed pa: 317 * - release consumed pa:
318 * pa 318 * pa
319 * group 319 * group
320 * object 320 * object
321 * 321 *
322 * - generate in-core bitmap: 322 * - generate in-core bitmap:
323 * group 323 * group
324 * pa 324 * pa
325 * 325 *
326 * - discard all for given object (inode, locality group): 326 * - discard all for given object (inode, locality group):
327 * object 327 * object
328 * pa 328 * pa
329 * group 329 * group
330 * 330 *
331 * - discard all for given group: 331 * - discard all for given group:
332 * group 332 * group
333 * pa 333 * pa
334 * group 334 * group
335 * object 335 * object
336 * 336 *
337 */ 337 */
338 static struct kmem_cache *ext4_pspace_cachep; 338 static struct kmem_cache *ext4_pspace_cachep;
339 static struct kmem_cache *ext4_ac_cachep; 339 static struct kmem_cache *ext4_ac_cachep;
340 static struct kmem_cache *ext4_free_ext_cachep; 340 static struct kmem_cache *ext4_free_ext_cachep;
341 341
342 /* We create slab caches for groupinfo data structures based on the 342 /* We create slab caches for groupinfo data structures based on the
343 * superblock block size. There will be one per mounted filesystem for 343 * superblock block size. There will be one per mounted filesystem for
344 * each unique s_blocksize_bits */ 344 * each unique s_blocksize_bits */
345 #define NR_GRPINFO_CACHES 8 345 #define NR_GRPINFO_CACHES 8
346 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; 346 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
347 347
348 static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { 348 static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
349 "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", 349 "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
350 "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", 350 "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
351 "ext4_groupinfo_64k", "ext4_groupinfo_128k" 351 "ext4_groupinfo_64k", "ext4_groupinfo_128k"
352 }; 352 };
353 353
354 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 354 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
355 ext4_group_t group); 355 ext4_group_t group);
356 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 356 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
357 ext4_group_t group); 357 ext4_group_t group);
358 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); 358 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
359 359
360 static inline void *mb_correct_addr_and_bit(int *bit, void *addr) 360 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
361 { 361 {
362 #if BITS_PER_LONG == 64 362 #if BITS_PER_LONG == 64
363 *bit += ((unsigned long) addr & 7UL) << 3; 363 *bit += ((unsigned long) addr & 7UL) << 3;
364 addr = (void *) ((unsigned long) addr & ~7UL); 364 addr = (void *) ((unsigned long) addr & ~7UL);
365 #elif BITS_PER_LONG == 32 365 #elif BITS_PER_LONG == 32
366 *bit += ((unsigned long) addr & 3UL) << 3; 366 *bit += ((unsigned long) addr & 3UL) << 3;
367 addr = (void *) ((unsigned long) addr & ~3UL); 367 addr = (void *) ((unsigned long) addr & ~3UL);
368 #else 368 #else
369 #error "how many bits you are?!" 369 #error "how many bits you are?!"
370 #endif 370 #endif
371 return addr; 371 return addr;
372 } 372 }
373 373
374 static inline int mb_test_bit(int bit, void *addr) 374 static inline int mb_test_bit(int bit, void *addr)
375 { 375 {
376 /* 376 /*
377 * ext4_test_bit on architecture like powerpc 377 * ext4_test_bit on architecture like powerpc
378 * needs unsigned long aligned address 378 * needs unsigned long aligned address
379 */ 379 */
380 addr = mb_correct_addr_and_bit(&bit, addr); 380 addr = mb_correct_addr_and_bit(&bit, addr);
381 return ext4_test_bit(bit, addr); 381 return ext4_test_bit(bit, addr);
382 } 382 }
383 383
384 static inline void mb_set_bit(int bit, void *addr) 384 static inline void mb_set_bit(int bit, void *addr)
385 { 385 {
386 addr = mb_correct_addr_and_bit(&bit, addr); 386 addr = mb_correct_addr_and_bit(&bit, addr);
387 ext4_set_bit(bit, addr); 387 ext4_set_bit(bit, addr);
388 } 388 }
389 389
390 static inline void mb_clear_bit(int bit, void *addr) 390 static inline void mb_clear_bit(int bit, void *addr)
391 { 391 {
392 addr = mb_correct_addr_and_bit(&bit, addr); 392 addr = mb_correct_addr_and_bit(&bit, addr);
393 ext4_clear_bit(bit, addr); 393 ext4_clear_bit(bit, addr);
394 } 394 }
395 395
396 static inline int mb_find_next_zero_bit(void *addr, int max, int start) 396 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
397 { 397 {
398 int fix = 0, ret, tmpmax; 398 int fix = 0, ret, tmpmax;
399 addr = mb_correct_addr_and_bit(&fix, addr); 399 addr = mb_correct_addr_and_bit(&fix, addr);
400 tmpmax = max + fix; 400 tmpmax = max + fix;
401 start += fix; 401 start += fix;
402 402
403 ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; 403 ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
404 if (ret > max) 404 if (ret > max)
405 return max; 405 return max;
406 return ret; 406 return ret;
407 } 407 }
408 408
409 static inline int mb_find_next_bit(void *addr, int max, int start) 409 static inline int mb_find_next_bit(void *addr, int max, int start)
410 { 410 {
411 int fix = 0, ret, tmpmax; 411 int fix = 0, ret, tmpmax;
412 addr = mb_correct_addr_and_bit(&fix, addr); 412 addr = mb_correct_addr_and_bit(&fix, addr);
413 tmpmax = max + fix; 413 tmpmax = max + fix;
414 start += fix; 414 start += fix;
415 415
416 ret = ext4_find_next_bit(addr, tmpmax, start) - fix; 416 ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
417 if (ret > max) 417 if (ret > max)
418 return max; 418 return max;
419 return ret; 419 return ret;
420 } 420 }
421 421
422 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) 422 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
423 { 423 {
424 char *bb; 424 char *bb;
425 425
426 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); 426 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
427 BUG_ON(max == NULL); 427 BUG_ON(max == NULL);
428 428
429 if (order > e4b->bd_blkbits + 1) { 429 if (order > e4b->bd_blkbits + 1) {
430 *max = 0; 430 *max = 0;
431 return NULL; 431 return NULL;
432 } 432 }
433 433
434 /* at order 0 we see each particular block */ 434 /* at order 0 we see each particular block */
435 if (order == 0) { 435 if (order == 0) {
436 *max = 1 << (e4b->bd_blkbits + 3); 436 *max = 1 << (e4b->bd_blkbits + 3);
437 return EXT4_MB_BITMAP(e4b); 437 return EXT4_MB_BITMAP(e4b);
438 } 438 }
439 439
440 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; 440 bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
441 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; 441 *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
442 442
443 return bb; 443 return bb;
444 } 444 }
445 445
446 #ifdef DOUBLE_CHECK 446 #ifdef DOUBLE_CHECK
447 static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, 447 static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
448 int first, int count) 448 int first, int count)
449 { 449 {
450 int i; 450 int i;
451 struct super_block *sb = e4b->bd_sb; 451 struct super_block *sb = e4b->bd_sb;
452 452
453 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 453 if (unlikely(e4b->bd_info->bb_bitmap == NULL))
454 return; 454 return;
455 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 455 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
456 for (i = 0; i < count; i++) { 456 for (i = 0; i < count; i++) {
457 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { 457 if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
458 ext4_fsblk_t blocknr; 458 ext4_fsblk_t blocknr;
459 459
460 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 460 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
461 blocknr += first + i; 461 blocknr += first + i;
462 ext4_grp_locked_error(sb, e4b->bd_group, 462 ext4_grp_locked_error(sb, e4b->bd_group,
463 inode ? inode->i_ino : 0, 463 inode ? inode->i_ino : 0,
464 blocknr, 464 blocknr,
465 "freeing block already freed " 465 "freeing block already freed "
466 "(bit %u)", 466 "(bit %u)",
467 first + i); 467 first + i);
468 } 468 }
469 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); 469 mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
470 } 470 }
471 } 471 }
472 472
473 static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) 473 static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
474 { 474 {
475 int i; 475 int i;
476 476
477 if (unlikely(e4b->bd_info->bb_bitmap == NULL)) 477 if (unlikely(e4b->bd_info->bb_bitmap == NULL))
478 return; 478 return;
479 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 479 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
480 for (i = 0; i < count; i++) { 480 for (i = 0; i < count; i++) {
481 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); 481 BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
482 mb_set_bit(first + i, e4b->bd_info->bb_bitmap); 482 mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
483 } 483 }
484 } 484 }
485 485
486 static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) 486 static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
487 { 487 {
488 if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { 488 if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
489 unsigned char *b1, *b2; 489 unsigned char *b1, *b2;
490 int i; 490 int i;
491 b1 = (unsigned char *) e4b->bd_info->bb_bitmap; 491 b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
492 b2 = (unsigned char *) bitmap; 492 b2 = (unsigned char *) bitmap;
493 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { 493 for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
494 if (b1[i] != b2[i]) { 494 if (b1[i] != b2[i]) {
495 printk(KERN_ERR "corruption in group %u " 495 printk(KERN_ERR "corruption in group %u "
496 "at byte %u(%u): %x in copy != %x " 496 "at byte %u(%u): %x in copy != %x "
497 "on disk/prealloc\n", 497 "on disk/prealloc\n",
498 e4b->bd_group, i, i * 8, b1[i], b2[i]); 498 e4b->bd_group, i, i * 8, b1[i], b2[i]);
499 BUG(); 499 BUG();
500 } 500 }
501 } 501 }
502 } 502 }
503 } 503 }
504 504
505 #else 505 #else
506 static inline void mb_free_blocks_double(struct inode *inode, 506 static inline void mb_free_blocks_double(struct inode *inode,
507 struct ext4_buddy *e4b, int first, int count) 507 struct ext4_buddy *e4b, int first, int count)
508 { 508 {
509 return; 509 return;
510 } 510 }
511 static inline void mb_mark_used_double(struct ext4_buddy *e4b, 511 static inline void mb_mark_used_double(struct ext4_buddy *e4b,
512 int first, int count) 512 int first, int count)
513 { 513 {
514 return; 514 return;
515 } 515 }
516 static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) 516 static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
517 { 517 {
518 return; 518 return;
519 } 519 }
520 #endif 520 #endif
521 521
522 #ifdef AGGRESSIVE_CHECK 522 #ifdef AGGRESSIVE_CHECK
523 523
524 #define MB_CHECK_ASSERT(assert) \ 524 #define MB_CHECK_ASSERT(assert) \
525 do { \ 525 do { \
526 if (!(assert)) { \ 526 if (!(assert)) { \
527 printk(KERN_EMERG \ 527 printk(KERN_EMERG \
528 "Assertion failure in %s() at %s:%d: \"%s\"\n", \ 528 "Assertion failure in %s() at %s:%d: \"%s\"\n", \
529 function, file, line, # assert); \ 529 function, file, line, # assert); \
530 BUG(); \ 530 BUG(); \
531 } \ 531 } \
532 } while (0) 532 } while (0)
533 533
534 static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, 534 static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
535 const char *function, int line) 535 const char *function, int line)
536 { 536 {
537 struct super_block *sb = e4b->bd_sb; 537 struct super_block *sb = e4b->bd_sb;
538 int order = e4b->bd_blkbits + 1; 538 int order = e4b->bd_blkbits + 1;
539 int max; 539 int max;
540 int max2; 540 int max2;
541 int i; 541 int i;
542 int j; 542 int j;
543 int k; 543 int k;
544 int count; 544 int count;
545 struct ext4_group_info *grp; 545 struct ext4_group_info *grp;
546 int fragments = 0; 546 int fragments = 0;
547 int fstart; 547 int fstart;
548 struct list_head *cur; 548 struct list_head *cur;
549 void *buddy; 549 void *buddy;
550 void *buddy2; 550 void *buddy2;
551 551
552 { 552 {
553 static int mb_check_counter; 553 static int mb_check_counter;
554 if (mb_check_counter++ % 100 != 0) 554 if (mb_check_counter++ % 100 != 0)
555 return 0; 555 return 0;
556 } 556 }
557 557
558 while (order > 1) { 558 while (order > 1) {
559 buddy = mb_find_buddy(e4b, order, &max); 559 buddy = mb_find_buddy(e4b, order, &max);
560 MB_CHECK_ASSERT(buddy); 560 MB_CHECK_ASSERT(buddy);
561 buddy2 = mb_find_buddy(e4b, order - 1, &max2); 561 buddy2 = mb_find_buddy(e4b, order - 1, &max2);
562 MB_CHECK_ASSERT(buddy2); 562 MB_CHECK_ASSERT(buddy2);
563 MB_CHECK_ASSERT(buddy != buddy2); 563 MB_CHECK_ASSERT(buddy != buddy2);
564 MB_CHECK_ASSERT(max * 2 == max2); 564 MB_CHECK_ASSERT(max * 2 == max2);
565 565
566 count = 0; 566 count = 0;
567 for (i = 0; i < max; i++) { 567 for (i = 0; i < max; i++) {
568 568
569 if (mb_test_bit(i, buddy)) { 569 if (mb_test_bit(i, buddy)) {
570 /* only single bit in buddy2 may be 1 */ 570 /* only single bit in buddy2 may be 1 */
571 if (!mb_test_bit(i << 1, buddy2)) { 571 if (!mb_test_bit(i << 1, buddy2)) {
572 MB_CHECK_ASSERT( 572 MB_CHECK_ASSERT(
573 mb_test_bit((i<<1)+1, buddy2)); 573 mb_test_bit((i<<1)+1, buddy2));
574 } else if (!mb_test_bit((i << 1) + 1, buddy2)) { 574 } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
575 MB_CHECK_ASSERT( 575 MB_CHECK_ASSERT(
576 mb_test_bit(i << 1, buddy2)); 576 mb_test_bit(i << 1, buddy2));
577 } 577 }
578 continue; 578 continue;
579 } 579 }
580 580
581 /* both bits in buddy2 must be 0 */ 581 /* both bits in buddy2 must be 0 */
582 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); 582 MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
583 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); 583 MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
584 584
585 for (j = 0; j < (1 << order); j++) { 585 for (j = 0; j < (1 << order); j++) {
586 k = (i * (1 << order)) + j; 586 k = (i * (1 << order)) + j;
587 MB_CHECK_ASSERT( 587 MB_CHECK_ASSERT(
588 !mb_test_bit(k, EXT4_MB_BITMAP(e4b))); 588 !mb_test_bit(k, EXT4_MB_BITMAP(e4b)));
589 } 589 }
590 count++; 590 count++;
591 } 591 }
592 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); 592 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
593 order--; 593 order--;
594 } 594 }
595 595
596 fstart = -1; 596 fstart = -1;
597 buddy = mb_find_buddy(e4b, 0, &max); 597 buddy = mb_find_buddy(e4b, 0, &max);
598 for (i = 0; i < max; i++) { 598 for (i = 0; i < max; i++) {
599 if (!mb_test_bit(i, buddy)) { 599 if (!mb_test_bit(i, buddy)) {
600 MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); 600 MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
601 if (fstart == -1) { 601 if (fstart == -1) {
602 fragments++; 602 fragments++;
603 fstart = i; 603 fstart = i;
604 } 604 }
605 continue; 605 continue;
606 } 606 }
607 fstart = -1; 607 fstart = -1;
608 /* check used bits only */ 608 /* check used bits only */
609 for (j = 0; j < e4b->bd_blkbits + 1; j++) { 609 for (j = 0; j < e4b->bd_blkbits + 1; j++) {
610 buddy2 = mb_find_buddy(e4b, j, &max2); 610 buddy2 = mb_find_buddy(e4b, j, &max2);
611 k = i >> j; 611 k = i >> j;
612 MB_CHECK_ASSERT(k < max2); 612 MB_CHECK_ASSERT(k < max2);
613 MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); 613 MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
614 } 614 }
615 } 615 }
616 MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); 616 MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
617 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); 617 MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
618 618
619 grp = ext4_get_group_info(sb, e4b->bd_group); 619 grp = ext4_get_group_info(sb, e4b->bd_group);
620 list_for_each(cur, &grp->bb_prealloc_list) { 620 list_for_each(cur, &grp->bb_prealloc_list) {
621 ext4_group_t groupnr; 621 ext4_group_t groupnr;
622 struct ext4_prealloc_space *pa; 622 struct ext4_prealloc_space *pa;
623 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 623 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
624 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); 624 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
625 MB_CHECK_ASSERT(groupnr == e4b->bd_group); 625 MB_CHECK_ASSERT(groupnr == e4b->bd_group);
626 for (i = 0; i < pa->pa_len; i++) 626 for (i = 0; i < pa->pa_len; i++)
627 MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); 627 MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
628 } 628 }
629 return 0; 629 return 0;
630 } 630 }
631 #undef MB_CHECK_ASSERT 631 #undef MB_CHECK_ASSERT
632 #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ 632 #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \
633 __FILE__, __func__, __LINE__) 633 __FILE__, __func__, __LINE__)
634 #else 634 #else
635 #define mb_check_buddy(e4b) 635 #define mb_check_buddy(e4b)
636 #endif 636 #endif
637 637
638 /* 638 /*
639 * Divide blocks started from @first with length @len into 639 * Divide blocks started from @first with length @len into
640 * smaller chunks with power of 2 blocks. 640 * smaller chunks with power of 2 blocks.
641 * Clear the bits in bitmap which the blocks of the chunk(s) covered, 641 * Clear the bits in bitmap which the blocks of the chunk(s) covered,
642 * then increase bb_counters[] for corresponded chunk size. 642 * then increase bb_counters[] for corresponded chunk size.
643 */ 643 */
644 static void ext4_mb_mark_free_simple(struct super_block *sb, 644 static void ext4_mb_mark_free_simple(struct super_block *sb,
645 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, 645 void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
646 struct ext4_group_info *grp) 646 struct ext4_group_info *grp)
647 { 647 {
648 struct ext4_sb_info *sbi = EXT4_SB(sb); 648 struct ext4_sb_info *sbi = EXT4_SB(sb);
649 ext4_grpblk_t min; 649 ext4_grpblk_t min;
650 ext4_grpblk_t max; 650 ext4_grpblk_t max;
651 ext4_grpblk_t chunk; 651 ext4_grpblk_t chunk;
652 unsigned short border; 652 unsigned short border;
653 653
654 BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb)); 654 BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
655 655
656 border = 2 << sb->s_blocksize_bits; 656 border = 2 << sb->s_blocksize_bits;
657 657
658 while (len > 0) { 658 while (len > 0) {
659 /* find how many blocks can be covered since this position */ 659 /* find how many blocks can be covered since this position */
660 max = ffs(first | border) - 1; 660 max = ffs(first | border) - 1;
661 661
662 /* find how many blocks of power 2 we need to mark */ 662 /* find how many blocks of power 2 we need to mark */
663 min = fls(len) - 1; 663 min = fls(len) - 1;
664 664
665 if (max < min) 665 if (max < min)
666 min = max; 666 min = max;
667 chunk = 1 << min; 667 chunk = 1 << min;
668 668
669 /* mark multiblock chunks only */ 669 /* mark multiblock chunks only */
670 grp->bb_counters[min]++; 670 grp->bb_counters[min]++;
671 if (min > 0) 671 if (min > 0)
672 mb_clear_bit(first >> min, 672 mb_clear_bit(first >> min,
673 buddy + sbi->s_mb_offsets[min]); 673 buddy + sbi->s_mb_offsets[min]);
674 674
675 len -= chunk; 675 len -= chunk;
676 first += chunk; 676 first += chunk;
677 } 677 }
678 } 678 }
679 679
680 /* 680 /*
681 * Cache the order of the largest free extent we have available in this block 681 * Cache the order of the largest free extent we have available in this block
682 * group. 682 * group.
683 */ 683 */
684 static void 684 static void
685 mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) 685 mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
686 { 686 {
687 int i; 687 int i;
688 int bits; 688 int bits;
689 689
690 grp->bb_largest_free_order = -1; /* uninit */ 690 grp->bb_largest_free_order = -1; /* uninit */
691 691
692 bits = sb->s_blocksize_bits + 1; 692 bits = sb->s_blocksize_bits + 1;
693 for (i = bits; i >= 0; i--) { 693 for (i = bits; i >= 0; i--) {
694 if (grp->bb_counters[i] > 0) { 694 if (grp->bb_counters[i] > 0) {
695 grp->bb_largest_free_order = i; 695 grp->bb_largest_free_order = i;
696 break; 696 break;
697 } 697 }
698 } 698 }
699 } 699 }
700 700
701 static noinline_for_stack 701 static noinline_for_stack
702 void ext4_mb_generate_buddy(struct super_block *sb, 702 void ext4_mb_generate_buddy(struct super_block *sb,
703 void *buddy, void *bitmap, ext4_group_t group) 703 void *buddy, void *bitmap, ext4_group_t group)
704 { 704 {
705 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 705 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
706 ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb); 706 ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
707 ext4_grpblk_t i = 0; 707 ext4_grpblk_t i = 0;
708 ext4_grpblk_t first; 708 ext4_grpblk_t first;
709 ext4_grpblk_t len; 709 ext4_grpblk_t len;
710 unsigned free = 0; 710 unsigned free = 0;
711 unsigned fragments = 0; 711 unsigned fragments = 0;
712 unsigned long long period = get_cycles(); 712 unsigned long long period = get_cycles();
713 713
714 /* initialize buddy from bitmap which is aggregation 714 /* initialize buddy from bitmap which is aggregation
715 * of on-disk bitmap and preallocations */ 715 * of on-disk bitmap and preallocations */
716 i = mb_find_next_zero_bit(bitmap, max, 0); 716 i = mb_find_next_zero_bit(bitmap, max, 0);
717 grp->bb_first_free = i; 717 grp->bb_first_free = i;
718 while (i < max) { 718 while (i < max) {
719 fragments++; 719 fragments++;
720 first = i; 720 first = i;
721 i = mb_find_next_bit(bitmap, max, i); 721 i = mb_find_next_bit(bitmap, max, i);
722 len = i - first; 722 len = i - first;
723 free += len; 723 free += len;
724 if (len > 1) 724 if (len > 1)
725 ext4_mb_mark_free_simple(sb, buddy, first, len, grp); 725 ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
726 else 726 else
727 grp->bb_counters[0]++; 727 grp->bb_counters[0]++;
728 if (i < max) 728 if (i < max)
729 i = mb_find_next_zero_bit(bitmap, max, i); 729 i = mb_find_next_zero_bit(bitmap, max, i);
730 } 730 }
731 grp->bb_fragments = fragments; 731 grp->bb_fragments = fragments;
732 732
733 if (free != grp->bb_free) { 733 if (free != grp->bb_free) {
734 ext4_grp_locked_error(sb, group, 0, 0, 734 ext4_grp_locked_error(sb, group, 0, 0,
735 "%u blocks in bitmap, %u in gd", 735 "%u blocks in bitmap, %u in gd",
736 free, grp->bb_free); 736 free, grp->bb_free);
737 /* 737 /*
738 * If we intent to continue, we consider group descritor 738 * If we intent to continue, we consider group descritor
739 * corrupt and update bb_free using bitmap value 739 * corrupt and update bb_free using bitmap value
740 */ 740 */
741 grp->bb_free = free; 741 grp->bb_free = free;
742 } 742 }
743 mb_set_largest_free_order(sb, grp); 743 mb_set_largest_free_order(sb, grp);
744 744
745 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 745 clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
746 746
747 period = get_cycles() - period; 747 period = get_cycles() - period;
748 spin_lock(&EXT4_SB(sb)->s_bal_lock); 748 spin_lock(&EXT4_SB(sb)->s_bal_lock);
749 EXT4_SB(sb)->s_mb_buddies_generated++; 749 EXT4_SB(sb)->s_mb_buddies_generated++;
750 EXT4_SB(sb)->s_mb_generation_time += period; 750 EXT4_SB(sb)->s_mb_generation_time += period;
751 spin_unlock(&EXT4_SB(sb)->s_bal_lock); 751 spin_unlock(&EXT4_SB(sb)->s_bal_lock);
752 } 752 }
753 753
754 /* The buddy information is attached the buddy cache inode 754 /* The buddy information is attached the buddy cache inode
755 * for convenience. The information regarding each group 755 * for convenience. The information regarding each group
756 * is loaded via ext4_mb_load_buddy. The information involve 756 * is loaded via ext4_mb_load_buddy. The information involve
757 * block bitmap and buddy information. The information are 757 * block bitmap and buddy information. The information are
758 * stored in the inode as 758 * stored in the inode as
759 * 759 *
760 * { page } 760 * { page }
761 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... 761 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
762 * 762 *
763 * 763 *
764 * one block each for bitmap and buddy information. 764 * one block each for bitmap and buddy information.
765 * So for each group we take up 2 blocks. A page can 765 * So for each group we take up 2 blocks. A page can
766 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks. 766 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize) blocks.
767 * So it can have information regarding groups_per_page which 767 * So it can have information regarding groups_per_page which
768 * is blocks_per_page/2 768 * is blocks_per_page/2
769 * 769 *
770 * Locking note: This routine takes the block group lock of all groups 770 * Locking note: This routine takes the block group lock of all groups
771 * for this page; do not hold this lock when calling this routine! 771 * for this page; do not hold this lock when calling this routine!
772 */ 772 */
773 773
774 static int ext4_mb_init_cache(struct page *page, char *incore) 774 static int ext4_mb_init_cache(struct page *page, char *incore)
775 { 775 {
776 ext4_group_t ngroups; 776 ext4_group_t ngroups;
777 int blocksize; 777 int blocksize;
778 int blocks_per_page; 778 int blocks_per_page;
779 int groups_per_page; 779 int groups_per_page;
780 int err = 0; 780 int err = 0;
781 int i; 781 int i;
782 ext4_group_t first_group; 782 ext4_group_t first_group;
783 int first_block; 783 int first_block;
784 struct super_block *sb; 784 struct super_block *sb;
785 struct buffer_head *bhs; 785 struct buffer_head *bhs;
786 struct buffer_head **bh; 786 struct buffer_head **bh;
787 struct inode *inode; 787 struct inode *inode;
788 char *data; 788 char *data;
789 char *bitmap; 789 char *bitmap;
790 790
791 mb_debug(1, "init page %lu\n", page->index); 791 mb_debug(1, "init page %lu\n", page->index);
792 792
793 inode = page->mapping->host; 793 inode = page->mapping->host;
794 sb = inode->i_sb; 794 sb = inode->i_sb;
795 ngroups = ext4_get_groups_count(sb); 795 ngroups = ext4_get_groups_count(sb);
796 blocksize = 1 << inode->i_blkbits; 796 blocksize = 1 << inode->i_blkbits;
797 blocks_per_page = PAGE_CACHE_SIZE / blocksize; 797 blocks_per_page = PAGE_CACHE_SIZE / blocksize;
798 798
799 groups_per_page = blocks_per_page >> 1; 799 groups_per_page = blocks_per_page >> 1;
800 if (groups_per_page == 0) 800 if (groups_per_page == 0)
801 groups_per_page = 1; 801 groups_per_page = 1;
802 802
803 /* allocate buffer_heads to read bitmaps */ 803 /* allocate buffer_heads to read bitmaps */
804 if (groups_per_page > 1) { 804 if (groups_per_page > 1) {
805 err = -ENOMEM; 805 err = -ENOMEM;
806 i = sizeof(struct buffer_head *) * groups_per_page; 806 i = sizeof(struct buffer_head *) * groups_per_page;
807 bh = kzalloc(i, GFP_NOFS); 807 bh = kzalloc(i, GFP_NOFS);
808 if (bh == NULL) 808 if (bh == NULL)
809 goto out; 809 goto out;
810 } else 810 } else
811 bh = &bhs; 811 bh = &bhs;
812 812
813 first_group = page->index * blocks_per_page / 2; 813 first_group = page->index * blocks_per_page / 2;
814 814
815 /* read all groups the page covers into the cache */ 815 /* read all groups the page covers into the cache */
816 for (i = 0; i < groups_per_page; i++) { 816 for (i = 0; i < groups_per_page; i++) {
817 struct ext4_group_desc *desc; 817 struct ext4_group_desc *desc;
818 818
819 if (first_group + i >= ngroups) 819 if (first_group + i >= ngroups)
820 break; 820 break;
821 821
822 err = -EIO; 822 err = -EIO;
823 desc = ext4_get_group_desc(sb, first_group + i, NULL); 823 desc = ext4_get_group_desc(sb, first_group + i, NULL);
824 if (desc == NULL) 824 if (desc == NULL)
825 goto out; 825 goto out;
826 826
827 err = -ENOMEM; 827 err = -ENOMEM;
828 bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc)); 828 bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
829 if (bh[i] == NULL) 829 if (bh[i] == NULL)
830 goto out; 830 goto out;
831 831
832 if (bitmap_uptodate(bh[i])) 832 if (bitmap_uptodate(bh[i]))
833 continue; 833 continue;
834 834
835 lock_buffer(bh[i]); 835 lock_buffer(bh[i]);
836 if (bitmap_uptodate(bh[i])) { 836 if (bitmap_uptodate(bh[i])) {
837 unlock_buffer(bh[i]); 837 unlock_buffer(bh[i]);
838 continue; 838 continue;
839 } 839 }
840 ext4_lock_group(sb, first_group + i); 840 ext4_lock_group(sb, first_group + i);
841 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 841 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
842 ext4_init_block_bitmap(sb, bh[i], 842 ext4_init_block_bitmap(sb, bh[i],
843 first_group + i, desc); 843 first_group + i, desc);
844 set_bitmap_uptodate(bh[i]); 844 set_bitmap_uptodate(bh[i]);
845 set_buffer_uptodate(bh[i]); 845 set_buffer_uptodate(bh[i]);
846 ext4_unlock_group(sb, first_group + i); 846 ext4_unlock_group(sb, first_group + i);
847 unlock_buffer(bh[i]); 847 unlock_buffer(bh[i]);
848 continue; 848 continue;
849 } 849 }
850 ext4_unlock_group(sb, first_group + i); 850 ext4_unlock_group(sb, first_group + i);
851 if (buffer_uptodate(bh[i])) { 851 if (buffer_uptodate(bh[i])) {
852 /* 852 /*
853 * if not uninit if bh is uptodate, 853 * if not uninit if bh is uptodate,
854 * bitmap is also uptodate 854 * bitmap is also uptodate
855 */ 855 */
856 set_bitmap_uptodate(bh[i]); 856 set_bitmap_uptodate(bh[i]);
857 unlock_buffer(bh[i]); 857 unlock_buffer(bh[i]);
858 continue; 858 continue;
859 } 859 }
860 get_bh(bh[i]); 860 get_bh(bh[i]);
861 /* 861 /*
862 * submit the buffer_head for read. We can 862 * submit the buffer_head for read. We can
863 * safely mark the bitmap as uptodate now. 863 * safely mark the bitmap as uptodate now.
864 * We do it here so the bitmap uptodate bit 864 * We do it here so the bitmap uptodate bit
865 * get set with buffer lock held. 865 * get set with buffer lock held.
866 */ 866 */
867 set_bitmap_uptodate(bh[i]); 867 set_bitmap_uptodate(bh[i]);
868 bh[i]->b_end_io = end_buffer_read_sync; 868 bh[i]->b_end_io = end_buffer_read_sync;
869 submit_bh(READ, bh[i]); 869 submit_bh(READ, bh[i]);
870 mb_debug(1, "read bitmap for group %u\n", first_group + i); 870 mb_debug(1, "read bitmap for group %u\n", first_group + i);
871 } 871 }
872 872
873 /* wait for I/O completion */ 873 /* wait for I/O completion */
874 for (i = 0; i < groups_per_page && bh[i]; i++) 874 for (i = 0; i < groups_per_page && bh[i]; i++)
875 wait_on_buffer(bh[i]); 875 wait_on_buffer(bh[i]);
876 876
877 err = -EIO; 877 err = -EIO;
878 for (i = 0; i < groups_per_page && bh[i]; i++) 878 for (i = 0; i < groups_per_page && bh[i]; i++)
879 if (!buffer_uptodate(bh[i])) 879 if (!buffer_uptodate(bh[i]))
880 goto out; 880 goto out;
881 881
882 err = 0; 882 err = 0;
883 first_block = page->index * blocks_per_page; 883 first_block = page->index * blocks_per_page;
884 /* init the page */ 884 /* init the page */
885 memset(page_address(page), 0xff, PAGE_CACHE_SIZE); 885 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
886 for (i = 0; i < blocks_per_page; i++) { 886 for (i = 0; i < blocks_per_page; i++) {
887 int group; 887 int group;
888 struct ext4_group_info *grinfo; 888 struct ext4_group_info *grinfo;
889 889
890 group = (first_block + i) >> 1; 890 group = (first_block + i) >> 1;
891 if (group >= ngroups) 891 if (group >= ngroups)
892 break; 892 break;
893 893
894 /* 894 /*
895 * data carry information regarding this 895 * data carry information regarding this
896 * particular group in the format specified 896 * particular group in the format specified
897 * above 897 * above
898 * 898 *
899 */ 899 */
900 data = page_address(page) + (i * blocksize); 900 data = page_address(page) + (i * blocksize);
901 bitmap = bh[group - first_group]->b_data; 901 bitmap = bh[group - first_group]->b_data;
902 902
903 /* 903 /*
904 * We place the buddy block and bitmap block 904 * We place the buddy block and bitmap block
905 * close together 905 * close together
906 */ 906 */
907 if ((first_block + i) & 1) { 907 if ((first_block + i) & 1) {
908 /* this is block of buddy */ 908 /* this is block of buddy */
909 BUG_ON(incore == NULL); 909 BUG_ON(incore == NULL);
910 mb_debug(1, "put buddy for group %u in page %lu/%x\n", 910 mb_debug(1, "put buddy for group %u in page %lu/%x\n",
911 group, page->index, i * blocksize); 911 group, page->index, i * blocksize);
912 trace_ext4_mb_buddy_bitmap_load(sb, group); 912 trace_ext4_mb_buddy_bitmap_load(sb, group);
913 grinfo = ext4_get_group_info(sb, group); 913 grinfo = ext4_get_group_info(sb, group);
914 grinfo->bb_fragments = 0; 914 grinfo->bb_fragments = 0;
915 memset(grinfo->bb_counters, 0, 915 memset(grinfo->bb_counters, 0,
916 sizeof(*grinfo->bb_counters) * 916 sizeof(*grinfo->bb_counters) *
917 (sb->s_blocksize_bits+2)); 917 (sb->s_blocksize_bits+2));
918 /* 918 /*
919 * incore got set to the group block bitmap below 919 * incore got set to the group block bitmap below
920 */ 920 */
921 ext4_lock_group(sb, group); 921 ext4_lock_group(sb, group);
922 ext4_mb_generate_buddy(sb, data, incore, group); 922 ext4_mb_generate_buddy(sb, data, incore, group);
923 ext4_unlock_group(sb, group); 923 ext4_unlock_group(sb, group);
924 incore = NULL; 924 incore = NULL;
925 } else { 925 } else {
926 /* this is block of bitmap */ 926 /* this is block of bitmap */
927 BUG_ON(incore != NULL); 927 BUG_ON(incore != NULL);
928 mb_debug(1, "put bitmap for group %u in page %lu/%x\n", 928 mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
929 group, page->index, i * blocksize); 929 group, page->index, i * blocksize);
930 trace_ext4_mb_bitmap_load(sb, group); 930 trace_ext4_mb_bitmap_load(sb, group);
931 931
932 /* see comments in ext4_mb_put_pa() */ 932 /* see comments in ext4_mb_put_pa() */
933 ext4_lock_group(sb, group); 933 ext4_lock_group(sb, group);
934 memcpy(data, bitmap, blocksize); 934 memcpy(data, bitmap, blocksize);
935 935
936 /* mark all preallocated blks used in in-core bitmap */ 936 /* mark all preallocated blks used in in-core bitmap */
937 ext4_mb_generate_from_pa(sb, data, group); 937 ext4_mb_generate_from_pa(sb, data, group);
938 ext4_mb_generate_from_freelist(sb, data, group); 938 ext4_mb_generate_from_freelist(sb, data, group);
939 ext4_unlock_group(sb, group); 939 ext4_unlock_group(sb, group);
940 940
941 /* set incore so that the buddy information can be 941 /* set incore so that the buddy information can be
942 * generated using this 942 * generated using this
943 */ 943 */
944 incore = data; 944 incore = data;
945 } 945 }
946 } 946 }
947 SetPageUptodate(page); 947 SetPageUptodate(page);
948 948
949 out: 949 out:
950 if (bh) { 950 if (bh) {
951 for (i = 0; i < groups_per_page && bh[i]; i++) 951 for (i = 0; i < groups_per_page && bh[i]; i++)
952 brelse(bh[i]); 952 brelse(bh[i]);
953 if (bh != &bhs) 953 if (bh != &bhs)
954 kfree(bh); 954 kfree(bh);
955 } 955 }
956 return err; 956 return err;
957 } 957 }
958 958
959 /* 959 /*
960 * lock the group_info alloc_sem of all the groups 960 * lock the group_info alloc_sem of all the groups
961 * belonging to the same buddy cache page. This 961 * belonging to the same buddy cache page. This
962 * make sure other parallel operation on the buddy 962 * make sure other parallel operation on the buddy
963 * cache doesn't happen whild holding the buddy cache 963 * cache doesn't happen whild holding the buddy cache
964 * lock 964 * lock
965 */ 965 */
966 static int ext4_mb_get_buddy_cache_lock(struct super_block *sb, 966 static int ext4_mb_get_buddy_cache_lock(struct super_block *sb,
967 ext4_group_t group) 967 ext4_group_t group)
968 { 968 {
969 int i; 969 int i;
970 int block, pnum; 970 int block, pnum;
971 int blocks_per_page; 971 int blocks_per_page;
972 int groups_per_page; 972 int groups_per_page;
973 ext4_group_t ngroups = ext4_get_groups_count(sb); 973 ext4_group_t ngroups = ext4_get_groups_count(sb);
974 ext4_group_t first_group; 974 ext4_group_t first_group;
975 struct ext4_group_info *grp; 975 struct ext4_group_info *grp;
976 976
977 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 977 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
978 /* 978 /*
979 * the buddy cache inode stores the block bitmap 979 * the buddy cache inode stores the block bitmap
980 * and buddy information in consecutive blocks. 980 * and buddy information in consecutive blocks.
981 * So for each group we need two blocks. 981 * So for each group we need two blocks.
982 */ 982 */
983 block = group * 2; 983 block = group * 2;
984 pnum = block / blocks_per_page; 984 pnum = block / blocks_per_page;
985 first_group = pnum * blocks_per_page / 2; 985 first_group = pnum * blocks_per_page / 2;
986 986
987 groups_per_page = blocks_per_page >> 1; 987 groups_per_page = blocks_per_page >> 1;
988 if (groups_per_page == 0) 988 if (groups_per_page == 0)
989 groups_per_page = 1; 989 groups_per_page = 1;
990 /* read all groups the page covers into the cache */ 990 /* read all groups the page covers into the cache */
991 for (i = 0; i < groups_per_page; i++) { 991 for (i = 0; i < groups_per_page; i++) {
992 992
993 if ((first_group + i) >= ngroups) 993 if ((first_group + i) >= ngroups)
994 break; 994 break;
995 grp = ext4_get_group_info(sb, first_group + i); 995 grp = ext4_get_group_info(sb, first_group + i);
996 /* take all groups write allocation 996 /* take all groups write allocation
997 * semaphore. This make sure there is 997 * semaphore. This make sure there is
998 * no block allocation going on in any 998 * no block allocation going on in any
999 * of that groups 999 * of that groups
1000 */ 1000 */
1001 down_write_nested(&grp->alloc_sem, i); 1001 down_write_nested(&grp->alloc_sem, i);
1002 } 1002 }
1003 return i; 1003 return i;
1004 } 1004 }
1005 1005
1006 static void ext4_mb_put_buddy_cache_lock(struct super_block *sb, 1006 static void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
1007 ext4_group_t group, int locked_group) 1007 ext4_group_t group, int locked_group)
1008 { 1008 {
1009 int i; 1009 int i;
1010 int block, pnum; 1010 int block, pnum;
1011 int blocks_per_page; 1011 int blocks_per_page;
1012 ext4_group_t first_group; 1012 ext4_group_t first_group;
1013 struct ext4_group_info *grp; 1013 struct ext4_group_info *grp;
1014 1014
1015 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1015 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1016 /* 1016 /*
1017 * the buddy cache inode stores the block bitmap 1017 * the buddy cache inode stores the block bitmap
1018 * and buddy information in consecutive blocks. 1018 * and buddy information in consecutive blocks.
1019 * So for each group we need two blocks. 1019 * So for each group we need two blocks.
1020 */ 1020 */
1021 block = group * 2; 1021 block = group * 2;
1022 pnum = block / blocks_per_page; 1022 pnum = block / blocks_per_page;
1023 first_group = pnum * blocks_per_page / 2; 1023 first_group = pnum * blocks_per_page / 2;
1024 /* release locks on all the groups */ 1024 /* release locks on all the groups */
1025 for (i = 0; i < locked_group; i++) { 1025 for (i = 0; i < locked_group; i++) {
1026 1026
1027 grp = ext4_get_group_info(sb, first_group + i); 1027 grp = ext4_get_group_info(sb, first_group + i);
1028 /* take all groups write allocation 1028 /* take all groups write allocation
1029 * semaphore. This make sure there is 1029 * semaphore. This make sure there is
1030 * no block allocation going on in any 1030 * no block allocation going on in any
1031 * of that groups 1031 * of that groups
1032 */ 1032 */
1033 up_write(&grp->alloc_sem); 1033 up_write(&grp->alloc_sem);
1034 } 1034 }
1035 1035
1036 } 1036 }
1037 1037
1038 /* 1038 /*
1039 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1039 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1040 * block group lock of all groups for this page; do not hold the BG lock when 1040 * block group lock of all groups for this page; do not hold the BG lock when
1041 * calling this routine! 1041 * calling this routine!
1042 */ 1042 */
1043 static noinline_for_stack 1043 static noinline_for_stack
1044 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) 1044 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1045 { 1045 {
1046 1046
1047 int ret = 0; 1047 int ret = 0;
1048 void *bitmap; 1048 void *bitmap;
1049 int blocks_per_page; 1049 int blocks_per_page;
1050 int block, pnum, poff; 1050 int block, pnum, poff;
1051 int num_grp_locked = 0; 1051 int num_grp_locked = 0;
1052 struct ext4_group_info *this_grp; 1052 struct ext4_group_info *this_grp;
1053 struct ext4_sb_info *sbi = EXT4_SB(sb); 1053 struct ext4_sb_info *sbi = EXT4_SB(sb);
1054 struct inode *inode = sbi->s_buddy_cache; 1054 struct inode *inode = sbi->s_buddy_cache;
1055 struct page *page = NULL, *bitmap_page = NULL; 1055 struct page *page = NULL, *bitmap_page = NULL;
1056 1056
1057 mb_debug(1, "init group %u\n", group); 1057 mb_debug(1, "init group %u\n", group);
1058 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1058 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1059 this_grp = ext4_get_group_info(sb, group); 1059 this_grp = ext4_get_group_info(sb, group);
1060 /* 1060 /*
1061 * This ensures that we don't reinit the buddy cache 1061 * This ensures that we don't reinit the buddy cache
1062 * page which map to the group from which we are already 1062 * page which map to the group from which we are already
1063 * allocating. If we are looking at the buddy cache we would 1063 * allocating. If we are looking at the buddy cache we would
1064 * have taken a reference using ext4_mb_load_buddy and that 1064 * have taken a reference using ext4_mb_load_buddy and that
1065 * would have taken the alloc_sem lock. 1065 * would have taken the alloc_sem lock.
1066 */ 1066 */
1067 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); 1067 num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group);
1068 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { 1068 if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
1069 /* 1069 /*
1070 * somebody initialized the group 1070 * somebody initialized the group
1071 * return without doing anything 1071 * return without doing anything
1072 */ 1072 */
1073 ret = 0; 1073 ret = 0;
1074 goto err; 1074 goto err;
1075 } 1075 }
1076 /* 1076 /*
1077 * the buddy cache inode stores the block bitmap 1077 * the buddy cache inode stores the block bitmap
1078 * and buddy information in consecutive blocks. 1078 * and buddy information in consecutive blocks.
1079 * So for each group we need two blocks. 1079 * So for each group we need two blocks.
1080 */ 1080 */
1081 block = group * 2; 1081 block = group * 2;
1082 pnum = block / blocks_per_page; 1082 pnum = block / blocks_per_page;
1083 poff = block % blocks_per_page; 1083 poff = block % blocks_per_page;
1084 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1084 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1085 if (page) { 1085 if (page) {
1086 BUG_ON(page->mapping != inode->i_mapping); 1086 BUG_ON(page->mapping != inode->i_mapping);
1087 ret = ext4_mb_init_cache(page, NULL); 1087 ret = ext4_mb_init_cache(page, NULL);
1088 if (ret) { 1088 if (ret) {
1089 unlock_page(page); 1089 unlock_page(page);
1090 goto err; 1090 goto err;
1091 } 1091 }
1092 unlock_page(page); 1092 unlock_page(page);
1093 } 1093 }
1094 if (page == NULL || !PageUptodate(page)) { 1094 if (page == NULL || !PageUptodate(page)) {
1095 ret = -EIO; 1095 ret = -EIO;
1096 goto err; 1096 goto err;
1097 } 1097 }
1098 mark_page_accessed(page); 1098 mark_page_accessed(page);
1099 bitmap_page = page; 1099 bitmap_page = page;
1100 bitmap = page_address(page) + (poff * sb->s_blocksize); 1100 bitmap = page_address(page) + (poff * sb->s_blocksize);
1101 1101
1102 /* init buddy cache */ 1102 /* init buddy cache */
1103 block++; 1103 block++;
1104 pnum = block / blocks_per_page; 1104 pnum = block / blocks_per_page;
1105 poff = block % blocks_per_page; 1105 poff = block % blocks_per_page;
1106 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1106 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1107 if (page == bitmap_page) { 1107 if (page == bitmap_page) {
1108 /* 1108 /*
1109 * If both the bitmap and buddy are in 1109 * If both the bitmap and buddy are in
1110 * the same page we don't need to force 1110 * the same page we don't need to force
1111 * init the buddy 1111 * init the buddy
1112 */ 1112 */
1113 unlock_page(page); 1113 unlock_page(page);
1114 } else if (page) { 1114 } else if (page) {
1115 BUG_ON(page->mapping != inode->i_mapping); 1115 BUG_ON(page->mapping != inode->i_mapping);
1116 ret = ext4_mb_init_cache(page, bitmap); 1116 ret = ext4_mb_init_cache(page, bitmap);
1117 if (ret) { 1117 if (ret) {
1118 unlock_page(page); 1118 unlock_page(page);
1119 goto err; 1119 goto err;
1120 } 1120 }
1121 unlock_page(page); 1121 unlock_page(page);
1122 } 1122 }
1123 if (page == NULL || !PageUptodate(page)) { 1123 if (page == NULL || !PageUptodate(page)) {
1124 ret = -EIO; 1124 ret = -EIO;
1125 goto err; 1125 goto err;
1126 } 1126 }
1127 mark_page_accessed(page); 1127 mark_page_accessed(page);
1128 err: 1128 err:
1129 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); 1129 ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
1130 if (bitmap_page) 1130 if (bitmap_page)
1131 page_cache_release(bitmap_page); 1131 page_cache_release(bitmap_page);
1132 if (page) 1132 if (page)
1133 page_cache_release(page); 1133 page_cache_release(page);
1134 return ret; 1134 return ret;
1135 } 1135 }
1136 1136
1137 /* 1137 /*
1138 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1138 * Locking note: This routine calls ext4_mb_init_cache(), which takes the
1139 * block group lock of all groups for this page; do not hold the BG lock when 1139 * block group lock of all groups for this page; do not hold the BG lock when
1140 * calling this routine! 1140 * calling this routine!
1141 */ 1141 */
1142 static noinline_for_stack int 1142 static noinline_for_stack int
1143 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, 1143 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1144 struct ext4_buddy *e4b) 1144 struct ext4_buddy *e4b)
1145 { 1145 {
1146 int blocks_per_page; 1146 int blocks_per_page;
1147 int block; 1147 int block;
1148 int pnum; 1148 int pnum;
1149 int poff; 1149 int poff;
1150 struct page *page; 1150 struct page *page;
1151 int ret; 1151 int ret;
1152 struct ext4_group_info *grp; 1152 struct ext4_group_info *grp;
1153 struct ext4_sb_info *sbi = EXT4_SB(sb); 1153 struct ext4_sb_info *sbi = EXT4_SB(sb);
1154 struct inode *inode = sbi->s_buddy_cache; 1154 struct inode *inode = sbi->s_buddy_cache;
1155 1155
1156 mb_debug(1, "load group %u\n", group); 1156 mb_debug(1, "load group %u\n", group);
1157 1157
1158 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; 1158 blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1159 grp = ext4_get_group_info(sb, group); 1159 grp = ext4_get_group_info(sb, group);
1160 1160
1161 e4b->bd_blkbits = sb->s_blocksize_bits; 1161 e4b->bd_blkbits = sb->s_blocksize_bits;
1162 e4b->bd_info = ext4_get_group_info(sb, group); 1162 e4b->bd_info = ext4_get_group_info(sb, group);
1163 e4b->bd_sb = sb; 1163 e4b->bd_sb = sb;
1164 e4b->bd_group = group; 1164 e4b->bd_group = group;
1165 e4b->bd_buddy_page = NULL; 1165 e4b->bd_buddy_page = NULL;
1166 e4b->bd_bitmap_page = NULL; 1166 e4b->bd_bitmap_page = NULL;
1167 e4b->alloc_semp = &grp->alloc_sem; 1167 e4b->alloc_semp = &grp->alloc_sem;
1168 1168
1169 /* Take the read lock on the group alloc 1169 /* Take the read lock on the group alloc
1170 * sem. This would make sure a parallel 1170 * sem. This would make sure a parallel
1171 * ext4_mb_init_group happening on other 1171 * ext4_mb_init_group happening on other
1172 * groups mapped by the page is blocked 1172 * groups mapped by the page is blocked
1173 * till we are done with allocation 1173 * till we are done with allocation
1174 */ 1174 */
1175 repeat_load_buddy: 1175 repeat_load_buddy:
1176 down_read(e4b->alloc_semp); 1176 down_read(e4b->alloc_semp);
1177 1177
1178 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1178 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1179 /* we need to check for group need init flag 1179 /* we need to check for group need init flag
1180 * with alloc_semp held so that we can be sure 1180 * with alloc_semp held so that we can be sure
1181 * that new blocks didn't get added to the group 1181 * that new blocks didn't get added to the group
1182 * when we are loading the buddy cache 1182 * when we are loading the buddy cache
1183 */ 1183 */
1184 up_read(e4b->alloc_semp); 1184 up_read(e4b->alloc_semp);
1185 /* 1185 /*
1186 * we need full data about the group 1186 * we need full data about the group
1187 * to make a good selection 1187 * to make a good selection
1188 */ 1188 */
1189 ret = ext4_mb_init_group(sb, group); 1189 ret = ext4_mb_init_group(sb, group);
1190 if (ret) 1190 if (ret)
1191 return ret; 1191 return ret;
1192 goto repeat_load_buddy; 1192 goto repeat_load_buddy;
1193 } 1193 }
1194 1194
1195 /* 1195 /*
1196 * the buddy cache inode stores the block bitmap 1196 * the buddy cache inode stores the block bitmap
1197 * and buddy information in consecutive blocks. 1197 * and buddy information in consecutive blocks.
1198 * So for each group we need two blocks. 1198 * So for each group we need two blocks.
1199 */ 1199 */
1200 block = group * 2; 1200 block = group * 2;
1201 pnum = block / blocks_per_page; 1201 pnum = block / blocks_per_page;
1202 poff = block % blocks_per_page; 1202 poff = block % blocks_per_page;
1203 1203
1204 /* we could use find_or_create_page(), but it locks page 1204 /* we could use find_or_create_page(), but it locks page
1205 * what we'd like to avoid in fast path ... */ 1205 * what we'd like to avoid in fast path ... */
1206 page = find_get_page(inode->i_mapping, pnum); 1206 page = find_get_page(inode->i_mapping, pnum);
1207 if (page == NULL || !PageUptodate(page)) { 1207 if (page == NULL || !PageUptodate(page)) {
1208 if (page) 1208 if (page)
1209 /* 1209 /*
1210 * drop the page reference and try 1210 * drop the page reference and try
1211 * to get the page with lock. If we 1211 * to get the page with lock. If we
1212 * are not uptodate that implies 1212 * are not uptodate that implies
1213 * somebody just created the page but 1213 * somebody just created the page but
1214 * is yet to initialize the same. So 1214 * is yet to initialize the same. So
1215 * wait for it to initialize. 1215 * wait for it to initialize.
1216 */ 1216 */
1217 page_cache_release(page); 1217 page_cache_release(page);
1218 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1218 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1219 if (page) { 1219 if (page) {
1220 BUG_ON(page->mapping != inode->i_mapping); 1220 BUG_ON(page->mapping != inode->i_mapping);
1221 if (!PageUptodate(page)) { 1221 if (!PageUptodate(page)) {
1222 ret = ext4_mb_init_cache(page, NULL); 1222 ret = ext4_mb_init_cache(page, NULL);
1223 if (ret) { 1223 if (ret) {
1224 unlock_page(page); 1224 unlock_page(page);
1225 goto err; 1225 goto err;
1226 } 1226 }
1227 mb_cmp_bitmaps(e4b, page_address(page) + 1227 mb_cmp_bitmaps(e4b, page_address(page) +
1228 (poff * sb->s_blocksize)); 1228 (poff * sb->s_blocksize));
1229 } 1229 }
1230 unlock_page(page); 1230 unlock_page(page);
1231 } 1231 }
1232 } 1232 }
1233 if (page == NULL || !PageUptodate(page)) { 1233 if (page == NULL || !PageUptodate(page)) {
1234 ret = -EIO; 1234 ret = -EIO;
1235 goto err; 1235 goto err;
1236 } 1236 }
1237 e4b->bd_bitmap_page = page; 1237 e4b->bd_bitmap_page = page;
1238 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); 1238 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1239 mark_page_accessed(page); 1239 mark_page_accessed(page);
1240 1240
1241 block++; 1241 block++;
1242 pnum = block / blocks_per_page; 1242 pnum = block / blocks_per_page;
1243 poff = block % blocks_per_page; 1243 poff = block % blocks_per_page;
1244 1244
1245 page = find_get_page(inode->i_mapping, pnum); 1245 page = find_get_page(inode->i_mapping, pnum);
1246 if (page == NULL || !PageUptodate(page)) { 1246 if (page == NULL || !PageUptodate(page)) {
1247 if (page) 1247 if (page)
1248 page_cache_release(page); 1248 page_cache_release(page);
1249 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); 1249 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1250 if (page) { 1250 if (page) {
1251 BUG_ON(page->mapping != inode->i_mapping); 1251 BUG_ON(page->mapping != inode->i_mapping);
1252 if (!PageUptodate(page)) { 1252 if (!PageUptodate(page)) {
1253 ret = ext4_mb_init_cache(page, e4b->bd_bitmap); 1253 ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
1254 if (ret) { 1254 if (ret) {
1255 unlock_page(page); 1255 unlock_page(page);
1256 goto err; 1256 goto err;
1257 } 1257 }
1258 } 1258 }
1259 unlock_page(page); 1259 unlock_page(page);
1260 } 1260 }
1261 } 1261 }
1262 if (page == NULL || !PageUptodate(page)) { 1262 if (page == NULL || !PageUptodate(page)) {
1263 ret = -EIO; 1263 ret = -EIO;
1264 goto err; 1264 goto err;
1265 } 1265 }
1266 e4b->bd_buddy_page = page; 1266 e4b->bd_buddy_page = page;
1267 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); 1267 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1268 mark_page_accessed(page); 1268 mark_page_accessed(page);
1269 1269
1270 BUG_ON(e4b->bd_bitmap_page == NULL); 1270 BUG_ON(e4b->bd_bitmap_page == NULL);
1271 BUG_ON(e4b->bd_buddy_page == NULL); 1271 BUG_ON(e4b->bd_buddy_page == NULL);
1272 1272
1273 return 0; 1273 return 0;
1274 1274
1275 err: 1275 err:
1276 if (page) 1276 if (page)
1277 page_cache_release(page); 1277 page_cache_release(page);
1278 if (e4b->bd_bitmap_page) 1278 if (e4b->bd_bitmap_page)
1279 page_cache_release(e4b->bd_bitmap_page); 1279 page_cache_release(e4b->bd_bitmap_page);
1280 if (e4b->bd_buddy_page) 1280 if (e4b->bd_buddy_page)
1281 page_cache_release(e4b->bd_buddy_page); 1281 page_cache_release(e4b->bd_buddy_page);
1282 e4b->bd_buddy = NULL; 1282 e4b->bd_buddy = NULL;
1283 e4b->bd_bitmap = NULL; 1283 e4b->bd_bitmap = NULL;
1284 1284
1285 /* Done with the buddy cache */ 1285 /* Done with the buddy cache */
1286 up_read(e4b->alloc_semp); 1286 up_read(e4b->alloc_semp);
1287 return ret; 1287 return ret;
1288 } 1288 }
1289 1289
1290 static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) 1290 static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1291 { 1291 {
1292 if (e4b->bd_bitmap_page) 1292 if (e4b->bd_bitmap_page)
1293 page_cache_release(e4b->bd_bitmap_page); 1293 page_cache_release(e4b->bd_bitmap_page);
1294 if (e4b->bd_buddy_page) 1294 if (e4b->bd_buddy_page)
1295 page_cache_release(e4b->bd_buddy_page); 1295 page_cache_release(e4b->bd_buddy_page);
1296 /* Done with the buddy cache */ 1296 /* Done with the buddy cache */
1297 if (e4b->alloc_semp) 1297 if (e4b->alloc_semp)
1298 up_read(e4b->alloc_semp); 1298 up_read(e4b->alloc_semp);
1299 } 1299 }
1300 1300
1301 1301
1302 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) 1302 static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1303 { 1303 {
1304 int order = 1; 1304 int order = 1;
1305 void *bb; 1305 void *bb;
1306 1306
1307 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b)); 1307 BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
1308 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); 1308 BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1309 1309
1310 bb = EXT4_MB_BUDDY(e4b); 1310 bb = EXT4_MB_BUDDY(e4b);
1311 while (order <= e4b->bd_blkbits + 1) { 1311 while (order <= e4b->bd_blkbits + 1) {
1312 block = block >> 1; 1312 block = block >> 1;
1313 if (!mb_test_bit(block, bb)) { 1313 if (!mb_test_bit(block, bb)) {
1314 /* this block is part of buddy of order 'order' */ 1314 /* this block is part of buddy of order 'order' */
1315 return order; 1315 return order;
1316 } 1316 }
1317 bb += 1 << (e4b->bd_blkbits - order); 1317 bb += 1 << (e4b->bd_blkbits - order);
1318 order++; 1318 order++;
1319 } 1319 }
1320 return 0; 1320 return 0;
1321 } 1321 }
1322 1322
1323 static void mb_clear_bits(void *bm, int cur, int len) 1323 static void mb_clear_bits(void *bm, int cur, int len)
1324 { 1324 {
1325 __u32 *addr; 1325 __u32 *addr;
1326 1326
1327 len = cur + len; 1327 len = cur + len;
1328 while (cur < len) { 1328 while (cur < len) {
1329 if ((cur & 31) == 0 && (len - cur) >= 32) { 1329 if ((cur & 31) == 0 && (len - cur) >= 32) {
1330 /* fast path: clear whole word at once */ 1330 /* fast path: clear whole word at once */
1331 addr = bm + (cur >> 3); 1331 addr = bm + (cur >> 3);
1332 *addr = 0; 1332 *addr = 0;
1333 cur += 32; 1333 cur += 32;
1334 continue; 1334 continue;
1335 } 1335 }
1336 mb_clear_bit(cur, bm); 1336 mb_clear_bit(cur, bm);
1337 cur++; 1337 cur++;
1338 } 1338 }
1339 } 1339 }
1340 1340
1341 static void mb_set_bits(void *bm, int cur, int len) 1341 static void mb_set_bits(void *bm, int cur, int len)
1342 { 1342 {
1343 __u32 *addr; 1343 __u32 *addr;
1344 1344
1345 len = cur + len; 1345 len = cur + len;
1346 while (cur < len) { 1346 while (cur < len) {
1347 if ((cur & 31) == 0 && (len - cur) >= 32) { 1347 if ((cur & 31) == 0 && (len - cur) >= 32) {
1348 /* fast path: set whole word at once */ 1348 /* fast path: set whole word at once */
1349 addr = bm + (cur >> 3); 1349 addr = bm + (cur >> 3);
1350 *addr = 0xffffffff; 1350 *addr = 0xffffffff;
1351 cur += 32; 1351 cur += 32;
1352 continue; 1352 continue;
1353 } 1353 }
1354 mb_set_bit(cur, bm); 1354 mb_set_bit(cur, bm);
1355 cur++; 1355 cur++;
1356 } 1356 }
1357 } 1357 }
1358 1358
1359 static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, 1359 static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1360 int first, int count) 1360 int first, int count)
1361 { 1361 {
1362 int block = 0; 1362 int block = 0;
1363 int max = 0; 1363 int max = 0;
1364 int order; 1364 int order;
1365 void *buddy; 1365 void *buddy;
1366 void *buddy2; 1366 void *buddy2;
1367 struct super_block *sb = e4b->bd_sb; 1367 struct super_block *sb = e4b->bd_sb;
1368 1368
1369 BUG_ON(first + count > (sb->s_blocksize << 3)); 1369 BUG_ON(first + count > (sb->s_blocksize << 3));
1370 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); 1370 assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1371 mb_check_buddy(e4b); 1371 mb_check_buddy(e4b);
1372 mb_free_blocks_double(inode, e4b, first, count); 1372 mb_free_blocks_double(inode, e4b, first, count);
1373 1373
1374 e4b->bd_info->bb_free += count; 1374 e4b->bd_info->bb_free += count;
1375 if (first < e4b->bd_info->bb_first_free) 1375 if (first < e4b->bd_info->bb_first_free)
1376 e4b->bd_info->bb_first_free = first; 1376 e4b->bd_info->bb_first_free = first;
1377 1377
1378 /* let's maintain fragments counter */ 1378 /* let's maintain fragments counter */
1379 if (first != 0) 1379 if (first != 0)
1380 block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b)); 1380 block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b));
1381 if (first + count < EXT4_SB(sb)->s_mb_maxs[0]) 1381 if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
1382 max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b)); 1382 max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b));
1383 if (block && max) 1383 if (block && max)
1384 e4b->bd_info->bb_fragments--; 1384 e4b->bd_info->bb_fragments--;
1385 else if (!block && !max) 1385 else if (!block && !max)
1386 e4b->bd_info->bb_fragments++; 1386 e4b->bd_info->bb_fragments++;
1387 1387
1388 /* let's maintain buddy itself */ 1388 /* let's maintain buddy itself */
1389 while (count-- > 0) { 1389 while (count-- > 0) {
1390 block = first++; 1390 block = first++;
1391 order = 0; 1391 order = 0;
1392 1392
1393 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) { 1393 if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
1394 ext4_fsblk_t blocknr; 1394 ext4_fsblk_t blocknr;
1395 1395
1396 blocknr = ext4_group_first_block_no(sb, e4b->bd_group); 1396 blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1397 blocknr += block; 1397 blocknr += block;
1398 ext4_grp_locked_error(sb, e4b->bd_group, 1398 ext4_grp_locked_error(sb, e4b->bd_group,
1399 inode ? inode->i_ino : 0, 1399 inode ? inode->i_ino : 0,
1400 blocknr, 1400 blocknr,
1401 "freeing already freed block " 1401 "freeing already freed block "
1402 "(bit %u)", block); 1402 "(bit %u)", block);
1403 } 1403 }
1404 mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); 1404 mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
1405 e4b->bd_info->bb_counters[order]++; 1405 e4b->bd_info->bb_counters[order]++;
1406 1406
1407 /* start of the buddy */ 1407 /* start of the buddy */
1408 buddy = mb_find_buddy(e4b, order, &max); 1408 buddy = mb_find_buddy(e4b, order, &max);
1409 1409
1410 do { 1410 do {
1411 block &= ~1UL; 1411 block &= ~1UL;
1412 if (mb_test_bit(block, buddy) || 1412 if (mb_test_bit(block, buddy) ||
1413 mb_test_bit(block + 1, buddy)) 1413 mb_test_bit(block + 1, buddy))
1414 break; 1414 break;
1415 1415
1416 /* both the buddies are free, try to coalesce them */ 1416 /* both the buddies are free, try to coalesce them */
1417 buddy2 = mb_find_buddy(e4b, order + 1, &max); 1417 buddy2 = mb_find_buddy(e4b, order + 1, &max);
1418 1418
1419 if (!buddy2) 1419 if (!buddy2)
1420 break; 1420 break;
1421 1421
1422 if (order > 0) { 1422 if (order > 0) {
1423 /* for special purposes, we don't set 1423 /* for special purposes, we don't set
1424 * free bits in bitmap */ 1424 * free bits in bitmap */
1425 mb_set_bit(block, buddy); 1425 mb_set_bit(block, buddy);
1426 mb_set_bit(block + 1, buddy); 1426 mb_set_bit(block + 1, buddy);
1427 } 1427 }
1428 e4b->bd_info->bb_counters[order]--; 1428 e4b->bd_info->bb_counters[order]--;
1429 e4b->bd_info->bb_counters[order]--; 1429 e4b->bd_info->bb_counters[order]--;
1430 1430
1431 block = block >> 1; 1431 block = block >> 1;
1432 order++; 1432 order++;
1433 e4b->bd_info->bb_counters[order]++; 1433 e4b->bd_info->bb_counters[order]++;
1434 1434
1435 mb_clear_bit(block, buddy2); 1435 mb_clear_bit(block, buddy2);
1436 buddy = buddy2; 1436 buddy = buddy2;
1437 } while (1); 1437 } while (1);
1438 } 1438 }
1439 mb_set_largest_free_order(sb, e4b->bd_info); 1439 mb_set_largest_free_order(sb, e4b->bd_info);
1440 mb_check_buddy(e4b); 1440 mb_check_buddy(e4b);
1441 } 1441 }
1442 1442
1443 static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, 1443 static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
1444 int needed, struct ext4_free_extent *ex) 1444 int needed, struct ext4_free_extent *ex)
1445 { 1445 {
1446 int next = block; 1446 int next = block;
1447 int max; 1447 int max;
1448 int ord; 1448 int ord;
1449 void *buddy; 1449 void *buddy;
1450 1450
1451 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 1451 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1452 BUG_ON(ex == NULL); 1452 BUG_ON(ex == NULL);
1453 1453
1454 buddy = mb_find_buddy(e4b, order, &max); 1454 buddy = mb_find_buddy(e4b, order, &max);
1455 BUG_ON(buddy == NULL); 1455 BUG_ON(buddy == NULL);
1456 BUG_ON(block >= max); 1456 BUG_ON(block >= max);
1457 if (mb_test_bit(block, buddy)) { 1457 if (mb_test_bit(block, buddy)) {
1458 ex->fe_len = 0; 1458 ex->fe_len = 0;
1459 ex->fe_start = 0; 1459 ex->fe_start = 0;
1460 ex->fe_group = 0; 1460 ex->fe_group = 0;
1461 return 0; 1461 return 0;
1462 } 1462 }
1463 1463
1464 /* FIXME dorp order completely ? */ 1464 /* FIXME dorp order completely ? */
1465 if (likely(order == 0)) { 1465 if (likely(order == 0)) {
1466 /* find actual order */ 1466 /* find actual order */
1467 order = mb_find_order_for_block(e4b, block); 1467 order = mb_find_order_for_block(e4b, block);
1468 block = block >> order; 1468 block = block >> order;
1469 } 1469 }
1470 1470
1471 ex->fe_len = 1 << order; 1471 ex->fe_len = 1 << order;
1472 ex->fe_start = block << order; 1472 ex->fe_start = block << order;
1473 ex->fe_group = e4b->bd_group; 1473 ex->fe_group = e4b->bd_group;
1474 1474
1475 /* calc difference from given start */ 1475 /* calc difference from given start */
1476 next = next - ex->fe_start; 1476 next = next - ex->fe_start;
1477 ex->fe_len -= next; 1477 ex->fe_len -= next;
1478 ex->fe_start += next; 1478 ex->fe_start += next;
1479 1479
1480 while (needed > ex->fe_len && 1480 while (needed > ex->fe_len &&
1481 (buddy = mb_find_buddy(e4b, order, &max))) { 1481 (buddy = mb_find_buddy(e4b, order, &max))) {
1482 1482
1483 if (block + 1 >= max) 1483 if (block + 1 >= max)
1484 break; 1484 break;
1485 1485
1486 next = (block + 1) * (1 << order); 1486 next = (block + 1) * (1 << order);
1487 if (mb_test_bit(next, EXT4_MB_BITMAP(e4b))) 1487 if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
1488 break; 1488 break;
1489 1489
1490 ord = mb_find_order_for_block(e4b, next); 1490 ord = mb_find_order_for_block(e4b, next);
1491 1491
1492 order = ord; 1492 order = ord;
1493 block = next >> order; 1493 block = next >> order;
1494 ex->fe_len += 1 << order; 1494 ex->fe_len += 1 << order;
1495 } 1495 }
1496 1496
1497 BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))); 1497 BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
1498 return ex->fe_len; 1498 return ex->fe_len;
1499 } 1499 }
1500 1500
1501 static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) 1501 static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1502 { 1502 {
1503 int ord; 1503 int ord;
1504 int mlen = 0; 1504 int mlen = 0;
1505 int max = 0; 1505 int max = 0;
1506 int cur; 1506 int cur;
1507 int start = ex->fe_start; 1507 int start = ex->fe_start;
1508 int len = ex->fe_len; 1508 int len = ex->fe_len;
1509 unsigned ret = 0; 1509 unsigned ret = 0;
1510 int len0 = len; 1510 int len0 = len;
1511 void *buddy; 1511 void *buddy;
1512 1512
1513 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); 1513 BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
1514 BUG_ON(e4b->bd_group != ex->fe_group); 1514 BUG_ON(e4b->bd_group != ex->fe_group);
1515 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); 1515 assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1516 mb_check_buddy(e4b); 1516 mb_check_buddy(e4b);
1517 mb_mark_used_double(e4b, start, len); 1517 mb_mark_used_double(e4b, start, len);
1518 1518
1519 e4b->bd_info->bb_free -= len; 1519 e4b->bd_info->bb_free -= len;
1520 if (e4b->bd_info->bb_first_free == start) 1520 if (e4b->bd_info->bb_first_free == start)
1521 e4b->bd_info->bb_first_free += len; 1521 e4b->bd_info->bb_first_free += len;
1522 1522
1523 /* let's maintain fragments counter */ 1523 /* let's maintain fragments counter */
1524 if (start != 0) 1524 if (start != 0)
1525 mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b)); 1525 mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b));
1526 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) 1526 if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
1527 max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b)); 1527 max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b));
1528 if (mlen && max) 1528 if (mlen && max)
1529 e4b->bd_info->bb_fragments++; 1529 e4b->bd_info->bb_fragments++;
1530 else if (!mlen && !max) 1530 else if (!mlen && !max)
1531 e4b->bd_info->bb_fragments--; 1531 e4b->bd_info->bb_fragments--;
1532 1532
1533 /* let's maintain buddy itself */ 1533 /* let's maintain buddy itself */
1534 while (len) { 1534 while (len) {
1535 ord = mb_find_order_for_block(e4b, start); 1535 ord = mb_find_order_for_block(e4b, start);
1536 1536
1537 if (((start >> ord) << ord) == start && len >= (1 << ord)) { 1537 if (((start >> ord) << ord) == start && len >= (1 << ord)) {
1538 /* the whole chunk may be allocated at once! */ 1538 /* the whole chunk may be allocated at once! */
1539 mlen = 1 << ord; 1539 mlen = 1 << ord;
1540 buddy = mb_find_buddy(e4b, ord, &max); 1540 buddy = mb_find_buddy(e4b, ord, &max);
1541 BUG_ON((start >> ord) >= max); 1541 BUG_ON((start >> ord) >= max);
1542 mb_set_bit(start >> ord, buddy); 1542 mb_set_bit(start >> ord, buddy);
1543 e4b->bd_info->bb_counters[ord]--; 1543 e4b->bd_info->bb_counters[ord]--;
1544 start += mlen; 1544 start += mlen;
1545 len -= mlen; 1545 len -= mlen;
1546 BUG_ON(len < 0); 1546 BUG_ON(len < 0);
1547 continue; 1547 continue;
1548 } 1548 }
1549 1549
1550 /* store for history */ 1550 /* store for history */
1551 if (ret == 0) 1551 if (ret == 0)
1552 ret = len | (ord << 16); 1552 ret = len | (ord << 16);
1553 1553
1554 /* we have to split large buddy */ 1554 /* we have to split large buddy */
1555 BUG_ON(ord <= 0); 1555 BUG_ON(ord <= 0);
1556 buddy = mb_find_buddy(e4b, ord, &max); 1556 buddy = mb_find_buddy(e4b, ord, &max);
1557 mb_set_bit(start >> ord, buddy); 1557 mb_set_bit(start >> ord, buddy);
1558 e4b->bd_info->bb_counters[ord]--; 1558 e4b->bd_info->bb_counters[ord]--;
1559 1559
1560 ord--; 1560 ord--;
1561 cur = (start >> ord) & ~1U; 1561 cur = (start >> ord) & ~1U;
1562 buddy = mb_find_buddy(e4b, ord, &max); 1562 buddy = mb_find_buddy(e4b, ord, &max);
1563 mb_clear_bit(cur, buddy); 1563 mb_clear_bit(cur, buddy);
1564 mb_clear_bit(cur + 1, buddy); 1564 mb_clear_bit(cur + 1, buddy);
1565 e4b->bd_info->bb_counters[ord]++; 1565 e4b->bd_info->bb_counters[ord]++;
1566 e4b->bd_info->bb_counters[ord]++; 1566 e4b->bd_info->bb_counters[ord]++;
1567 } 1567 }
1568 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); 1568 mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1569 1569
1570 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); 1570 mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
1571 mb_check_buddy(e4b); 1571 mb_check_buddy(e4b);
1572 1572
1573 return ret; 1573 return ret;
1574 } 1574 }
1575 1575
1576 /* 1576 /*
1577 * Must be called under group lock! 1577 * Must be called under group lock!
1578 */ 1578 */
1579 static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, 1579 static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1580 struct ext4_buddy *e4b) 1580 struct ext4_buddy *e4b)
1581 { 1581 {
1582 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1582 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1583 int ret; 1583 int ret;
1584 1584
1585 BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); 1585 BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
1586 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 1586 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1587 1587
1588 ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); 1588 ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
1589 ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; 1589 ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
1590 ret = mb_mark_used(e4b, &ac->ac_b_ex); 1590 ret = mb_mark_used(e4b, &ac->ac_b_ex);
1591 1591
1592 /* preallocation can change ac_b_ex, thus we store actually 1592 /* preallocation can change ac_b_ex, thus we store actually
1593 * allocated blocks for history */ 1593 * allocated blocks for history */
1594 ac->ac_f_ex = ac->ac_b_ex; 1594 ac->ac_f_ex = ac->ac_b_ex;
1595 1595
1596 ac->ac_status = AC_STATUS_FOUND; 1596 ac->ac_status = AC_STATUS_FOUND;
1597 ac->ac_tail = ret & 0xffff; 1597 ac->ac_tail = ret & 0xffff;
1598 ac->ac_buddy = ret >> 16; 1598 ac->ac_buddy = ret >> 16;
1599 1599
1600 /* 1600 /*
1601 * take the page reference. We want the page to be pinned 1601 * take the page reference. We want the page to be pinned
1602 * so that we don't get a ext4_mb_init_cache_call for this 1602 * so that we don't get a ext4_mb_init_cache_call for this
1603 * group until we update the bitmap. That would mean we 1603 * group until we update the bitmap. That would mean we
1604 * double allocate blocks. The reference is dropped 1604 * double allocate blocks. The reference is dropped
1605 * in ext4_mb_release_context 1605 * in ext4_mb_release_context
1606 */ 1606 */
1607 ac->ac_bitmap_page = e4b->bd_bitmap_page; 1607 ac->ac_bitmap_page = e4b->bd_bitmap_page;
1608 get_page(ac->ac_bitmap_page); 1608 get_page(ac->ac_bitmap_page);
1609 ac->ac_buddy_page = e4b->bd_buddy_page; 1609 ac->ac_buddy_page = e4b->bd_buddy_page;
1610 get_page(ac->ac_buddy_page); 1610 get_page(ac->ac_buddy_page);
1611 /* on allocation we use ac to track the held semaphore */ 1611 /* on allocation we use ac to track the held semaphore */
1612 ac->alloc_semp = e4b->alloc_semp; 1612 ac->alloc_semp = e4b->alloc_semp;
1613 e4b->alloc_semp = NULL; 1613 e4b->alloc_semp = NULL;
1614 /* store last allocated for subsequent stream allocation */ 1614 /* store last allocated for subsequent stream allocation */
1615 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 1615 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1616 spin_lock(&sbi->s_md_lock); 1616 spin_lock(&sbi->s_md_lock);
1617 sbi->s_mb_last_group = ac->ac_f_ex.fe_group; 1617 sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1618 sbi->s_mb_last_start = ac->ac_f_ex.fe_start; 1618 sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
1619 spin_unlock(&sbi->s_md_lock); 1619 spin_unlock(&sbi->s_md_lock);
1620 } 1620 }
1621 } 1621 }
1622 1622
1623 /* 1623 /*
1624 * regular allocator, for general purposes allocation 1624 * regular allocator, for general purposes allocation
1625 */ 1625 */
1626 1626
1627 static void ext4_mb_check_limits(struct ext4_allocation_context *ac, 1627 static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1628 struct ext4_buddy *e4b, 1628 struct ext4_buddy *e4b,
1629 int finish_group) 1629 int finish_group)
1630 { 1630 {
1631 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1631 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1632 struct ext4_free_extent *bex = &ac->ac_b_ex; 1632 struct ext4_free_extent *bex = &ac->ac_b_ex;
1633 struct ext4_free_extent *gex = &ac->ac_g_ex; 1633 struct ext4_free_extent *gex = &ac->ac_g_ex;
1634 struct ext4_free_extent ex; 1634 struct ext4_free_extent ex;
1635 int max; 1635 int max;
1636 1636
1637 if (ac->ac_status == AC_STATUS_FOUND) 1637 if (ac->ac_status == AC_STATUS_FOUND)
1638 return; 1638 return;
1639 /* 1639 /*
1640 * We don't want to scan for a whole year 1640 * We don't want to scan for a whole year
1641 */ 1641 */
1642 if (ac->ac_found > sbi->s_mb_max_to_scan && 1642 if (ac->ac_found > sbi->s_mb_max_to_scan &&
1643 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 1643 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1644 ac->ac_status = AC_STATUS_BREAK; 1644 ac->ac_status = AC_STATUS_BREAK;
1645 return; 1645 return;
1646 } 1646 }
1647 1647
1648 /* 1648 /*
1649 * Haven't found good chunk so far, let's continue 1649 * Haven't found good chunk so far, let's continue
1650 */ 1650 */
1651 if (bex->fe_len < gex->fe_len) 1651 if (bex->fe_len < gex->fe_len)
1652 return; 1652 return;
1653 1653
1654 if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) 1654 if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
1655 && bex->fe_group == e4b->bd_group) { 1655 && bex->fe_group == e4b->bd_group) {
1656 /* recheck chunk's availability - we don't know 1656 /* recheck chunk's availability - we don't know
1657 * when it was found (within this lock-unlock 1657 * when it was found (within this lock-unlock
1658 * period or not) */ 1658 * period or not) */
1659 max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex); 1659 max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
1660 if (max >= gex->fe_len) { 1660 if (max >= gex->fe_len) {
1661 ext4_mb_use_best_found(ac, e4b); 1661 ext4_mb_use_best_found(ac, e4b);
1662 return; 1662 return;
1663 } 1663 }
1664 } 1664 }
1665 } 1665 }
1666 1666
1667 /* 1667 /*
1668 * The routine checks whether found extent is good enough. If it is, 1668 * The routine checks whether found extent is good enough. If it is,
1669 * then the extent gets marked used and flag is set to the context 1669 * then the extent gets marked used and flag is set to the context
1670 * to stop scanning. Otherwise, the extent is compared with the 1670 * to stop scanning. Otherwise, the extent is compared with the
1671 * previous found extent and if new one is better, then it's stored 1671 * previous found extent and if new one is better, then it's stored
1672 * in the context. Later, the best found extent will be used, if 1672 * in the context. Later, the best found extent will be used, if
1673 * mballoc can't find good enough extent. 1673 * mballoc can't find good enough extent.
1674 * 1674 *
1675 * FIXME: real allocation policy is to be designed yet! 1675 * FIXME: real allocation policy is to be designed yet!
1676 */ 1676 */
1677 static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, 1677 static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
1678 struct ext4_free_extent *ex, 1678 struct ext4_free_extent *ex,
1679 struct ext4_buddy *e4b) 1679 struct ext4_buddy *e4b)
1680 { 1680 {
1681 struct ext4_free_extent *bex = &ac->ac_b_ex; 1681 struct ext4_free_extent *bex = &ac->ac_b_ex;
1682 struct ext4_free_extent *gex = &ac->ac_g_ex; 1682 struct ext4_free_extent *gex = &ac->ac_g_ex;
1683 1683
1684 BUG_ON(ex->fe_len <= 0); 1684 BUG_ON(ex->fe_len <= 0);
1685 BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 1685 BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1686 BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 1686 BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1687 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); 1687 BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1688 1688
1689 ac->ac_found++; 1689 ac->ac_found++;
1690 1690
1691 /* 1691 /*
1692 * The special case - take what you catch first 1692 * The special case - take what you catch first
1693 */ 1693 */
1694 if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 1694 if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1695 *bex = *ex; 1695 *bex = *ex;
1696 ext4_mb_use_best_found(ac, e4b); 1696 ext4_mb_use_best_found(ac, e4b);
1697 return; 1697 return;
1698 } 1698 }
1699 1699
1700 /* 1700 /*
1701 * Let's check whether the chuck is good enough 1701 * Let's check whether the chuck is good enough
1702 */ 1702 */
1703 if (ex->fe_len == gex->fe_len) { 1703 if (ex->fe_len == gex->fe_len) {
1704 *bex = *ex; 1704 *bex = *ex;
1705 ext4_mb_use_best_found(ac, e4b); 1705 ext4_mb_use_best_found(ac, e4b);
1706 return; 1706 return;
1707 } 1707 }
1708 1708
1709 /* 1709 /*
1710 * If this is first found extent, just store it in the context 1710 * If this is first found extent, just store it in the context
1711 */ 1711 */
1712 if (bex->fe_len == 0) { 1712 if (bex->fe_len == 0) {
1713 *bex = *ex; 1713 *bex = *ex;
1714 return; 1714 return;
1715 } 1715 }
1716 1716
1717 /* 1717 /*
1718 * If new found extent is better, store it in the context 1718 * If new found extent is better, store it in the context
1719 */ 1719 */
1720 if (bex->fe_len < gex->fe_len) { 1720 if (bex->fe_len < gex->fe_len) {
1721 /* if the request isn't satisfied, any found extent 1721 /* if the request isn't satisfied, any found extent
1722 * larger than previous best one is better */ 1722 * larger than previous best one is better */
1723 if (ex->fe_len > bex->fe_len) 1723 if (ex->fe_len > bex->fe_len)
1724 *bex = *ex; 1724 *bex = *ex;
1725 } else if (ex->fe_len > gex->fe_len) { 1725 } else if (ex->fe_len > gex->fe_len) {
1726 /* if the request is satisfied, then we try to find 1726 /* if the request is satisfied, then we try to find
1727 * an extent that still satisfy the request, but is 1727 * an extent that still satisfy the request, but is
1728 * smaller than previous one */ 1728 * smaller than previous one */
1729 if (ex->fe_len < bex->fe_len) 1729 if (ex->fe_len < bex->fe_len)
1730 *bex = *ex; 1730 *bex = *ex;
1731 } 1731 }
1732 1732
1733 ext4_mb_check_limits(ac, e4b, 0); 1733 ext4_mb_check_limits(ac, e4b, 0);
1734 } 1734 }
1735 1735
1736 static noinline_for_stack 1736 static noinline_for_stack
1737 int ext4_mb_try_best_found(struct ext4_allocation_context *ac, 1737 int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1738 struct ext4_buddy *e4b) 1738 struct ext4_buddy *e4b)
1739 { 1739 {
1740 struct ext4_free_extent ex = ac->ac_b_ex; 1740 struct ext4_free_extent ex = ac->ac_b_ex;
1741 ext4_group_t group = ex.fe_group; 1741 ext4_group_t group = ex.fe_group;
1742 int max; 1742 int max;
1743 int err; 1743 int err;
1744 1744
1745 BUG_ON(ex.fe_len <= 0); 1745 BUG_ON(ex.fe_len <= 0);
1746 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 1746 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1747 if (err) 1747 if (err)
1748 return err; 1748 return err;
1749 1749
1750 ext4_lock_group(ac->ac_sb, group); 1750 ext4_lock_group(ac->ac_sb, group);
1751 max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex); 1751 max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
1752 1752
1753 if (max > 0) { 1753 if (max > 0) {
1754 ac->ac_b_ex = ex; 1754 ac->ac_b_ex = ex;
1755 ext4_mb_use_best_found(ac, e4b); 1755 ext4_mb_use_best_found(ac, e4b);
1756 } 1756 }
1757 1757
1758 ext4_unlock_group(ac->ac_sb, group); 1758 ext4_unlock_group(ac->ac_sb, group);
1759 ext4_mb_unload_buddy(e4b); 1759 ext4_mb_unload_buddy(e4b);
1760 1760
1761 return 0; 1761 return 0;
1762 } 1762 }
1763 1763
1764 static noinline_for_stack 1764 static noinline_for_stack
1765 int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, 1765 int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1766 struct ext4_buddy *e4b) 1766 struct ext4_buddy *e4b)
1767 { 1767 {
1768 ext4_group_t group = ac->ac_g_ex.fe_group; 1768 ext4_group_t group = ac->ac_g_ex.fe_group;
1769 int max; 1769 int max;
1770 int err; 1770 int err;
1771 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1771 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1772 struct ext4_free_extent ex; 1772 struct ext4_free_extent ex;
1773 1773
1774 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) 1774 if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
1775 return 0; 1775 return 0;
1776 1776
1777 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); 1777 err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1778 if (err) 1778 if (err)
1779 return err; 1779 return err;
1780 1780
1781 ext4_lock_group(ac->ac_sb, group); 1781 ext4_lock_group(ac->ac_sb, group);
1782 max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start, 1782 max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
1783 ac->ac_g_ex.fe_len, &ex); 1783 ac->ac_g_ex.fe_len, &ex);
1784 1784
1785 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) { 1785 if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1786 ext4_fsblk_t start; 1786 ext4_fsblk_t start;
1787 1787
1788 start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) + 1788 start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
1789 ex.fe_start; 1789 ex.fe_start;
1790 /* use do_div to get remainder (would be 64-bit modulo) */ 1790 /* use do_div to get remainder (would be 64-bit modulo) */
1791 if (do_div(start, sbi->s_stripe) == 0) { 1791 if (do_div(start, sbi->s_stripe) == 0) {
1792 ac->ac_found++; 1792 ac->ac_found++;
1793 ac->ac_b_ex = ex; 1793 ac->ac_b_ex = ex;
1794 ext4_mb_use_best_found(ac, e4b); 1794 ext4_mb_use_best_found(ac, e4b);
1795 } 1795 }
1796 } else if (max >= ac->ac_g_ex.fe_len) { 1796 } else if (max >= ac->ac_g_ex.fe_len) {
1797 BUG_ON(ex.fe_len <= 0); 1797 BUG_ON(ex.fe_len <= 0);
1798 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 1798 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1799 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 1799 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1800 ac->ac_found++; 1800 ac->ac_found++;
1801 ac->ac_b_ex = ex; 1801 ac->ac_b_ex = ex;
1802 ext4_mb_use_best_found(ac, e4b); 1802 ext4_mb_use_best_found(ac, e4b);
1803 } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { 1803 } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
1804 /* Sometimes, caller may want to merge even small 1804 /* Sometimes, caller may want to merge even small
1805 * number of blocks to an existing extent */ 1805 * number of blocks to an existing extent */
1806 BUG_ON(ex.fe_len <= 0); 1806 BUG_ON(ex.fe_len <= 0);
1807 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); 1807 BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1808 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); 1808 BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1809 ac->ac_found++; 1809 ac->ac_found++;
1810 ac->ac_b_ex = ex; 1810 ac->ac_b_ex = ex;
1811 ext4_mb_use_best_found(ac, e4b); 1811 ext4_mb_use_best_found(ac, e4b);
1812 } 1812 }
1813 ext4_unlock_group(ac->ac_sb, group); 1813 ext4_unlock_group(ac->ac_sb, group);
1814 ext4_mb_unload_buddy(e4b); 1814 ext4_mb_unload_buddy(e4b);
1815 1815
1816 return 0; 1816 return 0;
1817 } 1817 }
1818 1818
1819 /* 1819 /*
1820 * The routine scans buddy structures (not bitmap!) from given order 1820 * The routine scans buddy structures (not bitmap!) from given order
1821 * to max order and tries to find big enough chunk to satisfy the req 1821 * to max order and tries to find big enough chunk to satisfy the req
1822 */ 1822 */
1823 static noinline_for_stack 1823 static noinline_for_stack
1824 void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, 1824 void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
1825 struct ext4_buddy *e4b) 1825 struct ext4_buddy *e4b)
1826 { 1826 {
1827 struct super_block *sb = ac->ac_sb; 1827 struct super_block *sb = ac->ac_sb;
1828 struct ext4_group_info *grp = e4b->bd_info; 1828 struct ext4_group_info *grp = e4b->bd_info;
1829 void *buddy; 1829 void *buddy;
1830 int i; 1830 int i;
1831 int k; 1831 int k;
1832 int max; 1832 int max;
1833 1833
1834 BUG_ON(ac->ac_2order <= 0); 1834 BUG_ON(ac->ac_2order <= 0);
1835 for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { 1835 for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
1836 if (grp->bb_counters[i] == 0) 1836 if (grp->bb_counters[i] == 0)
1837 continue; 1837 continue;
1838 1838
1839 buddy = mb_find_buddy(e4b, i, &max); 1839 buddy = mb_find_buddy(e4b, i, &max);
1840 BUG_ON(buddy == NULL); 1840 BUG_ON(buddy == NULL);
1841 1841
1842 k = mb_find_next_zero_bit(buddy, max, 0); 1842 k = mb_find_next_zero_bit(buddy, max, 0);
1843 BUG_ON(k >= max); 1843 BUG_ON(k >= max);
1844 1844
1845 ac->ac_found++; 1845 ac->ac_found++;
1846 1846
1847 ac->ac_b_ex.fe_len = 1 << i; 1847 ac->ac_b_ex.fe_len = 1 << i;
1848 ac->ac_b_ex.fe_start = k << i; 1848 ac->ac_b_ex.fe_start = k << i;
1849 ac->ac_b_ex.fe_group = e4b->bd_group; 1849 ac->ac_b_ex.fe_group = e4b->bd_group;
1850 1850
1851 ext4_mb_use_best_found(ac, e4b); 1851 ext4_mb_use_best_found(ac, e4b);
1852 1852
1853 BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); 1853 BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
1854 1854
1855 if (EXT4_SB(sb)->s_mb_stats) 1855 if (EXT4_SB(sb)->s_mb_stats)
1856 atomic_inc(&EXT4_SB(sb)->s_bal_2orders); 1856 atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
1857 1857
1858 break; 1858 break;
1859 } 1859 }
1860 } 1860 }
1861 1861
1862 /* 1862 /*
1863 * The routine scans the group and measures all found extents. 1863 * The routine scans the group and measures all found extents.
1864 * In order to optimize scanning, caller must pass number of 1864 * In order to optimize scanning, caller must pass number of
1865 * free blocks in the group, so the routine can know upper limit. 1865 * free blocks in the group, so the routine can know upper limit.
1866 */ 1866 */
1867 static noinline_for_stack 1867 static noinline_for_stack
1868 void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, 1868 void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1869 struct ext4_buddy *e4b) 1869 struct ext4_buddy *e4b)
1870 { 1870 {
1871 struct super_block *sb = ac->ac_sb; 1871 struct super_block *sb = ac->ac_sb;
1872 void *bitmap = EXT4_MB_BITMAP(e4b); 1872 void *bitmap = EXT4_MB_BITMAP(e4b);
1873 struct ext4_free_extent ex; 1873 struct ext4_free_extent ex;
1874 int i; 1874 int i;
1875 int free; 1875 int free;
1876 1876
1877 free = e4b->bd_info->bb_free; 1877 free = e4b->bd_info->bb_free;
1878 BUG_ON(free <= 0); 1878 BUG_ON(free <= 0);
1879 1879
1880 i = e4b->bd_info->bb_first_free; 1880 i = e4b->bd_info->bb_first_free;
1881 1881
1882 while (free && ac->ac_status == AC_STATUS_CONTINUE) { 1882 while (free && ac->ac_status == AC_STATUS_CONTINUE) {
1883 i = mb_find_next_zero_bit(bitmap, 1883 i = mb_find_next_zero_bit(bitmap,
1884 EXT4_BLOCKS_PER_GROUP(sb), i); 1884 EXT4_BLOCKS_PER_GROUP(sb), i);
1885 if (i >= EXT4_BLOCKS_PER_GROUP(sb)) { 1885 if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
1886 /* 1886 /*
1887 * IF we have corrupt bitmap, we won't find any 1887 * IF we have corrupt bitmap, we won't find any
1888 * free blocks even though group info says we 1888 * free blocks even though group info says we
1889 * we have free blocks 1889 * we have free blocks
1890 */ 1890 */
1891 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, 1891 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1892 "%d free blocks as per " 1892 "%d free blocks as per "
1893 "group info. But bitmap says 0", 1893 "group info. But bitmap says 0",
1894 free); 1894 free);
1895 break; 1895 break;
1896 } 1896 }
1897 1897
1898 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); 1898 mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
1899 BUG_ON(ex.fe_len <= 0); 1899 BUG_ON(ex.fe_len <= 0);
1900 if (free < ex.fe_len) { 1900 if (free < ex.fe_len) {
1901 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, 1901 ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1902 "%d free blocks as per " 1902 "%d free blocks as per "
1903 "group info. But got %d blocks", 1903 "group info. But got %d blocks",
1904 free, ex.fe_len); 1904 free, ex.fe_len);
1905 /* 1905 /*
1906 * The number of free blocks differs. This mostly 1906 * The number of free blocks differs. This mostly
1907 * indicate that the bitmap is corrupt. So exit 1907 * indicate that the bitmap is corrupt. So exit
1908 * without claiming the space. 1908 * without claiming the space.
1909 */ 1909 */
1910 break; 1910 break;
1911 } 1911 }
1912 1912
1913 ext4_mb_measure_extent(ac, &ex, e4b); 1913 ext4_mb_measure_extent(ac, &ex, e4b);
1914 1914
1915 i += ex.fe_len; 1915 i += ex.fe_len;
1916 free -= ex.fe_len; 1916 free -= ex.fe_len;
1917 } 1917 }
1918 1918
1919 ext4_mb_check_limits(ac, e4b, 1); 1919 ext4_mb_check_limits(ac, e4b, 1);
1920 } 1920 }
1921 1921
1922 /* 1922 /*
1923 * This is a special case for storages like raid5 1923 * This is a special case for storages like raid5
1924 * we try to find stripe-aligned chunks for stripe-size-multiple requests 1924 * we try to find stripe-aligned chunks for stripe-size-multiple requests
1925 */ 1925 */
1926 static noinline_for_stack 1926 static noinline_for_stack
1927 void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, 1927 void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1928 struct ext4_buddy *e4b) 1928 struct ext4_buddy *e4b)
1929 { 1929 {
1930 struct super_block *sb = ac->ac_sb; 1930 struct super_block *sb = ac->ac_sb;
1931 struct ext4_sb_info *sbi = EXT4_SB(sb); 1931 struct ext4_sb_info *sbi = EXT4_SB(sb);
1932 void *bitmap = EXT4_MB_BITMAP(e4b); 1932 void *bitmap = EXT4_MB_BITMAP(e4b);
1933 struct ext4_free_extent ex; 1933 struct ext4_free_extent ex;
1934 ext4_fsblk_t first_group_block; 1934 ext4_fsblk_t first_group_block;
1935 ext4_fsblk_t a; 1935 ext4_fsblk_t a;
1936 ext4_grpblk_t i; 1936 ext4_grpblk_t i;
1937 int max; 1937 int max;
1938 1938
1939 BUG_ON(sbi->s_stripe == 0); 1939 BUG_ON(sbi->s_stripe == 0);
1940 1940
1941 /* find first stripe-aligned block in group */ 1941 /* find first stripe-aligned block in group */
1942 first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); 1942 first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
1943 1943
1944 a = first_group_block + sbi->s_stripe - 1; 1944 a = first_group_block + sbi->s_stripe - 1;
1945 do_div(a, sbi->s_stripe); 1945 do_div(a, sbi->s_stripe);
1946 i = (a * sbi->s_stripe) - first_group_block; 1946 i = (a * sbi->s_stripe) - first_group_block;
1947 1947
1948 while (i < EXT4_BLOCKS_PER_GROUP(sb)) { 1948 while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
1949 if (!mb_test_bit(i, bitmap)) { 1949 if (!mb_test_bit(i, bitmap)) {
1950 max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex); 1950 max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
1951 if (max >= sbi->s_stripe) { 1951 if (max >= sbi->s_stripe) {
1952 ac->ac_found++; 1952 ac->ac_found++;
1953 ac->ac_b_ex = ex; 1953 ac->ac_b_ex = ex;
1954 ext4_mb_use_best_found(ac, e4b); 1954 ext4_mb_use_best_found(ac, e4b);
1955 break; 1955 break;
1956 } 1956 }
1957 } 1957 }
1958 i += sbi->s_stripe; 1958 i += sbi->s_stripe;
1959 } 1959 }
1960 } 1960 }
1961 1961
1962 /* This is now called BEFORE we load the buddy bitmap. */ 1962 /* This is now called BEFORE we load the buddy bitmap. */
1963 static int ext4_mb_good_group(struct ext4_allocation_context *ac, 1963 static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1964 ext4_group_t group, int cr) 1964 ext4_group_t group, int cr)
1965 { 1965 {
1966 unsigned free, fragments; 1966 unsigned free, fragments;
1967 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); 1967 int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
1968 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); 1968 struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1969 1969
1970 BUG_ON(cr < 0 || cr >= 4); 1970 BUG_ON(cr < 0 || cr >= 4);
1971 1971
1972 /* We only do this if the grp has never been initialized */ 1972 /* We only do this if the grp has never been initialized */
1973 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { 1973 if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1974 int ret = ext4_mb_init_group(ac->ac_sb, group); 1974 int ret = ext4_mb_init_group(ac->ac_sb, group);
1975 if (ret) 1975 if (ret)
1976 return 0; 1976 return 0;
1977 } 1977 }
1978 1978
1979 free = grp->bb_free; 1979 free = grp->bb_free;
1980 fragments = grp->bb_fragments; 1980 fragments = grp->bb_fragments;
1981 if (free == 0) 1981 if (free == 0)
1982 return 0; 1982 return 0;
1983 if (fragments == 0) 1983 if (fragments == 0)
1984 return 0; 1984 return 0;
1985 1985
1986 switch (cr) { 1986 switch (cr) {
1987 case 0: 1987 case 0:
1988 BUG_ON(ac->ac_2order == 0); 1988 BUG_ON(ac->ac_2order == 0);
1989 1989
1990 if (grp->bb_largest_free_order < ac->ac_2order) 1990 if (grp->bb_largest_free_order < ac->ac_2order)
1991 return 0; 1991 return 0;
1992 1992
1993 /* Avoid using the first bg of a flexgroup for data files */ 1993 /* Avoid using the first bg of a flexgroup for data files */
1994 if ((ac->ac_flags & EXT4_MB_HINT_DATA) && 1994 if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
1995 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && 1995 (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
1996 ((group % flex_size) == 0)) 1996 ((group % flex_size) == 0))
1997 return 0; 1997 return 0;
1998 1998
1999 return 1; 1999 return 1;
2000 case 1: 2000 case 1:
2001 if ((free / fragments) >= ac->ac_g_ex.fe_len) 2001 if ((free / fragments) >= ac->ac_g_ex.fe_len)
2002 return 1; 2002 return 1;
2003 break; 2003 break;
2004 case 2: 2004 case 2:
2005 if (free >= ac->ac_g_ex.fe_len) 2005 if (free >= ac->ac_g_ex.fe_len)
2006 return 1; 2006 return 1;
2007 break; 2007 break;
2008 case 3: 2008 case 3:
2009 return 1; 2009 return 1;
2010 default: 2010 default:
2011 BUG(); 2011 BUG();
2012 } 2012 }
2013 2013
2014 return 0; 2014 return 0;
2015 } 2015 }
2016 2016
2017 static noinline_for_stack int 2017 static noinline_for_stack int
2018 ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 2018 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2019 { 2019 {
2020 ext4_group_t ngroups, group, i; 2020 ext4_group_t ngroups, group, i;
2021 int cr; 2021 int cr;
2022 int err = 0; 2022 int err = 0;
2023 struct ext4_sb_info *sbi; 2023 struct ext4_sb_info *sbi;
2024 struct super_block *sb; 2024 struct super_block *sb;
2025 struct ext4_buddy e4b; 2025 struct ext4_buddy e4b;
2026 2026
2027 sb = ac->ac_sb; 2027 sb = ac->ac_sb;
2028 sbi = EXT4_SB(sb); 2028 sbi = EXT4_SB(sb);
2029 ngroups = ext4_get_groups_count(sb); 2029 ngroups = ext4_get_groups_count(sb);
2030 /* non-extent files are limited to low blocks/groups */ 2030 /* non-extent files are limited to low blocks/groups */
2031 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) 2031 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
2032 ngroups = sbi->s_blockfile_groups; 2032 ngroups = sbi->s_blockfile_groups;
2033 2033
2034 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 2034 BUG_ON(ac->ac_status == AC_STATUS_FOUND);
2035 2035
2036 /* first, try the goal */ 2036 /* first, try the goal */
2037 err = ext4_mb_find_by_goal(ac, &e4b); 2037 err = ext4_mb_find_by_goal(ac, &e4b);
2038 if (err || ac->ac_status == AC_STATUS_FOUND) 2038 if (err || ac->ac_status == AC_STATUS_FOUND)
2039 goto out; 2039 goto out;
2040 2040
2041 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 2041 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2042 goto out; 2042 goto out;
2043 2043
2044 /* 2044 /*
2045 * ac->ac2_order is set only if the fe_len is a power of 2 2045 * ac->ac2_order is set only if the fe_len is a power of 2
2046 * if ac2_order is set we also set criteria to 0 so that we 2046 * if ac2_order is set we also set criteria to 0 so that we
2047 * try exact allocation using buddy. 2047 * try exact allocation using buddy.
2048 */ 2048 */
2049 i = fls(ac->ac_g_ex.fe_len); 2049 i = fls(ac->ac_g_ex.fe_len);
2050 ac->ac_2order = 0; 2050 ac->ac_2order = 0;
2051 /* 2051 /*
2052 * We search using buddy data only if the order of the request 2052 * We search using buddy data only if the order of the request
2053 * is greater than equal to the sbi_s_mb_order2_reqs 2053 * is greater than equal to the sbi_s_mb_order2_reqs
2054 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req 2054 * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
2055 */ 2055 */
2056 if (i >= sbi->s_mb_order2_reqs) { 2056 if (i >= sbi->s_mb_order2_reqs) {
2057 /* 2057 /*
2058 * This should tell if fe_len is exactly power of 2 2058 * This should tell if fe_len is exactly power of 2
2059 */ 2059 */
2060 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) 2060 if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
2061 ac->ac_2order = i - 1; 2061 ac->ac_2order = i - 1;
2062 } 2062 }
2063 2063
2064 /* if stream allocation is enabled, use global goal */ 2064 /* if stream allocation is enabled, use global goal */
2065 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 2065 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2066 /* TBD: may be hot point */ 2066 /* TBD: may be hot point */
2067 spin_lock(&sbi->s_md_lock); 2067 spin_lock(&sbi->s_md_lock);
2068 ac->ac_g_ex.fe_group = sbi->s_mb_last_group; 2068 ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
2069 ac->ac_g_ex.fe_start = sbi->s_mb_last_start; 2069 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
2070 spin_unlock(&sbi->s_md_lock); 2070 spin_unlock(&sbi->s_md_lock);
2071 } 2071 }
2072 2072
2073 /* Let's just scan groups to find more-less suitable blocks */ 2073 /* Let's just scan groups to find more-less suitable blocks */
2074 cr = ac->ac_2order ? 0 : 1; 2074 cr = ac->ac_2order ? 0 : 1;
2075 /* 2075 /*
2076 * cr == 0 try to get exact allocation, 2076 * cr == 0 try to get exact allocation,
2077 * cr == 3 try to get anything 2077 * cr == 3 try to get anything
2078 */ 2078 */
2079 repeat: 2079 repeat:
2080 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { 2080 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
2081 ac->ac_criteria = cr; 2081 ac->ac_criteria = cr;
2082 /* 2082 /*
2083 * searching for the right group start 2083 * searching for the right group start
2084 * from the goal value specified 2084 * from the goal value specified
2085 */ 2085 */
2086 group = ac->ac_g_ex.fe_group; 2086 group = ac->ac_g_ex.fe_group;
2087 2087
2088 for (i = 0; i < ngroups; group++, i++) { 2088 for (i = 0; i < ngroups; group++, i++) {
2089 if (group == ngroups) 2089 if (group == ngroups)
2090 group = 0; 2090 group = 0;
2091 2091
2092 /* This now checks without needing the buddy page */ 2092 /* This now checks without needing the buddy page */
2093 if (!ext4_mb_good_group(ac, group, cr)) 2093 if (!ext4_mb_good_group(ac, group, cr))
2094 continue; 2094 continue;
2095 2095
2096 err = ext4_mb_load_buddy(sb, group, &e4b); 2096 err = ext4_mb_load_buddy(sb, group, &e4b);
2097 if (err) 2097 if (err)
2098 goto out; 2098 goto out;
2099 2099
2100 ext4_lock_group(sb, group); 2100 ext4_lock_group(sb, group);
2101 2101
2102 /* 2102 /*
2103 * We need to check again after locking the 2103 * We need to check again after locking the
2104 * block group 2104 * block group
2105 */ 2105 */
2106 if (!ext4_mb_good_group(ac, group, cr)) { 2106 if (!ext4_mb_good_group(ac, group, cr)) {
2107 ext4_unlock_group(sb, group); 2107 ext4_unlock_group(sb, group);
2108 ext4_mb_unload_buddy(&e4b); 2108 ext4_mb_unload_buddy(&e4b);
2109 continue; 2109 continue;
2110 } 2110 }
2111 2111
2112 ac->ac_groups_scanned++; 2112 ac->ac_groups_scanned++;
2113 if (cr == 0) 2113 if (cr == 0)
2114 ext4_mb_simple_scan_group(ac, &e4b); 2114 ext4_mb_simple_scan_group(ac, &e4b);
2115 else if (cr == 1 && sbi->s_stripe && 2115 else if (cr == 1 && sbi->s_stripe &&
2116 !(ac->ac_g_ex.fe_len % sbi->s_stripe)) 2116 !(ac->ac_g_ex.fe_len % sbi->s_stripe))
2117 ext4_mb_scan_aligned(ac, &e4b); 2117 ext4_mb_scan_aligned(ac, &e4b);
2118 else 2118 else
2119 ext4_mb_complex_scan_group(ac, &e4b); 2119 ext4_mb_complex_scan_group(ac, &e4b);
2120 2120
2121 ext4_unlock_group(sb, group); 2121 ext4_unlock_group(sb, group);
2122 ext4_mb_unload_buddy(&e4b); 2122 ext4_mb_unload_buddy(&e4b);
2123 2123
2124 if (ac->ac_status != AC_STATUS_CONTINUE) 2124 if (ac->ac_status != AC_STATUS_CONTINUE)
2125 break; 2125 break;
2126 } 2126 }
2127 } 2127 }
2128 2128
2129 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && 2129 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
2130 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { 2130 !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2131 /* 2131 /*
2132 * We've been searching too long. Let's try to allocate 2132 * We've been searching too long. Let's try to allocate
2133 * the best chunk we've found so far 2133 * the best chunk we've found so far
2134 */ 2134 */
2135 2135
2136 ext4_mb_try_best_found(ac, &e4b); 2136 ext4_mb_try_best_found(ac, &e4b);
2137 if (ac->ac_status != AC_STATUS_FOUND) { 2137 if (ac->ac_status != AC_STATUS_FOUND) {
2138 /* 2138 /*
2139 * Someone more lucky has already allocated it. 2139 * Someone more lucky has already allocated it.
2140 * The only thing we can do is just take first 2140 * The only thing we can do is just take first
2141 * found block(s) 2141 * found block(s)
2142 printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n"); 2142 printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
2143 */ 2143 */
2144 ac->ac_b_ex.fe_group = 0; 2144 ac->ac_b_ex.fe_group = 0;
2145 ac->ac_b_ex.fe_start = 0; 2145 ac->ac_b_ex.fe_start = 0;
2146 ac->ac_b_ex.fe_len = 0; 2146 ac->ac_b_ex.fe_len = 0;
2147 ac->ac_status = AC_STATUS_CONTINUE; 2147 ac->ac_status = AC_STATUS_CONTINUE;
2148 ac->ac_flags |= EXT4_MB_HINT_FIRST; 2148 ac->ac_flags |= EXT4_MB_HINT_FIRST;
2149 cr = 3; 2149 cr = 3;
2150 atomic_inc(&sbi->s_mb_lost_chunks); 2150 atomic_inc(&sbi->s_mb_lost_chunks);
2151 goto repeat; 2151 goto repeat;
2152 } 2152 }
2153 } 2153 }
2154 out: 2154 out:
2155 return err; 2155 return err;
2156 } 2156 }
2157 2157
2158 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) 2158 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2159 { 2159 {
2160 struct super_block *sb = seq->private; 2160 struct super_block *sb = seq->private;
2161 ext4_group_t group; 2161 ext4_group_t group;
2162 2162
2163 if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) 2163 if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2164 return NULL; 2164 return NULL;
2165 group = *pos + 1; 2165 group = *pos + 1;
2166 return (void *) ((unsigned long) group); 2166 return (void *) ((unsigned long) group);
2167 } 2167 }
2168 2168
2169 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) 2169 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2170 { 2170 {
2171 struct super_block *sb = seq->private; 2171 struct super_block *sb = seq->private;
2172 ext4_group_t group; 2172 ext4_group_t group;
2173 2173
2174 ++*pos; 2174 ++*pos;
2175 if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) 2175 if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2176 return NULL; 2176 return NULL;
2177 group = *pos + 1; 2177 group = *pos + 1;
2178 return (void *) ((unsigned long) group); 2178 return (void *) ((unsigned long) group);
2179 } 2179 }
2180 2180
2181 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) 2181 static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2182 { 2182 {
2183 struct super_block *sb = seq->private; 2183 struct super_block *sb = seq->private;
2184 ext4_group_t group = (ext4_group_t) ((unsigned long) v); 2184 ext4_group_t group = (ext4_group_t) ((unsigned long) v);
2185 int i; 2185 int i;
2186 int err; 2186 int err;
2187 struct ext4_buddy e4b; 2187 struct ext4_buddy e4b;
2188 struct sg { 2188 struct sg {
2189 struct ext4_group_info info; 2189 struct ext4_group_info info;
2190 ext4_grpblk_t counters[16]; 2190 ext4_grpblk_t counters[16];
2191 } sg; 2191 } sg;
2192 2192
2193 group--; 2193 group--;
2194 if (group == 0) 2194 if (group == 0)
2195 seq_printf(seq, "#%-5s: %-5s %-5s %-5s " 2195 seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
2196 "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " 2196 "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
2197 "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", 2197 "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
2198 "group", "free", "frags", "first", 2198 "group", "free", "frags", "first",
2199 "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", 2199 "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
2200 "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); 2200 "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
2201 2201
2202 i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + 2202 i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
2203 sizeof(struct ext4_group_info); 2203 sizeof(struct ext4_group_info);
2204 err = ext4_mb_load_buddy(sb, group, &e4b); 2204 err = ext4_mb_load_buddy(sb, group, &e4b);
2205 if (err) { 2205 if (err) {
2206 seq_printf(seq, "#%-5u: I/O error\n", group); 2206 seq_printf(seq, "#%-5u: I/O error\n", group);
2207 return 0; 2207 return 0;
2208 } 2208 }
2209 ext4_lock_group(sb, group); 2209 ext4_lock_group(sb, group);
2210 memcpy(&sg, ext4_get_group_info(sb, group), i); 2210 memcpy(&sg, ext4_get_group_info(sb, group), i);
2211 ext4_unlock_group(sb, group); 2211 ext4_unlock_group(sb, group);
2212 ext4_mb_unload_buddy(&e4b); 2212 ext4_mb_unload_buddy(&e4b);
2213 2213
2214 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, 2214 seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2215 sg.info.bb_fragments, sg.info.bb_first_free); 2215 sg.info.bb_fragments, sg.info.bb_first_free);
2216 for (i = 0; i <= 13; i++) 2216 for (i = 0; i <= 13; i++)
2217 seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? 2217 seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
2218 sg.info.bb_counters[i] : 0); 2218 sg.info.bb_counters[i] : 0);
2219 seq_printf(seq, " ]\n"); 2219 seq_printf(seq, " ]\n");
2220 2220
2221 return 0; 2221 return 0;
2222 } 2222 }
2223 2223
2224 static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) 2224 static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2225 { 2225 {
2226 } 2226 }
2227 2227
2228 static const struct seq_operations ext4_mb_seq_groups_ops = { 2228 static const struct seq_operations ext4_mb_seq_groups_ops = {
2229 .start = ext4_mb_seq_groups_start, 2229 .start = ext4_mb_seq_groups_start,
2230 .next = ext4_mb_seq_groups_next, 2230 .next = ext4_mb_seq_groups_next,
2231 .stop = ext4_mb_seq_groups_stop, 2231 .stop = ext4_mb_seq_groups_stop,
2232 .show = ext4_mb_seq_groups_show, 2232 .show = ext4_mb_seq_groups_show,
2233 }; 2233 };
2234 2234
2235 static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) 2235 static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2236 { 2236 {
2237 struct super_block *sb = PDE(inode)->data; 2237 struct super_block *sb = PDE(inode)->data;
2238 int rc; 2238 int rc;
2239 2239
2240 rc = seq_open(file, &ext4_mb_seq_groups_ops); 2240 rc = seq_open(file, &ext4_mb_seq_groups_ops);
2241 if (rc == 0) { 2241 if (rc == 0) {
2242 struct seq_file *m = file->private_data; 2242 struct seq_file *m = file->private_data;
2243 m->private = sb; 2243 m->private = sb;
2244 } 2244 }
2245 return rc; 2245 return rc;
2246 2246
2247 } 2247 }
2248 2248
2249 static const struct file_operations ext4_mb_seq_groups_fops = { 2249 static const struct file_operations ext4_mb_seq_groups_fops = {
2250 .owner = THIS_MODULE, 2250 .owner = THIS_MODULE,
2251 .open = ext4_mb_seq_groups_open, 2251 .open = ext4_mb_seq_groups_open,
2252 .read = seq_read, 2252 .read = seq_read,
2253 .llseek = seq_lseek, 2253 .llseek = seq_lseek,
2254 .release = seq_release, 2254 .release = seq_release,
2255 }; 2255 };
2256 2256
2257 static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) 2257 static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
2258 { 2258 {
2259 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; 2259 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2260 struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; 2260 struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
2261 2261
2262 BUG_ON(!cachep); 2262 BUG_ON(!cachep);
2263 return cachep; 2263 return cachep;
2264 } 2264 }
2265 2265
2266 /* Create and initialize ext4_group_info data for the given group. */ 2266 /* Create and initialize ext4_group_info data for the given group. */
2267 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, 2267 int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2268 struct ext4_group_desc *desc) 2268 struct ext4_group_desc *desc)
2269 { 2269 {
2270 int i; 2270 int i;
2271 int metalen = 0; 2271 int metalen = 0;
2272 struct ext4_sb_info *sbi = EXT4_SB(sb); 2272 struct ext4_sb_info *sbi = EXT4_SB(sb);
2273 struct ext4_group_info **meta_group_info; 2273 struct ext4_group_info **meta_group_info;
2274 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); 2274 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2275 2275
2276 /* 2276 /*
2277 * First check if this group is the first of a reserved block. 2277 * First check if this group is the first of a reserved block.
2278 * If it's true, we have to allocate a new table of pointers 2278 * If it's true, we have to allocate a new table of pointers
2279 * to ext4_group_info structures 2279 * to ext4_group_info structures
2280 */ 2280 */
2281 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { 2281 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2282 metalen = sizeof(*meta_group_info) << 2282 metalen = sizeof(*meta_group_info) <<
2283 EXT4_DESC_PER_BLOCK_BITS(sb); 2283 EXT4_DESC_PER_BLOCK_BITS(sb);
2284 meta_group_info = kmalloc(metalen, GFP_KERNEL); 2284 meta_group_info = kmalloc(metalen, GFP_KERNEL);
2285 if (meta_group_info == NULL) { 2285 if (meta_group_info == NULL) {
2286 printk(KERN_ERR "EXT4-fs: can't allocate mem for a " 2286 printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
2287 "buddy group\n"); 2287 "buddy group\n");
2288 goto exit_meta_group_info; 2288 goto exit_meta_group_info;
2289 } 2289 }
2290 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = 2290 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
2291 meta_group_info; 2291 meta_group_info;
2292 } 2292 }
2293 2293
2294 meta_group_info = 2294 meta_group_info =
2295 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; 2295 sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2296 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); 2296 i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2297 2297
2298 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL); 2298 meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
2299 if (meta_group_info[i] == NULL) { 2299 if (meta_group_info[i] == NULL) {
2300 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); 2300 printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
2301 goto exit_group_info; 2301 goto exit_group_info;
2302 } 2302 }
2303 memset(meta_group_info[i], 0, kmem_cache_size(cachep)); 2303 memset(meta_group_info[i], 0, kmem_cache_size(cachep));
2304 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, 2304 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2305 &(meta_group_info[i]->bb_state)); 2305 &(meta_group_info[i]->bb_state));
2306 2306
2307 /* 2307 /*
2308 * initialize bb_free to be able to skip 2308 * initialize bb_free to be able to skip
2309 * empty groups without initialization 2309 * empty groups without initialization
2310 */ 2310 */
2311 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 2311 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2312 meta_group_info[i]->bb_free = 2312 meta_group_info[i]->bb_free =
2313 ext4_free_blocks_after_init(sb, group, desc); 2313 ext4_free_blocks_after_init(sb, group, desc);
2314 } else { 2314 } else {
2315 meta_group_info[i]->bb_free = 2315 meta_group_info[i]->bb_free =
2316 ext4_free_blks_count(sb, desc); 2316 ext4_free_blks_count(sb, desc);
2317 } 2317 }
2318 2318
2319 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 2319 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2320 init_rwsem(&meta_group_info[i]->alloc_sem); 2320 init_rwsem(&meta_group_info[i]->alloc_sem);
2321 meta_group_info[i]->bb_free_root = RB_ROOT; 2321 meta_group_info[i]->bb_free_root = RB_ROOT;
2322 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ 2322 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */
2323 2323
2324 #ifdef DOUBLE_CHECK 2324 #ifdef DOUBLE_CHECK
2325 { 2325 {
2326 struct buffer_head *bh; 2326 struct buffer_head *bh;
2327 meta_group_info[i]->bb_bitmap = 2327 meta_group_info[i]->bb_bitmap =
2328 kmalloc(sb->s_blocksize, GFP_KERNEL); 2328 kmalloc(sb->s_blocksize, GFP_KERNEL);
2329 BUG_ON(meta_group_info[i]->bb_bitmap == NULL); 2329 BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
2330 bh = ext4_read_block_bitmap(sb, group); 2330 bh = ext4_read_block_bitmap(sb, group);
2331 BUG_ON(bh == NULL); 2331 BUG_ON(bh == NULL);
2332 memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, 2332 memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
2333 sb->s_blocksize); 2333 sb->s_blocksize);
2334 put_bh(bh); 2334 put_bh(bh);
2335 } 2335 }
2336 #endif 2336 #endif
2337 2337
2338 return 0; 2338 return 0;
2339 2339
2340 exit_group_info: 2340 exit_group_info:
2341 /* If a meta_group_info table has been allocated, release it now */ 2341 /* If a meta_group_info table has been allocated, release it now */
2342 if (group % EXT4_DESC_PER_BLOCK(sb) == 0) 2342 if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
2343 kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); 2343 kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
2344 exit_meta_group_info: 2344 exit_meta_group_info:
2345 return -ENOMEM; 2345 return -ENOMEM;
2346 } /* ext4_mb_add_groupinfo */ 2346 } /* ext4_mb_add_groupinfo */
2347 2347
2348 static int ext4_mb_init_backend(struct super_block *sb) 2348 static int ext4_mb_init_backend(struct super_block *sb)
2349 { 2349 {
2350 ext4_group_t ngroups = ext4_get_groups_count(sb); 2350 ext4_group_t ngroups = ext4_get_groups_count(sb);
2351 ext4_group_t i; 2351 ext4_group_t i;
2352 struct ext4_sb_info *sbi = EXT4_SB(sb); 2352 struct ext4_sb_info *sbi = EXT4_SB(sb);
2353 struct ext4_super_block *es = sbi->s_es; 2353 struct ext4_super_block *es = sbi->s_es;
2354 int num_meta_group_infos; 2354 int num_meta_group_infos;
2355 int num_meta_group_infos_max; 2355 int num_meta_group_infos_max;
2356 int array_size; 2356 int array_size;
2357 struct ext4_group_desc *desc; 2357 struct ext4_group_desc *desc;
2358 struct kmem_cache *cachep; 2358 struct kmem_cache *cachep;
2359 2359
2360 /* This is the number of blocks used by GDT */ 2360 /* This is the number of blocks used by GDT */
2361 num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 2361 num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
2362 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); 2362 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
2363 2363
2364 /* 2364 /*
2365 * This is the total number of blocks used by GDT including 2365 * This is the total number of blocks used by GDT including
2366 * the number of reserved blocks for GDT. 2366 * the number of reserved blocks for GDT.
2367 * The s_group_info array is allocated with this value 2367 * The s_group_info array is allocated with this value
2368 * to allow a clean online resize without a complex 2368 * to allow a clean online resize without a complex
2369 * manipulation of pointer. 2369 * manipulation of pointer.
2370 * The drawback is the unused memory when no resize 2370 * The drawback is the unused memory when no resize
2371 * occurs but it's very low in terms of pages 2371 * occurs but it's very low in terms of pages
2372 * (see comments below) 2372 * (see comments below)
2373 * Need to handle this properly when META_BG resizing is allowed 2373 * Need to handle this properly when META_BG resizing is allowed
2374 */ 2374 */
2375 num_meta_group_infos_max = num_meta_group_infos + 2375 num_meta_group_infos_max = num_meta_group_infos +
2376 le16_to_cpu(es->s_reserved_gdt_blocks); 2376 le16_to_cpu(es->s_reserved_gdt_blocks);
2377 2377
2378 /* 2378 /*
2379 * array_size is the size of s_group_info array. We round it 2379 * array_size is the size of s_group_info array. We round it
2380 * to the next power of two because this approximation is done 2380 * to the next power of two because this approximation is done
2381 * internally by kmalloc so we can have some more memory 2381 * internally by kmalloc so we can have some more memory
2382 * for free here (e.g. may be used for META_BG resize). 2382 * for free here (e.g. may be used for META_BG resize).
2383 */ 2383 */
2384 array_size = 1; 2384 array_size = 1;
2385 while (array_size < sizeof(*sbi->s_group_info) * 2385 while (array_size < sizeof(*sbi->s_group_info) *
2386 num_meta_group_infos_max) 2386 num_meta_group_infos_max)
2387 array_size = array_size << 1; 2387 array_size = array_size << 1;
2388 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte 2388 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
2389 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. 2389 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
2390 * So a two level scheme suffices for now. */ 2390 * So a two level scheme suffices for now. */
2391 sbi->s_group_info = kzalloc(array_size, GFP_KERNEL); 2391 sbi->s_group_info = kzalloc(array_size, GFP_KERNEL);
2392 if (sbi->s_group_info == NULL) { 2392 if (sbi->s_group_info == NULL) {
2393 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); 2393 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
2394 return -ENOMEM; 2394 return -ENOMEM;
2395 } 2395 }
2396 sbi->s_buddy_cache = new_inode(sb); 2396 sbi->s_buddy_cache = new_inode(sb);
2397 if (sbi->s_buddy_cache == NULL) { 2397 if (sbi->s_buddy_cache == NULL) {
2398 printk(KERN_ERR "EXT4-fs: can't get new inode\n"); 2398 printk(KERN_ERR "EXT4-fs: can't get new inode\n");
2399 goto err_freesgi; 2399 goto err_freesgi;
2400 } 2400 }
2401 sbi->s_buddy_cache->i_ino = get_next_ino(); 2401 sbi->s_buddy_cache->i_ino = get_next_ino();
2402 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 2402 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2403 for (i = 0; i < ngroups; i++) { 2403 for (i = 0; i < ngroups; i++) {
2404 desc = ext4_get_group_desc(sb, i, NULL); 2404 desc = ext4_get_group_desc(sb, i, NULL);
2405 if (desc == NULL) { 2405 if (desc == NULL) {
2406 printk(KERN_ERR 2406 printk(KERN_ERR
2407 "EXT4-fs: can't read descriptor %u\n", i); 2407 "EXT4-fs: can't read descriptor %u\n", i);
2408 goto err_freebuddy; 2408 goto err_freebuddy;
2409 } 2409 }
2410 if (ext4_mb_add_groupinfo(sb, i, desc) != 0) 2410 if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
2411 goto err_freebuddy; 2411 goto err_freebuddy;
2412 } 2412 }
2413 2413
2414 return 0; 2414 return 0;
2415 2415
2416 err_freebuddy: 2416 err_freebuddy:
2417 cachep = get_groupinfo_cache(sb->s_blocksize_bits); 2417 cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2418 while (i-- > 0) 2418 while (i-- > 0)
2419 kmem_cache_free(cachep, ext4_get_group_info(sb, i)); 2419 kmem_cache_free(cachep, ext4_get_group_info(sb, i));
2420 i = num_meta_group_infos; 2420 i = num_meta_group_infos;
2421 while (i-- > 0) 2421 while (i-- > 0)
2422 kfree(sbi->s_group_info[i]); 2422 kfree(sbi->s_group_info[i]);
2423 iput(sbi->s_buddy_cache); 2423 iput(sbi->s_buddy_cache);
2424 err_freesgi: 2424 err_freesgi:
2425 kfree(sbi->s_group_info); 2425 kfree(sbi->s_group_info);
2426 return -ENOMEM; 2426 return -ENOMEM;
2427 } 2427 }
2428 2428
2429 static void ext4_groupinfo_destroy_slabs(void) 2429 static void ext4_groupinfo_destroy_slabs(void)
2430 { 2430 {
2431 int i; 2431 int i;
2432 2432
2433 for (i = 0; i < NR_GRPINFO_CACHES; i++) { 2433 for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2434 if (ext4_groupinfo_caches[i]) 2434 if (ext4_groupinfo_caches[i])
2435 kmem_cache_destroy(ext4_groupinfo_caches[i]); 2435 kmem_cache_destroy(ext4_groupinfo_caches[i]);
2436 ext4_groupinfo_caches[i] = NULL; 2436 ext4_groupinfo_caches[i] = NULL;
2437 } 2437 }
2438 } 2438 }
2439 2439
2440 static int ext4_groupinfo_create_slab(size_t size) 2440 static int ext4_groupinfo_create_slab(size_t size)
2441 { 2441 {
2442 static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); 2442 static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
2443 int slab_size; 2443 int slab_size;
2444 int blocksize_bits = order_base_2(size); 2444 int blocksize_bits = order_base_2(size);
2445 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; 2445 int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2446 struct kmem_cache *cachep; 2446 struct kmem_cache *cachep;
2447 2447
2448 if (cache_index >= NR_GRPINFO_CACHES) 2448 if (cache_index >= NR_GRPINFO_CACHES)
2449 return -EINVAL; 2449 return -EINVAL;
2450 2450
2451 if (unlikely(cache_index < 0)) 2451 if (unlikely(cache_index < 0))
2452 cache_index = 0; 2452 cache_index = 0;
2453 2453
2454 mutex_lock(&ext4_grpinfo_slab_create_mutex); 2454 mutex_lock(&ext4_grpinfo_slab_create_mutex);
2455 if (ext4_groupinfo_caches[cache_index]) { 2455 if (ext4_groupinfo_caches[cache_index]) {
2456 mutex_unlock(&ext4_grpinfo_slab_create_mutex); 2456 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2457 return 0; /* Already created */ 2457 return 0; /* Already created */
2458 } 2458 }
2459 2459
2460 slab_size = offsetof(struct ext4_group_info, 2460 slab_size = offsetof(struct ext4_group_info,
2461 bb_counters[blocksize_bits + 2]); 2461 bb_counters[blocksize_bits + 2]);
2462 2462
2463 cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], 2463 cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
2464 slab_size, 0, SLAB_RECLAIM_ACCOUNT, 2464 slab_size, 0, SLAB_RECLAIM_ACCOUNT,
2465 NULL); 2465 NULL);
2466 2466
2467 mutex_unlock(&ext4_grpinfo_slab_create_mutex); 2467 mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2468 if (!cachep) { 2468 if (!cachep) {
2469 printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n"); 2469 printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n");
2470 return -ENOMEM; 2470 return -ENOMEM;
2471 } 2471 }
2472 2472
2473 ext4_groupinfo_caches[cache_index] = cachep; 2473 ext4_groupinfo_caches[cache_index] = cachep;
2474 2474
2475 return 0; 2475 return 0;
2476 } 2476 }
2477 2477
2478 int ext4_mb_init(struct super_block *sb, int needs_recovery) 2478 int ext4_mb_init(struct super_block *sb, int needs_recovery)
2479 { 2479 {
2480 struct ext4_sb_info *sbi = EXT4_SB(sb); 2480 struct ext4_sb_info *sbi = EXT4_SB(sb);
2481 unsigned i, j; 2481 unsigned i, j;
2482 unsigned offset; 2482 unsigned offset;
2483 unsigned max; 2483 unsigned max;
2484 int ret; 2484 int ret;
2485 2485
2486 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); 2486 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2487 2487
2488 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); 2488 sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2489 if (sbi->s_mb_offsets == NULL) { 2489 if (sbi->s_mb_offsets == NULL) {
2490 ret = -ENOMEM; 2490 ret = -ENOMEM;
2491 goto out; 2491 goto out;
2492 } 2492 }
2493 2493
2494 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); 2494 i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2495 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); 2495 sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2496 if (sbi->s_mb_maxs == NULL) { 2496 if (sbi->s_mb_maxs == NULL) {
2497 ret = -ENOMEM; 2497 ret = -ENOMEM;
2498 goto out; 2498 goto out;
2499 } 2499 }
2500 2500
2501 ret = ext4_groupinfo_create_slab(sb->s_blocksize); 2501 ret = ext4_groupinfo_create_slab(sb->s_blocksize);
2502 if (ret < 0) 2502 if (ret < 0)
2503 goto out; 2503 goto out;
2504 2504
2505 /* order 0 is regular bitmap */ 2505 /* order 0 is regular bitmap */
2506 sbi->s_mb_maxs[0] = sb->s_blocksize << 3; 2506 sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
2507 sbi->s_mb_offsets[0] = 0; 2507 sbi->s_mb_offsets[0] = 0;
2508 2508
2509 i = 1; 2509 i = 1;
2510 offset = 0; 2510 offset = 0;
2511 max = sb->s_blocksize << 2; 2511 max = sb->s_blocksize << 2;
2512 do { 2512 do {
2513 sbi->s_mb_offsets[i] = offset; 2513 sbi->s_mb_offsets[i] = offset;
2514 sbi->s_mb_maxs[i] = max; 2514 sbi->s_mb_maxs[i] = max;
2515 offset += 1 << (sb->s_blocksize_bits - i); 2515 offset += 1 << (sb->s_blocksize_bits - i);
2516 max = max >> 1; 2516 max = max >> 1;
2517 i++; 2517 i++;
2518 } while (i <= sb->s_blocksize_bits + 1); 2518 } while (i <= sb->s_blocksize_bits + 1);
2519 2519
2520 /* init file for buddy data */ 2520 /* init file for buddy data */
2521 ret = ext4_mb_init_backend(sb); 2521 ret = ext4_mb_init_backend(sb);
2522 if (ret != 0) { 2522 if (ret != 0) {
2523 goto out; 2523 goto out;
2524 } 2524 }
2525 2525
2526 spin_lock_init(&sbi->s_md_lock); 2526 spin_lock_init(&sbi->s_md_lock);
2527 spin_lock_init(&sbi->s_bal_lock); 2527 spin_lock_init(&sbi->s_bal_lock);
2528 2528
2529 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; 2529 sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
2530 sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; 2530 sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
2531 sbi->s_mb_stats = MB_DEFAULT_STATS; 2531 sbi->s_mb_stats = MB_DEFAULT_STATS;
2532 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; 2532 sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2533 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; 2533 sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2534 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; 2534 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
2535 2535
2536 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 2536 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2537 if (sbi->s_locality_groups == NULL) { 2537 if (sbi->s_locality_groups == NULL) {
2538 ret = -ENOMEM; 2538 ret = -ENOMEM;
2539 goto out; 2539 goto out;
2540 } 2540 }
2541 for_each_possible_cpu(i) { 2541 for_each_possible_cpu(i) {
2542 struct ext4_locality_group *lg; 2542 struct ext4_locality_group *lg;
2543 lg = per_cpu_ptr(sbi->s_locality_groups, i); 2543 lg = per_cpu_ptr(sbi->s_locality_groups, i);
2544 mutex_init(&lg->lg_mutex); 2544 mutex_init(&lg->lg_mutex);
2545 for (j = 0; j < PREALLOC_TB_SIZE; j++) 2545 for (j = 0; j < PREALLOC_TB_SIZE; j++)
2546 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); 2546 INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
2547 spin_lock_init(&lg->lg_prealloc_lock); 2547 spin_lock_init(&lg->lg_prealloc_lock);
2548 } 2548 }
2549 2549
2550 if (sbi->s_proc) 2550 if (sbi->s_proc)
2551 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, 2551 proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2552 &ext4_mb_seq_groups_fops, sb); 2552 &ext4_mb_seq_groups_fops, sb);
2553 2553
2554 if (sbi->s_journal) 2554 if (sbi->s_journal)
2555 sbi->s_journal->j_commit_callback = release_blocks_on_commit; 2555 sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2556 out: 2556 out:
2557 if (ret) { 2557 if (ret) {
2558 kfree(sbi->s_mb_offsets); 2558 kfree(sbi->s_mb_offsets);
2559 kfree(sbi->s_mb_maxs); 2559 kfree(sbi->s_mb_maxs);
2560 } 2560 }
2561 return ret; 2561 return ret;
2562 } 2562 }
2563 2563
2564 /* need to called with the ext4 group lock held */ 2564 /* need to called with the ext4 group lock held */
2565 static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) 2565 static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2566 { 2566 {
2567 struct ext4_prealloc_space *pa; 2567 struct ext4_prealloc_space *pa;
2568 struct list_head *cur, *tmp; 2568 struct list_head *cur, *tmp;
2569 int count = 0; 2569 int count = 0;
2570 2570
2571 list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { 2571 list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
2572 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 2572 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
2573 list_del(&pa->pa_group_list); 2573 list_del(&pa->pa_group_list);
2574 count++; 2574 count++;
2575 kmem_cache_free(ext4_pspace_cachep, pa); 2575 kmem_cache_free(ext4_pspace_cachep, pa);
2576 } 2576 }
2577 if (count) 2577 if (count)
2578 mb_debug(1, "mballoc: %u PAs left\n", count); 2578 mb_debug(1, "mballoc: %u PAs left\n", count);
2579 2579
2580 } 2580 }
2581 2581
2582 int ext4_mb_release(struct super_block *sb) 2582 int ext4_mb_release(struct super_block *sb)
2583 { 2583 {
2584 ext4_group_t ngroups = ext4_get_groups_count(sb); 2584 ext4_group_t ngroups = ext4_get_groups_count(sb);
2585 ext4_group_t i; 2585 ext4_group_t i;
2586 int num_meta_group_infos; 2586 int num_meta_group_infos;
2587 struct ext4_group_info *grinfo; 2587 struct ext4_group_info *grinfo;
2588 struct ext4_sb_info *sbi = EXT4_SB(sb); 2588 struct ext4_sb_info *sbi = EXT4_SB(sb);
2589 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); 2589 struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2590 2590
2591 if (sbi->s_group_info) { 2591 if (sbi->s_group_info) {
2592 for (i = 0; i < ngroups; i++) { 2592 for (i = 0; i < ngroups; i++) {
2593 grinfo = ext4_get_group_info(sb, i); 2593 grinfo = ext4_get_group_info(sb, i);
2594 #ifdef DOUBLE_CHECK 2594 #ifdef DOUBLE_CHECK
2595 kfree(grinfo->bb_bitmap); 2595 kfree(grinfo->bb_bitmap);
2596 #endif 2596 #endif
2597 ext4_lock_group(sb, i); 2597 ext4_lock_group(sb, i);
2598 ext4_mb_cleanup_pa(grinfo); 2598 ext4_mb_cleanup_pa(grinfo);
2599 ext4_unlock_group(sb, i); 2599 ext4_unlock_group(sb, i);
2600 kmem_cache_free(cachep, grinfo); 2600 kmem_cache_free(cachep, grinfo);
2601 } 2601 }
2602 num_meta_group_infos = (ngroups + 2602 num_meta_group_infos = (ngroups +
2603 EXT4_DESC_PER_BLOCK(sb) - 1) >> 2603 EXT4_DESC_PER_BLOCK(sb) - 1) >>
2604 EXT4_DESC_PER_BLOCK_BITS(sb); 2604 EXT4_DESC_PER_BLOCK_BITS(sb);
2605 for (i = 0; i < num_meta_group_infos; i++) 2605 for (i = 0; i < num_meta_group_infos; i++)
2606 kfree(sbi->s_group_info[i]); 2606 kfree(sbi->s_group_info[i]);
2607 kfree(sbi->s_group_info); 2607 kfree(sbi->s_group_info);
2608 } 2608 }
2609 kfree(sbi->s_mb_offsets); 2609 kfree(sbi->s_mb_offsets);
2610 kfree(sbi->s_mb_maxs); 2610 kfree(sbi->s_mb_maxs);
2611 if (sbi->s_buddy_cache) 2611 if (sbi->s_buddy_cache)
2612 iput(sbi->s_buddy_cache); 2612 iput(sbi->s_buddy_cache);
2613 if (sbi->s_mb_stats) { 2613 if (sbi->s_mb_stats) {
2614 printk(KERN_INFO 2614 printk(KERN_INFO
2615 "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n", 2615 "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n",
2616 atomic_read(&sbi->s_bal_allocated), 2616 atomic_read(&sbi->s_bal_allocated),
2617 atomic_read(&sbi->s_bal_reqs), 2617 atomic_read(&sbi->s_bal_reqs),
2618 atomic_read(&sbi->s_bal_success)); 2618 atomic_read(&sbi->s_bal_success));
2619 printk(KERN_INFO 2619 printk(KERN_INFO
2620 "EXT4-fs: mballoc: %u extents scanned, %u goal hits, " 2620 "EXT4-fs: mballoc: %u extents scanned, %u goal hits, "
2621 "%u 2^N hits, %u breaks, %u lost\n", 2621 "%u 2^N hits, %u breaks, %u lost\n",
2622 atomic_read(&sbi->s_bal_ex_scanned), 2622 atomic_read(&sbi->s_bal_ex_scanned),
2623 atomic_read(&sbi->s_bal_goals), 2623 atomic_read(&sbi->s_bal_goals),
2624 atomic_read(&sbi->s_bal_2orders), 2624 atomic_read(&sbi->s_bal_2orders),
2625 atomic_read(&sbi->s_bal_breaks), 2625 atomic_read(&sbi->s_bal_breaks),
2626 atomic_read(&sbi->s_mb_lost_chunks)); 2626 atomic_read(&sbi->s_mb_lost_chunks));
2627 printk(KERN_INFO 2627 printk(KERN_INFO
2628 "EXT4-fs: mballoc: %lu generated and it took %Lu\n", 2628 "EXT4-fs: mballoc: %lu generated and it took %Lu\n",
2629 sbi->s_mb_buddies_generated++, 2629 sbi->s_mb_buddies_generated++,
2630 sbi->s_mb_generation_time); 2630 sbi->s_mb_generation_time);
2631 printk(KERN_INFO 2631 printk(KERN_INFO
2632 "EXT4-fs: mballoc: %u preallocated, %u discarded\n", 2632 "EXT4-fs: mballoc: %u preallocated, %u discarded\n",
2633 atomic_read(&sbi->s_mb_preallocated), 2633 atomic_read(&sbi->s_mb_preallocated),
2634 atomic_read(&sbi->s_mb_discarded)); 2634 atomic_read(&sbi->s_mb_discarded));
2635 } 2635 }
2636 2636
2637 free_percpu(sbi->s_locality_groups); 2637 free_percpu(sbi->s_locality_groups);
2638 if (sbi->s_proc) 2638 if (sbi->s_proc)
2639 remove_proc_entry("mb_groups", sbi->s_proc); 2639 remove_proc_entry("mb_groups", sbi->s_proc);
2640 2640
2641 return 0; 2641 return 0;
2642 } 2642 }
2643 2643
2644 static inline int ext4_issue_discard(struct super_block *sb, 2644 static inline int ext4_issue_discard(struct super_block *sb,
2645 ext4_group_t block_group, ext4_grpblk_t block, int count) 2645 ext4_group_t block_group, ext4_grpblk_t block, int count)
2646 { 2646 {
2647 ext4_fsblk_t discard_block; 2647 ext4_fsblk_t discard_block;
2648 2648
2649 discard_block = block + ext4_group_first_block_no(sb, block_group); 2649 discard_block = block + ext4_group_first_block_no(sb, block_group);
2650 trace_ext4_discard_blocks(sb, 2650 trace_ext4_discard_blocks(sb,
2651 (unsigned long long) discard_block, count); 2651 (unsigned long long) discard_block, count);
2652 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); 2652 return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
2653 } 2653 }
2654 2654
2655 /* 2655 /*
2656 * This function is called by the jbd2 layer once the commit has finished, 2656 * This function is called by the jbd2 layer once the commit has finished,
2657 * so we know we can free the blocks that were released with that commit. 2657 * so we know we can free the blocks that were released with that commit.
2658 */ 2658 */
2659 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) 2659 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2660 { 2660 {
2661 struct super_block *sb = journal->j_private; 2661 struct super_block *sb = journal->j_private;
2662 struct ext4_buddy e4b; 2662 struct ext4_buddy e4b;
2663 struct ext4_group_info *db; 2663 struct ext4_group_info *db;
2664 int err, count = 0, count2 = 0; 2664 int err, count = 0, count2 = 0;
2665 struct ext4_free_data *entry; 2665 struct ext4_free_data *entry;
2666 struct list_head *l, *ltmp; 2666 struct list_head *l, *ltmp;
2667 2667
2668 list_for_each_safe(l, ltmp, &txn->t_private_list) { 2668 list_for_each_safe(l, ltmp, &txn->t_private_list) {
2669 entry = list_entry(l, struct ext4_free_data, list); 2669 entry = list_entry(l, struct ext4_free_data, list);
2670 2670
2671 mb_debug(1, "gonna free %u blocks in group %u (0x%p):", 2671 mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2672 entry->count, entry->group, entry); 2672 entry->count, entry->group, entry);
2673 2673
2674 if (test_opt(sb, DISCARD)) 2674 if (test_opt(sb, DISCARD))
2675 ext4_issue_discard(sb, entry->group, 2675 ext4_issue_discard(sb, entry->group,
2676 entry->start_blk, entry->count); 2676 entry->start_blk, entry->count);
2677 2677
2678 err = ext4_mb_load_buddy(sb, entry->group, &e4b); 2678 err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2679 /* we expect to find existing buddy because it's pinned */ 2679 /* we expect to find existing buddy because it's pinned */
2680 BUG_ON(err != 0); 2680 BUG_ON(err != 0);
2681 2681
2682 db = e4b.bd_info; 2682 db = e4b.bd_info;
2683 /* there are blocks to put in buddy to make them really free */ 2683 /* there are blocks to put in buddy to make them really free */
2684 count += entry->count; 2684 count += entry->count;
2685 count2++; 2685 count2++;
2686 ext4_lock_group(sb, entry->group); 2686 ext4_lock_group(sb, entry->group);
2687 /* Take it out of per group rb tree */ 2687 /* Take it out of per group rb tree */
2688 rb_erase(&entry->node, &(db->bb_free_root)); 2688 rb_erase(&entry->node, &(db->bb_free_root));
2689 mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); 2689 mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
2690 2690
2691 if (!db->bb_free_root.rb_node) { 2691 if (!db->bb_free_root.rb_node) {
2692 /* No more items in the per group rb tree 2692 /* No more items in the per group rb tree
2693 * balance refcounts from ext4_mb_free_metadata() 2693 * balance refcounts from ext4_mb_free_metadata()
2694 */ 2694 */
2695 page_cache_release(e4b.bd_buddy_page); 2695 page_cache_release(e4b.bd_buddy_page);
2696 page_cache_release(e4b.bd_bitmap_page); 2696 page_cache_release(e4b.bd_bitmap_page);
2697 } 2697 }
2698 ext4_unlock_group(sb, entry->group); 2698 ext4_unlock_group(sb, entry->group);
2699 kmem_cache_free(ext4_free_ext_cachep, entry); 2699 kmem_cache_free(ext4_free_ext_cachep, entry);
2700 ext4_mb_unload_buddy(&e4b); 2700 ext4_mb_unload_buddy(&e4b);
2701 } 2701 }
2702 2702
2703 mb_debug(1, "freed %u blocks in %u structures\n", count, count2); 2703 mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
2704 } 2704 }
2705 2705
2706 #ifdef CONFIG_EXT4_DEBUG 2706 #ifdef CONFIG_EXT4_DEBUG
2707 u8 mb_enable_debug __read_mostly; 2707 u8 mb_enable_debug __read_mostly;
2708 2708
2709 static struct dentry *debugfs_dir; 2709 static struct dentry *debugfs_dir;
2710 static struct dentry *debugfs_debug; 2710 static struct dentry *debugfs_debug;
2711 2711
2712 static void __init ext4_create_debugfs_entry(void) 2712 static void __init ext4_create_debugfs_entry(void)
2713 { 2713 {
2714 debugfs_dir = debugfs_create_dir("ext4", NULL); 2714 debugfs_dir = debugfs_create_dir("ext4", NULL);
2715 if (debugfs_dir) 2715 if (debugfs_dir)
2716 debugfs_debug = debugfs_create_u8("mballoc-debug", 2716 debugfs_debug = debugfs_create_u8("mballoc-debug",
2717 S_IRUGO | S_IWUSR, 2717 S_IRUGO | S_IWUSR,
2718 debugfs_dir, 2718 debugfs_dir,
2719 &mb_enable_debug); 2719 &mb_enable_debug);
2720 } 2720 }
2721 2721
2722 static void ext4_remove_debugfs_entry(void) 2722 static void ext4_remove_debugfs_entry(void)
2723 { 2723 {
2724 debugfs_remove(debugfs_debug); 2724 debugfs_remove(debugfs_debug);
2725 debugfs_remove(debugfs_dir); 2725 debugfs_remove(debugfs_dir);
2726 } 2726 }
2727 2727
2728 #else 2728 #else
2729 2729
2730 static void __init ext4_create_debugfs_entry(void) 2730 static void __init ext4_create_debugfs_entry(void)
2731 { 2731 {
2732 } 2732 }
2733 2733
2734 static void ext4_remove_debugfs_entry(void) 2734 static void ext4_remove_debugfs_entry(void)
2735 { 2735 {
2736 } 2736 }
2737 2737
2738 #endif 2738 #endif
2739 2739
2740 int __init ext4_init_mballoc(void) 2740 int __init ext4_init_mballoc(void)
2741 { 2741 {
2742 ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, 2742 ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
2743 SLAB_RECLAIM_ACCOUNT); 2743 SLAB_RECLAIM_ACCOUNT);
2744 if (ext4_pspace_cachep == NULL) 2744 if (ext4_pspace_cachep == NULL)
2745 return -ENOMEM; 2745 return -ENOMEM;
2746 2746
2747 ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, 2747 ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
2748 SLAB_RECLAIM_ACCOUNT); 2748 SLAB_RECLAIM_ACCOUNT);
2749 if (ext4_ac_cachep == NULL) { 2749 if (ext4_ac_cachep == NULL) {
2750 kmem_cache_destroy(ext4_pspace_cachep); 2750 kmem_cache_destroy(ext4_pspace_cachep);
2751 return -ENOMEM; 2751 return -ENOMEM;
2752 } 2752 }
2753 2753
2754 ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data, 2754 ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
2755 SLAB_RECLAIM_ACCOUNT); 2755 SLAB_RECLAIM_ACCOUNT);
2756 if (ext4_free_ext_cachep == NULL) { 2756 if (ext4_free_ext_cachep == NULL) {
2757 kmem_cache_destroy(ext4_pspace_cachep); 2757 kmem_cache_destroy(ext4_pspace_cachep);
2758 kmem_cache_destroy(ext4_ac_cachep); 2758 kmem_cache_destroy(ext4_ac_cachep);
2759 return -ENOMEM; 2759 return -ENOMEM;
2760 } 2760 }
2761 ext4_create_debugfs_entry(); 2761 ext4_create_debugfs_entry();
2762 return 0; 2762 return 0;
2763 } 2763 }
2764 2764
2765 void ext4_exit_mballoc(void) 2765 void ext4_exit_mballoc(void)
2766 { 2766 {
2767 /* 2767 /*
2768 * Wait for completion of call_rcu()'s on ext4_pspace_cachep 2768 * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2769 * before destroying the slab cache. 2769 * before destroying the slab cache.
2770 */ 2770 */
2771 rcu_barrier(); 2771 rcu_barrier();
2772 kmem_cache_destroy(ext4_pspace_cachep); 2772 kmem_cache_destroy(ext4_pspace_cachep);
2773 kmem_cache_destroy(ext4_ac_cachep); 2773 kmem_cache_destroy(ext4_ac_cachep);
2774 kmem_cache_destroy(ext4_free_ext_cachep); 2774 kmem_cache_destroy(ext4_free_ext_cachep);
2775 ext4_groupinfo_destroy_slabs(); 2775 ext4_groupinfo_destroy_slabs();
2776 ext4_remove_debugfs_entry(); 2776 ext4_remove_debugfs_entry();
2777 } 2777 }
2778 2778
2779 2779
2780 /* 2780 /*
2781 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps 2781 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
2782 * Returns 0 if success or error code 2782 * Returns 0 if success or error code
2783 */ 2783 */
2784 static noinline_for_stack int 2784 static noinline_for_stack int
2785 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 2785 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2786 handle_t *handle, unsigned int reserv_blks) 2786 handle_t *handle, unsigned int reserv_blks)
2787 { 2787 {
2788 struct buffer_head *bitmap_bh = NULL; 2788 struct buffer_head *bitmap_bh = NULL;
2789 struct ext4_group_desc *gdp; 2789 struct ext4_group_desc *gdp;
2790 struct buffer_head *gdp_bh; 2790 struct buffer_head *gdp_bh;
2791 struct ext4_sb_info *sbi; 2791 struct ext4_sb_info *sbi;
2792 struct super_block *sb; 2792 struct super_block *sb;
2793 ext4_fsblk_t block; 2793 ext4_fsblk_t block;
2794 int err, len; 2794 int err, len;
2795 2795
2796 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 2796 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
2797 BUG_ON(ac->ac_b_ex.fe_len <= 0); 2797 BUG_ON(ac->ac_b_ex.fe_len <= 0);
2798 2798
2799 sb = ac->ac_sb; 2799 sb = ac->ac_sb;
2800 sbi = EXT4_SB(sb); 2800 sbi = EXT4_SB(sb);
2801 2801
2802 err = -EIO; 2802 err = -EIO;
2803 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); 2803 bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
2804 if (!bitmap_bh) 2804 if (!bitmap_bh)
2805 goto out_err; 2805 goto out_err;
2806 2806
2807 err = ext4_journal_get_write_access(handle, bitmap_bh); 2807 err = ext4_journal_get_write_access(handle, bitmap_bh);
2808 if (err) 2808 if (err)
2809 goto out_err; 2809 goto out_err;
2810 2810
2811 err = -EIO; 2811 err = -EIO;
2812 gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); 2812 gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
2813 if (!gdp) 2813 if (!gdp)
2814 goto out_err; 2814 goto out_err;
2815 2815
2816 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, 2816 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
2817 ext4_free_blks_count(sb, gdp)); 2817 ext4_free_blks_count(sb, gdp));
2818 2818
2819 err = ext4_journal_get_write_access(handle, gdp_bh); 2819 err = ext4_journal_get_write_access(handle, gdp_bh);
2820 if (err) 2820 if (err)
2821 goto out_err; 2821 goto out_err;
2822 2822
2823 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 2823 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
2824 2824
2825 len = ac->ac_b_ex.fe_len; 2825 len = ac->ac_b_ex.fe_len;
2826 if (!ext4_data_block_valid(sbi, block, len)) { 2826 if (!ext4_data_block_valid(sbi, block, len)) {
2827 ext4_error(sb, "Allocating blocks %llu-%llu which overlap " 2827 ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
2828 "fs metadata\n", block, block+len); 2828 "fs metadata\n", block, block+len);
2829 /* File system mounted not to panic on error 2829 /* File system mounted not to panic on error
2830 * Fix the bitmap and repeat the block allocation 2830 * Fix the bitmap and repeat the block allocation
2831 * We leak some of the blocks here. 2831 * We leak some of the blocks here.
2832 */ 2832 */
2833 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 2833 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2834 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, 2834 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2835 ac->ac_b_ex.fe_len); 2835 ac->ac_b_ex.fe_len);
2836 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 2836 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2837 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 2837 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2838 if (!err) 2838 if (!err)
2839 err = -EAGAIN; 2839 err = -EAGAIN;
2840 goto out_err; 2840 goto out_err;
2841 } 2841 }
2842 2842
2843 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 2843 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2844 #ifdef AGGRESSIVE_CHECK 2844 #ifdef AGGRESSIVE_CHECK
2845 { 2845 {
2846 int i; 2846 int i;
2847 for (i = 0; i < ac->ac_b_ex.fe_len; i++) { 2847 for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
2848 BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, 2848 BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
2849 bitmap_bh->b_data)); 2849 bitmap_bh->b_data));
2850 } 2850 }
2851 } 2851 }
2852 #endif 2852 #endif
2853 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len); 2853 mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len);
2854 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { 2854 if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2855 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 2855 gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2856 ext4_free_blks_set(sb, gdp, 2856 ext4_free_blks_set(sb, gdp,
2857 ext4_free_blocks_after_init(sb, 2857 ext4_free_blocks_after_init(sb,
2858 ac->ac_b_ex.fe_group, gdp)); 2858 ac->ac_b_ex.fe_group, gdp));
2859 } 2859 }
2860 len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len; 2860 len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
2861 ext4_free_blks_set(sb, gdp, len); 2861 ext4_free_blks_set(sb, gdp, len);
2862 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); 2862 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
2863 2863
2864 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 2864 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2865 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); 2865 percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
2866 /* 2866 /*
2867 * Now reduce the dirty block count also. Should not go negative 2867 * Now reduce the dirty block count also. Should not go negative
2868 */ 2868 */
2869 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) 2869 if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
2870 /* release all the reserved blocks if non delalloc */ 2870 /* release all the reserved blocks if non delalloc */
2871 percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); 2871 percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
2872 2872
2873 if (sbi->s_log_groups_per_flex) { 2873 if (sbi->s_log_groups_per_flex) {
2874 ext4_group_t flex_group = ext4_flex_group(sbi, 2874 ext4_group_t flex_group = ext4_flex_group(sbi,
2875 ac->ac_b_ex.fe_group); 2875 ac->ac_b_ex.fe_group);
2876 atomic_sub(ac->ac_b_ex.fe_len, 2876 atomic_sub(ac->ac_b_ex.fe_len,
2877 &sbi->s_flex_groups[flex_group].free_blocks); 2877 &sbi->s_flex_groups[flex_group].free_blocks);
2878 } 2878 }
2879 2879
2880 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 2880 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2881 if (err) 2881 if (err)
2882 goto out_err; 2882 goto out_err;
2883 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); 2883 err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
2884 2884
2885 out_err: 2885 out_err:
2886 ext4_mark_super_dirty(sb); 2886 ext4_mark_super_dirty(sb);
2887 brelse(bitmap_bh); 2887 brelse(bitmap_bh);
2888 return err; 2888 return err;
2889 } 2889 }
2890 2890
2891 /* 2891 /*
2892 * here we normalize request for locality group 2892 * here we normalize request for locality group
2893 * Group request are normalized to s_strip size if we set the same via mount 2893 * Group request are normalized to s_strip size if we set the same via mount
2894 * option. If not we set it to s_mb_group_prealloc which can be configured via 2894 * option. If not we set it to s_mb_group_prealloc which can be configured via
2895 * /sys/fs/ext4/<partition>/mb_group_prealloc 2895 * /sys/fs/ext4/<partition>/mb_group_prealloc
2896 * 2896 *
2897 * XXX: should we try to preallocate more than the group has now? 2897 * XXX: should we try to preallocate more than the group has now?
2898 */ 2898 */
2899 static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) 2899 static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
2900 { 2900 {
2901 struct super_block *sb = ac->ac_sb; 2901 struct super_block *sb = ac->ac_sb;
2902 struct ext4_locality_group *lg = ac->ac_lg; 2902 struct ext4_locality_group *lg = ac->ac_lg;
2903 2903
2904 BUG_ON(lg == NULL); 2904 BUG_ON(lg == NULL);
2905 if (EXT4_SB(sb)->s_stripe) 2905 if (EXT4_SB(sb)->s_stripe)
2906 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; 2906 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
2907 else 2907 else
2908 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; 2908 ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
2909 mb_debug(1, "#%u: goal %u blocks for locality group\n", 2909 mb_debug(1, "#%u: goal %u blocks for locality group\n",
2910 current->pid, ac->ac_g_ex.fe_len); 2910 current->pid, ac->ac_g_ex.fe_len);
2911 } 2911 }
2912 2912
2913 /* 2913 /*
2914 * Normalization means making request better in terms of 2914 * Normalization means making request better in terms of
2915 * size and alignment 2915 * size and alignment
2916 */ 2916 */
2917 static noinline_for_stack void 2917 static noinline_for_stack void
2918 ext4_mb_normalize_request(struct ext4_allocation_context *ac, 2918 ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2919 struct ext4_allocation_request *ar) 2919 struct ext4_allocation_request *ar)
2920 { 2920 {
2921 int bsbits, max; 2921 int bsbits, max;
2922 ext4_lblk_t end; 2922 ext4_lblk_t end;
2923 loff_t size, orig_size, start_off; 2923 loff_t size, orig_size, start_off;
2924 ext4_lblk_t start; 2924 ext4_lblk_t start;
2925 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 2925 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
2926 struct ext4_prealloc_space *pa; 2926 struct ext4_prealloc_space *pa;
2927 2927
2928 /* do normalize only data requests, metadata requests 2928 /* do normalize only data requests, metadata requests
2929 do not need preallocation */ 2929 do not need preallocation */
2930 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 2930 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
2931 return; 2931 return;
2932 2932
2933 /* sometime caller may want exact blocks */ 2933 /* sometime caller may want exact blocks */
2934 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 2934 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2935 return; 2935 return;
2936 2936
2937 /* caller may indicate that preallocation isn't 2937 /* caller may indicate that preallocation isn't
2938 * required (it's a tail, for example) */ 2938 * required (it's a tail, for example) */
2939 if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) 2939 if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
2940 return; 2940 return;
2941 2941
2942 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { 2942 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
2943 ext4_mb_normalize_group_request(ac); 2943 ext4_mb_normalize_group_request(ac);
2944 return ; 2944 return ;
2945 } 2945 }
2946 2946
2947 bsbits = ac->ac_sb->s_blocksize_bits; 2947 bsbits = ac->ac_sb->s_blocksize_bits;
2948 2948
2949 /* first, let's learn actual file size 2949 /* first, let's learn actual file size
2950 * given current request is allocated */ 2950 * given current request is allocated */
2951 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 2951 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
2952 size = size << bsbits; 2952 size = size << bsbits;
2953 if (size < i_size_read(ac->ac_inode)) 2953 if (size < i_size_read(ac->ac_inode))
2954 size = i_size_read(ac->ac_inode); 2954 size = i_size_read(ac->ac_inode);
2955 orig_size = size; 2955 orig_size = size;
2956 2956
2957 /* max size of free chunks */ 2957 /* max size of free chunks */
2958 max = 2 << bsbits; 2958 max = 2 << bsbits;
2959 2959
2960 #define NRL_CHECK_SIZE(req, size, max, chunk_size) \ 2960 #define NRL_CHECK_SIZE(req, size, max, chunk_size) \
2961 (req <= (size) || max <= (chunk_size)) 2961 (req <= (size) || max <= (chunk_size))
2962 2962
2963 /* first, try to predict filesize */ 2963 /* first, try to predict filesize */
2964 /* XXX: should this table be tunable? */ 2964 /* XXX: should this table be tunable? */
2965 start_off = 0; 2965 start_off = 0;
2966 if (size <= 16 * 1024) { 2966 if (size <= 16 * 1024) {
2967 size = 16 * 1024; 2967 size = 16 * 1024;
2968 } else if (size <= 32 * 1024) { 2968 } else if (size <= 32 * 1024) {
2969 size = 32 * 1024; 2969 size = 32 * 1024;
2970 } else if (size <= 64 * 1024) { 2970 } else if (size <= 64 * 1024) {
2971 size = 64 * 1024; 2971 size = 64 * 1024;
2972 } else if (size <= 128 * 1024) { 2972 } else if (size <= 128 * 1024) {
2973 size = 128 * 1024; 2973 size = 128 * 1024;
2974 } else if (size <= 256 * 1024) { 2974 } else if (size <= 256 * 1024) {
2975 size = 256 * 1024; 2975 size = 256 * 1024;
2976 } else if (size <= 512 * 1024) { 2976 } else if (size <= 512 * 1024) {
2977 size = 512 * 1024; 2977 size = 512 * 1024;
2978 } else if (size <= 1024 * 1024) { 2978 } else if (size <= 1024 * 1024) {
2979 size = 1024 * 1024; 2979 size = 1024 * 1024;
2980 } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { 2980 } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
2981 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 2981 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
2982 (21 - bsbits)) << 21; 2982 (21 - bsbits)) << 21;
2983 size = 2 * 1024 * 1024; 2983 size = 2 * 1024 * 1024;
2984 } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { 2984 } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
2985 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 2985 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
2986 (22 - bsbits)) << 22; 2986 (22 - bsbits)) << 22;
2987 size = 4 * 1024 * 1024; 2987 size = 4 * 1024 * 1024;
2988 } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len, 2988 } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
2989 (8<<20)>>bsbits, max, 8 * 1024)) { 2989 (8<<20)>>bsbits, max, 8 * 1024)) {
2990 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 2990 start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
2991 (23 - bsbits)) << 23; 2991 (23 - bsbits)) << 23;
2992 size = 8 * 1024 * 1024; 2992 size = 8 * 1024 * 1024;
2993 } else { 2993 } else {
2994 start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; 2994 start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
2995 size = ac->ac_o_ex.fe_len << bsbits; 2995 size = ac->ac_o_ex.fe_len << bsbits;
2996 } 2996 }
2997 size = size >> bsbits; 2997 size = size >> bsbits;
2998 start = start_off >> bsbits; 2998 start = start_off >> bsbits;
2999 2999
3000 /* don't cover already allocated blocks in selected range */ 3000 /* don't cover already allocated blocks in selected range */
3001 if (ar->pleft && start <= ar->lleft) { 3001 if (ar->pleft && start <= ar->lleft) {
3002 size -= ar->lleft + 1 - start; 3002 size -= ar->lleft + 1 - start;
3003 start = ar->lleft + 1; 3003 start = ar->lleft + 1;
3004 } 3004 }
3005 if (ar->pright && start + size - 1 >= ar->lright) 3005 if (ar->pright && start + size - 1 >= ar->lright)
3006 size -= start + size - ar->lright; 3006 size -= start + size - ar->lright;
3007 3007
3008 end = start + size; 3008 end = start + size;
3009 3009
3010 /* check we don't cross already preallocated blocks */ 3010 /* check we don't cross already preallocated blocks */
3011 rcu_read_lock(); 3011 rcu_read_lock();
3012 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3012 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3013 ext4_lblk_t pa_end; 3013 ext4_lblk_t pa_end;
3014 3014
3015 if (pa->pa_deleted) 3015 if (pa->pa_deleted)
3016 continue; 3016 continue;
3017 spin_lock(&pa->pa_lock); 3017 spin_lock(&pa->pa_lock);
3018 if (pa->pa_deleted) { 3018 if (pa->pa_deleted) {
3019 spin_unlock(&pa->pa_lock); 3019 spin_unlock(&pa->pa_lock);
3020 continue; 3020 continue;
3021 } 3021 }
3022 3022
3023 pa_end = pa->pa_lstart + pa->pa_len; 3023 pa_end = pa->pa_lstart + pa->pa_len;
3024 3024
3025 /* PA must not overlap original request */ 3025 /* PA must not overlap original request */
3026 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || 3026 BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
3027 ac->ac_o_ex.fe_logical < pa->pa_lstart)); 3027 ac->ac_o_ex.fe_logical < pa->pa_lstart));
3028 3028
3029 /* skip PAs this normalized request doesn't overlap with */ 3029 /* skip PAs this normalized request doesn't overlap with */
3030 if (pa->pa_lstart >= end || pa_end <= start) { 3030 if (pa->pa_lstart >= end || pa_end <= start) {
3031 spin_unlock(&pa->pa_lock); 3031 spin_unlock(&pa->pa_lock);
3032 continue; 3032 continue;
3033 } 3033 }
3034 BUG_ON(pa->pa_lstart <= start && pa_end >= end); 3034 BUG_ON(pa->pa_lstart <= start && pa_end >= end);
3035 3035
3036 /* adjust start or end to be adjacent to this pa */ 3036 /* adjust start or end to be adjacent to this pa */
3037 if (pa_end <= ac->ac_o_ex.fe_logical) { 3037 if (pa_end <= ac->ac_o_ex.fe_logical) {
3038 BUG_ON(pa_end < start); 3038 BUG_ON(pa_end < start);
3039 start = pa_end; 3039 start = pa_end;
3040 } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { 3040 } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3041 BUG_ON(pa->pa_lstart > end); 3041 BUG_ON(pa->pa_lstart > end);
3042 end = pa->pa_lstart; 3042 end = pa->pa_lstart;
3043 } 3043 }
3044 spin_unlock(&pa->pa_lock); 3044 spin_unlock(&pa->pa_lock);
3045 } 3045 }
3046 rcu_read_unlock(); 3046 rcu_read_unlock();
3047 size = end - start; 3047 size = end - start;
3048 3048
3049 /* XXX: extra loop to check we really don't overlap preallocations */ 3049 /* XXX: extra loop to check we really don't overlap preallocations */
3050 rcu_read_lock(); 3050 rcu_read_lock();
3051 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3051 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3052 ext4_lblk_t pa_end; 3052 ext4_lblk_t pa_end;
3053 spin_lock(&pa->pa_lock); 3053 spin_lock(&pa->pa_lock);
3054 if (pa->pa_deleted == 0) { 3054 if (pa->pa_deleted == 0) {
3055 pa_end = pa->pa_lstart + pa->pa_len; 3055 pa_end = pa->pa_lstart + pa->pa_len;
3056 BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); 3056 BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
3057 } 3057 }
3058 spin_unlock(&pa->pa_lock); 3058 spin_unlock(&pa->pa_lock);
3059 } 3059 }
3060 rcu_read_unlock(); 3060 rcu_read_unlock();
3061 3061
3062 if (start + size <= ac->ac_o_ex.fe_logical && 3062 if (start + size <= ac->ac_o_ex.fe_logical &&
3063 start > ac->ac_o_ex.fe_logical) { 3063 start > ac->ac_o_ex.fe_logical) {
3064 printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n", 3064 printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n",
3065 (unsigned long) start, (unsigned long) size, 3065 (unsigned long) start, (unsigned long) size,
3066 (unsigned long) ac->ac_o_ex.fe_logical); 3066 (unsigned long) ac->ac_o_ex.fe_logical);
3067 } 3067 }
3068 BUG_ON(start + size <= ac->ac_o_ex.fe_logical && 3068 BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
3069 start > ac->ac_o_ex.fe_logical); 3069 start > ac->ac_o_ex.fe_logical);
3070 BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); 3070 BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
3071 3071
3072 /* now prepare goal request */ 3072 /* now prepare goal request */
3073 3073
3074 /* XXX: is it better to align blocks WRT to logical 3074 /* XXX: is it better to align blocks WRT to logical
3075 * placement or satisfy big request as is */ 3075 * placement or satisfy big request as is */
3076 ac->ac_g_ex.fe_logical = start; 3076 ac->ac_g_ex.fe_logical = start;
3077 ac->ac_g_ex.fe_len = size; 3077 ac->ac_g_ex.fe_len = size;
3078 3078
3079 /* define goal start in order to merge */ 3079 /* define goal start in order to merge */
3080 if (ar->pright && (ar->lright == (start + size))) { 3080 if (ar->pright && (ar->lright == (start + size))) {
3081 /* merge to the right */ 3081 /* merge to the right */
3082 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, 3082 ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
3083 &ac->ac_f_ex.fe_group, 3083 &ac->ac_f_ex.fe_group,
3084 &ac->ac_f_ex.fe_start); 3084 &ac->ac_f_ex.fe_start);
3085 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 3085 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3086 } 3086 }
3087 if (ar->pleft && (ar->lleft + 1 == start)) { 3087 if (ar->pleft && (ar->lleft + 1 == start)) {
3088 /* merge to the left */ 3088 /* merge to the left */
3089 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, 3089 ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
3090 &ac->ac_f_ex.fe_group, 3090 &ac->ac_f_ex.fe_group,
3091 &ac->ac_f_ex.fe_start); 3091 &ac->ac_f_ex.fe_start);
3092 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; 3092 ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3093 } 3093 }
3094 3094
3095 mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size, 3095 mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
3096 (unsigned) orig_size, (unsigned) start); 3096 (unsigned) orig_size, (unsigned) start);
3097 } 3097 }
3098 3098
3099 static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) 3099 static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3100 { 3100 {
3101 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 3101 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3102 3102
3103 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { 3103 if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
3104 atomic_inc(&sbi->s_bal_reqs); 3104 atomic_inc(&sbi->s_bal_reqs);
3105 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); 3105 atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
3106 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) 3106 if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
3107 atomic_inc(&sbi->s_bal_success); 3107 atomic_inc(&sbi->s_bal_success);
3108 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); 3108 atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
3109 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && 3109 if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
3110 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) 3110 ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
3111 atomic_inc(&sbi->s_bal_goals); 3111 atomic_inc(&sbi->s_bal_goals);
3112 if (ac->ac_found > sbi->s_mb_max_to_scan) 3112 if (ac->ac_found > sbi->s_mb_max_to_scan)
3113 atomic_inc(&sbi->s_bal_breaks); 3113 atomic_inc(&sbi->s_bal_breaks);
3114 } 3114 }
3115 3115
3116 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) 3116 if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
3117 trace_ext4_mballoc_alloc(ac); 3117 trace_ext4_mballoc_alloc(ac);
3118 else 3118 else
3119 trace_ext4_mballoc_prealloc(ac); 3119 trace_ext4_mballoc_prealloc(ac);
3120 } 3120 }
3121 3121
3122 /* 3122 /*
3123 * Called on failure; free up any blocks from the inode PA for this 3123 * Called on failure; free up any blocks from the inode PA for this
3124 * context. We don't need this for MB_GROUP_PA because we only change 3124 * context. We don't need this for MB_GROUP_PA because we only change
3125 * pa_free in ext4_mb_release_context(), but on failure, we've already 3125 * pa_free in ext4_mb_release_context(), but on failure, we've already
3126 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. 3126 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
3127 */ 3127 */
3128 static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) 3128 static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
3129 { 3129 {
3130 struct ext4_prealloc_space *pa = ac->ac_pa; 3130 struct ext4_prealloc_space *pa = ac->ac_pa;
3131 int len; 3131 int len;
3132 3132
3133 if (pa && pa->pa_type == MB_INODE_PA) { 3133 if (pa && pa->pa_type == MB_INODE_PA) {
3134 len = ac->ac_b_ex.fe_len; 3134 len = ac->ac_b_ex.fe_len;
3135 pa->pa_free += len; 3135 pa->pa_free += len;
3136 } 3136 }
3137 3137
3138 } 3138 }
3139 3139
3140 /* 3140 /*
3141 * use blocks preallocated to inode 3141 * use blocks preallocated to inode
3142 */ 3142 */
3143 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, 3143 static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3144 struct ext4_prealloc_space *pa) 3144 struct ext4_prealloc_space *pa)
3145 { 3145 {
3146 ext4_fsblk_t start; 3146 ext4_fsblk_t start;
3147 ext4_fsblk_t end; 3147 ext4_fsblk_t end;
3148 int len; 3148 int len;
3149 3149
3150 /* found preallocated blocks, use them */ 3150 /* found preallocated blocks, use them */
3151 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); 3151 start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
3152 end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len); 3152 end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len);
3153 len = end - start; 3153 len = end - start;
3154 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, 3154 ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
3155 &ac->ac_b_ex.fe_start); 3155 &ac->ac_b_ex.fe_start);
3156 ac->ac_b_ex.fe_len = len; 3156 ac->ac_b_ex.fe_len = len;
3157 ac->ac_status = AC_STATUS_FOUND; 3157 ac->ac_status = AC_STATUS_FOUND;
3158 ac->ac_pa = pa; 3158 ac->ac_pa = pa;
3159 3159
3160 BUG_ON(start < pa->pa_pstart); 3160 BUG_ON(start < pa->pa_pstart);
3161 BUG_ON(start + len > pa->pa_pstart + pa->pa_len); 3161 BUG_ON(start + len > pa->pa_pstart + pa->pa_len);
3162 BUG_ON(pa->pa_free < len); 3162 BUG_ON(pa->pa_free < len);
3163 pa->pa_free -= len; 3163 pa->pa_free -= len;
3164 3164
3165 mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa); 3165 mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
3166 } 3166 }
3167 3167
3168 /* 3168 /*
3169 * use blocks preallocated to locality group 3169 * use blocks preallocated to locality group
3170 */ 3170 */
3171 static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, 3171 static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3172 struct ext4_prealloc_space *pa) 3172 struct ext4_prealloc_space *pa)
3173 { 3173 {
3174 unsigned int len = ac->ac_o_ex.fe_len; 3174 unsigned int len = ac->ac_o_ex.fe_len;
3175 3175
3176 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, 3176 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
3177 &ac->ac_b_ex.fe_group, 3177 &ac->ac_b_ex.fe_group,
3178 &ac->ac_b_ex.fe_start); 3178 &ac->ac_b_ex.fe_start);
3179 ac->ac_b_ex.fe_len = len; 3179 ac->ac_b_ex.fe_len = len;
3180 ac->ac_status = AC_STATUS_FOUND; 3180 ac->ac_status = AC_STATUS_FOUND;
3181 ac->ac_pa = pa; 3181 ac->ac_pa = pa;
3182 3182
3183 /* we don't correct pa_pstart or pa_plen here to avoid 3183 /* we don't correct pa_pstart or pa_plen here to avoid
3184 * possible race when the group is being loaded concurrently 3184 * possible race when the group is being loaded concurrently
3185 * instead we correct pa later, after blocks are marked 3185 * instead we correct pa later, after blocks are marked
3186 * in on-disk bitmap -- see ext4_mb_release_context() 3186 * in on-disk bitmap -- see ext4_mb_release_context()
3187 * Other CPUs are prevented from allocating from this pa by lg_mutex 3187 * Other CPUs are prevented from allocating from this pa by lg_mutex
3188 */ 3188 */
3189 mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); 3189 mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3190 } 3190 }
3191 3191
3192 /* 3192 /*
3193 * Return the prealloc space that have minimal distance 3193 * Return the prealloc space that have minimal distance
3194 * from the goal block. @cpa is the prealloc 3194 * from the goal block. @cpa is the prealloc
3195 * space that is having currently known minimal distance 3195 * space that is having currently known minimal distance
3196 * from the goal block. 3196 * from the goal block.
3197 */ 3197 */
3198 static struct ext4_prealloc_space * 3198 static struct ext4_prealloc_space *
3199 ext4_mb_check_group_pa(ext4_fsblk_t goal_block, 3199 ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3200 struct ext4_prealloc_space *pa, 3200 struct ext4_prealloc_space *pa,
3201 struct ext4_prealloc_space *cpa) 3201 struct ext4_prealloc_space *cpa)
3202 { 3202 {
3203 ext4_fsblk_t cur_distance, new_distance; 3203 ext4_fsblk_t cur_distance, new_distance;
3204 3204
3205 if (cpa == NULL) { 3205 if (cpa == NULL) {
3206 atomic_inc(&pa->pa_count); 3206 atomic_inc(&pa->pa_count);
3207 return pa; 3207 return pa;
3208 } 3208 }
3209 cur_distance = abs(goal_block - cpa->pa_pstart); 3209 cur_distance = abs(goal_block - cpa->pa_pstart);
3210 new_distance = abs(goal_block - pa->pa_pstart); 3210 new_distance = abs(goal_block - pa->pa_pstart);
3211 3211
3212 if (cur_distance <= new_distance) 3212 if (cur_distance <= new_distance)
3213 return cpa; 3213 return cpa;
3214 3214
3215 /* drop the previous reference */ 3215 /* drop the previous reference */
3216 atomic_dec(&cpa->pa_count); 3216 atomic_dec(&cpa->pa_count);
3217 atomic_inc(&pa->pa_count); 3217 atomic_inc(&pa->pa_count);
3218 return pa; 3218 return pa;
3219 } 3219 }
3220 3220
3221 /* 3221 /*
3222 * search goal blocks in preallocated space 3222 * search goal blocks in preallocated space
3223 */ 3223 */
3224 static noinline_for_stack int 3224 static noinline_for_stack int
3225 ext4_mb_use_preallocated(struct ext4_allocation_context *ac) 3225 ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3226 { 3226 {
3227 int order, i; 3227 int order, i;
3228 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); 3228 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3229 struct ext4_locality_group *lg; 3229 struct ext4_locality_group *lg;
3230 struct ext4_prealloc_space *pa, *cpa = NULL; 3230 struct ext4_prealloc_space *pa, *cpa = NULL;
3231 ext4_fsblk_t goal_block; 3231 ext4_fsblk_t goal_block;
3232 3232
3233 /* only data can be preallocated */ 3233 /* only data can be preallocated */
3234 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 3234 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3235 return 0; 3235 return 0;
3236 3236
3237 /* first, try per-file preallocation */ 3237 /* first, try per-file preallocation */
3238 rcu_read_lock(); 3238 rcu_read_lock();
3239 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { 3239 list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3240 3240
3241 /* all fields in this condition don't change, 3241 /* all fields in this condition don't change,
3242 * so we can skip locking for them */ 3242 * so we can skip locking for them */
3243 if (ac->ac_o_ex.fe_logical < pa->pa_lstart || 3243 if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
3244 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len) 3244 ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
3245 continue; 3245 continue;
3246 3246
3247 /* non-extent files can't have physical blocks past 2^32 */ 3247 /* non-extent files can't have physical blocks past 2^32 */
3248 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && 3248 if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
3249 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS) 3249 pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
3250 continue; 3250 continue;
3251 3251
3252 /* found preallocated blocks, use them */ 3252 /* found preallocated blocks, use them */
3253 spin_lock(&pa->pa_lock); 3253 spin_lock(&pa->pa_lock);
3254 if (pa->pa_deleted == 0 && pa->pa_free) { 3254 if (pa->pa_deleted == 0 && pa->pa_free) {
3255 atomic_inc(&pa->pa_count); 3255 atomic_inc(&pa->pa_count);
3256 ext4_mb_use_inode_pa(ac, pa); 3256 ext4_mb_use_inode_pa(ac, pa);
3257 spin_unlock(&pa->pa_lock); 3257 spin_unlock(&pa->pa_lock);
3258 ac->ac_criteria = 10; 3258 ac->ac_criteria = 10;
3259 rcu_read_unlock(); 3259 rcu_read_unlock();
3260 return 1; 3260 return 1;
3261 } 3261 }
3262 spin_unlock(&pa->pa_lock); 3262 spin_unlock(&pa->pa_lock);
3263 } 3263 }
3264 rcu_read_unlock(); 3264 rcu_read_unlock();
3265 3265
3266 /* can we use group allocation? */ 3266 /* can we use group allocation? */
3267 if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) 3267 if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
3268 return 0; 3268 return 0;
3269 3269
3270 /* inode may have no locality group for some reason */ 3270 /* inode may have no locality group for some reason */
3271 lg = ac->ac_lg; 3271 lg = ac->ac_lg;
3272 if (lg == NULL) 3272 if (lg == NULL)
3273 return 0; 3273 return 0;
3274 order = fls(ac->ac_o_ex.fe_len) - 1; 3274 order = fls(ac->ac_o_ex.fe_len) - 1;
3275 if (order > PREALLOC_TB_SIZE - 1) 3275 if (order > PREALLOC_TB_SIZE - 1)
3276 /* The max size of hash table is PREALLOC_TB_SIZE */ 3276 /* The max size of hash table is PREALLOC_TB_SIZE */
3277 order = PREALLOC_TB_SIZE - 1; 3277 order = PREALLOC_TB_SIZE - 1;
3278 3278
3279 goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); 3279 goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
3280 /* 3280 /*
3281 * search for the prealloc space that is having 3281 * search for the prealloc space that is having
3282 * minimal distance from the goal block. 3282 * minimal distance from the goal block.
3283 */ 3283 */
3284 for (i = order; i < PREALLOC_TB_SIZE; i++) { 3284 for (i = order; i < PREALLOC_TB_SIZE; i++) {
3285 rcu_read_lock(); 3285 rcu_read_lock();
3286 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], 3286 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
3287 pa_inode_list) { 3287 pa_inode_list) {
3288 spin_lock(&pa->pa_lock); 3288 spin_lock(&pa->pa_lock);
3289 if (pa->pa_deleted == 0 && 3289 if (pa->pa_deleted == 0 &&
3290 pa->pa_free >= ac->ac_o_ex.fe_len) { 3290 pa->pa_free >= ac->ac_o_ex.fe_len) {
3291 3291
3292 cpa = ext4_mb_check_group_pa(goal_block, 3292 cpa = ext4_mb_check_group_pa(goal_block,
3293 pa, cpa); 3293 pa, cpa);
3294 } 3294 }
3295 spin_unlock(&pa->pa_lock); 3295 spin_unlock(&pa->pa_lock);
3296 } 3296 }
3297 rcu_read_unlock(); 3297 rcu_read_unlock();
3298 } 3298 }
3299 if (cpa) { 3299 if (cpa) {
3300 ext4_mb_use_group_pa(ac, cpa); 3300 ext4_mb_use_group_pa(ac, cpa);
3301 ac->ac_criteria = 20; 3301 ac->ac_criteria = 20;
3302 return 1; 3302 return 1;
3303 } 3303 }
3304 return 0; 3304 return 0;
3305 } 3305 }
3306 3306
3307 /* 3307 /*
3308 * the function goes through all block freed in the group 3308 * the function goes through all block freed in the group
3309 * but not yet committed and marks them used in in-core bitmap. 3309 * but not yet committed and marks them used in in-core bitmap.
3310 * buddy must be generated from this bitmap 3310 * buddy must be generated from this bitmap
3311 * Need to be called with the ext4 group lock held 3311 * Need to be called with the ext4 group lock held
3312 */ 3312 */
3313 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 3313 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3314 ext4_group_t group) 3314 ext4_group_t group)
3315 { 3315 {
3316 struct rb_node *n; 3316 struct rb_node *n;
3317 struct ext4_group_info *grp; 3317 struct ext4_group_info *grp;
3318 struct ext4_free_data *entry; 3318 struct ext4_free_data *entry;
3319 3319
3320 grp = ext4_get_group_info(sb, group); 3320 grp = ext4_get_group_info(sb, group);
3321 n = rb_first(&(grp->bb_free_root)); 3321 n = rb_first(&(grp->bb_free_root));
3322 3322
3323 while (n) { 3323 while (n) {
3324 entry = rb_entry(n, struct ext4_free_data, node); 3324 entry = rb_entry(n, struct ext4_free_data, node);
3325 mb_set_bits(bitmap, entry->start_blk, entry->count); 3325 mb_set_bits(bitmap, entry->start_blk, entry->count);
3326 n = rb_next(n); 3326 n = rb_next(n);
3327 } 3327 }
3328 return; 3328 return;
3329 } 3329 }
3330 3330
3331 /* 3331 /*
3332 * the function goes through all preallocation in this group and marks them 3332 * the function goes through all preallocation in this group and marks them
3333 * used in in-core bitmap. buddy must be generated from this bitmap 3333 * used in in-core bitmap. buddy must be generated from this bitmap
3334 * Need to be called with ext4 group lock held 3334 * Need to be called with ext4 group lock held
3335 */ 3335 */
3336 static noinline_for_stack 3336 static noinline_for_stack
3337 void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 3337 void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3338 ext4_group_t group) 3338 ext4_group_t group)
3339 { 3339 {
3340 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 3340 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3341 struct ext4_prealloc_space *pa; 3341 struct ext4_prealloc_space *pa;
3342 struct list_head *cur; 3342 struct list_head *cur;
3343 ext4_group_t groupnr; 3343 ext4_group_t groupnr;
3344 ext4_grpblk_t start; 3344 ext4_grpblk_t start;
3345 int preallocated = 0; 3345 int preallocated = 0;
3346 int count = 0; 3346 int count = 0;
3347 int len; 3347 int len;
3348 3348
3349 /* all form of preallocation discards first load group, 3349 /* all form of preallocation discards first load group,
3350 * so the only competing code is preallocation use. 3350 * so the only competing code is preallocation use.
3351 * we don't need any locking here 3351 * we don't need any locking here
3352 * notice we do NOT ignore preallocations with pa_deleted 3352 * notice we do NOT ignore preallocations with pa_deleted
3353 * otherwise we could leave used blocks available for 3353 * otherwise we could leave used blocks available for
3354 * allocation in buddy when concurrent ext4_mb_put_pa() 3354 * allocation in buddy when concurrent ext4_mb_put_pa()
3355 * is dropping preallocation 3355 * is dropping preallocation
3356 */ 3356 */
3357 list_for_each(cur, &grp->bb_prealloc_list) { 3357 list_for_each(cur, &grp->bb_prealloc_list) {
3358 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 3358 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
3359 spin_lock(&pa->pa_lock); 3359 spin_lock(&pa->pa_lock);
3360 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 3360 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
3361 &groupnr, &start); 3361 &groupnr, &start);
3362 len = pa->pa_len; 3362 len = pa->pa_len;
3363 spin_unlock(&pa->pa_lock); 3363 spin_unlock(&pa->pa_lock);
3364 if (unlikely(len == 0)) 3364 if (unlikely(len == 0))
3365 continue; 3365 continue;
3366 BUG_ON(groupnr != group); 3366 BUG_ON(groupnr != group);
3367 mb_set_bits(bitmap, start, len); 3367 mb_set_bits(bitmap, start, len);
3368 preallocated += len; 3368 preallocated += len;
3369 count++; 3369 count++;
3370 } 3370 }
3371 mb_debug(1, "prellocated %u for group %u\n", preallocated, group); 3371 mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
3372 } 3372 }
3373 3373
3374 static void ext4_mb_pa_callback(struct rcu_head *head) 3374 static void ext4_mb_pa_callback(struct rcu_head *head)
3375 { 3375 {
3376 struct ext4_prealloc_space *pa; 3376 struct ext4_prealloc_space *pa;
3377 pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); 3377 pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
3378 kmem_cache_free(ext4_pspace_cachep, pa); 3378 kmem_cache_free(ext4_pspace_cachep, pa);
3379 } 3379 }
3380 3380
3381 /* 3381 /*
3382 * drops a reference to preallocated space descriptor 3382 * drops a reference to preallocated space descriptor
3383 * if this was the last reference and the space is consumed 3383 * if this was the last reference and the space is consumed
3384 */ 3384 */
3385 static void ext4_mb_put_pa(struct ext4_allocation_context *ac, 3385 static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3386 struct super_block *sb, struct ext4_prealloc_space *pa) 3386 struct super_block *sb, struct ext4_prealloc_space *pa)
3387 { 3387 {
3388 ext4_group_t grp; 3388 ext4_group_t grp;
3389 ext4_fsblk_t grp_blk; 3389 ext4_fsblk_t grp_blk;
3390 3390
3391 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) 3391 if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
3392 return; 3392 return;
3393 3393
3394 /* in this short window concurrent discard can set pa_deleted */ 3394 /* in this short window concurrent discard can set pa_deleted */
3395 spin_lock(&pa->pa_lock); 3395 spin_lock(&pa->pa_lock);
3396 if (pa->pa_deleted == 1) { 3396 if (pa->pa_deleted == 1) {
3397 spin_unlock(&pa->pa_lock); 3397 spin_unlock(&pa->pa_lock);
3398 return; 3398 return;
3399 } 3399 }
3400 3400
3401 pa->pa_deleted = 1; 3401 pa->pa_deleted = 1;
3402 spin_unlock(&pa->pa_lock); 3402 spin_unlock(&pa->pa_lock);
3403 3403
3404 grp_blk = pa->pa_pstart; 3404 grp_blk = pa->pa_pstart;
3405 /* 3405 /*
3406 * If doing group-based preallocation, pa_pstart may be in the 3406 * If doing group-based preallocation, pa_pstart may be in the
3407 * next group when pa is used up 3407 * next group when pa is used up
3408 */ 3408 */
3409 if (pa->pa_type == MB_GROUP_PA) 3409 if (pa->pa_type == MB_GROUP_PA)
3410 grp_blk--; 3410 grp_blk--;
3411 3411
3412 ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); 3412 ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
3413 3413
3414 /* 3414 /*
3415 * possible race: 3415 * possible race:
3416 * 3416 *
3417 * P1 (buddy init) P2 (regular allocation) 3417 * P1 (buddy init) P2 (regular allocation)
3418 * find block B in PA 3418 * find block B in PA
3419 * copy on-disk bitmap to buddy 3419 * copy on-disk bitmap to buddy
3420 * mark B in on-disk bitmap 3420 * mark B in on-disk bitmap
3421 * drop PA from group 3421 * drop PA from group
3422 * mark all PAs in buddy 3422 * mark all PAs in buddy
3423 * 3423 *
3424 * thus, P1 initializes buddy with B available. to prevent this 3424 * thus, P1 initializes buddy with B available. to prevent this
3425 * we make "copy" and "mark all PAs" atomic and serialize "drop PA" 3425 * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
3426 * against that pair 3426 * against that pair
3427 */ 3427 */
3428 ext4_lock_group(sb, grp); 3428 ext4_lock_group(sb, grp);
3429 list_del(&pa->pa_group_list); 3429 list_del(&pa->pa_group_list);
3430 ext4_unlock_group(sb, grp); 3430 ext4_unlock_group(sb, grp);
3431 3431
3432 spin_lock(pa->pa_obj_lock); 3432 spin_lock(pa->pa_obj_lock);
3433 list_del_rcu(&pa->pa_inode_list); 3433 list_del_rcu(&pa->pa_inode_list);
3434 spin_unlock(pa->pa_obj_lock); 3434 spin_unlock(pa->pa_obj_lock);
3435 3435
3436 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3436 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3437 } 3437 }
3438 3438
3439 /* 3439 /*
3440 * creates new preallocated space for given inode 3440 * creates new preallocated space for given inode
3441 */ 3441 */
3442 static noinline_for_stack int 3442 static noinline_for_stack int
3443 ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) 3443 ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3444 { 3444 {
3445 struct super_block *sb = ac->ac_sb; 3445 struct super_block *sb = ac->ac_sb;
3446 struct ext4_prealloc_space *pa; 3446 struct ext4_prealloc_space *pa;
3447 struct ext4_group_info *grp; 3447 struct ext4_group_info *grp;
3448 struct ext4_inode_info *ei; 3448 struct ext4_inode_info *ei;
3449 3449
3450 /* preallocate only when found space is larger then requested */ 3450 /* preallocate only when found space is larger then requested */
3451 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 3451 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3452 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 3452 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3453 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 3453 BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3454 3454
3455 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); 3455 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3456 if (pa == NULL) 3456 if (pa == NULL)
3457 return -ENOMEM; 3457 return -ENOMEM;
3458 3458
3459 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { 3459 if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
3460 int winl; 3460 int winl;
3461 int wins; 3461 int wins;
3462 int win; 3462 int win;
3463 int offs; 3463 int offs;
3464 3464
3465 /* we can't allocate as much as normalizer wants. 3465 /* we can't allocate as much as normalizer wants.
3466 * so, found space must get proper lstart 3466 * so, found space must get proper lstart
3467 * to cover original request */ 3467 * to cover original request */
3468 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); 3468 BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
3469 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); 3469 BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
3470 3470
3471 /* we're limited by original request in that 3471 /* we're limited by original request in that
3472 * logical block must be covered any way 3472 * logical block must be covered any way
3473 * winl is window we can move our chunk within */ 3473 * winl is window we can move our chunk within */
3474 winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; 3474 winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
3475 3475
3476 /* also, we should cover whole original request */ 3476 /* also, we should cover whole original request */
3477 wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len; 3477 wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
3478 3478
3479 /* the smallest one defines real window */ 3479 /* the smallest one defines real window */
3480 win = min(winl, wins); 3480 win = min(winl, wins);
3481 3481
3482 offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len; 3482 offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
3483 if (offs && offs < win) 3483 if (offs && offs < win)
3484 win = offs; 3484 win = offs;
3485 3485
3486 ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win; 3486 ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
3487 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); 3487 BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
3488 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); 3488 BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
3489 } 3489 }
3490 3490
3491 /* preallocation can change ac_b_ex, thus we store actually 3491 /* preallocation can change ac_b_ex, thus we store actually
3492 * allocated blocks for history */ 3492 * allocated blocks for history */
3493 ac->ac_f_ex = ac->ac_b_ex; 3493 ac->ac_f_ex = ac->ac_b_ex;
3494 3494
3495 pa->pa_lstart = ac->ac_b_ex.fe_logical; 3495 pa->pa_lstart = ac->ac_b_ex.fe_logical;
3496 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 3496 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3497 pa->pa_len = ac->ac_b_ex.fe_len; 3497 pa->pa_len = ac->ac_b_ex.fe_len;
3498 pa->pa_free = pa->pa_len; 3498 pa->pa_free = pa->pa_len;
3499 atomic_set(&pa->pa_count, 1); 3499 atomic_set(&pa->pa_count, 1);
3500 spin_lock_init(&pa->pa_lock); 3500 spin_lock_init(&pa->pa_lock);
3501 INIT_LIST_HEAD(&pa->pa_inode_list); 3501 INIT_LIST_HEAD(&pa->pa_inode_list);
3502 INIT_LIST_HEAD(&pa->pa_group_list); 3502 INIT_LIST_HEAD(&pa->pa_group_list);
3503 pa->pa_deleted = 0; 3503 pa->pa_deleted = 0;
3504 pa->pa_type = MB_INODE_PA; 3504 pa->pa_type = MB_INODE_PA;
3505 3505
3506 mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa, 3506 mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
3507 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3507 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3508 trace_ext4_mb_new_inode_pa(ac, pa); 3508 trace_ext4_mb_new_inode_pa(ac, pa);
3509 3509
3510 ext4_mb_use_inode_pa(ac, pa); 3510 ext4_mb_use_inode_pa(ac, pa);
3511 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3511 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
3512 3512
3513 ei = EXT4_I(ac->ac_inode); 3513 ei = EXT4_I(ac->ac_inode);
3514 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 3514 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3515 3515
3516 pa->pa_obj_lock = &ei->i_prealloc_lock; 3516 pa->pa_obj_lock = &ei->i_prealloc_lock;
3517 pa->pa_inode = ac->ac_inode; 3517 pa->pa_inode = ac->ac_inode;
3518 3518
3519 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 3519 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3520 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 3520 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3521 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3521 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3522 3522
3523 spin_lock(pa->pa_obj_lock); 3523 spin_lock(pa->pa_obj_lock);
3524 list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); 3524 list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
3525 spin_unlock(pa->pa_obj_lock); 3525 spin_unlock(pa->pa_obj_lock);
3526 3526
3527 return 0; 3527 return 0;
3528 } 3528 }
3529 3529
3530 /* 3530 /*
3531 * creates new preallocated space for locality group inodes belongs to 3531 * creates new preallocated space for locality group inodes belongs to
3532 */ 3532 */
3533 static noinline_for_stack int 3533 static noinline_for_stack int
3534 ext4_mb_new_group_pa(struct ext4_allocation_context *ac) 3534 ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3535 { 3535 {
3536 struct super_block *sb = ac->ac_sb; 3536 struct super_block *sb = ac->ac_sb;
3537 struct ext4_locality_group *lg; 3537 struct ext4_locality_group *lg;
3538 struct ext4_prealloc_space *pa; 3538 struct ext4_prealloc_space *pa;
3539 struct ext4_group_info *grp; 3539 struct ext4_group_info *grp;
3540 3540
3541 /* preallocate only when found space is larger then requested */ 3541 /* preallocate only when found space is larger then requested */
3542 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); 3542 BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3543 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 3543 BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3544 BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); 3544 BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3545 3545
3546 BUG_ON(ext4_pspace_cachep == NULL); 3546 BUG_ON(ext4_pspace_cachep == NULL);
3547 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); 3547 pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3548 if (pa == NULL) 3548 if (pa == NULL)
3549 return -ENOMEM; 3549 return -ENOMEM;
3550 3550
3551 /* preallocation can change ac_b_ex, thus we store actually 3551 /* preallocation can change ac_b_ex, thus we store actually
3552 * allocated blocks for history */ 3552 * allocated blocks for history */
3553 ac->ac_f_ex = ac->ac_b_ex; 3553 ac->ac_f_ex = ac->ac_b_ex;
3554 3554
3555 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 3555 pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3556 pa->pa_lstart = pa->pa_pstart; 3556 pa->pa_lstart = pa->pa_pstart;
3557 pa->pa_len = ac->ac_b_ex.fe_len; 3557 pa->pa_len = ac->ac_b_ex.fe_len;
3558 pa->pa_free = pa->pa_len; 3558 pa->pa_free = pa->pa_len;
3559 atomic_set(&pa->pa_count, 1); 3559 atomic_set(&pa->pa_count, 1);
3560 spin_lock_init(&pa->pa_lock); 3560 spin_lock_init(&pa->pa_lock);
3561 INIT_LIST_HEAD(&pa->pa_inode_list); 3561 INIT_LIST_HEAD(&pa->pa_inode_list);
3562 INIT_LIST_HEAD(&pa->pa_group_list); 3562 INIT_LIST_HEAD(&pa->pa_group_list);
3563 pa->pa_deleted = 0; 3563 pa->pa_deleted = 0;
3564 pa->pa_type = MB_GROUP_PA; 3564 pa->pa_type = MB_GROUP_PA;
3565 3565
3566 mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa, 3566 mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
3567 pa->pa_pstart, pa->pa_len, pa->pa_lstart); 3567 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3568 trace_ext4_mb_new_group_pa(ac, pa); 3568 trace_ext4_mb_new_group_pa(ac, pa);
3569 3569
3570 ext4_mb_use_group_pa(ac, pa); 3570 ext4_mb_use_group_pa(ac, pa);
3571 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); 3571 atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
3572 3572
3573 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); 3573 grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3574 lg = ac->ac_lg; 3574 lg = ac->ac_lg;
3575 BUG_ON(lg == NULL); 3575 BUG_ON(lg == NULL);
3576 3576
3577 pa->pa_obj_lock = &lg->lg_prealloc_lock; 3577 pa->pa_obj_lock = &lg->lg_prealloc_lock;
3578 pa->pa_inode = NULL; 3578 pa->pa_inode = NULL;
3579 3579
3580 ext4_lock_group(sb, ac->ac_b_ex.fe_group); 3580 ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3581 list_add(&pa->pa_group_list, &grp->bb_prealloc_list); 3581 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3582 ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 3582 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3583 3583
3584 /* 3584 /*
3585 * We will later add the new pa to the right bucket 3585 * We will later add the new pa to the right bucket
3586 * after updating the pa_free in ext4_mb_release_context 3586 * after updating the pa_free in ext4_mb_release_context
3587 */ 3587 */
3588 return 0; 3588 return 0;
3589 } 3589 }
3590 3590
3591 static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) 3591 static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
3592 { 3592 {
3593 int err; 3593 int err;
3594 3594
3595 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 3595 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
3596 err = ext4_mb_new_group_pa(ac); 3596 err = ext4_mb_new_group_pa(ac);
3597 else 3597 else
3598 err = ext4_mb_new_inode_pa(ac); 3598 err = ext4_mb_new_inode_pa(ac);
3599 return err; 3599 return err;
3600 } 3600 }
3601 3601
3602 /* 3602 /*
3603 * finds all unused blocks in on-disk bitmap, frees them in 3603 * finds all unused blocks in on-disk bitmap, frees them in
3604 * in-core bitmap and buddy. 3604 * in-core bitmap and buddy.
3605 * @pa must be unlinked from inode and group lists, so that 3605 * @pa must be unlinked from inode and group lists, so that
3606 * nobody else can find/use it. 3606 * nobody else can find/use it.
3607 * the caller MUST hold group/inode locks. 3607 * the caller MUST hold group/inode locks.
3608 * TODO: optimize the case when there are no in-core structures yet 3608 * TODO: optimize the case when there are no in-core structures yet
3609 */ 3609 */
3610 static noinline_for_stack int 3610 static noinline_for_stack int
3611 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, 3611 ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3612 struct ext4_prealloc_space *pa) 3612 struct ext4_prealloc_space *pa)
3613 { 3613 {
3614 struct super_block *sb = e4b->bd_sb; 3614 struct super_block *sb = e4b->bd_sb;
3615 struct ext4_sb_info *sbi = EXT4_SB(sb); 3615 struct ext4_sb_info *sbi = EXT4_SB(sb);
3616 unsigned int end; 3616 unsigned int end;
3617 unsigned int next; 3617 unsigned int next;
3618 ext4_group_t group; 3618 ext4_group_t group;
3619 ext4_grpblk_t bit; 3619 ext4_grpblk_t bit;
3620 unsigned long long grp_blk_start; 3620 unsigned long long grp_blk_start;
3621 int err = 0; 3621 int err = 0;
3622 int free = 0; 3622 int free = 0;
3623 3623
3624 BUG_ON(pa->pa_deleted == 0); 3624 BUG_ON(pa->pa_deleted == 0);
3625 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3625 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3626 grp_blk_start = pa->pa_pstart - bit; 3626 grp_blk_start = pa->pa_pstart - bit;
3627 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3627 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3628 end = bit + pa->pa_len; 3628 end = bit + pa->pa_len;
3629 3629
3630 while (bit < end) { 3630 while (bit < end) {
3631 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); 3631 bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
3632 if (bit >= end) 3632 if (bit >= end)
3633 break; 3633 break;
3634 next = mb_find_next_bit(bitmap_bh->b_data, end, bit); 3634 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3635 mb_debug(1, " free preallocated %u/%u in group %u\n", 3635 mb_debug(1, " free preallocated %u/%u in group %u\n",
3636 (unsigned) ext4_group_first_block_no(sb, group) + bit, 3636 (unsigned) ext4_group_first_block_no(sb, group) + bit,
3637 (unsigned) next - bit, (unsigned) group); 3637 (unsigned) next - bit, (unsigned) group);
3638 free += next - bit; 3638 free += next - bit;
3639 3639
3640 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); 3640 trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
3641 trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa, 3641 trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa,
3642 grp_blk_start + bit, next - bit); 3642 grp_blk_start + bit, next - bit);
3643 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); 3643 mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3644 bit = next + 1; 3644 bit = next + 1;
3645 } 3645 }
3646 if (free != pa->pa_free) { 3646 if (free != pa->pa_free) {
3647 printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n", 3647 printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n",
3648 pa, (unsigned long) pa->pa_lstart, 3648 pa, (unsigned long) pa->pa_lstart,
3649 (unsigned long) pa->pa_pstart, 3649 (unsigned long) pa->pa_pstart,
3650 (unsigned long) pa->pa_len); 3650 (unsigned long) pa->pa_len);
3651 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", 3651 ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
3652 free, pa->pa_free); 3652 free, pa->pa_free);
3653 /* 3653 /*
3654 * pa is already deleted so we use the value obtained 3654 * pa is already deleted so we use the value obtained
3655 * from the bitmap and continue. 3655 * from the bitmap and continue.
3656 */ 3656 */
3657 } 3657 }
3658 atomic_add(free, &sbi->s_mb_discarded); 3658 atomic_add(free, &sbi->s_mb_discarded);
3659 3659
3660 return err; 3660 return err;
3661 } 3661 }
3662 3662
3663 static noinline_for_stack int 3663 static noinline_for_stack int
3664 ext4_mb_release_group_pa(struct ext4_buddy *e4b, 3664 ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3665 struct ext4_prealloc_space *pa) 3665 struct ext4_prealloc_space *pa)
3666 { 3666 {
3667 struct super_block *sb = e4b->bd_sb; 3667 struct super_block *sb = e4b->bd_sb;
3668 ext4_group_t group; 3668 ext4_group_t group;
3669 ext4_grpblk_t bit; 3669 ext4_grpblk_t bit;
3670 3670
3671 trace_ext4_mb_release_group_pa(sb, pa); 3671 trace_ext4_mb_release_group_pa(sb, pa);
3672 BUG_ON(pa->pa_deleted == 0); 3672 BUG_ON(pa->pa_deleted == 0);
3673 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); 3673 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3674 BUG_ON(group != e4b->bd_group && pa->pa_len != 0); 3674 BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3675 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); 3675 mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
3676 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); 3676 atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
3677 trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); 3677 trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
3678 3678
3679 return 0; 3679 return 0;
3680 } 3680 }
3681 3681
3682 /* 3682 /*
3683 * releases all preallocations in given group 3683 * releases all preallocations in given group
3684 * 3684 *
3685 * first, we need to decide discard policy: 3685 * first, we need to decide discard policy:
3686 * - when do we discard 3686 * - when do we discard
3687 * 1) ENOSPC 3687 * 1) ENOSPC
3688 * - how many do we discard 3688 * - how many do we discard
3689 * 1) how many requested 3689 * 1) how many requested
3690 */ 3690 */
3691 static noinline_for_stack int 3691 static noinline_for_stack int
3692 ext4_mb_discard_group_preallocations(struct super_block *sb, 3692 ext4_mb_discard_group_preallocations(struct super_block *sb,
3693 ext4_group_t group, int needed) 3693 ext4_group_t group, int needed)
3694 { 3694 {
3695 struct ext4_group_info *grp = ext4_get_group_info(sb, group); 3695 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3696 struct buffer_head *bitmap_bh = NULL; 3696 struct buffer_head *bitmap_bh = NULL;
3697 struct ext4_prealloc_space *pa, *tmp; 3697 struct ext4_prealloc_space *pa, *tmp;
3698 struct list_head list; 3698 struct list_head list;
3699 struct ext4_buddy e4b; 3699 struct ext4_buddy e4b;
3700 int err; 3700 int err;
3701 int busy = 0; 3701 int busy = 0;
3702 int free = 0; 3702 int free = 0;
3703 3703
3704 mb_debug(1, "discard preallocation for group %u\n", group); 3704 mb_debug(1, "discard preallocation for group %u\n", group);
3705 3705
3706 if (list_empty(&grp->bb_prealloc_list)) 3706 if (list_empty(&grp->bb_prealloc_list))
3707 return 0; 3707 return 0;
3708 3708
3709 bitmap_bh = ext4_read_block_bitmap(sb, group); 3709 bitmap_bh = ext4_read_block_bitmap(sb, group);
3710 if (bitmap_bh == NULL) { 3710 if (bitmap_bh == NULL) {
3711 ext4_error(sb, "Error reading block bitmap for %u", group); 3711 ext4_error(sb, "Error reading block bitmap for %u", group);
3712 return 0; 3712 return 0;
3713 } 3713 }
3714 3714
3715 err = ext4_mb_load_buddy(sb, group, &e4b); 3715 err = ext4_mb_load_buddy(sb, group, &e4b);
3716 if (err) { 3716 if (err) {
3717 ext4_error(sb, "Error loading buddy information for %u", group); 3717 ext4_error(sb, "Error loading buddy information for %u", group);
3718 put_bh(bitmap_bh); 3718 put_bh(bitmap_bh);
3719 return 0; 3719 return 0;
3720 } 3720 }
3721 3721
3722 if (needed == 0) 3722 if (needed == 0)
3723 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; 3723 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
3724 3724
3725 INIT_LIST_HEAD(&list); 3725 INIT_LIST_HEAD(&list);
3726 repeat: 3726 repeat:
3727 ext4_lock_group(sb, group); 3727 ext4_lock_group(sb, group);
3728 list_for_each_entry_safe(pa, tmp, 3728 list_for_each_entry_safe(pa, tmp,
3729 &grp->bb_prealloc_list, pa_group_list) { 3729 &grp->bb_prealloc_list, pa_group_list) {
3730 spin_lock(&pa->pa_lock); 3730 spin_lock(&pa->pa_lock);
3731 if (atomic_read(&pa->pa_count)) { 3731 if (atomic_read(&pa->pa_count)) {
3732 spin_unlock(&pa->pa_lock); 3732 spin_unlock(&pa->pa_lock);
3733 busy = 1; 3733 busy = 1;
3734 continue; 3734 continue;
3735 } 3735 }
3736 if (pa->pa_deleted) { 3736 if (pa->pa_deleted) {
3737 spin_unlock(&pa->pa_lock); 3737 spin_unlock(&pa->pa_lock);
3738 continue; 3738 continue;
3739 } 3739 }
3740 3740
3741 /* seems this one can be freed ... */ 3741 /* seems this one can be freed ... */
3742 pa->pa_deleted = 1; 3742 pa->pa_deleted = 1;
3743 3743
3744 /* we can trust pa_free ... */ 3744 /* we can trust pa_free ... */
3745 free += pa->pa_free; 3745 free += pa->pa_free;
3746 3746
3747 spin_unlock(&pa->pa_lock); 3747 spin_unlock(&pa->pa_lock);
3748 3748
3749 list_del(&pa->pa_group_list); 3749 list_del(&pa->pa_group_list);
3750 list_add(&pa->u.pa_tmp_list, &list); 3750 list_add(&pa->u.pa_tmp_list, &list);
3751 } 3751 }
3752 3752
3753 /* if we still need more blocks and some PAs were used, try again */ 3753 /* if we still need more blocks and some PAs were used, try again */
3754 if (free < needed && busy) { 3754 if (free < needed && busy) {
3755 busy = 0; 3755 busy = 0;
3756 ext4_unlock_group(sb, group); 3756 ext4_unlock_group(sb, group);
3757 /* 3757 /*
3758 * Yield the CPU here so that we don't get soft lockup 3758 * Yield the CPU here so that we don't get soft lockup
3759 * in non preempt case. 3759 * in non preempt case.
3760 */ 3760 */
3761 yield(); 3761 yield();
3762 goto repeat; 3762 goto repeat;
3763 } 3763 }
3764 3764
3765 /* found anything to free? */ 3765 /* found anything to free? */
3766 if (list_empty(&list)) { 3766 if (list_empty(&list)) {
3767 BUG_ON(free != 0); 3767 BUG_ON(free != 0);
3768 goto out; 3768 goto out;
3769 } 3769 }
3770 3770
3771 /* now free all selected PAs */ 3771 /* now free all selected PAs */
3772 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 3772 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3773 3773
3774 /* remove from object (inode or locality group) */ 3774 /* remove from object (inode or locality group) */
3775 spin_lock(pa->pa_obj_lock); 3775 spin_lock(pa->pa_obj_lock);
3776 list_del_rcu(&pa->pa_inode_list); 3776 list_del_rcu(&pa->pa_inode_list);
3777 spin_unlock(pa->pa_obj_lock); 3777 spin_unlock(pa->pa_obj_lock);
3778 3778
3779 if (pa->pa_type == MB_GROUP_PA) 3779 if (pa->pa_type == MB_GROUP_PA)
3780 ext4_mb_release_group_pa(&e4b, pa); 3780 ext4_mb_release_group_pa(&e4b, pa);
3781 else 3781 else
3782 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); 3782 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3783 3783
3784 list_del(&pa->u.pa_tmp_list); 3784 list_del(&pa->u.pa_tmp_list);
3785 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3785 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3786 } 3786 }
3787 3787
3788 out: 3788 out:
3789 ext4_unlock_group(sb, group); 3789 ext4_unlock_group(sb, group);
3790 ext4_mb_unload_buddy(&e4b); 3790 ext4_mb_unload_buddy(&e4b);
3791 put_bh(bitmap_bh); 3791 put_bh(bitmap_bh);
3792 return free; 3792 return free;
3793 } 3793 }
3794 3794
3795 /* 3795 /*
3796 * releases all non-used preallocated blocks for given inode 3796 * releases all non-used preallocated blocks for given inode
3797 * 3797 *
3798 * It's important to discard preallocations under i_data_sem 3798 * It's important to discard preallocations under i_data_sem
3799 * We don't want another block to be served from the prealloc 3799 * We don't want another block to be served from the prealloc
3800 * space when we are discarding the inode prealloc space. 3800 * space when we are discarding the inode prealloc space.
3801 * 3801 *
3802 * FIXME!! Make sure it is valid at all the call sites 3802 * FIXME!! Make sure it is valid at all the call sites
3803 */ 3803 */
3804 void ext4_discard_preallocations(struct inode *inode) 3804 void ext4_discard_preallocations(struct inode *inode)
3805 { 3805 {
3806 struct ext4_inode_info *ei = EXT4_I(inode); 3806 struct ext4_inode_info *ei = EXT4_I(inode);
3807 struct super_block *sb = inode->i_sb; 3807 struct super_block *sb = inode->i_sb;
3808 struct buffer_head *bitmap_bh = NULL; 3808 struct buffer_head *bitmap_bh = NULL;
3809 struct ext4_prealloc_space *pa, *tmp; 3809 struct ext4_prealloc_space *pa, *tmp;
3810 ext4_group_t group = 0; 3810 ext4_group_t group = 0;
3811 struct list_head list; 3811 struct list_head list;
3812 struct ext4_buddy e4b; 3812 struct ext4_buddy e4b;
3813 int err; 3813 int err;
3814 3814
3815 if (!S_ISREG(inode->i_mode)) { 3815 if (!S_ISREG(inode->i_mode)) {
3816 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ 3816 /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
3817 return; 3817 return;
3818 } 3818 }
3819 3819
3820 mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino); 3820 mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
3821 trace_ext4_discard_preallocations(inode); 3821 trace_ext4_discard_preallocations(inode);
3822 3822
3823 INIT_LIST_HEAD(&list); 3823 INIT_LIST_HEAD(&list);
3824 3824
3825 repeat: 3825 repeat:
3826 /* first, collect all pa's in the inode */ 3826 /* first, collect all pa's in the inode */
3827 spin_lock(&ei->i_prealloc_lock); 3827 spin_lock(&ei->i_prealloc_lock);
3828 while (!list_empty(&ei->i_prealloc_list)) { 3828 while (!list_empty(&ei->i_prealloc_list)) {
3829 pa = list_entry(ei->i_prealloc_list.next, 3829 pa = list_entry(ei->i_prealloc_list.next,
3830 struct ext4_prealloc_space, pa_inode_list); 3830 struct ext4_prealloc_space, pa_inode_list);
3831 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); 3831 BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
3832 spin_lock(&pa->pa_lock); 3832 spin_lock(&pa->pa_lock);
3833 if (atomic_read(&pa->pa_count)) { 3833 if (atomic_read(&pa->pa_count)) {
3834 /* this shouldn't happen often - nobody should 3834 /* this shouldn't happen often - nobody should
3835 * use preallocation while we're discarding it */ 3835 * use preallocation while we're discarding it */
3836 spin_unlock(&pa->pa_lock); 3836 spin_unlock(&pa->pa_lock);
3837 spin_unlock(&ei->i_prealloc_lock); 3837 spin_unlock(&ei->i_prealloc_lock);
3838 printk(KERN_ERR "uh-oh! used pa while discarding\n"); 3838 printk(KERN_ERR "uh-oh! used pa while discarding\n");
3839 WARN_ON(1); 3839 WARN_ON(1);
3840 schedule_timeout_uninterruptible(HZ); 3840 schedule_timeout_uninterruptible(HZ);
3841 goto repeat; 3841 goto repeat;
3842 3842
3843 } 3843 }
3844 if (pa->pa_deleted == 0) { 3844 if (pa->pa_deleted == 0) {
3845 pa->pa_deleted = 1; 3845 pa->pa_deleted = 1;
3846 spin_unlock(&pa->pa_lock); 3846 spin_unlock(&pa->pa_lock);
3847 list_del_rcu(&pa->pa_inode_list); 3847 list_del_rcu(&pa->pa_inode_list);
3848 list_add(&pa->u.pa_tmp_list, &list); 3848 list_add(&pa->u.pa_tmp_list, &list);
3849 continue; 3849 continue;
3850 } 3850 }
3851 3851
3852 /* someone is deleting pa right now */ 3852 /* someone is deleting pa right now */
3853 spin_unlock(&pa->pa_lock); 3853 spin_unlock(&pa->pa_lock);
3854 spin_unlock(&ei->i_prealloc_lock); 3854 spin_unlock(&ei->i_prealloc_lock);
3855 3855
3856 /* we have to wait here because pa_deleted 3856 /* we have to wait here because pa_deleted
3857 * doesn't mean pa is already unlinked from 3857 * doesn't mean pa is already unlinked from
3858 * the list. as we might be called from 3858 * the list. as we might be called from
3859 * ->clear_inode() the inode will get freed 3859 * ->clear_inode() the inode will get freed
3860 * and concurrent thread which is unlinking 3860 * and concurrent thread which is unlinking
3861 * pa from inode's list may access already 3861 * pa from inode's list may access already
3862 * freed memory, bad-bad-bad */ 3862 * freed memory, bad-bad-bad */
3863 3863
3864 /* XXX: if this happens too often, we can 3864 /* XXX: if this happens too often, we can
3865 * add a flag to force wait only in case 3865 * add a flag to force wait only in case
3866 * of ->clear_inode(), but not in case of 3866 * of ->clear_inode(), but not in case of
3867 * regular truncate */ 3867 * regular truncate */
3868 schedule_timeout_uninterruptible(HZ); 3868 schedule_timeout_uninterruptible(HZ);
3869 goto repeat; 3869 goto repeat;
3870 } 3870 }
3871 spin_unlock(&ei->i_prealloc_lock); 3871 spin_unlock(&ei->i_prealloc_lock);
3872 3872
3873 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { 3873 list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3874 BUG_ON(pa->pa_type != MB_INODE_PA); 3874 BUG_ON(pa->pa_type != MB_INODE_PA);
3875 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 3875 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
3876 3876
3877 err = ext4_mb_load_buddy(sb, group, &e4b); 3877 err = ext4_mb_load_buddy(sb, group, &e4b);
3878 if (err) { 3878 if (err) {
3879 ext4_error(sb, "Error loading buddy information for %u", 3879 ext4_error(sb, "Error loading buddy information for %u",
3880 group); 3880 group);
3881 continue; 3881 continue;
3882 } 3882 }
3883 3883
3884 bitmap_bh = ext4_read_block_bitmap(sb, group); 3884 bitmap_bh = ext4_read_block_bitmap(sb, group);
3885 if (bitmap_bh == NULL) { 3885 if (bitmap_bh == NULL) {
3886 ext4_error(sb, "Error reading block bitmap for %u", 3886 ext4_error(sb, "Error reading block bitmap for %u",
3887 group); 3887 group);
3888 ext4_mb_unload_buddy(&e4b); 3888 ext4_mb_unload_buddy(&e4b);
3889 continue; 3889 continue;
3890 } 3890 }
3891 3891
3892 ext4_lock_group(sb, group); 3892 ext4_lock_group(sb, group);
3893 list_del(&pa->pa_group_list); 3893 list_del(&pa->pa_group_list);
3894 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); 3894 ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3895 ext4_unlock_group(sb, group); 3895 ext4_unlock_group(sb, group);
3896 3896
3897 ext4_mb_unload_buddy(&e4b); 3897 ext4_mb_unload_buddy(&e4b);
3898 put_bh(bitmap_bh); 3898 put_bh(bitmap_bh);
3899 3899
3900 list_del(&pa->u.pa_tmp_list); 3900 list_del(&pa->u.pa_tmp_list);
3901 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 3901 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3902 } 3902 }
3903 } 3903 }
3904 3904
3905 #ifdef CONFIG_EXT4_DEBUG 3905 #ifdef CONFIG_EXT4_DEBUG
3906 static void ext4_mb_show_ac(struct ext4_allocation_context *ac) 3906 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3907 { 3907 {
3908 struct super_block *sb = ac->ac_sb; 3908 struct super_block *sb = ac->ac_sb;
3909 ext4_group_t ngroups, i; 3909 ext4_group_t ngroups, i;
3910 3910
3911 if (!mb_enable_debug || 3911 if (!mb_enable_debug ||
3912 (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) 3912 (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
3913 return; 3913 return;
3914 3914
3915 printk(KERN_ERR "EXT4-fs: Can't allocate:" 3915 printk(KERN_ERR "EXT4-fs: Can't allocate:"
3916 " Allocation context details:\n"); 3916 " Allocation context details:\n");
3917 printk(KERN_ERR "EXT4-fs: status %d flags %d\n", 3917 printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
3918 ac->ac_status, ac->ac_flags); 3918 ac->ac_status, ac->ac_flags);
3919 printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, " 3919 printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, "
3920 "best %lu/%lu/%lu@%lu cr %d\n", 3920 "best %lu/%lu/%lu@%lu cr %d\n",
3921 (unsigned long)ac->ac_o_ex.fe_group, 3921 (unsigned long)ac->ac_o_ex.fe_group,
3922 (unsigned long)ac->ac_o_ex.fe_start, 3922 (unsigned long)ac->ac_o_ex.fe_start,
3923 (unsigned long)ac->ac_o_ex.fe_len, 3923 (unsigned long)ac->ac_o_ex.fe_len,
3924 (unsigned long)ac->ac_o_ex.fe_logical, 3924 (unsigned long)ac->ac_o_ex.fe_logical,
3925 (unsigned long)ac->ac_g_ex.fe_group, 3925 (unsigned long)ac->ac_g_ex.fe_group,
3926 (unsigned long)ac->ac_g_ex.fe_start, 3926 (unsigned long)ac->ac_g_ex.fe_start,
3927 (unsigned long)ac->ac_g_ex.fe_len, 3927 (unsigned long)ac->ac_g_ex.fe_len,
3928 (unsigned long)ac->ac_g_ex.fe_logical, 3928 (unsigned long)ac->ac_g_ex.fe_logical,
3929 (unsigned long)ac->ac_b_ex.fe_group, 3929 (unsigned long)ac->ac_b_ex.fe_group,
3930 (unsigned long)ac->ac_b_ex.fe_start, 3930 (unsigned long)ac->ac_b_ex.fe_start,
3931 (unsigned long)ac->ac_b_ex.fe_len, 3931 (unsigned long)ac->ac_b_ex.fe_len,
3932 (unsigned long)ac->ac_b_ex.fe_logical, 3932 (unsigned long)ac->ac_b_ex.fe_logical,
3933 (int)ac->ac_criteria); 3933 (int)ac->ac_criteria);
3934 printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, 3934 printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
3935 ac->ac_found); 3935 ac->ac_found);
3936 printk(KERN_ERR "EXT4-fs: groups: \n"); 3936 printk(KERN_ERR "EXT4-fs: groups: \n");
3937 ngroups = ext4_get_groups_count(sb); 3937 ngroups = ext4_get_groups_count(sb);
3938 for (i = 0; i < ngroups; i++) { 3938 for (i = 0; i < ngroups; i++) {
3939 struct ext4_group_info *grp = ext4_get_group_info(sb, i); 3939 struct ext4_group_info *grp = ext4_get_group_info(sb, i);
3940 struct ext4_prealloc_space *pa; 3940 struct ext4_prealloc_space *pa;
3941 ext4_grpblk_t start; 3941 ext4_grpblk_t start;
3942 struct list_head *cur; 3942 struct list_head *cur;
3943 ext4_lock_group(sb, i); 3943 ext4_lock_group(sb, i);
3944 list_for_each(cur, &grp->bb_prealloc_list) { 3944 list_for_each(cur, &grp->bb_prealloc_list) {
3945 pa = list_entry(cur, struct ext4_prealloc_space, 3945 pa = list_entry(cur, struct ext4_prealloc_space,
3946 pa_group_list); 3946 pa_group_list);
3947 spin_lock(&pa->pa_lock); 3947 spin_lock(&pa->pa_lock);
3948 ext4_get_group_no_and_offset(sb, pa->pa_pstart, 3948 ext4_get_group_no_and_offset(sb, pa->pa_pstart,
3949 NULL, &start); 3949 NULL, &start);
3950 spin_unlock(&pa->pa_lock); 3950 spin_unlock(&pa->pa_lock);
3951 printk(KERN_ERR "PA:%u:%d:%u \n", i, 3951 printk(KERN_ERR "PA:%u:%d:%u \n", i,
3952 start, pa->pa_len); 3952 start, pa->pa_len);
3953 } 3953 }
3954 ext4_unlock_group(sb, i); 3954 ext4_unlock_group(sb, i);
3955 3955
3956 if (grp->bb_free == 0) 3956 if (grp->bb_free == 0)
3957 continue; 3957 continue;
3958 printk(KERN_ERR "%u: %d/%d \n", 3958 printk(KERN_ERR "%u: %d/%d \n",
3959 i, grp->bb_free, grp->bb_fragments); 3959 i, grp->bb_free, grp->bb_fragments);
3960 } 3960 }
3961 printk(KERN_ERR "\n"); 3961 printk(KERN_ERR "\n");
3962 } 3962 }
3963 #else 3963 #else
3964 static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) 3964 static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3965 { 3965 {
3966 return; 3966 return;
3967 } 3967 }
3968 #endif 3968 #endif
3969 3969
3970 /* 3970 /*
3971 * We use locality group preallocation for small size file. The size of the 3971 * We use locality group preallocation for small size file. The size of the
3972 * file is determined by the current size or the resulting size after 3972 * file is determined by the current size or the resulting size after
3973 * allocation which ever is larger 3973 * allocation which ever is larger
3974 * 3974 *
3975 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req 3975 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
3976 */ 3976 */
3977 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) 3977 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
3978 { 3978 {
3979 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 3979 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3980 int bsbits = ac->ac_sb->s_blocksize_bits; 3980 int bsbits = ac->ac_sb->s_blocksize_bits;
3981 loff_t size, isize; 3981 loff_t size, isize;
3982 3982
3983 if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) 3983 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3984 return; 3984 return;
3985 3985
3986 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) 3986 if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
3987 return; 3987 return;
3988 3988
3989 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len; 3989 size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
3990 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) 3990 isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
3991 >> bsbits; 3991 >> bsbits;
3992 3992
3993 if ((size == isize) && 3993 if ((size == isize) &&
3994 !ext4_fs_is_busy(sbi) && 3994 !ext4_fs_is_busy(sbi) &&
3995 (atomic_read(&ac->ac_inode->i_writecount) == 0)) { 3995 (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
3996 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; 3996 ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
3997 return; 3997 return;
3998 } 3998 }
3999 3999
4000 /* don't use group allocation for large files */ 4000 /* don't use group allocation for large files */
4001 size = max(size, isize); 4001 size = max(size, isize);
4002 if (size > sbi->s_mb_stream_request) { 4002 if (size > sbi->s_mb_stream_request) {
4003 ac->ac_flags |= EXT4_MB_STREAM_ALLOC; 4003 ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4004 return; 4004 return;
4005 } 4005 }
4006 4006
4007 BUG_ON(ac->ac_lg != NULL); 4007 BUG_ON(ac->ac_lg != NULL);
4008 /* 4008 /*
4009 * locality group prealloc space are per cpu. The reason for having 4009 * locality group prealloc space are per cpu. The reason for having
4010 * per cpu locality group is to reduce the contention between block 4010 * per cpu locality group is to reduce the contention between block
4011 * request from multiple CPUs. 4011 * request from multiple CPUs.
4012 */ 4012 */
4013 ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups); 4013 ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups);
4014 4014
4015 /* we're going to use group allocation */ 4015 /* we're going to use group allocation */
4016 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; 4016 ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
4017 4017
4018 /* serialize all allocations in the group */ 4018 /* serialize all allocations in the group */
4019 mutex_lock(&ac->ac_lg->lg_mutex); 4019 mutex_lock(&ac->ac_lg->lg_mutex);
4020 } 4020 }
4021 4021
4022 static noinline_for_stack int 4022 static noinline_for_stack int
4023 ext4_mb_initialize_context(struct ext4_allocation_context *ac, 4023 ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4024 struct ext4_allocation_request *ar) 4024 struct ext4_allocation_request *ar)
4025 { 4025 {
4026 struct super_block *sb = ar->inode->i_sb; 4026 struct super_block *sb = ar->inode->i_sb;
4027 struct ext4_sb_info *sbi = EXT4_SB(sb); 4027 struct ext4_sb_info *sbi = EXT4_SB(sb);
4028 struct ext4_super_block *es = sbi->s_es; 4028 struct ext4_super_block *es = sbi->s_es;
4029 ext4_group_t group; 4029 ext4_group_t group;
4030 unsigned int len; 4030 unsigned int len;
4031 ext4_fsblk_t goal; 4031 ext4_fsblk_t goal;
4032 ext4_grpblk_t block; 4032 ext4_grpblk_t block;
4033 4033
4034 /* we can't allocate > group size */ 4034 /* we can't allocate > group size */
4035 len = ar->len; 4035 len = ar->len;
4036 4036
4037 /* just a dirty hack to filter too big requests */ 4037 /* just a dirty hack to filter too big requests */
4038 if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10) 4038 if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10)
4039 len = EXT4_BLOCKS_PER_GROUP(sb) - 10; 4039 len = EXT4_BLOCKS_PER_GROUP(sb) - 10;
4040 4040
4041 /* start searching from the goal */ 4041 /* start searching from the goal */
4042 goal = ar->goal; 4042 goal = ar->goal;
4043 if (goal < le32_to_cpu(es->s_first_data_block) || 4043 if (goal < le32_to_cpu(es->s_first_data_block) ||
4044 goal >= ext4_blocks_count(es)) 4044 goal >= ext4_blocks_count(es))
4045 goal = le32_to_cpu(es->s_first_data_block); 4045 goal = le32_to_cpu(es->s_first_data_block);
4046 ext4_get_group_no_and_offset(sb, goal, &group, &block); 4046 ext4_get_group_no_and_offset(sb, goal, &group, &block);
4047 4047
4048 /* set up allocation goals */ 4048 /* set up allocation goals */
4049 memset(ac, 0, sizeof(struct ext4_allocation_context)); 4049 memset(ac, 0, sizeof(struct ext4_allocation_context));
4050 ac->ac_b_ex.fe_logical = ar->logical; 4050 ac->ac_b_ex.fe_logical = ar->logical;
4051 ac->ac_status = AC_STATUS_CONTINUE; 4051 ac->ac_status = AC_STATUS_CONTINUE;
4052 ac->ac_sb = sb; 4052 ac->ac_sb = sb;
4053 ac->ac_inode = ar->inode; 4053 ac->ac_inode = ar->inode;
4054 ac->ac_o_ex.fe_logical = ar->logical; 4054 ac->ac_o_ex.fe_logical = ar->logical;
4055 ac->ac_o_ex.fe_group = group; 4055 ac->ac_o_ex.fe_group = group;
4056 ac->ac_o_ex.fe_start = block; 4056 ac->ac_o_ex.fe_start = block;
4057 ac->ac_o_ex.fe_len = len; 4057 ac->ac_o_ex.fe_len = len;
4058 ac->ac_g_ex.fe_logical = ar->logical; 4058 ac->ac_g_ex.fe_logical = ar->logical;
4059 ac->ac_g_ex.fe_group = group; 4059 ac->ac_g_ex.fe_group = group;
4060 ac->ac_g_ex.fe_start = block; 4060 ac->ac_g_ex.fe_start = block;
4061 ac->ac_g_ex.fe_len = len; 4061 ac->ac_g_ex.fe_len = len;
4062 ac->ac_flags = ar->flags; 4062 ac->ac_flags = ar->flags;
4063 4063
4064 /* we have to define context: we'll we work with a file or 4064 /* we have to define context: we'll we work with a file or
4065 * locality group. this is a policy, actually */ 4065 * locality group. this is a policy, actually */
4066 ext4_mb_group_or_file(ac); 4066 ext4_mb_group_or_file(ac);
4067 4067
4068 mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " 4068 mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4069 "left: %u/%u, right %u/%u to %swritable\n", 4069 "left: %u/%u, right %u/%u to %swritable\n",
4070 (unsigned) ar->len, (unsigned) ar->logical, 4070 (unsigned) ar->len, (unsigned) ar->logical,
4071 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, 4071 (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
4072 (unsigned) ar->lleft, (unsigned) ar->pleft, 4072 (unsigned) ar->lleft, (unsigned) ar->pleft,
4073 (unsigned) ar->lright, (unsigned) ar->pright, 4073 (unsigned) ar->lright, (unsigned) ar->pright,
4074 atomic_read(&ar->inode->i_writecount) ? "" : "non-"); 4074 atomic_read(&ar->inode->i_writecount) ? "" : "non-");
4075 return 0; 4075 return 0;
4076 4076
4077 } 4077 }
4078 4078
4079 static noinline_for_stack void 4079 static noinline_for_stack void
4080 ext4_mb_discard_lg_preallocations(struct super_block *sb, 4080 ext4_mb_discard_lg_preallocations(struct super_block *sb,
4081 struct ext4_locality_group *lg, 4081 struct ext4_locality_group *lg,
4082 int order, int total_entries) 4082 int order, int total_entries)
4083 { 4083 {
4084 ext4_group_t group = 0; 4084 ext4_group_t group = 0;
4085 struct ext4_buddy e4b; 4085 struct ext4_buddy e4b;
4086 struct list_head discard_list; 4086 struct list_head discard_list;
4087 struct ext4_prealloc_space *pa, *tmp; 4087 struct ext4_prealloc_space *pa, *tmp;
4088 4088
4089 mb_debug(1, "discard locality group preallocation\n"); 4089 mb_debug(1, "discard locality group preallocation\n");
4090 4090
4091 INIT_LIST_HEAD(&discard_list); 4091 INIT_LIST_HEAD(&discard_list);
4092 4092
4093 spin_lock(&lg->lg_prealloc_lock); 4093 spin_lock(&lg->lg_prealloc_lock);
4094 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], 4094 list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
4095 pa_inode_list) { 4095 pa_inode_list) {
4096 spin_lock(&pa->pa_lock); 4096 spin_lock(&pa->pa_lock);
4097 if (atomic_read(&pa->pa_count)) { 4097 if (atomic_read(&pa->pa_count)) {
4098 /* 4098 /*
4099 * This is the pa that we just used 4099 * This is the pa that we just used
4100 * for block allocation. So don't 4100 * for block allocation. So don't
4101 * free that 4101 * free that
4102 */ 4102 */
4103 spin_unlock(&pa->pa_lock); 4103 spin_unlock(&pa->pa_lock);
4104 continue; 4104 continue;
4105 } 4105 }
4106 if (pa->pa_deleted) { 4106 if (pa->pa_deleted) {
4107 spin_unlock(&pa->pa_lock); 4107 spin_unlock(&pa->pa_lock);
4108 continue; 4108 continue;
4109 } 4109 }
4110 /* only lg prealloc space */ 4110 /* only lg prealloc space */
4111 BUG_ON(pa->pa_type != MB_GROUP_PA); 4111 BUG_ON(pa->pa_type != MB_GROUP_PA);
4112 4112
4113 /* seems this one can be freed ... */ 4113 /* seems this one can be freed ... */
4114 pa->pa_deleted = 1; 4114 pa->pa_deleted = 1;
4115 spin_unlock(&pa->pa_lock); 4115 spin_unlock(&pa->pa_lock);
4116 4116
4117 list_del_rcu(&pa->pa_inode_list); 4117 list_del_rcu(&pa->pa_inode_list);
4118 list_add(&pa->u.pa_tmp_list, &discard_list); 4118 list_add(&pa->u.pa_tmp_list, &discard_list);
4119 4119
4120 total_entries--; 4120 total_entries--;
4121 if (total_entries <= 5) { 4121 if (total_entries <= 5) {
4122 /* 4122 /*
4123 * we want to keep only 5 entries 4123 * we want to keep only 5 entries
4124 * allowing it to grow to 8. This 4124 * allowing it to grow to 8. This
4125 * mak sure we don't call discard 4125 * mak sure we don't call discard
4126 * soon for this list. 4126 * soon for this list.
4127 */ 4127 */
4128 break; 4128 break;
4129 } 4129 }
4130 } 4130 }
4131 spin_unlock(&lg->lg_prealloc_lock); 4131 spin_unlock(&lg->lg_prealloc_lock);
4132 4132
4133 list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { 4133 list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
4134 4134
4135 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); 4135 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
4136 if (ext4_mb_load_buddy(sb, group, &e4b)) { 4136 if (ext4_mb_load_buddy(sb, group, &e4b)) {
4137 ext4_error(sb, "Error loading buddy information for %u", 4137 ext4_error(sb, "Error loading buddy information for %u",
4138 group); 4138 group);
4139 continue; 4139 continue;
4140 } 4140 }
4141 ext4_lock_group(sb, group); 4141 ext4_lock_group(sb, group);
4142 list_del(&pa->pa_group_list); 4142 list_del(&pa->pa_group_list);
4143 ext4_mb_release_group_pa(&e4b, pa); 4143 ext4_mb_release_group_pa(&e4b, pa);
4144 ext4_unlock_group(sb, group); 4144 ext4_unlock_group(sb, group);
4145 4145
4146 ext4_mb_unload_buddy(&e4b); 4146 ext4_mb_unload_buddy(&e4b);
4147 list_del(&pa->u.pa_tmp_list); 4147 list_del(&pa->u.pa_tmp_list);
4148 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); 4148 call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4149 } 4149 }
4150 } 4150 }
4151 4151
4152 /* 4152 /*
4153 * We have incremented pa_count. So it cannot be freed at this 4153 * We have incremented pa_count. So it cannot be freed at this
4154 * point. Also we hold lg_mutex. So no parallel allocation is 4154 * point. Also we hold lg_mutex. So no parallel allocation is
4155 * possible from this lg. That means pa_free cannot be updated. 4155 * possible from this lg. That means pa_free cannot be updated.
4156 * 4156 *
4157 * A parallel ext4_mb_discard_group_preallocations is possible. 4157 * A parallel ext4_mb_discard_group_preallocations is possible.
4158 * which can cause the lg_prealloc_list to be updated. 4158 * which can cause the lg_prealloc_list to be updated.
4159 */ 4159 */
4160 4160
4161 static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) 4161 static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
4162 { 4162 {
4163 int order, added = 0, lg_prealloc_count = 1; 4163 int order, added = 0, lg_prealloc_count = 1;
4164 struct super_block *sb = ac->ac_sb; 4164 struct super_block *sb = ac->ac_sb;
4165 struct ext4_locality_group *lg = ac->ac_lg; 4165 struct ext4_locality_group *lg = ac->ac_lg;
4166 struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; 4166 struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
4167 4167
4168 order = fls(pa->pa_free) - 1; 4168 order = fls(pa->pa_free) - 1;
4169 if (order > PREALLOC_TB_SIZE - 1) 4169 if (order > PREALLOC_TB_SIZE - 1)
4170 /* The max size of hash table is PREALLOC_TB_SIZE */ 4170 /* The max size of hash table is PREALLOC_TB_SIZE */
4171 order = PREALLOC_TB_SIZE - 1; 4171 order = PREALLOC_TB_SIZE - 1;
4172 /* Add the prealloc space to lg */ 4172 /* Add the prealloc space to lg */
4173 rcu_read_lock(); 4173 rcu_read_lock();
4174 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], 4174 list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
4175 pa_inode_list) { 4175 pa_inode_list) {
4176 spin_lock(&tmp_pa->pa_lock); 4176 spin_lock(&tmp_pa->pa_lock);
4177 if (tmp_pa->pa_deleted) { 4177 if (tmp_pa->pa_deleted) {
4178 spin_unlock(&tmp_pa->pa_lock); 4178 spin_unlock(&tmp_pa->pa_lock);
4179 continue; 4179 continue;
4180 } 4180 }
4181 if (!added && pa->pa_free < tmp_pa->pa_free) { 4181 if (!added && pa->pa_free < tmp_pa->pa_free) {
4182 /* Add to the tail of the previous entry */ 4182 /* Add to the tail of the previous entry */
4183 list_add_tail_rcu(&pa->pa_inode_list, 4183 list_add_tail_rcu(&pa->pa_inode_list,
4184 &tmp_pa->pa_inode_list); 4184 &tmp_pa->pa_inode_list);
4185 added = 1; 4185 added = 1;
4186 /* 4186 /*
4187 * we want to count the total 4187 * we want to count the total
4188 * number of entries in the list 4188 * number of entries in the list
4189 */ 4189 */
4190 } 4190 }
4191 spin_unlock(&tmp_pa->pa_lock); 4191 spin_unlock(&tmp_pa->pa_lock);
4192 lg_prealloc_count++; 4192 lg_prealloc_count++;
4193 } 4193 }
4194 if (!added) 4194 if (!added)
4195 list_add_tail_rcu(&pa->pa_inode_list, 4195 list_add_tail_rcu(&pa->pa_inode_list,
4196 &lg->lg_prealloc_list[order]); 4196 &lg->lg_prealloc_list[order]);
4197 rcu_read_unlock(); 4197 rcu_read_unlock();
4198 4198
4199 /* Now trim the list to be not more than 8 elements */ 4199 /* Now trim the list to be not more than 8 elements */
4200 if (lg_prealloc_count > 8) { 4200 if (lg_prealloc_count > 8) {
4201 ext4_mb_discard_lg_preallocations(sb, lg, 4201 ext4_mb_discard_lg_preallocations(sb, lg,
4202 order, lg_prealloc_count); 4202 order, lg_prealloc_count);
4203 return; 4203 return;
4204 } 4204 }
4205 return ; 4205 return ;
4206 } 4206 }
4207 4207
4208 /* 4208 /*
4209 * release all resource we used in allocation 4209 * release all resource we used in allocation
4210 */ 4210 */
4211 static int ext4_mb_release_context(struct ext4_allocation_context *ac) 4211 static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4212 { 4212 {
4213 struct ext4_prealloc_space *pa = ac->ac_pa; 4213 struct ext4_prealloc_space *pa = ac->ac_pa;
4214 if (pa) { 4214 if (pa) {
4215 if (pa->pa_type == MB_GROUP_PA) { 4215 if (pa->pa_type == MB_GROUP_PA) {
4216 /* see comment in ext4_mb_use_group_pa() */ 4216 /* see comment in ext4_mb_use_group_pa() */
4217 spin_lock(&pa->pa_lock); 4217 spin_lock(&pa->pa_lock);
4218 pa->pa_pstart += ac->ac_b_ex.fe_len; 4218 pa->pa_pstart += ac->ac_b_ex.fe_len;
4219 pa->pa_lstart += ac->ac_b_ex.fe_len; 4219 pa->pa_lstart += ac->ac_b_ex.fe_len;
4220 pa->pa_free -= ac->ac_b_ex.fe_len; 4220 pa->pa_free -= ac->ac_b_ex.fe_len;
4221 pa->pa_len -= ac->ac_b_ex.fe_len; 4221 pa->pa_len -= ac->ac_b_ex.fe_len;
4222 spin_unlock(&pa->pa_lock); 4222 spin_unlock(&pa->pa_lock);
4223 } 4223 }
4224 } 4224 }
4225 if (ac->alloc_semp) 4225 if (ac->alloc_semp)
4226 up_read(ac->alloc_semp); 4226 up_read(ac->alloc_semp);
4227 if (pa) { 4227 if (pa) {
4228 /* 4228 /*
4229 * We want to add the pa to the right bucket. 4229 * We want to add the pa to the right bucket.
4230 * Remove it from the list and while adding 4230 * Remove it from the list and while adding
4231 * make sure the list to which we are adding 4231 * make sure the list to which we are adding
4232 * doesn't grow big. We need to release 4232 * doesn't grow big. We need to release
4233 * alloc_semp before calling ext4_mb_add_n_trim() 4233 * alloc_semp before calling ext4_mb_add_n_trim()
4234 */ 4234 */
4235 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) { 4235 if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4236 spin_lock(pa->pa_obj_lock); 4236 spin_lock(pa->pa_obj_lock);
4237 list_del_rcu(&pa->pa_inode_list); 4237 list_del_rcu(&pa->pa_inode_list);
4238 spin_unlock(pa->pa_obj_lock); 4238 spin_unlock(pa->pa_obj_lock);
4239 ext4_mb_add_n_trim(ac); 4239 ext4_mb_add_n_trim(ac);
4240 } 4240 }
4241 ext4_mb_put_pa(ac, ac->ac_sb, pa); 4241 ext4_mb_put_pa(ac, ac->ac_sb, pa);
4242 } 4242 }
4243 if (ac->ac_bitmap_page) 4243 if (ac->ac_bitmap_page)
4244 page_cache_release(ac->ac_bitmap_page); 4244 page_cache_release(ac->ac_bitmap_page);
4245 if (ac->ac_buddy_page) 4245 if (ac->ac_buddy_page)
4246 page_cache_release(ac->ac_buddy_page); 4246 page_cache_release(ac->ac_buddy_page);
4247 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) 4247 if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
4248 mutex_unlock(&ac->ac_lg->lg_mutex); 4248 mutex_unlock(&ac->ac_lg->lg_mutex);
4249 ext4_mb_collect_stats(ac); 4249 ext4_mb_collect_stats(ac);
4250 return 0; 4250 return 0;
4251 } 4251 }
4252 4252
4253 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) 4253 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4254 { 4254 {
4255 ext4_group_t i, ngroups = ext4_get_groups_count(sb); 4255 ext4_group_t i, ngroups = ext4_get_groups_count(sb);
4256 int ret; 4256 int ret;
4257 int freed = 0; 4257 int freed = 0;
4258 4258
4259 trace_ext4_mb_discard_preallocations(sb, needed); 4259 trace_ext4_mb_discard_preallocations(sb, needed);
4260 for (i = 0; i < ngroups && needed > 0; i++) { 4260 for (i = 0; i < ngroups && needed > 0; i++) {
4261 ret = ext4_mb_discard_group_preallocations(sb, i, needed); 4261 ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4262 freed += ret; 4262 freed += ret;
4263 needed -= ret; 4263 needed -= ret;
4264 } 4264 }
4265 4265
4266 return freed; 4266 return freed;
4267 } 4267 }
4268 4268
4269 /* 4269 /*
4270 * Main entry point into mballoc to allocate blocks 4270 * Main entry point into mballoc to allocate blocks
4271 * it tries to use preallocation first, then falls back 4271 * it tries to use preallocation first, then falls back
4272 * to usual allocation 4272 * to usual allocation
4273 */ 4273 */
4274 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, 4274 ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4275 struct ext4_allocation_request *ar, int *errp) 4275 struct ext4_allocation_request *ar, int *errp)
4276 { 4276 {
4277 int freed; 4277 int freed;
4278 struct ext4_allocation_context *ac = NULL; 4278 struct ext4_allocation_context *ac = NULL;
4279 struct ext4_sb_info *sbi; 4279 struct ext4_sb_info *sbi;
4280 struct super_block *sb; 4280 struct super_block *sb;
4281 ext4_fsblk_t block = 0; 4281 ext4_fsblk_t block = 0;
4282 unsigned int inquota = 0; 4282 unsigned int inquota = 0;
4283 unsigned int reserv_blks = 0; 4283 unsigned int reserv_blks = 0;
4284 4284
4285 sb = ar->inode->i_sb; 4285 sb = ar->inode->i_sb;
4286 sbi = EXT4_SB(sb); 4286 sbi = EXT4_SB(sb);
4287 4287
4288 trace_ext4_request_blocks(ar); 4288 trace_ext4_request_blocks(ar);
4289 4289
4290 /* 4290 /*
4291 * For delayed allocation, we could skip the ENOSPC and 4291 * For delayed allocation, we could skip the ENOSPC and
4292 * EDQUOT check, as blocks and quotas have been already 4292 * EDQUOT check, as blocks and quotas have been already
4293 * reserved when data being copied into pagecache. 4293 * reserved when data being copied into pagecache.
4294 */ 4294 */
4295 if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) 4295 if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
4296 ar->flags |= EXT4_MB_DELALLOC_RESERVED; 4296 ar->flags |= EXT4_MB_DELALLOC_RESERVED;
4297 else { 4297 else {
4298 /* Without delayed allocation we need to verify 4298 /* Without delayed allocation we need to verify
4299 * there is enough free blocks to do block allocation 4299 * there is enough free blocks to do block allocation
4300 * and verify allocation doesn't exceed the quota limits. 4300 * and verify allocation doesn't exceed the quota limits.
4301 */ 4301 */
4302 while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) { 4302 while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) {
4303 /* let others to free the space */ 4303 /* let others to free the space */
4304 yield(); 4304 yield();
4305 ar->len = ar->len >> 1; 4305 ar->len = ar->len >> 1;
4306 } 4306 }
4307 if (!ar->len) { 4307 if (!ar->len) {
4308 *errp = -ENOSPC; 4308 *errp = -ENOSPC;
4309 return 0; 4309 return 0;
4310 } 4310 }
4311 reserv_blks = ar->len; 4311 reserv_blks = ar->len;
4312 while (ar->len && dquot_alloc_block(ar->inode, ar->len)) { 4312 while (ar->len && dquot_alloc_block(ar->inode, ar->len)) {
4313 ar->flags |= EXT4_MB_HINT_NOPREALLOC; 4313 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4314 ar->len--; 4314 ar->len--;
4315 } 4315 }
4316 inquota = ar->len; 4316 inquota = ar->len;
4317 if (ar->len == 0) { 4317 if (ar->len == 0) {
4318 *errp = -EDQUOT; 4318 *errp = -EDQUOT;
4319 goto out; 4319 goto out;
4320 } 4320 }
4321 } 4321 }
4322 4322
4323 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4323 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4324 if (!ac) { 4324 if (!ac) {
4325 ar->len = 0; 4325 ar->len = 0;
4326 *errp = -ENOMEM; 4326 *errp = -ENOMEM;
4327 goto out; 4327 goto out;
4328 } 4328 }
4329 4329
4330 *errp = ext4_mb_initialize_context(ac, ar); 4330 *errp = ext4_mb_initialize_context(ac, ar);
4331 if (*errp) { 4331 if (*errp) {
4332 ar->len = 0; 4332 ar->len = 0;
4333 goto out; 4333 goto out;
4334 } 4334 }
4335 4335
4336 ac->ac_op = EXT4_MB_HISTORY_PREALLOC; 4336 ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
4337 if (!ext4_mb_use_preallocated(ac)) { 4337 if (!ext4_mb_use_preallocated(ac)) {
4338 ac->ac_op = EXT4_MB_HISTORY_ALLOC; 4338 ac->ac_op = EXT4_MB_HISTORY_ALLOC;
4339 ext4_mb_normalize_request(ac, ar); 4339 ext4_mb_normalize_request(ac, ar);
4340 repeat: 4340 repeat:
4341 /* allocate space in core */ 4341 /* allocate space in core */
4342 *errp = ext4_mb_regular_allocator(ac); 4342 *errp = ext4_mb_regular_allocator(ac);
4343 if (*errp) 4343 if (*errp)
4344 goto errout; 4344 goto errout;
4345 4345
4346 /* as we've just preallocated more space than 4346 /* as we've just preallocated more space than
4347 * user requested orinally, we store allocated 4347 * user requested orinally, we store allocated
4348 * space in a special descriptor */ 4348 * space in a special descriptor */
4349 if (ac->ac_status == AC_STATUS_FOUND && 4349 if (ac->ac_status == AC_STATUS_FOUND &&
4350 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 4350 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4351 ext4_mb_new_preallocation(ac); 4351 ext4_mb_new_preallocation(ac);
4352 } 4352 }
4353 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4353 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4354 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); 4354 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
4355 if (*errp == -EAGAIN) { 4355 if (*errp == -EAGAIN) {
4356 /* 4356 /*
4357 * drop the reference that we took 4357 * drop the reference that we took
4358 * in ext4_mb_use_best_found 4358 * in ext4_mb_use_best_found
4359 */ 4359 */
4360 ext4_mb_release_context(ac); 4360 ext4_mb_release_context(ac);
4361 ac->ac_b_ex.fe_group = 0; 4361 ac->ac_b_ex.fe_group = 0;
4362 ac->ac_b_ex.fe_start = 0; 4362 ac->ac_b_ex.fe_start = 0;
4363 ac->ac_b_ex.fe_len = 0; 4363 ac->ac_b_ex.fe_len = 0;
4364 ac->ac_status = AC_STATUS_CONTINUE; 4364 ac->ac_status = AC_STATUS_CONTINUE;
4365 goto repeat; 4365 goto repeat;
4366 } else if (*errp) 4366 } else if (*errp)
4367 errout: 4367 errout:
4368 ext4_discard_allocated_blocks(ac); 4368 ext4_discard_allocated_blocks(ac);
4369 else { 4369 else {
4370 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4370 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4371 ar->len = ac->ac_b_ex.fe_len; 4371 ar->len = ac->ac_b_ex.fe_len;
4372 } 4372 }
4373 } else { 4373 } else {
4374 freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); 4374 freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
4375 if (freed) 4375 if (freed)
4376 goto repeat; 4376 goto repeat;
4377 *errp = -ENOSPC; 4377 *errp = -ENOSPC;
4378 } 4378 }
4379 4379
4380 if (*errp) { 4380 if (*errp) {
4381 ac->ac_b_ex.fe_len = 0; 4381 ac->ac_b_ex.fe_len = 0;
4382 ar->len = 0; 4382 ar->len = 0;
4383 ext4_mb_show_ac(ac); 4383 ext4_mb_show_ac(ac);
4384 } 4384 }
4385 ext4_mb_release_context(ac); 4385 ext4_mb_release_context(ac);
4386 out: 4386 out:
4387 if (ac) 4387 if (ac)
4388 kmem_cache_free(ext4_ac_cachep, ac); 4388 kmem_cache_free(ext4_ac_cachep, ac);
4389 if (inquota && ar->len < inquota) 4389 if (inquota && ar->len < inquota)
4390 dquot_free_block(ar->inode, inquota - ar->len); 4390 dquot_free_block(ar->inode, inquota - ar->len);
4391 if (!ar->len) { 4391 if (!ar->len) {
4392 if (!ext4_test_inode_state(ar->inode, 4392 if (!ext4_test_inode_state(ar->inode,
4393 EXT4_STATE_DELALLOC_RESERVED)) 4393 EXT4_STATE_DELALLOC_RESERVED))
4394 /* release all the reserved blocks if non delalloc */ 4394 /* release all the reserved blocks if non delalloc */
4395 percpu_counter_sub(&sbi->s_dirtyblocks_counter, 4395 percpu_counter_sub(&sbi->s_dirtyblocks_counter,
4396 reserv_blks); 4396 reserv_blks);
4397 } 4397 }
4398 4398
4399 trace_ext4_allocate_blocks(ar, (unsigned long long)block); 4399 trace_ext4_allocate_blocks(ar, (unsigned long long)block);
4400 4400
4401 return block; 4401 return block;
4402 } 4402 }
4403 4403
4404 /* 4404 /*
4405 * We can merge two free data extents only if the physical blocks 4405 * We can merge two free data extents only if the physical blocks
4406 * are contiguous, AND the extents were freed by the same transaction, 4406 * are contiguous, AND the extents were freed by the same transaction,
4407 * AND the blocks are associated with the same group. 4407 * AND the blocks are associated with the same group.
4408 */ 4408 */
4409 static int can_merge(struct ext4_free_data *entry1, 4409 static int can_merge(struct ext4_free_data *entry1,
4410 struct ext4_free_data *entry2) 4410 struct ext4_free_data *entry2)
4411 { 4411 {
4412 if ((entry1->t_tid == entry2->t_tid) && 4412 if ((entry1->t_tid == entry2->t_tid) &&
4413 (entry1->group == entry2->group) && 4413 (entry1->group == entry2->group) &&
4414 ((entry1->start_blk + entry1->count) == entry2->start_blk)) 4414 ((entry1->start_blk + entry1->count) == entry2->start_blk))
4415 return 1; 4415 return 1;
4416 return 0; 4416 return 0;
4417 } 4417 }
4418 4418
4419 static noinline_for_stack int 4419 static noinline_for_stack int
4420 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, 4420 ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4421 struct ext4_free_data *new_entry) 4421 struct ext4_free_data *new_entry)
4422 { 4422 {
4423 ext4_group_t group = e4b->bd_group; 4423 ext4_group_t group = e4b->bd_group;
4424 ext4_grpblk_t block; 4424 ext4_grpblk_t block;
4425 struct ext4_free_data *entry; 4425 struct ext4_free_data *entry;
4426 struct ext4_group_info *db = e4b->bd_info; 4426 struct ext4_group_info *db = e4b->bd_info;
4427 struct super_block *sb = e4b->bd_sb; 4427 struct super_block *sb = e4b->bd_sb;
4428 struct ext4_sb_info *sbi = EXT4_SB(sb); 4428 struct ext4_sb_info *sbi = EXT4_SB(sb);
4429 struct rb_node **n = &db->bb_free_root.rb_node, *node; 4429 struct rb_node **n = &db->bb_free_root.rb_node, *node;
4430 struct rb_node *parent = NULL, *new_node; 4430 struct rb_node *parent = NULL, *new_node;
4431 4431
4432 BUG_ON(!ext4_handle_valid(handle)); 4432 BUG_ON(!ext4_handle_valid(handle));
4433 BUG_ON(e4b->bd_bitmap_page == NULL); 4433 BUG_ON(e4b->bd_bitmap_page == NULL);
4434 BUG_ON(e4b->bd_buddy_page == NULL); 4434 BUG_ON(e4b->bd_buddy_page == NULL);
4435 4435
4436 new_node = &new_entry->node; 4436 new_node = &new_entry->node;
4437 block = new_entry->start_blk; 4437 block = new_entry->start_blk;
4438 4438
4439 if (!*n) { 4439 if (!*n) {
4440 /* first free block exent. We need to 4440 /* first free block exent. We need to
4441 protect buddy cache from being freed, 4441 protect buddy cache from being freed,
4442 * otherwise we'll refresh it from 4442 * otherwise we'll refresh it from
4443 * on-disk bitmap and lose not-yet-available 4443 * on-disk bitmap and lose not-yet-available
4444 * blocks */ 4444 * blocks */
4445 page_cache_get(e4b->bd_buddy_page); 4445 page_cache_get(e4b->bd_buddy_page);
4446 page_cache_get(e4b->bd_bitmap_page); 4446 page_cache_get(e4b->bd_bitmap_page);
4447 } 4447 }
4448 while (*n) { 4448 while (*n) {
4449 parent = *n; 4449 parent = *n;
4450 entry = rb_entry(parent, struct ext4_free_data, node); 4450 entry = rb_entry(parent, struct ext4_free_data, node);
4451 if (block < entry->start_blk) 4451 if (block < entry->start_blk)
4452 n = &(*n)->rb_left; 4452 n = &(*n)->rb_left;
4453 else if (block >= (entry->start_blk + entry->count)) 4453 else if (block >= (entry->start_blk + entry->count))
4454 n = &(*n)->rb_right; 4454 n = &(*n)->rb_right;
4455 else { 4455 else {
4456 ext4_grp_locked_error(sb, group, 0, 4456 ext4_grp_locked_error(sb, group, 0,
4457 ext4_group_first_block_no(sb, group) + block, 4457 ext4_group_first_block_no(sb, group) + block,
4458 "Block already on to-be-freed list"); 4458 "Block already on to-be-freed list");
4459 return 0; 4459 return 0;
4460 } 4460 }
4461 } 4461 }
4462 4462
4463 rb_link_node(new_node, parent, n); 4463 rb_link_node(new_node, parent, n);
4464 rb_insert_color(new_node, &db->bb_free_root); 4464 rb_insert_color(new_node, &db->bb_free_root);
4465 4465
4466 /* Now try to see the extent can be merged to left and right */ 4466 /* Now try to see the extent can be merged to left and right */
4467 node = rb_prev(new_node); 4467 node = rb_prev(new_node);
4468 if (node) { 4468 if (node) {
4469 entry = rb_entry(node, struct ext4_free_data, node); 4469 entry = rb_entry(node, struct ext4_free_data, node);
4470 if (can_merge(entry, new_entry)) { 4470 if (can_merge(entry, new_entry)) {
4471 new_entry->start_blk = entry->start_blk; 4471 new_entry->start_blk = entry->start_blk;
4472 new_entry->count += entry->count; 4472 new_entry->count += entry->count;
4473 rb_erase(node, &(db->bb_free_root)); 4473 rb_erase(node, &(db->bb_free_root));
4474 spin_lock(&sbi->s_md_lock); 4474 spin_lock(&sbi->s_md_lock);
4475 list_del(&entry->list); 4475 list_del(&entry->list);
4476 spin_unlock(&sbi->s_md_lock); 4476 spin_unlock(&sbi->s_md_lock);
4477 kmem_cache_free(ext4_free_ext_cachep, entry); 4477 kmem_cache_free(ext4_free_ext_cachep, entry);
4478 } 4478 }
4479 } 4479 }
4480 4480
4481 node = rb_next(new_node); 4481 node = rb_next(new_node);
4482 if (node) { 4482 if (node) {
4483 entry = rb_entry(node, struct ext4_free_data, node); 4483 entry = rb_entry(node, struct ext4_free_data, node);
4484 if (can_merge(new_entry, entry)) { 4484 if (can_merge(new_entry, entry)) {
4485 new_entry->count += entry->count; 4485 new_entry->count += entry->count;
4486 rb_erase(node, &(db->bb_free_root)); 4486 rb_erase(node, &(db->bb_free_root));
4487 spin_lock(&sbi->s_md_lock); 4487 spin_lock(&sbi->s_md_lock);
4488 list_del(&entry->list); 4488 list_del(&entry->list);
4489 spin_unlock(&sbi->s_md_lock); 4489 spin_unlock(&sbi->s_md_lock);
4490 kmem_cache_free(ext4_free_ext_cachep, entry); 4490 kmem_cache_free(ext4_free_ext_cachep, entry);
4491 } 4491 }
4492 } 4492 }
4493 /* Add the extent to transaction's private list */ 4493 /* Add the extent to transaction's private list */
4494 spin_lock(&sbi->s_md_lock); 4494 spin_lock(&sbi->s_md_lock);
4495 list_add(&new_entry->list, &handle->h_transaction->t_private_list); 4495 list_add(&new_entry->list, &handle->h_transaction->t_private_list);
4496 spin_unlock(&sbi->s_md_lock); 4496 spin_unlock(&sbi->s_md_lock);
4497 return 0; 4497 return 0;
4498 } 4498 }
4499 4499
4500 /** 4500 /**
4501 * ext4_free_blocks() -- Free given blocks and update quota 4501 * ext4_free_blocks() -- Free given blocks and update quota
4502 * @handle: handle for this transaction 4502 * @handle: handle for this transaction
4503 * @inode: inode 4503 * @inode: inode
4504 * @block: start physical block to free 4504 * @block: start physical block to free
4505 * @count: number of blocks to count 4505 * @count: number of blocks to count
4506 * @metadata: Are these metadata blocks 4506 * @metadata: Are these metadata blocks
4507 */ 4507 */
4508 void ext4_free_blocks(handle_t *handle, struct inode *inode, 4508 void ext4_free_blocks(handle_t *handle, struct inode *inode,
4509 struct buffer_head *bh, ext4_fsblk_t block, 4509 struct buffer_head *bh, ext4_fsblk_t block,
4510 unsigned long count, int flags) 4510 unsigned long count, int flags)
4511 { 4511 {
4512 struct buffer_head *bitmap_bh = NULL; 4512 struct buffer_head *bitmap_bh = NULL;
4513 struct super_block *sb = inode->i_sb; 4513 struct super_block *sb = inode->i_sb;
4514 struct ext4_group_desc *gdp; 4514 struct ext4_group_desc *gdp;
4515 unsigned long freed = 0; 4515 unsigned long freed = 0;
4516 unsigned int overflow; 4516 unsigned int overflow;
4517 ext4_grpblk_t bit; 4517 ext4_grpblk_t bit;
4518 struct buffer_head *gd_bh; 4518 struct buffer_head *gd_bh;
4519 ext4_group_t block_group; 4519 ext4_group_t block_group;
4520 struct ext4_sb_info *sbi; 4520 struct ext4_sb_info *sbi;
4521 struct ext4_buddy e4b; 4521 struct ext4_buddy e4b;
4522 int err = 0; 4522 int err = 0;
4523 int ret; 4523 int ret;
4524 4524
4525 if (bh) { 4525 if (bh) {
4526 if (block) 4526 if (block)
4527 BUG_ON(block != bh->b_blocknr); 4527 BUG_ON(block != bh->b_blocknr);
4528 else 4528 else
4529 block = bh->b_blocknr; 4529 block = bh->b_blocknr;
4530 } 4530 }
4531 4531
4532 sbi = EXT4_SB(sb); 4532 sbi = EXT4_SB(sb);
4533 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 4533 if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
4534 !ext4_data_block_valid(sbi, block, count)) { 4534 !ext4_data_block_valid(sbi, block, count)) {
4535 ext4_error(sb, "Freeing blocks not in datazone - " 4535 ext4_error(sb, "Freeing blocks not in datazone - "
4536 "block = %llu, count = %lu", block, count); 4536 "block = %llu, count = %lu", block, count);
4537 goto error_return; 4537 goto error_return;
4538 } 4538 }
4539 4539
4540 ext4_debug("freeing block %llu\n", block); 4540 ext4_debug("freeing block %llu\n", block);
4541 trace_ext4_free_blocks(inode, block, count, flags); 4541 trace_ext4_free_blocks(inode, block, count, flags);
4542 4542
4543 if (flags & EXT4_FREE_BLOCKS_FORGET) { 4543 if (flags & EXT4_FREE_BLOCKS_FORGET) {
4544 struct buffer_head *tbh = bh; 4544 struct buffer_head *tbh = bh;
4545 int i; 4545 int i;
4546 4546
4547 BUG_ON(bh && (count > 1)); 4547 BUG_ON(bh && (count > 1));
4548 4548
4549 for (i = 0; i < count; i++) { 4549 for (i = 0; i < count; i++) {
4550 if (!bh) 4550 if (!bh)
4551 tbh = sb_find_get_block(inode->i_sb, 4551 tbh = sb_find_get_block(inode->i_sb,
4552 block + i); 4552 block + i);
4553 if (unlikely(!tbh)) 4553 if (unlikely(!tbh))
4554 continue; 4554 continue;
4555 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4555 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4556 inode, tbh, block + i); 4556 inode, tbh, block + i);
4557 } 4557 }
4558 } 4558 }
4559 4559
4560 /* 4560 /*
4561 * We need to make sure we don't reuse the freed block until 4561 * We need to make sure we don't reuse the freed block until
4562 * after the transaction is committed, which we can do by 4562 * after the transaction is committed, which we can do by
4563 * treating the block as metadata, below. We make an 4563 * treating the block as metadata, below. We make an
4564 * exception if the inode is to be written in writeback mode 4564 * exception if the inode is to be written in writeback mode
4565 * since writeback mode has weak data consistency guarantees. 4565 * since writeback mode has weak data consistency guarantees.
4566 */ 4566 */
4567 if (!ext4_should_writeback_data(inode)) 4567 if (!ext4_should_writeback_data(inode))
4568 flags |= EXT4_FREE_BLOCKS_METADATA; 4568 flags |= EXT4_FREE_BLOCKS_METADATA;
4569 4569
4570 do_more: 4570 do_more:
4571 overflow = 0; 4571 overflow = 0;
4572 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 4572 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4573 4573
4574 /* 4574 /*
4575 * Check to see if we are freeing blocks across a group 4575 * Check to see if we are freeing blocks across a group
4576 * boundary. 4576 * boundary.
4577 */ 4577 */
4578 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { 4578 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4579 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); 4579 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
4580 count -= overflow; 4580 count -= overflow;
4581 } 4581 }
4582 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 4582 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4583 if (!bitmap_bh) { 4583 if (!bitmap_bh) {
4584 err = -EIO; 4584 err = -EIO;
4585 goto error_return; 4585 goto error_return;
4586 } 4586 }
4587 gdp = ext4_get_group_desc(sb, block_group, &gd_bh); 4587 gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
4588 if (!gdp) { 4588 if (!gdp) {
4589 err = -EIO; 4589 err = -EIO;
4590 goto error_return; 4590 goto error_return;
4591 } 4591 }
4592 4592
4593 if (in_range(ext4_block_bitmap(sb, gdp), block, count) || 4593 if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
4594 in_range(ext4_inode_bitmap(sb, gdp), block, count) || 4594 in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
4595 in_range(block, ext4_inode_table(sb, gdp), 4595 in_range(block, ext4_inode_table(sb, gdp),
4596 EXT4_SB(sb)->s_itb_per_group) || 4596 EXT4_SB(sb)->s_itb_per_group) ||
4597 in_range(block + count - 1, ext4_inode_table(sb, gdp), 4597 in_range(block + count - 1, ext4_inode_table(sb, gdp),
4598 EXT4_SB(sb)->s_itb_per_group)) { 4598 EXT4_SB(sb)->s_itb_per_group)) {
4599 4599
4600 ext4_error(sb, "Freeing blocks in system zone - " 4600 ext4_error(sb, "Freeing blocks in system zone - "
4601 "Block = %llu, count = %lu", block, count); 4601 "Block = %llu, count = %lu", block, count);
4602 /* err = 0. ext4_std_error should be a no op */ 4602 /* err = 0. ext4_std_error should be a no op */
4603 goto error_return; 4603 goto error_return;
4604 } 4604 }
4605 4605
4606 BUFFER_TRACE(bitmap_bh, "getting write access"); 4606 BUFFER_TRACE(bitmap_bh, "getting write access");
4607 err = ext4_journal_get_write_access(handle, bitmap_bh); 4607 err = ext4_journal_get_write_access(handle, bitmap_bh);
4608 if (err) 4608 if (err)
4609 goto error_return; 4609 goto error_return;
4610 4610
4611 /* 4611 /*
4612 * We are about to modify some metadata. Call the journal APIs 4612 * We are about to modify some metadata. Call the journal APIs
4613 * to unshare ->b_data if a currently-committing transaction is 4613 * to unshare ->b_data if a currently-committing transaction is
4614 * using it 4614 * using it
4615 */ 4615 */
4616 BUFFER_TRACE(gd_bh, "get_write_access"); 4616 BUFFER_TRACE(gd_bh, "get_write_access");
4617 err = ext4_journal_get_write_access(handle, gd_bh); 4617 err = ext4_journal_get_write_access(handle, gd_bh);
4618 if (err) 4618 if (err)
4619 goto error_return; 4619 goto error_return;
4620 #ifdef AGGRESSIVE_CHECK 4620 #ifdef AGGRESSIVE_CHECK
4621 { 4621 {
4622 int i; 4622 int i;
4623 for (i = 0; i < count; i++) 4623 for (i = 0; i < count; i++)
4624 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 4624 BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4625 } 4625 }
4626 #endif 4626 #endif
4627 trace_ext4_mballoc_free(sb, inode, block_group, bit, count); 4627 trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
4628 4628
4629 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4629 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4630 if (err) 4630 if (err)
4631 goto error_return; 4631 goto error_return;
4632 4632
4633 if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) { 4633 if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
4634 struct ext4_free_data *new_entry; 4634 struct ext4_free_data *new_entry;
4635 /* 4635 /*
4636 * blocks being freed are metadata. these blocks shouldn't 4636 * blocks being freed are metadata. these blocks shouldn't
4637 * be used until this transaction is committed 4637 * be used until this transaction is committed
4638 */ 4638 */
4639 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); 4639 new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
4640 if (!new_entry) { 4640 if (!new_entry) {
4641 err = -ENOMEM; 4641 err = -ENOMEM;
4642 goto error_return; 4642 goto error_return;
4643 } 4643 }
4644 new_entry->start_blk = bit; 4644 new_entry->start_blk = bit;
4645 new_entry->group = block_group; 4645 new_entry->group = block_group;
4646 new_entry->count = count; 4646 new_entry->count = count;
4647 new_entry->t_tid = handle->h_transaction->t_tid; 4647 new_entry->t_tid = handle->h_transaction->t_tid;
4648 4648
4649 ext4_lock_group(sb, block_group); 4649 ext4_lock_group(sb, block_group);
4650 mb_clear_bits(bitmap_bh->b_data, bit, count); 4650 mb_clear_bits(bitmap_bh->b_data, bit, count);
4651 ext4_mb_free_metadata(handle, &e4b, new_entry); 4651 ext4_mb_free_metadata(handle, &e4b, new_entry);
4652 } else { 4652 } else {
4653 /* need to update group_info->bb_free and bitmap 4653 /* need to update group_info->bb_free and bitmap
4654 * with group lock held. generate_buddy look at 4654 * with group lock held. generate_buddy look at
4655 * them with group lock_held 4655 * them with group lock_held
4656 */ 4656 */
4657 ext4_lock_group(sb, block_group); 4657 ext4_lock_group(sb, block_group);
4658 mb_clear_bits(bitmap_bh->b_data, bit, count); 4658 mb_clear_bits(bitmap_bh->b_data, bit, count);
4659 mb_free_blocks(inode, &e4b, bit, count); 4659 mb_free_blocks(inode, &e4b, bit, count);
4660 } 4660 }
4661 4661
4662 ret = ext4_free_blks_count(sb, gdp) + count; 4662 ret = ext4_free_blks_count(sb, gdp) + count;
4663 ext4_free_blks_set(sb, gdp, ret); 4663 ext4_free_blks_set(sb, gdp, ret);
4664 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); 4664 gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
4665 ext4_unlock_group(sb, block_group); 4665 ext4_unlock_group(sb, block_group);
4666 percpu_counter_add(&sbi->s_freeblocks_counter, count); 4666 percpu_counter_add(&sbi->s_freeblocks_counter, count);
4667 4667
4668 if (sbi->s_log_groups_per_flex) { 4668 if (sbi->s_log_groups_per_flex) {
4669 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 4669 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4670 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks); 4670 atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
4671 } 4671 }
4672 4672
4673 ext4_mb_unload_buddy(&e4b); 4673 ext4_mb_unload_buddy(&e4b);
4674 4674
4675 freed += count; 4675 freed += count;
4676 4676
4677 /* We dirtied the bitmap block */ 4677 /* We dirtied the bitmap block */
4678 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 4678 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4679 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 4679 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4680 4680
4681 /* And the group descriptor block */ 4681 /* And the group descriptor block */
4682 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 4682 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4683 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 4683 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4684 if (!err) 4684 if (!err)
4685 err = ret; 4685 err = ret;
4686 4686
4687 if (overflow && !err) { 4687 if (overflow && !err) {
4688 block += count; 4688 block += count;
4689 count = overflow; 4689 count = overflow;
4690 put_bh(bitmap_bh); 4690 put_bh(bitmap_bh);
4691 goto do_more; 4691 goto do_more;
4692 } 4692 }
4693 ext4_mark_super_dirty(sb); 4693 ext4_mark_super_dirty(sb);
4694 error_return: 4694 error_return:
4695 if (freed) 4695 if (freed)
4696 dquot_free_block(inode, freed); 4696 dquot_free_block(inode, freed);
4697 brelse(bitmap_bh); 4697 brelse(bitmap_bh);
4698 ext4_std_error(sb, err); 4698 ext4_std_error(sb, err);
4699 return; 4699 return;
4700 } 4700 }
4701 4701
4702 /** 4702 /**
4703 * ext4_add_groupblocks() -- Add given blocks to an existing group 4703 * ext4_add_groupblocks() -- Add given blocks to an existing group
4704 * @handle: handle to this transaction 4704 * @handle: handle to this transaction
4705 * @sb: super block 4705 * @sb: super block
4706 * @block: start physcial block to add to the block group 4706 * @block: start physcial block to add to the block group
4707 * @count: number of blocks to free 4707 * @count: number of blocks to free
4708 * 4708 *
4709 * This marks the blocks as free in the bitmap. We ask the 4709 * This marks the blocks as free in the bitmap. We ask the
4710 * mballoc to reload the buddy after this by setting group 4710 * mballoc to reload the buddy after this by setting group
4711 * EXT4_GROUP_INFO_NEED_INIT_BIT flag 4711 * EXT4_GROUP_INFO_NEED_INIT_BIT flag
4712 */ 4712 */
4713 void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, 4713 void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
4714 ext4_fsblk_t block, unsigned long count) 4714 ext4_fsblk_t block, unsigned long count)
4715 { 4715 {
4716 struct buffer_head *bitmap_bh = NULL; 4716 struct buffer_head *bitmap_bh = NULL;
4717 struct buffer_head *gd_bh; 4717 struct buffer_head *gd_bh;
4718 ext4_group_t block_group; 4718 ext4_group_t block_group;
4719 ext4_grpblk_t bit; 4719 ext4_grpblk_t bit;
4720 unsigned int i; 4720 unsigned int i;
4721 struct ext4_group_desc *desc; 4721 struct ext4_group_desc *desc;
4722 struct ext4_sb_info *sbi = EXT4_SB(sb); 4722 struct ext4_sb_info *sbi = EXT4_SB(sb);
4723 int err = 0, ret, blk_free_count; 4723 int err = 0, ret, blk_free_count;
4724 ext4_grpblk_t blocks_freed; 4724 ext4_grpblk_t blocks_freed;
4725 struct ext4_group_info *grp; 4725 struct ext4_group_info *grp;
4726 4726
4727 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); 4727 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
4728 4728
4729 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); 4729 ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4730 grp = ext4_get_group_info(sb, block_group); 4730 grp = ext4_get_group_info(sb, block_group);
4731 /* 4731 /*
4732 * Check to see if we are freeing blocks across a group 4732 * Check to see if we are freeing blocks across a group
4733 * boundary. 4733 * boundary.
4734 */ 4734 */
4735 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { 4735 if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
4736 goto error_return; 4736 goto error_return;
4737 } 4737
4738 bitmap_bh = ext4_read_block_bitmap(sb, block_group); 4738 bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4739 if (!bitmap_bh) 4739 if (!bitmap_bh)
4740 goto error_return; 4740 goto error_return;
4741 desc = ext4_get_group_desc(sb, block_group, &gd_bh); 4741 desc = ext4_get_group_desc(sb, block_group, &gd_bh);
4742 if (!desc) 4742 if (!desc)
4743 goto error_return; 4743 goto error_return;
4744 4744
4745 if (in_range(ext4_block_bitmap(sb, desc), block, count) || 4745 if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
4746 in_range(ext4_inode_bitmap(sb, desc), block, count) || 4746 in_range(ext4_inode_bitmap(sb, desc), block, count) ||
4747 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || 4747 in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
4748 in_range(block + count - 1, ext4_inode_table(sb, desc), 4748 in_range(block + count - 1, ext4_inode_table(sb, desc),
4749 sbi->s_itb_per_group)) { 4749 sbi->s_itb_per_group)) {
4750 ext4_error(sb, "Adding blocks in system zones - " 4750 ext4_error(sb, "Adding blocks in system zones - "
4751 "Block = %llu, count = %lu", 4751 "Block = %llu, count = %lu",
4752 block, count); 4752 block, count);
4753 goto error_return; 4753 goto error_return;
4754 } 4754 }
4755 4755
4756 /* 4756 BUFFER_TRACE(bitmap_bh, "getting write access");
4757 * We are about to add blocks to the bitmap, 4757 err = ext4_journal_get_write_access(handle, bitmap_bh);
4758 * so we need undo access.
4759 */
4760 BUFFER_TRACE(bitmap_bh, "getting undo access");
4761 err = ext4_journal_get_undo_access(handle, bitmap_bh);
4762 if (err) 4758 if (err)
4763 goto error_return; 4759 goto error_return;
4764 4760
4765 /* 4761 /*
4766 * We are about to modify some metadata. Call the journal APIs 4762 * We are about to modify some metadata. Call the journal APIs
4767 * to unshare ->b_data if a currently-committing transaction is 4763 * to unshare ->b_data if a currently-committing transaction is
4768 * using it 4764 * using it
4769 */ 4765 */
4770 BUFFER_TRACE(gd_bh, "get_write_access"); 4766 BUFFER_TRACE(gd_bh, "get_write_access");
4771 err = ext4_journal_get_write_access(handle, gd_bh); 4767 err = ext4_journal_get_write_access(handle, gd_bh);
4772 if (err) 4768 if (err)
4773 goto error_return; 4769 goto error_return;
4774 /* 4770 /*
4775 * make sure we don't allow a parallel init on other groups in the 4771 * make sure we don't allow a parallel init on other groups in the
4776 * same buddy cache 4772 * same buddy cache
4777 */ 4773 */
4778 down_write(&grp->alloc_sem); 4774 down_write(&grp->alloc_sem);
4779 for (i = 0, blocks_freed = 0; i < count; i++) { 4775 for (i = 0, blocks_freed = 0; i < count; i++) {
4780 BUFFER_TRACE(bitmap_bh, "clear bit"); 4776 BUFFER_TRACE(bitmap_bh, "clear bit");
4781 if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), 4777 if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
4782 bit + i, bitmap_bh->b_data)) { 4778 bit + i, bitmap_bh->b_data)) {
4783 ext4_error(sb, "bit already cleared for block %llu", 4779 ext4_error(sb, "bit already cleared for block %llu",
4784 (ext4_fsblk_t)(block + i)); 4780 (ext4_fsblk_t)(block + i));
4785 BUFFER_TRACE(bitmap_bh, "bit already cleared"); 4781 BUFFER_TRACE(bitmap_bh, "bit already cleared");
4786 } else { 4782 } else {
4787 blocks_freed++; 4783 blocks_freed++;
4788 } 4784 }
4789 } 4785 }
4790 ext4_lock_group(sb, block_group); 4786 ext4_lock_group(sb, block_group);
4791 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); 4787 blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
4792 ext4_free_blks_set(sb, desc, blk_free_count); 4788 ext4_free_blks_set(sb, desc, blk_free_count);
4793 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); 4789 desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
4794 ext4_unlock_group(sb, block_group); 4790 ext4_unlock_group(sb, block_group);
4795 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); 4791 percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
4796 4792
4797 if (sbi->s_log_groups_per_flex) { 4793 if (sbi->s_log_groups_per_flex) {
4798 ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 4794 ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4799 atomic_add(blocks_freed, 4795 atomic_add(blocks_freed,
4800 &sbi->s_flex_groups[flex_group].free_blocks); 4796 &sbi->s_flex_groups[flex_group].free_blocks);
4801 } 4797 }
4802 /* 4798 /*
4803 * request to reload the buddy with the 4799 * request to reload the buddy with the
4804 * new bitmap information 4800 * new bitmap information
4805 */ 4801 */
4806 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); 4802 set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
4807 grp->bb_free += blocks_freed; 4803 grp->bb_free += blocks_freed;
4808 up_write(&grp->alloc_sem); 4804 up_write(&grp->alloc_sem);
4809 4805
4810 /* We dirtied the bitmap block */ 4806 /* We dirtied the bitmap block */
4811 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 4807 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4812 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 4808 err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4813 4809
4814 /* And the group descriptor block */ 4810 /* And the group descriptor block */
4815 BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 4811 BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4816 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 4812 ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4817 if (!err) 4813 if (!err)
4818 err = ret; 4814 err = ret;
4819 4815
4820 error_return: 4816 error_return:
4821 brelse(bitmap_bh); 4817 brelse(bitmap_bh);
4822 ext4_std_error(sb, err); 4818 ext4_std_error(sb, err);
4823 return; 4819 return;
4824 } 4820 }
4825 4821
4826 /** 4822 /**
4827 * ext4_trim_extent -- function to TRIM one single free extent in the group 4823 * ext4_trim_extent -- function to TRIM one single free extent in the group
4828 * @sb: super block for the file system 4824 * @sb: super block for the file system
4829 * @start: starting block of the free extent in the alloc. group 4825 * @start: starting block of the free extent in the alloc. group
4830 * @count: number of blocks to TRIM 4826 * @count: number of blocks to TRIM
4831 * @group: alloc. group we are working with 4827 * @group: alloc. group we are working with
4832 * @e4b: ext4 buddy for the group 4828 * @e4b: ext4 buddy for the group
4833 * 4829 *
4834 * Trim "count" blocks starting at "start" in the "group". To assure that no 4830 * Trim "count" blocks starting at "start" in the "group". To assure that no
4835 * one will allocate those blocks, mark it as used in buddy bitmap. This must 4831 * one will allocate those blocks, mark it as used in buddy bitmap. This must
4836 * be called with under the group lock. 4832 * be called with under the group lock.
4837 */ 4833 */
4838 static void ext4_trim_extent(struct super_block *sb, int start, int count, 4834 static void ext4_trim_extent(struct super_block *sb, int start, int count,
4839 ext4_group_t group, struct ext4_buddy *e4b) 4835 ext4_group_t group, struct ext4_buddy *e4b)
4840 { 4836 {
4841 struct ext4_free_extent ex; 4837 struct ext4_free_extent ex;
4842 4838
4843 assert_spin_locked(ext4_group_lock_ptr(sb, group)); 4839 assert_spin_locked(ext4_group_lock_ptr(sb, group));
4844 4840
4845 ex.fe_start = start; 4841 ex.fe_start = start;
4846 ex.fe_group = group; 4842 ex.fe_group = group;
4847 ex.fe_len = count; 4843 ex.fe_len = count;
4848 4844
4849 /* 4845 /*
4850 * Mark blocks used, so no one can reuse them while 4846 * Mark blocks used, so no one can reuse them while
4851 * being trimmed. 4847 * being trimmed.
4852 */ 4848 */
4853 mb_mark_used(e4b, &ex); 4849 mb_mark_used(e4b, &ex);
4854 ext4_unlock_group(sb, group); 4850 ext4_unlock_group(sb, group);
4855 ext4_issue_discard(sb, group, start, count); 4851 ext4_issue_discard(sb, group, start, count);
4856 ext4_lock_group(sb, group); 4852 ext4_lock_group(sb, group);
4857 mb_free_blocks(NULL, e4b, start, ex.fe_len); 4853 mb_free_blocks(NULL, e4b, start, ex.fe_len);
4858 } 4854 }
4859 4855
4860 /** 4856 /**
4861 * ext4_trim_all_free -- function to trim all free space in alloc. group 4857 * ext4_trim_all_free -- function to trim all free space in alloc. group
4862 * @sb: super block for file system 4858 * @sb: super block for file system
4863 * @e4b: ext4 buddy 4859 * @e4b: ext4 buddy
4864 * @start: first group block to examine 4860 * @start: first group block to examine
4865 * @max: last group block to examine 4861 * @max: last group block to examine
4866 * @minblocks: minimum extent block count 4862 * @minblocks: minimum extent block count
4867 * 4863 *
4868 * ext4_trim_all_free walks through group's buddy bitmap searching for free 4864 * ext4_trim_all_free walks through group's buddy bitmap searching for free
4869 * extents. When the free block is found, ext4_trim_extent is called to TRIM 4865 * extents. When the free block is found, ext4_trim_extent is called to TRIM
4870 * the extent. 4866 * the extent.
4871 * 4867 *
4872 * 4868 *
4873 * ext4_trim_all_free walks through group's block bitmap searching for free 4869 * ext4_trim_all_free walks through group's block bitmap searching for free
4874 * extents. When the free extent is found, mark it as used in group buddy 4870 * extents. When the free extent is found, mark it as used in group buddy
4875 * bitmap. Then issue a TRIM command on this extent and free the extent in 4871 * bitmap. Then issue a TRIM command on this extent and free the extent in
4876 * the group buddy bitmap. This is done until whole group is scanned. 4872 * the group buddy bitmap. This is done until whole group is scanned.
4877 */ 4873 */
4878 static ext4_grpblk_t 4874 static ext4_grpblk_t
4879 ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b, 4875 ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
4880 ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) 4876 ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
4881 { 4877 {
4882 void *bitmap; 4878 void *bitmap;
4883 ext4_grpblk_t next, count = 0; 4879 ext4_grpblk_t next, count = 0;
4884 ext4_group_t group; 4880 ext4_group_t group;
4885 4881
4886 BUG_ON(e4b == NULL); 4882 BUG_ON(e4b == NULL);
4887 4883
4888 bitmap = e4b->bd_bitmap; 4884 bitmap = e4b->bd_bitmap;
4889 group = e4b->bd_group; 4885 group = e4b->bd_group;
4890 start = (e4b->bd_info->bb_first_free > start) ? 4886 start = (e4b->bd_info->bb_first_free > start) ?
4891 e4b->bd_info->bb_first_free : start; 4887 e4b->bd_info->bb_first_free : start;
4892 ext4_lock_group(sb, group); 4888 ext4_lock_group(sb, group);
4893 4889
4894 while (start < max) { 4890 while (start < max) {
4895 start = mb_find_next_zero_bit(bitmap, max, start); 4891 start = mb_find_next_zero_bit(bitmap, max, start);
4896 if (start >= max) 4892 if (start >= max)
4897 break; 4893 break;
4898 next = mb_find_next_bit(bitmap, max, start); 4894 next = mb_find_next_bit(bitmap, max, start);
4899 4895
4900 if ((next - start) >= minblocks) { 4896 if ((next - start) >= minblocks) {
4901 ext4_trim_extent(sb, start, 4897 ext4_trim_extent(sb, start,
4902 next - start, group, e4b); 4898 next - start, group, e4b);
4903 count += next - start; 4899 count += next - start;
4904 } 4900 }
4905 start = next + 1; 4901 start = next + 1;
4906 4902
4907 if (fatal_signal_pending(current)) { 4903 if (fatal_signal_pending(current)) {
4908 count = -ERESTARTSYS; 4904 count = -ERESTARTSYS;
4909 break; 4905 break;
4910 } 4906 }
4911 4907
4912 if (need_resched()) { 4908 if (need_resched()) {
4913 ext4_unlock_group(sb, group); 4909 ext4_unlock_group(sb, group);
4914 cond_resched(); 4910 cond_resched();
4915 ext4_lock_group(sb, group); 4911 ext4_lock_group(sb, group);
4916 } 4912 }
4917 4913
4918 if ((e4b->bd_info->bb_free - count) < minblocks) 4914 if ((e4b->bd_info->bb_free - count) < minblocks)
4919 break; 4915 break;
4920 } 4916 }
4921 ext4_unlock_group(sb, group); 4917 ext4_unlock_group(sb, group);
4922 4918
4923 ext4_debug("trimmed %d blocks in the group %d\n", 4919 ext4_debug("trimmed %d blocks in the group %d\n",
4924 count, group); 4920 count, group);
4925 4921
4926 return count; 4922 return count;
4927 } 4923 }
4928 4924
4929 /** 4925 /**
4930 * ext4_trim_fs() -- trim ioctl handle function 4926 * ext4_trim_fs() -- trim ioctl handle function
4931 * @sb: superblock for filesystem 4927 * @sb: superblock for filesystem
4932 * @range: fstrim_range structure 4928 * @range: fstrim_range structure
4933 * 4929 *
4934 * start: First Byte to trim 4930 * start: First Byte to trim
4935 * len: number of Bytes to trim from start 4931 * len: number of Bytes to trim from start
4936 * minlen: minimum extent length in Bytes 4932 * minlen: minimum extent length in Bytes
4937 * ext4_trim_fs goes through all allocation groups containing Bytes from 4933 * ext4_trim_fs goes through all allocation groups containing Bytes from
4938 * start to start+len. For each such a group ext4_trim_all_free function 4934 * start to start+len. For each such a group ext4_trim_all_free function
4939 * is invoked to trim all free space. 4935 * is invoked to trim all free space.
4940 */ 4936 */
4941 int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) 4937 int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4942 { 4938 {
4943 struct ext4_buddy e4b; 4939 struct ext4_buddy e4b;
4944 ext4_group_t first_group, last_group; 4940 ext4_group_t first_group, last_group;
4945 ext4_group_t group, ngroups = ext4_get_groups_count(sb); 4941 ext4_group_t group, ngroups = ext4_get_groups_count(sb);
4946 ext4_grpblk_t cnt = 0, first_block, last_block; 4942 ext4_grpblk_t cnt = 0, first_block, last_block;
4947 uint64_t start, len, minlen, trimmed; 4943 uint64_t start, len, minlen, trimmed;
4948 ext4_fsblk_t first_data_blk = 4944 ext4_fsblk_t first_data_blk =
4949 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 4945 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
4950 int ret = 0; 4946 int ret = 0;
4951 4947
4952 start = range->start >> sb->s_blocksize_bits; 4948 start = range->start >> sb->s_blocksize_bits;
4953 len = range->len >> sb->s_blocksize_bits; 4949 len = range->len >> sb->s_blocksize_bits;
4954 minlen = range->minlen >> sb->s_blocksize_bits; 4950 minlen = range->minlen >> sb->s_blocksize_bits;
4955 trimmed = 0; 4951 trimmed = 0;
4956 4952
4957 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb))) 4953 if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
4958 return -EINVAL; 4954 return -EINVAL;
4959 if (start < first_data_blk) { 4955 if (start < first_data_blk) {
4960 len -= first_data_blk - start; 4956 len -= first_data_blk - start;
4961 start = first_data_blk; 4957 start = first_data_blk;
4962 } 4958 }
4963 4959
4964 /* Determine first and last group to examine based on start and len */ 4960 /* Determine first and last group to examine based on start and len */
4965 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, 4961 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
4966 &first_group, &first_block); 4962 &first_group, &first_block);
4967 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len), 4963 ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
4968 &last_group, &last_block); 4964 &last_group, &last_block);
4969 last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group; 4965 last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
4970 last_block = EXT4_BLOCKS_PER_GROUP(sb); 4966 last_block = EXT4_BLOCKS_PER_GROUP(sb);
4971 4967
4972 if (first_group > last_group) 4968 if (first_group > last_group)
4973 return -EINVAL; 4969 return -EINVAL;
4974 4970
4975 for (group = first_group; group <= last_group; group++) { 4971 for (group = first_group; group <= last_group; group++) {
4976 ret = ext4_mb_load_buddy(sb, group, &e4b); 4972 ret = ext4_mb_load_buddy(sb, group, &e4b);
4977 if (ret) { 4973 if (ret) {
4978 ext4_error(sb, "Error in loading buddy " 4974 ext4_error(sb, "Error in loading buddy "
4979 "information for %u", group); 4975 "information for %u", group);
4980 break; 4976 break;
4981 } 4977 }
4982 4978
4983 /* 4979 /*
4984 * For all the groups except the last one, last block will 4980 * For all the groups except the last one, last block will
4985 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to 4981 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to
4986 * change it for the last group in which case start + 4982 * change it for the last group in which case start +
4987 * len < EXT4_BLOCKS_PER_GROUP(sb). 4983 * len < EXT4_BLOCKS_PER_GROUP(sb).
4988 */ 4984 */
4989 if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb)) 4985 if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb))
4990 last_block = first_block + len; 4986 last_block = first_block + len;
4991 len -= last_block - first_block; 4987 len -= last_block - first_block;
4992 4988
4993 if (e4b.bd_info->bb_free >= minlen) { 4989 if (e4b.bd_info->bb_free >= minlen) {
4994 cnt = ext4_trim_all_free(sb, &e4b, first_block, 4990 cnt = ext4_trim_all_free(sb, &e4b, first_block,
4995 last_block, minlen); 4991 last_block, minlen);
4996 if (cnt < 0) { 4992 if (cnt < 0) {
4997 ret = cnt; 4993 ret = cnt;
4998 ext4_mb_unload_buddy(&e4b); 4994 ext4_mb_unload_buddy(&e4b);
4999 break; 4995 break;
5000 } 4996 }
5001 } 4997 }
5002 ext4_mb_unload_buddy(&e4b); 4998 ext4_mb_unload_buddy(&e4b);
5003 trimmed += cnt; 4999 trimmed += cnt;
5004 first_block = 0; 5000 first_block = 0;
5005 } 5001 }
5006 range->len = trimmed * sb->s_blocksize; 5002 range->len = trimmed * sb->s_blocksize;
5007 5003
5008 return ret; 5004 return ret;
5009 } 5005 }
5010 5006