Commit 6b82021b9e91cd689fdffadbcdb9a42597bbe764
Committed by
Joel Becker
1 parent
73c8a80003
Exists in
master
and in
4 other branches
ocfs2: increase the default size of local alloc windows
I have observed that the current size of 8M gives us pretty poor fragmentation on multi-threaded workloads which do lots of writes. Generally, I can increase the size of local alloc windows and observe a marked decrease in fragmentation, even up and beyond window sizes of 512 megabytes. This makes sense for a couple reasons - larger local alloc means more room for reservation windows. On multi-node workloads the larger local alloc helps as well because we don't have to do window slides as often. Also, I removed the OCFS2_DEFAULT_LOCAL_ALLOC_SIZE constant as it is no longer used and the comment above it was out of date. To test fragmentation, I used a workload which launched 4 threads that did 4k writes into a series of about 140 alternating files. With resv_level=2, and a 4k/4k file system I observed the following average fragmentation for various localalloc= parameters: localalloc= avg. fragmentation 8 48 32 16 64 10 120 7 On larger cluster sizes, the difference is more dramatic. The new default size top out at 256M, which we'll only get for cluster sizes of 32K and above. Signed-off-by: Mark Fasheh <mfasheh@suse.com> Signed-off-by: Joel Becker <joel.becker@oracle.com>
Showing 5 changed files with 118 additions and 11 deletions Side-by-side Diff
fs/ocfs2/localalloc.c
... | ... | @@ -75,10 +75,120 @@ |
75 | 75 | static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, |
76 | 76 | struct inode *local_alloc_inode); |
77 | 77 | |
78 | +/* | |
79 | + * ocfs2_la_default_mb() - determine a default size, in megabytes of | |
80 | + * the local alloc. | |
81 | + * | |
82 | + * Generally, we'd like to pick as large a local alloc as | |
83 | + * possible. Performance on large workloads tends to scale | |
84 | + * proportionally to la size. In addition to that, the reservations | |
85 | + * code functions more efficiently as it can reserve more windows for | |
86 | + * write. | |
87 | + * | |
88 | + * Some things work against us when trying to choose a large local alloc: | |
89 | + * | |
90 | + * - We need to ensure our sizing is picked to leave enough space in | |
91 | + * group descriptors for other allocations (such as block groups, | |
92 | + * etc). Picking default sizes which are a multiple of 4 could help | |
93 | + * - block groups are allocated in 2mb and 4mb chunks. | |
94 | + * | |
95 | + * - Likewise, we don't want to starve other nodes of bits on small | |
96 | + * file systems. This can easily be taken care of by limiting our | |
97 | + * default to a reasonable size (256M) on larger cluster sizes. | |
98 | + * | |
99 | + * - Some file systems can't support very large sizes - 4k and 8k in | |
100 | + * particular are limited to less than 128 and 256 megabytes respectively. | |
101 | + * | |
102 | + * The following reference table shows group descriptor and local | |
103 | + * alloc maximums at various cluster sizes (4k blocksize) | |
104 | + * | |
105 | + * csize: 4K group: 126M la: 121M | |
106 | + * csize: 8K group: 252M la: 243M | |
107 | + * csize: 16K group: 504M la: 486M | |
108 | + * csize: 32K group: 1008M la: 972M | |
109 | + * csize: 64K group: 2016M la: 1944M | |
110 | + * csize: 128K group: 4032M la: 3888M | |
111 | + * csize: 256K group: 8064M la: 7776M | |
112 | + * csize: 512K group: 16128M la: 15552M | |
113 | + * csize: 1024K group: 32256M la: 31104M | |
114 | + */ | |
115 | +#define OCFS2_LA_MAX_DEFAULT_MB 256 | |
116 | +#define OCFS2_LA_OLD_DEFAULT 8 | |
117 | +unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb) | |
118 | +{ | |
119 | + unsigned int la_mb; | |
120 | + unsigned int gd_mb; | |
121 | + unsigned int megs_per_slot; | |
122 | + struct super_block *sb = osb->sb; | |
123 | + | |
124 | + gd_mb = ocfs2_clusters_to_megabytes(osb->sb, | |
125 | + 8 * ocfs2_group_bitmap_size(sb)); | |
126 | + | |
127 | + /* | |
128 | + * This takes care of files systems with very small group | |
129 | + * descriptors - 512 byte blocksize at cluster sizes lower | |
130 | + * than 16K and also 1k blocksize with 4k cluster size. | |
131 | + */ | |
132 | + if ((sb->s_blocksize == 512 && osb->s_clustersize <= 8192) | |
133 | + || (sb->s_blocksize == 1024 && osb->s_clustersize == 4096)) | |
134 | + return OCFS2_LA_OLD_DEFAULT; | |
135 | + | |
136 | + /* | |
137 | + * Leave enough room for some block groups and make the final | |
138 | + * value we work from a multiple of 4. | |
139 | + */ | |
140 | + gd_mb -= 16; | |
141 | + gd_mb &= 0xFFFFFFFB; | |
142 | + | |
143 | + la_mb = gd_mb; | |
144 | + | |
145 | + /* | |
146 | + * Keep window sizes down to a reasonable default | |
147 | + */ | |
148 | + if (la_mb > OCFS2_LA_MAX_DEFAULT_MB) { | |
149 | + /* | |
150 | + * Some clustersize / blocksize combinations will have | |
151 | + * given us a larger than OCFS2_LA_MAX_DEFAULT_MB | |
152 | + * default size, but get poor distribution when | |
153 | + * limited to exactly 256 megabytes. | |
154 | + * | |
155 | + * As an example, 16K clustersize at 4K blocksize | |
156 | + * gives us a cluster group size of 504M. Paring the | |
157 | + * local alloc size down to 256 however, would give us | |
158 | + * only one window and around 200MB left in the | |
159 | + * cluster group. Instead, find the first size below | |
160 | + * 256 which would give us an even distribution. | |
161 | + * | |
162 | + * Larger cluster group sizes actually work out pretty | |
163 | + * well when pared to 256, so we don't have to do this | |
164 | + * for any group that fits more than two | |
165 | + * OCFS2_LA_MAX_DEFAULT_MB windows. | |
166 | + */ | |
167 | + if (gd_mb > (2 * OCFS2_LA_MAX_DEFAULT_MB)) | |
168 | + la_mb = 256; | |
169 | + else { | |
170 | + unsigned int gd_mult = gd_mb; | |
171 | + | |
172 | + while (gd_mult > 256) | |
173 | + gd_mult = gd_mult >> 1; | |
174 | + | |
175 | + la_mb = gd_mult; | |
176 | + } | |
177 | + } | |
178 | + | |
179 | + megs_per_slot = osb->osb_clusters_at_boot / osb->max_slots; | |
180 | + megs_per_slot = ocfs2_clusters_to_megabytes(osb->sb, megs_per_slot); | |
181 | + /* Too many nodes, too few disk clusters. */ | |
182 | + if (megs_per_slot < la_mb) | |
183 | + la_mb = megs_per_slot; | |
184 | + | |
185 | + return la_mb; | |
186 | +} | |
187 | + | |
78 | 188 | void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb) |
79 | 189 | { |
80 | 190 | struct super_block *sb = osb->sb; |
81 | - unsigned int la_default_mb = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; | |
191 | + unsigned int la_default_mb = ocfs2_la_default_mb(osb); | |
82 | 192 | unsigned int la_max_mb; |
83 | 193 | |
84 | 194 | la_max_mb = ocfs2_clusters_to_megabytes(sb, |
... | ... | @@ -185,7 +295,7 @@ |
185 | 295 | osb->local_alloc_bits, (osb->bitmap_cpg - 1)); |
186 | 296 | osb->local_alloc_bits = |
187 | 297 | ocfs2_megabytes_to_clusters(osb->sb, |
188 | - OCFS2_DEFAULT_LOCAL_ALLOC_SIZE); | |
298 | + ocfs2_la_default_mb(osb)); | |
189 | 299 | } |
190 | 300 | |
191 | 301 | /* read the alloc off disk */ |
fs/ocfs2/localalloc.h
... | ... | @@ -31,6 +31,7 @@ |
31 | 31 | void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb); |
32 | 32 | |
33 | 33 | void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb); |
34 | +unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb); | |
34 | 35 | |
35 | 36 | int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, |
36 | 37 | int node_num, |
fs/ocfs2/ocfs2.h
... | ... | @@ -342,6 +342,9 @@ |
342 | 342 | */ |
343 | 343 | unsigned int local_alloc_bits; |
344 | 344 | unsigned int local_alloc_default_bits; |
345 | + /* osb_clusters_at_boot can become stale! Do not trust it to | |
346 | + * be up to date. */ | |
347 | + unsigned int osb_clusters_at_boot; | |
345 | 348 | |
346 | 349 | enum ocfs2_local_alloc_state local_alloc_state; /* protected |
347 | 350 | * by osb_lock */ |
fs/ocfs2/ocfs2_fs.h
... | ... | @@ -283,14 +283,6 @@ |
283 | 283 | #define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) |
284 | 284 | |
285 | 285 | /* |
286 | - * Default local alloc size (in megabytes) | |
287 | - * | |
288 | - * The value chosen should be such that most allocations, including new | |
289 | - * block groups, use local alloc. | |
290 | - */ | |
291 | -#define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8 | |
292 | - | |
293 | -/* | |
294 | 286 | * Inline extended attribute size (in bytes) |
295 | 287 | * The value chosen should be aligned to 16 byte boundaries. |
296 | 288 | */ |
fs/ocfs2/super.c
... | ... | @@ -1503,7 +1503,7 @@ |
1503 | 1503 | (unsigned) (osb->osb_commit_interval / HZ)); |
1504 | 1504 | |
1505 | 1505 | local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits); |
1506 | - if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE) | |
1506 | + if (local_alloc_megs != ocfs2_la_default_mb(osb)) | |
1507 | 1507 | seq_printf(s, ",localalloc=%d", local_alloc_megs); |
1508 | 1508 | |
1509 | 1509 | if (opts & OCFS2_MOUNT_LOCALFLOCKS) |
... | ... | @@ -2251,6 +2251,7 @@ |
2251 | 2251 | } |
2252 | 2252 | |
2253 | 2253 | osb->bitmap_blkno = OCFS2_I(inode)->ip_blkno; |
2254 | + osb->osb_clusters_at_boot = OCFS2_I(inode)->ip_clusters; | |
2254 | 2255 | iput(inode); |
2255 | 2256 | |
2256 | 2257 | osb->bitmap_cpg = ocfs2_group_bitmap_size(sb) * 8; |