Commit d1f5273e9adb40724a85272f248f210dc4ce919a
Committed by
Theodore Ts'o
1 parent
6a8a13e038
Exists in
master
and in
20 other branches
ext4: return 32/64-bit dir name hash according to usage type
Traditionally ext2/3/4 has returned a 32-bit hash value from llseek() to appease NFSv2, which can only handle a 32-bit cookie for seekdir() and telldir(). However, this causes problems if there are 32-bit hash collisions, since the NFSv2 server can get stuck resending the same entries from the directory repeatedly. Allow ext4 to return a full 64-bit hash (both major and minor) for telldir to decrease the chance of hash collisions. This still needs integration on the NFS side. Patch-updated-by: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de> (blame me if something is not correct) Signed-off-by: Fan Yong <yong.fan@whamcloud.com> Signed-off-by: Andreas Dilger <adilger@whamcloud.com> Signed-off-by: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Showing 3 changed files with 176 additions and 48 deletions Side-by-side Diff
fs/ext4/dir.c
... | ... | @@ -32,25 +32,9 @@ |
32 | 32 | DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK |
33 | 33 | }; |
34 | 34 | |
35 | -static int ext4_readdir(struct file *, void *, filldir_t); | |
36 | 35 | static int ext4_dx_readdir(struct file *filp, |
37 | 36 | void *dirent, filldir_t filldir); |
38 | -static int ext4_release_dir(struct inode *inode, | |
39 | - struct file *filp); | |
40 | 37 | |
41 | -const struct file_operations ext4_dir_operations = { | |
42 | - .llseek = ext4_llseek, | |
43 | - .read = generic_read_dir, | |
44 | - .readdir = ext4_readdir, /* we take BKL. needed?*/ | |
45 | - .unlocked_ioctl = ext4_ioctl, | |
46 | -#ifdef CONFIG_COMPAT | |
47 | - .compat_ioctl = ext4_compat_ioctl, | |
48 | -#endif | |
49 | - .fsync = ext4_sync_file, | |
50 | - .release = ext4_release_dir, | |
51 | -}; | |
52 | - | |
53 | - | |
54 | 38 | static unsigned char get_dtype(struct super_block *sb, int filetype) |
55 | 39 | { |
56 | 40 | if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) || |
... | ... | @@ -60,6 +44,26 @@ |
60 | 44 | return (ext4_filetype_table[filetype]); |
61 | 45 | } |
62 | 46 | |
47 | +/** | |
48 | + * Check if the given dir-inode refers to an htree-indexed directory | |
49 | + * (or a directory which chould potentially get coverted to use htree | |
50 | + * indexing). | |
51 | + * | |
52 | + * Return 1 if it is a dx dir, 0 if not | |
53 | + */ | |
54 | +static int is_dx_dir(struct inode *inode) | |
55 | +{ | |
56 | + struct super_block *sb = inode->i_sb; | |
57 | + | |
58 | + if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, | |
59 | + EXT4_FEATURE_COMPAT_DIR_INDEX) && | |
60 | + ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || | |
61 | + ((inode->i_size >> sb->s_blocksize_bits) == 1))) | |
62 | + return 1; | |
63 | + | |
64 | + return 0; | |
65 | +} | |
66 | + | |
63 | 67 | /* |
64 | 68 | * Return 0 if the directory entry is OK, and 1 if there is a problem |
65 | 69 | * |
66 | 70 | |
67 | 71 | |
... | ... | @@ -115,18 +119,13 @@ |
115 | 119 | unsigned int offset; |
116 | 120 | int i, stored; |
117 | 121 | struct ext4_dir_entry_2 *de; |
118 | - struct super_block *sb; | |
119 | 122 | int err; |
120 | 123 | struct inode *inode = filp->f_path.dentry->d_inode; |
124 | + struct super_block *sb = inode->i_sb; | |
121 | 125 | int ret = 0; |
122 | 126 | int dir_has_error = 0; |
123 | 127 | |
124 | - sb = inode->i_sb; | |
125 | - | |
126 | - if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb, | |
127 | - EXT4_FEATURE_COMPAT_DIR_INDEX) && | |
128 | - ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) || | |
129 | - ((inode->i_size >> sb->s_blocksize_bits) == 1))) { | |
128 | + if (is_dx_dir(inode)) { | |
130 | 129 | err = ext4_dx_readdir(filp, dirent, filldir); |
131 | 130 | if (err != ERR_BAD_DX_DIR) { |
132 | 131 | ret = err; |
133 | 132 | |
134 | 133 | |
135 | 134 | |
136 | 135 | |
137 | 136 | |
... | ... | @@ -254,24 +253,136 @@ |
254 | 253 | return ret; |
255 | 254 | } |
256 | 255 | |
256 | +static inline int is_32bit_api(void) | |
257 | +{ | |
258 | +#ifdef CONFIG_COMPAT | |
259 | + return is_compat_task(); | |
260 | +#else | |
261 | + return (BITS_PER_LONG == 32); | |
262 | +#endif | |
263 | +} | |
264 | + | |
257 | 265 | /* |
258 | 266 | * These functions convert from the major/minor hash to an f_pos |
259 | - * value. | |
267 | + * value for dx directories | |
260 | 268 | * |
261 | - * Currently we only use major hash numer. This is unfortunate, but | |
262 | - * on 32-bit machines, the same VFS interface is used for lseek and | |
263 | - * llseek, so if we use the 64 bit offset, then the 32-bit versions of | |
264 | - * lseek/telldir/seekdir will blow out spectacularly, and from within | |
265 | - * the ext2 low-level routine, we don't know if we're being called by | |
266 | - * a 64-bit version of the system call or the 32-bit version of the | |
267 | - * system call. Worse yet, NFSv2 only allows for a 32-bit readdir | |
268 | - * cookie. Sigh. | |
269 | + * Upper layer (for example NFS) should specify FMODE_32BITHASH or | |
270 | + * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted | |
271 | + * directly on both 32-bit and 64-bit nodes, under such case, neither | |
272 | + * FMODE_32BITHASH nor FMODE_64BITHASH is specified. | |
269 | 273 | */ |
270 | -#define hash2pos(major, minor) (major >> 1) | |
271 | -#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) | |
272 | -#define pos2min_hash(pos) (0) | |
274 | +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) | |
275 | +{ | |
276 | + if ((filp->f_mode & FMODE_32BITHASH) || | |
277 | + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) | |
278 | + return major >> 1; | |
279 | + else | |
280 | + return ((__u64)(major >> 1) << 32) | (__u64)minor; | |
281 | +} | |
273 | 282 | |
283 | +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) | |
284 | +{ | |
285 | + if ((filp->f_mode & FMODE_32BITHASH) || | |
286 | + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) | |
287 | + return (pos << 1) & 0xffffffff; | |
288 | + else | |
289 | + return ((pos >> 32) << 1) & 0xffffffff; | |
290 | +} | |
291 | + | |
292 | +static inline __u32 pos2min_hash(struct file *filp, loff_t pos) | |
293 | +{ | |
294 | + if ((filp->f_mode & FMODE_32BITHASH) || | |
295 | + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) | |
296 | + return 0; | |
297 | + else | |
298 | + return pos & 0xffffffff; | |
299 | +} | |
300 | + | |
274 | 301 | /* |
302 | + * Return 32- or 64-bit end-of-file for dx directories | |
303 | + */ | |
304 | +static inline loff_t ext4_get_htree_eof(struct file *filp) | |
305 | +{ | |
306 | + if ((filp->f_mode & FMODE_32BITHASH) || | |
307 | + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) | |
308 | + return EXT4_HTREE_EOF_32BIT; | |
309 | + else | |
310 | + return EXT4_HTREE_EOF_64BIT; | |
311 | +} | |
312 | + | |
313 | + | |
314 | +/* | |
315 | + * ext4_dir_llseek() based on generic_file_llseek() to handle both | |
316 | + * non-htree and htree directories, where the "offset" is in terms | |
317 | + * of the filename hash value instead of the byte offset. | |
318 | + * | |
319 | + * NOTE: offsets obtained *before* ext4_set_inode_flag(dir, EXT4_INODE_INDEX) | |
320 | + * will be invalid once the directory was converted into a dx directory | |
321 | + */ | |
322 | +loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin) | |
323 | +{ | |
324 | + struct inode *inode = file->f_mapping->host; | |
325 | + loff_t ret = -EINVAL; | |
326 | + int dx_dir = is_dx_dir(inode); | |
327 | + | |
328 | + mutex_lock(&inode->i_mutex); | |
329 | + | |
330 | + /* NOTE: relative offsets with dx directories might not work | |
331 | + * as expected, as it is difficult to figure out the | |
332 | + * correct offset between dx hashes */ | |
333 | + | |
334 | + switch (origin) { | |
335 | + case SEEK_END: | |
336 | + if (unlikely(offset > 0)) | |
337 | + goto out_err; /* not supported for directories */ | |
338 | + | |
339 | + /* so only negative offsets are left, does that have a | |
340 | + * meaning for directories at all? */ | |
341 | + if (dx_dir) | |
342 | + offset += ext4_get_htree_eof(file); | |
343 | + else | |
344 | + offset += inode->i_size; | |
345 | + break; | |
346 | + case SEEK_CUR: | |
347 | + /* | |
348 | + * Here we special-case the lseek(fd, 0, SEEK_CUR) | |
349 | + * position-querying operation. Avoid rewriting the "same" | |
350 | + * f_pos value back to the file because a concurrent read(), | |
351 | + * write() or lseek() might have altered it | |
352 | + */ | |
353 | + if (offset == 0) { | |
354 | + offset = file->f_pos; | |
355 | + goto out_ok; | |
356 | + } | |
357 | + | |
358 | + offset += file->f_pos; | |
359 | + break; | |
360 | + } | |
361 | + | |
362 | + if (unlikely(offset < 0)) | |
363 | + goto out_err; | |
364 | + | |
365 | + if (!dx_dir) { | |
366 | + if (offset > inode->i_sb->s_maxbytes) | |
367 | + goto out_err; | |
368 | + } else if (offset > ext4_get_htree_eof(file)) | |
369 | + goto out_err; | |
370 | + | |
371 | + /* Special lock needed here? */ | |
372 | + if (offset != file->f_pos) { | |
373 | + file->f_pos = offset; | |
374 | + file->f_version = 0; | |
375 | + } | |
376 | + | |
377 | +out_ok: | |
378 | + ret = offset; | |
379 | +out_err: | |
380 | + mutex_unlock(&inode->i_mutex); | |
381 | + | |
382 | + return ret; | |
383 | +} | |
384 | + | |
385 | +/* | |
275 | 386 | * This structure holds the nodes of the red-black tree used to store |
276 | 387 | * the directory entry in hash order. |
277 | 388 | */ |
278 | 389 | |
... | ... | @@ -330,15 +441,16 @@ |
330 | 441 | } |
331 | 442 | |
332 | 443 | |
333 | -static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos) | |
444 | +static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp, | |
445 | + loff_t pos) | |
334 | 446 | { |
335 | 447 | struct dir_private_info *p; |
336 | 448 | |
337 | 449 | p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); |
338 | 450 | if (!p) |
339 | 451 | return NULL; |
340 | - p->curr_hash = pos2maj_hash(pos); | |
341 | - p->curr_minor_hash = pos2min_hash(pos); | |
452 | + p->curr_hash = pos2maj_hash(filp, pos); | |
453 | + p->curr_minor_hash = pos2min_hash(filp, pos); | |
342 | 454 | return p; |
343 | 455 | } |
344 | 456 | |
... | ... | @@ -429,7 +541,7 @@ |
429 | 541 | "null fname?!?\n"); |
430 | 542 | return 0; |
431 | 543 | } |
432 | - curr_pos = hash2pos(fname->hash, fname->minor_hash); | |
544 | + curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); | |
433 | 545 | while (fname) { |
434 | 546 | error = filldir(dirent, fname->name, |
435 | 547 | fname->name_len, curr_pos, |
436 | 548 | |
... | ... | @@ -454,13 +566,13 @@ |
454 | 566 | int ret; |
455 | 567 | |
456 | 568 | if (!info) { |
457 | - info = ext4_htree_create_dir_info(filp->f_pos); | |
569 | + info = ext4_htree_create_dir_info(filp, filp->f_pos); | |
458 | 570 | if (!info) |
459 | 571 | return -ENOMEM; |
460 | 572 | filp->private_data = info; |
461 | 573 | } |
462 | 574 | |
463 | - if (filp->f_pos == EXT4_HTREE_EOF) | |
575 | + if (filp->f_pos == ext4_get_htree_eof(filp)) | |
464 | 576 | return 0; /* EOF */ |
465 | 577 | |
466 | 578 | /* Some one has messed with f_pos; reset the world */ |
... | ... | @@ -468,8 +580,8 @@ |
468 | 580 | free_rb_tree_fname(&info->root); |
469 | 581 | info->curr_node = NULL; |
470 | 582 | info->extra_fname = NULL; |
471 | - info->curr_hash = pos2maj_hash(filp->f_pos); | |
472 | - info->curr_minor_hash = pos2min_hash(filp->f_pos); | |
583 | + info->curr_hash = pos2maj_hash(filp, filp->f_pos); | |
584 | + info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); | |
473 | 585 | } |
474 | 586 | |
475 | 587 | /* |
... | ... | @@ -501,7 +613,7 @@ |
501 | 613 | if (ret < 0) |
502 | 614 | return ret; |
503 | 615 | if (ret == 0) { |
504 | - filp->f_pos = EXT4_HTREE_EOF; | |
616 | + filp->f_pos = ext4_get_htree_eof(filp); | |
505 | 617 | break; |
506 | 618 | } |
507 | 619 | info->curr_node = rb_first(&info->root); |
... | ... | @@ -521,7 +633,7 @@ |
521 | 633 | info->curr_minor_hash = fname->minor_hash; |
522 | 634 | } else { |
523 | 635 | if (info->next_hash == ~0) { |
524 | - filp->f_pos = EXT4_HTREE_EOF; | |
636 | + filp->f_pos = ext4_get_htree_eof(filp); | |
525 | 637 | break; |
526 | 638 | } |
527 | 639 | info->curr_hash = info->next_hash; |
... | ... | @@ -540,4 +652,16 @@ |
540 | 652 | |
541 | 653 | return 0; |
542 | 654 | } |
655 | + | |
656 | +const struct file_operations ext4_dir_operations = { | |
657 | + .llseek = ext4_dir_llseek, | |
658 | + .read = generic_read_dir, | |
659 | + .readdir = ext4_readdir, | |
660 | + .unlocked_ioctl = ext4_ioctl, | |
661 | +#ifdef CONFIG_COMPAT | |
662 | + .compat_ioctl = ext4_compat_ioctl, | |
663 | +#endif | |
664 | + .fsync = ext4_sync_file, | |
665 | + .release = ext4_release_dir, | |
666 | +}; |
fs/ext4/ext4.h
... | ... | @@ -1612,7 +1612,11 @@ |
1612 | 1612 | u32 *seed; |
1613 | 1613 | }; |
1614 | 1614 | |
1615 | -#define EXT4_HTREE_EOF 0x7fffffff | |
1615 | + | |
1616 | +/* 32 and 64 bit signed EOF for dx directories */ | |
1617 | +#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) | |
1618 | +#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) | |
1619 | + | |
1616 | 1620 | |
1617 | 1621 | /* |
1618 | 1622 | * Control parameters used by ext4_htree_next_block |
fs/ext4/hash.c
... | ... | @@ -200,8 +200,8 @@ |
200 | 200 | return -1; |
201 | 201 | } |
202 | 202 | hash = hash & ~1; |
203 | - if (hash == (EXT4_HTREE_EOF << 1)) | |
204 | - hash = (EXT4_HTREE_EOF-1) << 1; | |
203 | + if (hash == (EXT4_HTREE_EOF_32BIT << 1)) | |
204 | + hash = (EXT4_HTREE_EOF_32BIT - 1) << 1; | |
205 | 205 | hinfo->hash = hash; |
206 | 206 | hinfo->minor_hash = minor_hash; |
207 | 207 | return 0; |