Commit d1f5273e9adb40724a85272f248f210dc4ce919a

Authored by Fan Yong
Committed by Theodore Ts'o
1 parent 6a8a13e038

ext4: return 32/64-bit dir name hash according to usage type

Traditionally ext2/3/4 has returned a 32-bit hash value from llseek()
to appease NFSv2, which can only handle a 32-bit cookie for seekdir()
and telldir().  However, this causes problems if there are 32-bit hash
collisions, since the NFSv2 server can get stuck resending the same
entries from the directory repeatedly.

Allow ext4 to return a full 64-bit hash (both major and minor) for
telldir to decrease the chance of hash collisions.  This still needs
integration on the NFS side.

Patch-updated-by: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de>
(blame me if something is not correct)

Signed-off-by: Fan Yong <yong.fan@whamcloud.com>
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Signed-off-by: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

Showing 3 changed files with 176 additions and 48 deletions Side-by-side Diff

... ... @@ -32,25 +32,9 @@
32 32 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
33 33 };
34 34  
35   -static int ext4_readdir(struct file *, void *, filldir_t);
36 35 static int ext4_dx_readdir(struct file *filp,
37 36 void *dirent, filldir_t filldir);
38   -static int ext4_release_dir(struct inode *inode,
39   - struct file *filp);
40 37  
41   -const struct file_operations ext4_dir_operations = {
42   - .llseek = ext4_llseek,
43   - .read = generic_read_dir,
44   - .readdir = ext4_readdir, /* we take BKL. needed?*/
45   - .unlocked_ioctl = ext4_ioctl,
46   -#ifdef CONFIG_COMPAT
47   - .compat_ioctl = ext4_compat_ioctl,
48   -#endif
49   - .fsync = ext4_sync_file,
50   - .release = ext4_release_dir,
51   -};
52   -
53   -
54 38 static unsigned char get_dtype(struct super_block *sb, int filetype)
55 39 {
56 40 if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
... ... @@ -60,6 +44,26 @@
60 44 return (ext4_filetype_table[filetype]);
61 45 }
62 46  
  47 +/**
  48 + * Check if the given dir-inode refers to an htree-indexed directory
  49 + * (or a directory which chould potentially get coverted to use htree
  50 + * indexing).
  51 + *
  52 + * Return 1 if it is a dx dir, 0 if not
  53 + */
  54 +static int is_dx_dir(struct inode *inode)
  55 +{
  56 + struct super_block *sb = inode->i_sb;
  57 +
  58 + if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
  59 + EXT4_FEATURE_COMPAT_DIR_INDEX) &&
  60 + ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
  61 + ((inode->i_size >> sb->s_blocksize_bits) == 1)))
  62 + return 1;
  63 +
  64 + return 0;
  65 +}
  66 +
63 67 /*
64 68 * Return 0 if the directory entry is OK, and 1 if there is a problem
65 69 *
66 70  
67 71  
... ... @@ -115,18 +119,13 @@
115 119 unsigned int offset;
116 120 int i, stored;
117 121 struct ext4_dir_entry_2 *de;
118   - struct super_block *sb;
119 122 int err;
120 123 struct inode *inode = filp->f_path.dentry->d_inode;
  124 + struct super_block *sb = inode->i_sb;
121 125 int ret = 0;
122 126 int dir_has_error = 0;
123 127  
124   - sb = inode->i_sb;
125   -
126   - if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
127   - EXT4_FEATURE_COMPAT_DIR_INDEX) &&
128   - ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
129   - ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
  128 + if (is_dx_dir(inode)) {
130 129 err = ext4_dx_readdir(filp, dirent, filldir);
131 130 if (err != ERR_BAD_DX_DIR) {
132 131 ret = err;
133 132  
134 133  
135 134  
136 135  
137 136  
... ... @@ -254,24 +253,136 @@
254 253 return ret;
255 254 }
256 255  
  256 +static inline int is_32bit_api(void)
  257 +{
  258 +#ifdef CONFIG_COMPAT
  259 + return is_compat_task();
  260 +#else
  261 + return (BITS_PER_LONG == 32);
  262 +#endif
  263 +}
  264 +
257 265 /*
258 266 * These functions convert from the major/minor hash to an f_pos
259   - * value.
  267 + * value for dx directories
260 268 *
261   - * Currently we only use major hash numer. This is unfortunate, but
262   - * on 32-bit machines, the same VFS interface is used for lseek and
263   - * llseek, so if we use the 64 bit offset, then the 32-bit versions of
264   - * lseek/telldir/seekdir will blow out spectacularly, and from within
265   - * the ext2 low-level routine, we don't know if we're being called by
266   - * a 64-bit version of the system call or the 32-bit version of the
267   - * system call. Worse yet, NFSv2 only allows for a 32-bit readdir
268   - * cookie. Sigh.
  269 + * Upper layer (for example NFS) should specify FMODE_32BITHASH or
  270 + * FMODE_64BITHASH explicitly. On the other hand, we allow ext4 to be mounted
  271 + * directly on both 32-bit and 64-bit nodes, under such case, neither
  272 + * FMODE_32BITHASH nor FMODE_64BITHASH is specified.
269 273 */
270   -#define hash2pos(major, minor) (major >> 1)
271   -#define pos2maj_hash(pos) ((pos << 1) & 0xffffffff)
272   -#define pos2min_hash(pos) (0)
  274 +static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor)
  275 +{
  276 + if ((filp->f_mode & FMODE_32BITHASH) ||
  277 + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
  278 + return major >> 1;
  279 + else
  280 + return ((__u64)(major >> 1) << 32) | (__u64)minor;
  281 +}
273 282  
  283 +static inline __u32 pos2maj_hash(struct file *filp, loff_t pos)
  284 +{
  285 + if ((filp->f_mode & FMODE_32BITHASH) ||
  286 + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
  287 + return (pos << 1) & 0xffffffff;
  288 + else
  289 + return ((pos >> 32) << 1) & 0xffffffff;
  290 +}
  291 +
  292 +static inline __u32 pos2min_hash(struct file *filp, loff_t pos)
  293 +{
  294 + if ((filp->f_mode & FMODE_32BITHASH) ||
  295 + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
  296 + return 0;
  297 + else
  298 + return pos & 0xffffffff;
  299 +}
  300 +
274 301 /*
  302 + * Return 32- or 64-bit end-of-file for dx directories
  303 + */
  304 +static inline loff_t ext4_get_htree_eof(struct file *filp)
  305 +{
  306 + if ((filp->f_mode & FMODE_32BITHASH) ||
  307 + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
  308 + return EXT4_HTREE_EOF_32BIT;
  309 + else
  310 + return EXT4_HTREE_EOF_64BIT;
  311 +}
  312 +
  313 +
  314 +/*
  315 + * ext4_dir_llseek() based on generic_file_llseek() to handle both
  316 + * non-htree and htree directories, where the "offset" is in terms
  317 + * of the filename hash value instead of the byte offset.
  318 + *
  319 + * NOTE: offsets obtained *before* ext4_set_inode_flag(dir, EXT4_INODE_INDEX)
  320 + * will be invalid once the directory was converted into a dx directory
  321 + */
  322 +loff_t ext4_dir_llseek(struct file *file, loff_t offset, int origin)
  323 +{
  324 + struct inode *inode = file->f_mapping->host;
  325 + loff_t ret = -EINVAL;
  326 + int dx_dir = is_dx_dir(inode);
  327 +
  328 + mutex_lock(&inode->i_mutex);
  329 +
  330 + /* NOTE: relative offsets with dx directories might not work
  331 + * as expected, as it is difficult to figure out the
  332 + * correct offset between dx hashes */
  333 +
  334 + switch (origin) {
  335 + case SEEK_END:
  336 + if (unlikely(offset > 0))
  337 + goto out_err; /* not supported for directories */
  338 +
  339 + /* so only negative offsets are left, does that have a
  340 + * meaning for directories at all? */
  341 + if (dx_dir)
  342 + offset += ext4_get_htree_eof(file);
  343 + else
  344 + offset += inode->i_size;
  345 + break;
  346 + case SEEK_CUR:
  347 + /*
  348 + * Here we special-case the lseek(fd, 0, SEEK_CUR)
  349 + * position-querying operation. Avoid rewriting the "same"
  350 + * f_pos value back to the file because a concurrent read(),
  351 + * write() or lseek() might have altered it
  352 + */
  353 + if (offset == 0) {
  354 + offset = file->f_pos;
  355 + goto out_ok;
  356 + }
  357 +
  358 + offset += file->f_pos;
  359 + break;
  360 + }
  361 +
  362 + if (unlikely(offset < 0))
  363 + goto out_err;
  364 +
  365 + if (!dx_dir) {
  366 + if (offset > inode->i_sb->s_maxbytes)
  367 + goto out_err;
  368 + } else if (offset > ext4_get_htree_eof(file))
  369 + goto out_err;
  370 +
  371 + /* Special lock needed here? */
  372 + if (offset != file->f_pos) {
  373 + file->f_pos = offset;
  374 + file->f_version = 0;
  375 + }
  376 +
  377 +out_ok:
  378 + ret = offset;
  379 +out_err:
  380 + mutex_unlock(&inode->i_mutex);
  381 +
  382 + return ret;
  383 +}
  384 +
  385 +/*
275 386 * This structure holds the nodes of the red-black tree used to store
276 387 * the directory entry in hash order.
277 388 */
278 389  
... ... @@ -330,15 +441,16 @@
330 441 }
331 442  
332 443  
333   -static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
  444 +static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
  445 + loff_t pos)
334 446 {
335 447 struct dir_private_info *p;
336 448  
337 449 p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
338 450 if (!p)
339 451 return NULL;
340   - p->curr_hash = pos2maj_hash(pos);
341   - p->curr_minor_hash = pos2min_hash(pos);
  452 + p->curr_hash = pos2maj_hash(filp, pos);
  453 + p->curr_minor_hash = pos2min_hash(filp, pos);
342 454 return p;
343 455 }
344 456  
... ... @@ -429,7 +541,7 @@
429 541 "null fname?!?\n");
430 542 return 0;
431 543 }
432   - curr_pos = hash2pos(fname->hash, fname->minor_hash);
  544 + curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
433 545 while (fname) {
434 546 error = filldir(dirent, fname->name,
435 547 fname->name_len, curr_pos,
436 548  
... ... @@ -454,13 +566,13 @@
454 566 int ret;
455 567  
456 568 if (!info) {
457   - info = ext4_htree_create_dir_info(filp->f_pos);
  569 + info = ext4_htree_create_dir_info(filp, filp->f_pos);
458 570 if (!info)
459 571 return -ENOMEM;
460 572 filp->private_data = info;
461 573 }
462 574  
463   - if (filp->f_pos == EXT4_HTREE_EOF)
  575 + if (filp->f_pos == ext4_get_htree_eof(filp))
464 576 return 0; /* EOF */
465 577  
466 578 /* Some one has messed with f_pos; reset the world */
... ... @@ -468,8 +580,8 @@
468 580 free_rb_tree_fname(&info->root);
469 581 info->curr_node = NULL;
470 582 info->extra_fname = NULL;
471   - info->curr_hash = pos2maj_hash(filp->f_pos);
472   - info->curr_minor_hash = pos2min_hash(filp->f_pos);
  583 + info->curr_hash = pos2maj_hash(filp, filp->f_pos);
  584 + info->curr_minor_hash = pos2min_hash(filp, filp->f_pos);
473 585 }
474 586  
475 587 /*
... ... @@ -501,7 +613,7 @@
501 613 if (ret < 0)
502 614 return ret;
503 615 if (ret == 0) {
504   - filp->f_pos = EXT4_HTREE_EOF;
  616 + filp->f_pos = ext4_get_htree_eof(filp);
505 617 break;
506 618 }
507 619 info->curr_node = rb_first(&info->root);
... ... @@ -521,7 +633,7 @@
521 633 info->curr_minor_hash = fname->minor_hash;
522 634 } else {
523 635 if (info->next_hash == ~0) {
524   - filp->f_pos = EXT4_HTREE_EOF;
  636 + filp->f_pos = ext4_get_htree_eof(filp);
525 637 break;
526 638 }
527 639 info->curr_hash = info->next_hash;
... ... @@ -540,4 +652,16 @@
540 652  
541 653 return 0;
542 654 }
  655 +
  656 +const struct file_operations ext4_dir_operations = {
  657 + .llseek = ext4_dir_llseek,
  658 + .read = generic_read_dir,
  659 + .readdir = ext4_readdir,
  660 + .unlocked_ioctl = ext4_ioctl,
  661 +#ifdef CONFIG_COMPAT
  662 + .compat_ioctl = ext4_compat_ioctl,
  663 +#endif
  664 + .fsync = ext4_sync_file,
  665 + .release = ext4_release_dir,
  666 +};
... ... @@ -1612,7 +1612,11 @@
1612 1612 u32 *seed;
1613 1613 };
1614 1614  
1615   -#define EXT4_HTREE_EOF 0x7fffffff
  1615 +
  1616 +/* 32 and 64 bit signed EOF for dx directories */
  1617 +#define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1)
  1618 +#define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1)
  1619 +
1616 1620  
1617 1621 /*
1618 1622 * Control parameters used by ext4_htree_next_block
... ... @@ -200,8 +200,8 @@
200 200 return -1;
201 201 }
202 202 hash = hash & ~1;
203   - if (hash == (EXT4_HTREE_EOF << 1))
204   - hash = (EXT4_HTREE_EOF-1) << 1;
  203 + if (hash == (EXT4_HTREE_EOF_32BIT << 1))
  204 + hash = (EXT4_HTREE_EOF_32BIT - 1) << 1;
205 205 hinfo->hash = hash;
206 206 hinfo->minor_hash = minor_hash;
207 207 return 0;