Commit 7bdb0d18bfd381cc5491eb95973ec5604b356c7e

Authored by Tristan Ye
Committed by Joel Becker
1 parent 75d9bbc738

ocfs2: Add a mount option "coherency=*" to handle cluster coherency for O_DIRECT writes.

Currently, the default behavior of O_DIRECT writes was allowing
concurrent writing among nodes to the same file, with no cluster
coherency guaranteed (no EX lock held).  This can leave stale data in
the cache for buffered reads on other nodes.

The new mount option introduce a chance to choose two different
behaviors for O_DIRECT writes:

    * coherency=full, as the default value, will disallow
                      concurrent O_DIRECT writes by taking
                      EX locks.

    * coherency=buffered, allow concurrent O_DIRECT writes
                          without EX lock among nodes, which
                          gains high performance at risk of
                          getting stale data on other nodes.

Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>

Showing 4 changed files with 52 additions and 2 deletions Side-by-side Diff

Documentation/filesystems/ocfs2.txt
... ... @@ -87,4 +87,11 @@
87 87 reservations - users should rarely need to change this
88 88 value. If allocation reservations are turned off, this
89 89 option will have no effect.
  90 +coherency=full (*) Disallow concurrent O_DIRECT writes, cluster inode
  91 + lock will be taken to force other nodes drop cache,
  92 + therefore full cluster coherency is guaranteed even
  93 + for O_DIRECT writes.
  94 +coherency=buffered Allow concurrent O_DIRECT writes without EX lock among
  95 + nodes, which gains high performance at risk of getting
  96 + stale data on other nodes.
... ... @@ -2225,6 +2225,8 @@
2225 2225 struct file *file = iocb->ki_filp;
2226 2226 struct inode *inode = file->f_path.dentry->d_inode;
2227 2227 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
  2228 + int full_coherency = !(osb->s_mount_opt &
  2229 + OCFS2_MOUNT_COHERENCY_BUFFERED);
2228 2230  
2229 2231 mlog_entry("(0x%p, %u, '%.*s')\n", file,
2230 2232 (unsigned int)nr_segs,
2231 2233  
... ... @@ -2248,12 +2250,35 @@
2248 2250 have_alloc_sem = 1;
2249 2251 }
2250 2252  
2251   - /* concurrent O_DIRECT writes are allowed */
2252   - rw_level = !direct_io;
  2253 + /*
  2254 + * Concurrent O_DIRECT writes are allowed with
  2255 + * mount_option "coherency=buffered".
  2256 + */
  2257 + rw_level = (!direct_io || full_coherency);
  2258 +
2253 2259 ret = ocfs2_rw_lock(inode, rw_level);
2254 2260 if (ret < 0) {
2255 2261 mlog_errno(ret);
2256 2262 goto out_sems;
  2263 + }
  2264 +
  2265 + /*
  2266 + * O_DIRECT writes with "coherency=full" need to take EX cluster
  2267 + * inode_lock to guarantee coherency.
  2268 + */
  2269 + if (direct_io && full_coherency) {
  2270 + /*
  2271 + * We need to take and drop the inode lock to force
  2272 + * other nodes to drop their caches. Buffered I/O
  2273 + * already does this in write_begin().
  2274 + */
  2275 + ret = ocfs2_inode_lock(inode, NULL, 1);
  2276 + if (ret < 0) {
  2277 + mlog_errno(ret);
  2278 + goto out_sems;
  2279 + }
  2280 +
  2281 + ocfs2_inode_unlock(inode, 1);
2257 2282 }
2258 2283  
2259 2284 can_do_direct = direct_io;
... ... @@ -263,6 +263,9 @@
263 263 control lists */
264 264 OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */
265 265 OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */
  266 +
  267 + OCFS2_MOUNT_COHERENCY_BUFFERED = 1 << 12 /* Allow concurrent O_DIRECT
  268 + writes */
266 269 };
267 270  
268 271 #define OCFS2_OSB_SOFT_RO 0x0001
... ... @@ -177,6 +177,8 @@
177 177 Opt_noacl,
178 178 Opt_usrquota,
179 179 Opt_grpquota,
  180 + Opt_coherency_buffered,
  181 + Opt_coherency_full,
180 182 Opt_resv_level,
181 183 Opt_dir_resv_level,
182 184 Opt_err,
... ... @@ -205,6 +207,8 @@
205 207 {Opt_noacl, "noacl"},
206 208 {Opt_usrquota, "usrquota"},
207 209 {Opt_grpquota, "grpquota"},
  210 + {Opt_coherency_buffered, "coherency=buffered"},
  211 + {Opt_coherency_full, "coherency=full"},
208 212 {Opt_resv_level, "resv_level=%u"},
209 213 {Opt_dir_resv_level, "dir_resv_level=%u"},
210 214 {Opt_err, NULL}
... ... @@ -1452,6 +1456,12 @@
1452 1456 case Opt_grpquota:
1453 1457 mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
1454 1458 break;
  1459 + case Opt_coherency_buffered:
  1460 + mopt->mount_opt |= OCFS2_MOUNT_COHERENCY_BUFFERED;
  1461 + break;
  1462 + case Opt_coherency_full:
  1463 + mopt->mount_opt &= ~OCFS2_MOUNT_COHERENCY_BUFFERED;
  1464 + break;
1455 1465 case Opt_acl:
1456 1466 mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
1457 1467 mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL;
... ... @@ -1549,6 +1559,11 @@
1549 1559 seq_printf(s, ",usrquota");
1550 1560 if (opts & OCFS2_MOUNT_GRPQUOTA)
1551 1561 seq_printf(s, ",grpquota");
  1562 +
  1563 + if (opts & OCFS2_MOUNT_COHERENCY_BUFFERED)
  1564 + seq_printf(s, ",coherency=buffered");
  1565 + else
  1566 + seq_printf(s, ",coherency=full");
1552 1567  
1553 1568 if (opts & OCFS2_MOUNT_NOUSERXATTR)
1554 1569 seq_printf(s, ",nouser_xattr");