ocfs2: take data locks around extend

We need to take a data lock around extends to protect the pages that ocfs2_zero_extend is going to be pulling into the page cache. Otherwise an extend on one node might populate the page cache with data pages that have no lock coverage. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

ocfs2: take data locks around extend
We need to take a data lock around extends to protect the pages that ocfs2_zero_extend is going to be pulling into the page cache. Otherwise an extend on one node might populate the page cache with data pages that have no lock coverage. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Mark Fasheh
1 parent 0c056c50a6
Showing 3 changed files with 87 additions and 33 deletions Side-by-side Diff
fs/ocfs2/aops.c
fs/ocfs2/aops.h
fs/ocfs2/file.c
@@ -276,13 +276,29 @@
 	return ret;
 }
  
+/* This can also be called from ocfs2_write_zero_page() which has done
+ * it's own cluster locking. */
+int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
+			       unsigned from, unsigned to)
+{
+	int ret;
+
+	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ret = block_prepare_write(page, from, to, ocfs2_get_block);
+
+	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+	return ret;
+}
+
 /*
  * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
  * from loopback.  It must be able to perform its own locking around
  * ocfs2_get_block().
  */
-int ocfs2_prepare_write(struct file *file, struct page *page,
-			unsigned from, unsigned to)
+static int ocfs2_prepare_write(struct file *file, struct page *page,
+			       unsigned from, unsigned to)
 {
 	struct inode *inode = page->mapping->host;
 	int ret;
  
@@ -295,12 +311,8 @@
 		goto out;
 	}
  
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	ret = ocfs2_prepare_write_nolock(inode, page, from, to);
  
-	ret = block_prepare_write(page, from, to, ocfs2_get_block);
-
-	up_read(&OCFS2_I(inode)->ip_alloc_sem);
-
 	ocfs2_meta_unlock(inode, 0);
 out:
 	mlog_exit(ret);
  
@@ -625,11 +637,31 @@
 	int ret;
  
 	mlog_entry_void();
+
+	/*
+	 * We get PR data locks even for O_DIRECT.  This allows
+	 * concurrent O_DIRECT I/O but doesn't let O_DIRECT with
+	 * extending and buffered zeroing writes race.  If they did
+	 * race then the buffered zeroing could be written back after
+	 * the O_DIRECT I/O.  It's one thing to tell people not to mix
+	 * buffered and O_DIRECT writes, but expecting them to
+	 * understand that file extension is also an implicit buffered
+	 * write is too much.  By getting the PR we force writeback of
+	 * the buffered zeroing before proceeding.
+	 */
+	ret = ocfs2_data_lock(inode, 0);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+	ocfs2_data_unlock(inode, 0);
+
 	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
 					    inode->i_sb->s_bdev, iov, offset,
 					    nr_segs, 
 					    ocfs2_direct_IO_get_blocks,
 					    ocfs2_dio_end_io);
+out:
 	mlog_exit(ret);
 	return ret;
 }
@@ -22,8 +22,8 @@
 #ifndef OCFS2_AOPS_H
 #define OCFS2_AOPS_H
  
-int ocfs2_prepare_write(struct file *file, struct page *page,
-			unsigned from, unsigned to);
+int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
+			       unsigned from, unsigned to);
  
 struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode,
 							 struct page *page,
@@ -613,7 +613,8 @@
  
 /* Some parts of this taken from generic_cont_expand, which turned out
  * to be too fragile to do exactly what we need without us having to
- * worry about recursive locking in ->commit_write(). */
+ * worry about recursive locking in ->prepare_write() and
+ * ->commit_write(). */
 static int ocfs2_write_zero_page(struct inode *inode,
 				 u64 size)
 {
@@ -641,7 +642,7 @@
 		goto out;
 	}
  
-	ret = ocfs2_prepare_write(NULL, page, offset, offset);
+	ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_unlock;
  
  
@@ -695,13 +696,26 @@
 	return ret;
 }
  
+/* 
+ * A tail_to_skip value > 0 indicates that we're being called from
+ * ocfs2_file_aio_write(). This has the following implications:
+ *
+ * - we don't want to update i_size
+ * - di_bh will be NULL, which is fine because it's only used in the
+ *   case where we want to update i_size.
+ * - ocfs2_zero_extend() will then only be filling the hole created
+ *   between i_size and the start of the write.
+ */
 static int ocfs2_extend_file(struct inode *inode,
 			     struct buffer_head *di_bh,
-			     u64 new_i_size)
+			     u64 new_i_size,
+			     size_t tail_to_skip)
 {
 	int ret = 0;
 	u32 clusters_to_add;
  
+	BUG_ON(!tail_to_skip && !di_bh);
+
 	/* setattr sometimes calls us like this. */
 	if (new_i_size == 0)
 		goto out;
  
  
  
  
  
@@ -714,27 +728,44 @@
 		OCFS2_I(inode)->ip_clusters;
  
 	if (clusters_to_add) {
-		ret = ocfs2_extend_allocation(inode, clusters_to_add);
+		/* 
+		 * protect the pages that ocfs2_zero_extend is going to
+		 * be pulling into the page cache.. we do this before the
+		 * metadata extend so that we don't get into the situation
+		 * where we've extended the metadata but can't get the data
+		 * lock to zero.
+		 */
+		ret = ocfs2_data_lock(inode, 1);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
 		}
  
-		ret = ocfs2_zero_extend(inode, new_i_size);
+		ret = ocfs2_extend_allocation(inode, clusters_to_add);
 		if (ret < 0) {
 			mlog_errno(ret);
-			goto out;
+			goto out_unlock;
 		}
-	} 
  
-	/* No allocation required, we just use this helper to
-	 * do a trivial update of i_size. */
-	ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out;
+		ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out_unlock;
+		}
 	}
  
+	if (!tail_to_skip) {
+		/* We're being called from ocfs2_setattr() which wants
+		 * us to update i_size */
+		ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
+		if (ret < 0)
+			mlog_errno(ret);
+	}
+
+out_unlock:
+	if (clusters_to_add) /* this is the only case in which we lock */
+		ocfs2_data_unlock(inode, 1);
+
 out:
 	return ret;
 }
@@ -793,7 +824,7 @@
 		if (i_size_read(inode) > attr->ia_size)
 			status = ocfs2_truncate_file(inode, bh, attr->ia_size);
 		else
-			status = ocfs2_extend_file(inode, bh, attr->ia_size);
+			status = ocfs2_extend_file(inode, bh, attr->ia_size, 0);
 		if (status < 0) {
 			if (status != -ENOSPC)
 				mlog_errno(status);
  
@@ -1049,19 +1080,10 @@
 		if (!clusters)
 			break;
  
-		ret = ocfs2_extend_allocation(inode, clusters);
+		ret = ocfs2_extend_file(inode, NULL, newsize, count);
 		if (ret < 0) {
 			if (ret != -ENOSPC)
 				mlog_errno(ret);
-			goto out;
-		}
-
-		/* Fill any holes which would've been created by this
-		 * write. If we're O_APPEND, this will wind up
-		 * (correctly) being a noop. */
-		ret = ocfs2_zero_extend(inode, (u64) newsize - count);
-		if (ret < 0) {
-			mlog_errno(ret);
 			goto out;
 		}
 		break;
...	...	@@ -276,13 +276,29 @@
276	276	return ret;
277	277	}
278	278
	279	+/* This can also be called from ocfs2_write_zero_page() which has done
	280	+ * it's own cluster locking. */
	281	+int ocfs2_prepare_write_nolock(struct inode inode, struct page page,
	282	+ unsigned from, unsigned to)
	283	+{
	284	+ int ret;
	285	+
	286	+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
	287	+
	288	+ ret = block_prepare_write(page, from, to, ocfs2_get_block);
	289	+
	290	+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
	291	+
	292	+ return ret;
	293	+}
	294	+
279	295	/*
280	296	* ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
281	297	* from loopback. It must be able to perform its own locking around
282	298	* ocfs2_get_block().
283	299	*/
284		-int ocfs2_prepare_write(struct file file, struct page page,
285		- unsigned from, unsigned to)
	300	+static int ocfs2_prepare_write(struct file file, struct page page,
	301	+ unsigned from, unsigned to)
286	302	{
287	303	struct inode *inode = page->mapping->host;
288	304	int ret;
289	305
...	...	@@ -295,12 +311,8 @@
295	311	goto out;
296	312	}
297	313
298		- down_read(&OCFS2_I(inode)->ip_alloc_sem);
	314	+ ret = ocfs2_prepare_write_nolock(inode, page, from, to);
299	315
300		- ret = block_prepare_write(page, from, to, ocfs2_get_block);
301		-
302		- up_read(&OCFS2_I(inode)->ip_alloc_sem);
303		-
304	316	ocfs2_meta_unlock(inode, 0);
305	317	out:
306	318	mlog_exit(ret);
307	319
...	...	@@ -625,11 +637,31 @@
625	637	int ret;
626	638
627	639	mlog_entry_void();
	640	+
	641	+ /*
	642	+ * We get PR data locks even for O_DIRECT. This allows
	643	+ * concurrent O_DIRECT I/O but doesn't let O_DIRECT with
	644	+ * extending and buffered zeroing writes race. If they did
	645	+ * race then the buffered zeroing could be written back after
	646	+ * the O_DIRECT I/O. It's one thing to tell people not to mix
	647	+ * buffered and O_DIRECT writes, but expecting them to
	648	+ * understand that file extension is also an implicit buffered
	649	+ * write is too much. By getting the PR we force writeback of
	650	+ * the buffered zeroing before proceeding.
	651	+ */
	652	+ ret = ocfs2_data_lock(inode, 0);
	653	+ if (ret < 0) {
	654	+ mlog_errno(ret);
	655	+ goto out;
	656	+ }
	657	+ ocfs2_data_unlock(inode, 0);
	658	+
628	659	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
629	660	inode->i_sb->s_bdev, iov, offset,
630	661	nr_segs,
631	662	ocfs2_direct_IO_get_blocks,
632	663	ocfs2_dio_end_io);
	664	+out:
633	665	mlog_exit(ret);
634	666	return ret;
635	667	}
...	...	@@ -22,8 +22,8 @@
22	22	#ifndef OCFS2_AOPS_H
23	23	#define OCFS2_AOPS_H
24	24
25		-int ocfs2_prepare_write(struct file file, struct page page,
26		- unsigned from, unsigned to);
	25	+int ocfs2_prepare_write_nolock(struct inode inode, struct page page,
	26	+ unsigned from, unsigned to);
27	27
28	28	struct ocfs2_journal_handle ocfs2_start_walk_page_trans(struct inode inode,
29	29	struct page *page,
...	...	@@ -613,7 +613,8 @@
613	613
614	614	/* Some parts of this taken from generic_cont_expand, which turned out
615	615	* to be too fragile to do exactly what we need without us having to
616		- * worry about recursive locking in ->commit_write(). */
	616	+ * worry about recursive locking in ->prepare_write() and
	617	+ * ->commit_write(). */
617	618	static int ocfs2_write_zero_page(struct inode *inode,
618	619	u64 size)
619	620	{
...	...	@@ -641,7 +642,7 @@
641	642	goto out;
642	643	}
643	644
644		- ret = ocfs2_prepare_write(NULL, page, offset, offset);
	645	+ ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
645	646	if (ret < 0) {
646	647	mlog_errno(ret);
647	648	goto out_unlock;
648	649
649	650
...	...	@@ -695,13 +696,26 @@
695	696	return ret;
696	697	}
697	698
	699	+/*
	700	+ * A tail_to_skip value > 0 indicates that we're being called from
	701	+ * ocfs2_file_aio_write(). This has the following implications:
	702	+ *
	703	+ * - we don't want to update i_size
	704	+ * - di_bh will be NULL, which is fine because it's only used in the
	705	+ * case where we want to update i_size.
	706	+ * - ocfs2_zero_extend() will then only be filling the hole created
	707	+ * between i_size and the start of the write.
	708	+ */
698	709	static int ocfs2_extend_file(struct inode *inode,
699	710	struct buffer_head *di_bh,
700		- u64 new_i_size)
	711	+ u64 new_i_size,
	712	+ size_t tail_to_skip)
701	713	{
702	714	int ret = 0;
703	715	u32 clusters_to_add;
704	716
	717	+ BUG_ON(!tail_to_skip && !di_bh);
	718	+
705	719	/* setattr sometimes calls us like this. */
706	720	if (new_i_size == 0)
707	721	goto out;
708	722
709	723
710	724
711	725
712	726
...	...	@@ -714,27 +728,44 @@
714	728	OCFS2_I(inode)->ip_clusters;
715	729
716	730	if (clusters_to_add) {
717		- ret = ocfs2_extend_allocation(inode, clusters_to_add);
	731	+ /*
	732	+ * protect the pages that ocfs2_zero_extend is going to
	733	+ * be pulling into the page cache.. we do this before the
	734	+ * metadata extend so that we don't get into the situation
	735	+ * where we've extended the metadata but can't get the data
	736	+ * lock to zero.
	737	+ */
	738	+ ret = ocfs2_data_lock(inode, 1);
718	739	if (ret < 0) {
719	740	mlog_errno(ret);
720	741	goto out;
721	742	}
722	743
723		- ret = ocfs2_zero_extend(inode, new_i_size);
	744	+ ret = ocfs2_extend_allocation(inode, clusters_to_add);
724	745	if (ret < 0) {
725	746	mlog_errno(ret);
726		- goto out;
	747	+ goto out_unlock;
727	748	}
728		- }
729	749
730		- /* No allocation required, we just use this helper to
731		- * do a trivial update of i_size. */
732		- ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
733		- if (ret < 0) {
734		- mlog_errno(ret);
735		- goto out;
	750	+ ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip);
	751	+ if (ret < 0) {
	752	+ mlog_errno(ret);
	753	+ goto out_unlock;
	754	+ }
736	755	}
737	756
	757	+ if (!tail_to_skip) {
	758	+ /* We're being called from ocfs2_setattr() which wants
	759	+ * us to update i_size */
	760	+ ret = ocfs2_simple_size_update(inode, di_bh, new_i_size);
	761	+ if (ret < 0)
	762	+ mlog_errno(ret);
	763	+ }
	764	+
	765	+out_unlock:
	766	+ if (clusters_to_add) /* this is the only case in which we lock */
	767	+ ocfs2_data_unlock(inode, 1);
	768	+
738	769	out:
739	770	return ret;
740	771	}
...	...	@@ -793,7 +824,7 @@
793	824	if (i_size_read(inode) > attr->ia_size)
794	825	status = ocfs2_truncate_file(inode, bh, attr->ia_size);
795	826	else
796		- status = ocfs2_extend_file(inode, bh, attr->ia_size);
	827	+ status = ocfs2_extend_file(inode, bh, attr->ia_size, 0);
797	828	if (status < 0) {
798	829	if (status != -ENOSPC)
799	830	mlog_errno(status);
800	831
...	...	@@ -1049,19 +1080,10 @@
1049	1080	if (!clusters)
1050	1081	break;
1051	1082
1052		- ret = ocfs2_extend_allocation(inode, clusters);
	1083	+ ret = ocfs2_extend_file(inode, NULL, newsize, count);
1053	1084	if (ret < 0) {
1054	1085	if (ret != -ENOSPC)
1055	1086	mlog_errno(ret);
1056		- goto out;
1057		- }
1058		-
1059		- /* Fill any holes which would've been created by this
1060		- * write. If we're O_APPEND, this will wind up
1061		- * (correctly) being a noop. */
1062		- ret = ocfs2_zero_extend(inode, (u64) newsize - count);
1063		- if (ret < 0) {
1064		- mlog_errno(ret);
1065	1087	goto out;
1066	1088	}
1067	1089	break;