Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable

* git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable: Btrfs: break out of shrink_delalloc earlier btrfs: fix not enough reserved space btrfs: fix dip leak Btrfs: make sure not to return overlapping extents to fiemap Btrfs: deal with short returns from copy_from_user Btrfs: fix regressions in copy_from_user handling

Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable
* git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable: Btrfs: break out of shrink_delalloc earlier btrfs: fix not enough reserved space btrfs: fix dip leak Btrfs: make sure not to return overlapping extents to fiemap Btrfs: deal with short returns from copy_from_user Btrfs: fix regressions in copy_from_user handling
Linus Torvalds
2 parents eebea5d13d 36e39c40b3
Showing 5 changed files Side-by-side Diff
fs/btrfs/ctree.h
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/file.c
fs/btrfs/inode.c
@@ -729,6 +729,15 @@
 	u64 disk_total;		/* total bytes on disk, takes mirrors into
 				   account */
  
+	/*
+	 * we bump reservation progress every time we decrement
+	 * bytes_reserved.  This way people waiting for reservations
+	 * know something good has happened and they can check
+	 * for progress.  The number here isn't to be trusted, it
+	 * just shows reclaim activity
+	 */
+	unsigned long reservation_progress;
+
 	int full;		/* indicates that we cannot allocate any more
 				   chunks for this space */
 	int force_alloc;	/* set if we need to force a chunk alloc for
@@ -3342,15 +3342,16 @@
 	u64 max_reclaim;
 	u64 reclaimed = 0;
 	long time_left;
-	int pause = 1;
 	int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
 	int loops = 0;
+	unsigned long progress;
  
 	block_rsv = &root->fs_info->delalloc_block_rsv;
 	space_info = block_rsv->space_info;
  
 	smp_mb();
 	reserved = space_info->bytes_reserved;
+	progress = space_info->reservation_progress;
  
 	if (reserved == 0)
 		return 0;
  
  
  
  
  
@@ -3365,32 +3366,37 @@
 		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
  
 		spin_lock(&space_info->lock);
-		if (reserved > space_info->bytes_reserved) {
-			loops = 0;
+		if (reserved > space_info->bytes_reserved)
 			reclaimed += reserved - space_info->bytes_reserved;
-		} else {
-			loops++;
-		}
 		reserved = space_info->bytes_reserved;
 		spin_unlock(&space_info->lock);
  
+		loops++;
+
 		if (reserved == 0 || reclaimed >= max_reclaim)
 			break;
  
 		if (trans && trans->transaction->blocked)
 			return -EAGAIN;
  
-		__set_current_state(TASK_INTERRUPTIBLE);
-		time_left = schedule_timeout(pause);
+		time_left = schedule_timeout_interruptible(1);
  
 		/* We were interrupted, exit */
 		if (time_left)
 			break;
  
-		pause <<= 1;
-		if (pause > HZ / 10)
-			pause = HZ / 10;
+		/* we've kicked the IO a few times, if anything has been freed,
+		 * exit.  There is no sense in looping here for a long time
+		 * when we really need to commit the transaction, or there are
+		 * just too many writers without enough free space
+		 */
  
+		if (loops > 3) {
+			smp_mb();
+			if (progress != space_info->reservation_progress)
+				break;
+		}
+
 	}
 	return reclaimed >= to_reclaim;
 }
@@ -3612,6 +3618,7 @@
 		if (num_bytes) {
 			spin_lock(&space_info->lock);
 			space_info->bytes_reserved -= num_bytes;
+			space_info->reservation_progress++;
 			spin_unlock(&space_info->lock);
 		}
 	}
@@ -3844,6 +3851,7 @@
 	if (block_rsv->reserved >= block_rsv->size) {
 		num_bytes = block_rsv->reserved - block_rsv->size;
 		sinfo->bytes_reserved -= num_bytes;
+		sinfo->reservation_progress++;
 		block_rsv->reserved = block_rsv->size;
 		block_rsv->full = 1;
 	}
@@ -4005,7 +4013,6 @@
 		to_reserve = 0;
 	}
 	spin_unlock(&BTRFS_I(inode)->accounting_lock);
-
 	to_reserve += calc_csum_metadata_size(inode, num_bytes);
 	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
 	if (ret)
@@ -4133,6 +4140,7 @@
 			btrfs_set_block_group_used(&cache->item, old_val);
 			cache->reserved -= num_bytes;
 			cache->space_info->bytes_reserved -= num_bytes;
+			cache->space_info->reservation_progress++;
 			cache->space_info->bytes_used += num_bytes;
 			cache->space_info->disk_used += num_bytes * factor;
 			spin_unlock(&cache->lock);
@@ -4184,6 +4192,7 @@
 	if (reserved) {
 		cache->reserved -= num_bytes;
 		cache->space_info->bytes_reserved -= num_bytes;
+		cache->space_info->reservation_progress++;
 	}
 	spin_unlock(&cache->lock);
 	spin_unlock(&cache->space_info->lock);
@@ -4234,6 +4243,7 @@
 				space_info->bytes_readonly += num_bytes;
 			cache->reserved -= num_bytes;
 			space_info->bytes_reserved -= num_bytes;
+			space_info->reservation_progress++;
 		}
 		spin_unlock(&cache->lock);
 		spin_unlock(&space_info->lock);
@@ -4712,6 +4722,7 @@
 		if (ret) {
 			spin_lock(&cache->space_info->lock);
 			cache->space_info->bytes_reserved -= buf->len;
+			cache->space_info->reservation_progress++;
 			spin_unlock(&cache->space_info->lock);
 		}
 		goto out;
@@ -3046,17 +3046,38 @@
 	}
  
 	while (!end) {
-		off = extent_map_end(em);
-		if (off >= max)
-			end = 1;
+		u64 offset_in_extent;
  
-		em_start = em->start;
-		em_len = em->len;
+		/* break if the extent we found is outside the range */
+		if (em->start >= max || extent_map_end(em) < off)
+			break;
+
+		/*
+		 * get_extent may return an extent that starts before our
+		 * requested range.  We have to make sure the ranges
+		 * we return to fiemap always move forward and don't
+		 * overlap, so adjust the offsets here
+		 */
+		em_start = max(em->start, off);
+
+		/*
+		 * record the offset from the start of the extent
+		 * for adjusting the disk offset below
+		 */
+		offset_in_extent = em_start - em->start;
 		em_end = extent_map_end(em);
+		em_len = em_end - em_start;
 		emflags = em->flags;
 		disko = 0;
 		flags = 0;
  
+		/*
+		 * bump off for our next call to get_extent
+		 */
+		off = extent_map_end(em);
+		if (off >= max)
+			end = 1;
+
 		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
 			end = 1;
 			flags |= FIEMAP_EXTENT_LAST;
@@ -3067,7 +3088,7 @@
 			flags |= (FIEMAP_EXTENT_DELALLOC |
 				  FIEMAP_EXTENT_UNKNOWN);
 		} else {
-			disko = em->block_start;
+			disko = em->block_start + offset_in_extent;
 		}
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
 			flags |= FIEMAP_EXTENT_ENCODED;
@@ -70,6 +70,19 @@
  
 		/* Flush processor's dcache for this page */
 		flush_dcache_page(page);
+
+		/*
+		 * if we get a partial write, we can end up with
+		 * partially up to date pages.  These add
+		 * a lot of complexity, so make sure they don't
+		 * happen by forcing this copy to be retried.
+		 *
+		 * The rest of the btrfs_file_write code will fall
+		 * back to page at a time copies after we return 0.
+		 */
+		if (!PageUptodate(page) && copied < count)
+			copied = 0;
+
 		iov_iter_advance(i, copied);
 		write_bytes -= copied;
 		total_copied += copied;
@@ -763,6 +776,27 @@
 }
  
 /*
+ * on error we return an unlocked page and the error value
+ * on success we return a locked page and 0
+ */
+static int prepare_uptodate_page(struct page *page, u64 pos)
+{
+	int ret = 0;
+
+	if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
+		ret = btrfs_readpage(NULL, page);
+		if (ret)
+			return ret;
+		lock_page(page);
+		if (!PageUptodate(page)) {
+			unlock_page(page);
+			return -EIO;
+		}
+	}
+	return 0;
+}
+
+/*
  * this gets pages into the page cache and locks them down, it also properly
  * waits for data=ordered extents to finish before allowing the pages to be
  * modified.
@@ -777,6 +811,7 @@
 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
 	struct inode *inode = fdentry(file)->d_inode;
 	int err = 0;
+	int faili = 0;
 	u64 start_pos;
 	u64 last_pos;
  
  
  
@@ -794,15 +829,24 @@
 	for (i = 0; i < num_pages; i++) {
 		pages[i] = grab_cache_page(inode->i_mapping, index + i);
 		if (!pages[i]) {
-			int c;
-			for (c = i - 1; c >= 0; c--) {
-				unlock_page(pages[c]);
-				page_cache_release(pages[c]);
-			}
-			return -ENOMEM;
+			faili = i - 1;
+			err = -ENOMEM;
+			goto fail;
 		}
+
+		if (i == 0)
+			err = prepare_uptodate_page(pages[i], pos);
+		if (i == num_pages - 1)
+			err = prepare_uptodate_page(pages[i],
+						    pos + write_bytes);
+		if (err) {
+			page_cache_release(pages[i]);
+			faili = i - 1;
+			goto fail;
+		}
 		wait_on_page_writeback(pages[i]);
 	}
+	err = 0;
 	if (start_pos < inode->i_size) {
 		struct btrfs_ordered_extent *ordered;
 		lock_extent_bits(&BTRFS_I(inode)->io_tree,
@@ -842,6 +886,14 @@
 		WARN_ON(!PageLocked(pages[i]));
 	}
 	return 0;
+fail:
+	while (faili >= 0) {
+		unlock_page(pages[faili]);
+		page_cache_release(pages[faili]);
+		faili--;
+	}
+	return err;
+
 }
  
 static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
@@ -851,7 +903,6 @@
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct page *pinned[2];
 	struct page **pages = NULL;
 	struct iov_iter i;
 	loff_t *ppos = &iocb->ki_pos;
@@ -872,9 +923,6 @@
 	will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
 		      (file->f_flags & O_DIRECT));
  
-	pinned[0] = NULL;
-	pinned[1] = NULL;
-
 	start_pos = pos;
  
 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
@@ -962,32 +1010,6 @@
 	first_index = pos >> PAGE_CACHE_SHIFT;
 	last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
  
-	/*
-	 * there are lots of better ways to do this, but this code
-	 * makes sure the first and last page in the file range are
-	 * up to date and ready for cow
-	 */
-	if ((pos & (PAGE_CACHE_SIZE - 1))) {
-		pinned[0] = grab_cache_page(inode->i_mapping, first_index);
-		if (!PageUptodate(pinned[0])) {
-			ret = btrfs_readpage(NULL, pinned[0]);
-			BUG_ON(ret);
-			wait_on_page_locked(pinned[0]);
-		} else {
-			unlock_page(pinned[0]);
-		}
-	}
-	if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
-		pinned[1] = grab_cache_page(inode->i_mapping, last_index);
-		if (!PageUptodate(pinned[1])) {
-			ret = btrfs_readpage(NULL, pinned[1]);
-			BUG_ON(ret);
-			wait_on_page_locked(pinned[1]);
-		} else {
-			unlock_page(pinned[1]);
-		}
-	}
-
 	while (iov_iter_count(&i) > 0) {
 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
 		size_t write_bytes = min(iov_iter_count(&i),
  
@@ -1024,9 +1046,21 @@
  
 		copied = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, &i);
-		dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >>
-				PAGE_CACHE_SHIFT;
  
+		/*
+		 * if we have trouble faulting in the pages, fall
+		 * back to one page at a time
+		 */
+		if (copied < write_bytes)
+			nrptrs = 1;
+
+		if (copied == 0)
+			dirty_pages = 0;
+		else
+			dirty_pages = (copied + offset +
+				       PAGE_CACHE_SIZE - 1) >>
+				       PAGE_CACHE_SHIFT;
+
 		if (num_pages > dirty_pages) {
 			if (copied > 0)
 				atomic_inc(
@@ -1069,10 +1103,6 @@
 		err = ret;
  
 	kfree(pages);
-	if (pinned[0])
-		page_cache_release(pinned[0]);
-	if (pinned[1])
-		page_cache_release(pinned[1]);
 	*ppos = pos;
  
 	/*
@@ -4821,10 +4821,11 @@
 		goto fail;
  
 	/*
-	 * 1 item for inode ref
+	 * 2 items for inode and inode ref
 	 * 2 items for dir items
+	 * 1 item for parent inode
 	 */
-	trans = btrfs_start_transaction(root, 3);
+	trans = btrfs_start_transaction(root, 5);
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
 		goto fail;
@@ -6056,6 +6057,7 @@
 	if (!skip_sum) {
 		dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
 		if (!dip->csums) {
+			kfree(dip);
 			ret = -ENOMEM;
 			goto free_ordered;
 		}
...	...	@@ -729,6 +729,15 @@
729	729	u64 disk_total; /* total bytes on disk, takes mirrors into
730	730	account */
731	731
	732	+ /*
	733	+ * we bump reservation progress every time we decrement
	734	+ * bytes_reserved. This way people waiting for reservations
	735	+ * know something good has happened and they can check
	736	+ * for progress. The number here isn't to be trusted, it
	737	+ * just shows reclaim activity
	738	+ */
	739	+ unsigned long reservation_progress;
	740	+
732	741	int full; /* indicates that we cannot allocate any more
733	742	chunks for this space */
734	743	int force_alloc; /* set if we need to force a chunk alloc for
...	...	@@ -3342,15 +3342,16 @@
3342	3342	u64 max_reclaim;
3343	3343	u64 reclaimed = 0;
3344	3344	long time_left;
3345		- int pause = 1;
3346	3345	int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3347	3346	int loops = 0;
	3347	+ unsigned long progress;
3348	3348
3349	3349	block_rsv = &root->fs_info->delalloc_block_rsv;
3350	3350	space_info = block_rsv->space_info;
3351	3351
3352	3352	smp_mb();
3353	3353	reserved = space_info->bytes_reserved;
	3354	+ progress = space_info->reservation_progress;
3354	3355
3355	3356	if (reserved == 0)
3356	3357	return 0;
3357	3358
3358	3359
3359	3360
3360	3361
3361	3362
...	...	@@ -3365,32 +3366,37 @@
3365	3366	writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
3366	3367
3367	3368	spin_lock(&space_info->lock);
3368		- if (reserved > space_info->bytes_reserved) {
3369		- loops = 0;
	3369	+ if (reserved > space_info->bytes_reserved)
3370	3370	reclaimed += reserved - space_info->bytes_reserved;
3371		- } else {
3372		- loops++;
3373		- }
3374	3371	reserved = space_info->bytes_reserved;
3375	3372	spin_unlock(&space_info->lock);
3376	3373
	3374	+ loops++;
	3375	+
3377	3376	if (reserved == 0 \|\| reclaimed >= max_reclaim)
3378	3377	break;
3379	3378
3380	3379	if (trans && trans->transaction->blocked)
3381	3380	return -EAGAIN;
3382	3381
3383		- __set_current_state(TASK_INTERRUPTIBLE);
3384		- time_left = schedule_timeout(pause);
	3382	+ time_left = schedule_timeout_interruptible(1);
3385	3383
3386	3384	/* We were interrupted, exit */
3387	3385	if (time_left)
3388	3386	break;
3389	3387
3390		- pause <<= 1;
3391		- if (pause > HZ / 10)
3392		- pause = HZ / 10;
	3388	+ /* we've kicked the IO a few times, if anything has been freed,
	3389	+ * exit. There is no sense in looping here for a long time
	3390	+ * when we really need to commit the transaction, or there are
	3391	+ * just too many writers without enough free space
	3392	+ */
3393	3393
	3394	+ if (loops > 3) {
	3395	+ smp_mb();
	3396	+ if (progress != space_info->reservation_progress)
	3397	+ break;
	3398	+ }
	3399	+
3394	3400	}
3395	3401	return reclaimed >= to_reclaim;
3396	3402	}
...	...	@@ -3612,6 +3618,7 @@
3612	3618	if (num_bytes) {
3613	3619	spin_lock(&space_info->lock);
3614	3620	space_info->bytes_reserved -= num_bytes;
	3621	+ space_info->reservation_progress++;
3615	3622	spin_unlock(&space_info->lock);
3616	3623	}
3617	3624	}
...	...	@@ -3844,6 +3851,7 @@
3844	3851	if (block_rsv->reserved >= block_rsv->size) {
3845	3852	num_bytes = block_rsv->reserved - block_rsv->size;
3846	3853	sinfo->bytes_reserved -= num_bytes;
	3854	+ sinfo->reservation_progress++;
3847	3855	block_rsv->reserved = block_rsv->size;
3848	3856	block_rsv->full = 1;
3849	3857	}
...	...	@@ -4005,7 +4013,6 @@
4005	4013	to_reserve = 0;
4006	4014	}
4007	4015	spin_unlock(&BTRFS_I(inode)->accounting_lock);
4008		-
4009	4016	to_reserve += calc_csum_metadata_size(inode, num_bytes);
4010	4017	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
4011	4018	if (ret)
...	...	@@ -4133,6 +4140,7 @@
4133	4140	btrfs_set_block_group_used(&cache->item, old_val);
4134	4141	cache->reserved -= num_bytes;
4135	4142	cache->space_info->bytes_reserved -= num_bytes;
	4143	+ cache->space_info->reservation_progress++;
4136	4144	cache->space_info->bytes_used += num_bytes;
4137	4145	cache->space_info->disk_used += num_bytes * factor;
4138	4146	spin_unlock(&cache->lock);
...	...	@@ -4184,6 +4192,7 @@
4184	4192	if (reserved) {
4185	4193	cache->reserved -= num_bytes;
4186	4194	cache->space_info->bytes_reserved -= num_bytes;
	4195	+ cache->space_info->reservation_progress++;
4187	4196	}
4188	4197	spin_unlock(&cache->lock);
4189	4198	spin_unlock(&cache->space_info->lock);
...	...	@@ -4234,6 +4243,7 @@
4234	4243	space_info->bytes_readonly += num_bytes;
4235	4244	cache->reserved -= num_bytes;
4236	4245	space_info->bytes_reserved -= num_bytes;
	4246	+ space_info->reservation_progress++;
4237	4247	}
4238	4248	spin_unlock(&cache->lock);
4239	4249	spin_unlock(&space_info->lock);
...	...	@@ -4712,6 +4722,7 @@
4712	4722	if (ret) {
4713	4723	spin_lock(&cache->space_info->lock);
4714	4724	cache->space_info->bytes_reserved -= buf->len;
	4725	+ cache->space_info->reservation_progress++;
4715	4726	spin_unlock(&cache->space_info->lock);
4716	4727	}
4717	4728	goto out;
...	...	@@ -3046,17 +3046,38 @@
3046	3046	}
3047	3047
3048	3048	while (!end) {
3049		- off = extent_map_end(em);
3050		- if (off >= max)
3051		- end = 1;
	3049	+ u64 offset_in_extent;
3052	3050
3053		- em_start = em->start;
3054		- em_len = em->len;
	3051	+ /* break if the extent we found is outside the range */
	3052	+ if (em->start >= max \|\| extent_map_end(em) < off)
	3053	+ break;
	3054	+
	3055	+ /*
	3056	+ * get_extent may return an extent that starts before our
	3057	+ * requested range. We have to make sure the ranges
	3058	+ * we return to fiemap always move forward and don't
	3059	+ * overlap, so adjust the offsets here
	3060	+ */
	3061	+ em_start = max(em->start, off);
	3062	+
	3063	+ /*
	3064	+ * record the offset from the start of the extent
	3065	+ * for adjusting the disk offset below
	3066	+ */
	3067	+ offset_in_extent = em_start - em->start;
3055	3068	em_end = extent_map_end(em);
	3069	+ em_len = em_end - em_start;
3056	3070	emflags = em->flags;
3057	3071	disko = 0;
3058	3072	flags = 0;
3059	3073
	3074	+ /*
	3075	+ * bump off for our next call to get_extent
	3076	+ */
	3077	+ off = extent_map_end(em);
	3078	+ if (off >= max)
	3079	+ end = 1;
	3080	+
3060	3081	if (em->block_start == EXTENT_MAP_LAST_BYTE) {
3061	3082	end = 1;
3062	3083	flags \|= FIEMAP_EXTENT_LAST;
...	...	@@ -3067,7 +3088,7 @@
3067	3088	flags \|= (FIEMAP_EXTENT_DELALLOC \|
3068	3089	FIEMAP_EXTENT_UNKNOWN);
3069	3090	} else {
3070		- disko = em->block_start;
	3091	+ disko = em->block_start + offset_in_extent;
3071	3092	}
3072	3093	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3073	3094	flags \|= FIEMAP_EXTENT_ENCODED;
...	...	@@ -70,6 +70,19 @@
70	70
71	71	/* Flush processor's dcache for this page */
72	72	flush_dcache_page(page);
	73	+
	74	+ /*
	75	+ * if we get a partial write, we can end up with
	76	+ * partially up to date pages. These add
	77	+ * a lot of complexity, so make sure they don't
	78	+ * happen by forcing this copy to be retried.
	79	+ *
	80	+ * The rest of the btrfs_file_write code will fall
	81	+ * back to page at a time copies after we return 0.
	82	+ */
	83	+ if (!PageUptodate(page) && copied < count)
	84	+ copied = 0;
	85	+
73	86	iov_iter_advance(i, copied);
74	87	write_bytes -= copied;
75	88	total_copied += copied;
...	...	@@ -763,6 +776,27 @@
763	776	}
764	777
765	778	/*
	779	+ * on error we return an unlocked page and the error value
	780	+ * on success we return a locked page and 0
	781	+ */
	782	+static int prepare_uptodate_page(struct page *page, u64 pos)
	783	+{
	784	+ int ret = 0;
	785	+
	786	+ if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
	787	+ ret = btrfs_readpage(NULL, page);
	788	+ if (ret)
	789	+ return ret;
	790	+ lock_page(page);
	791	+ if (!PageUptodate(page)) {
	792	+ unlock_page(page);
	793	+ return -EIO;
	794	+ }
	795	+ }
	796	+ return 0;
	797	+}
	798	+
	799	+/*
766	800	* this gets pages into the page cache and locks them down, it also properly
767	801	* waits for data=ordered extents to finish before allowing the pages to be
768	802	* modified.
...	...	@@ -777,6 +811,7 @@
777	811	unsigned long index = pos >> PAGE_CACHE_SHIFT;
778	812	struct inode *inode = fdentry(file)->d_inode;
779	813	int err = 0;
	814	+ int faili = 0;
780	815	u64 start_pos;
781	816	u64 last_pos;
782	817
783	818
784	819
...	...	@@ -794,15 +829,24 @@
794	829	for (i = 0; i < num_pages; i++) {
795	830	pages[i] = grab_cache_page(inode->i_mapping, index + i);
796	831	if (!pages[i]) {
797		- int c;
798		- for (c = i - 1; c >= 0; c--) {
799		- unlock_page(pages[c]);
800		- page_cache_release(pages[c]);
801		- }
802		- return -ENOMEM;
	832	+ faili = i - 1;
	833	+ err = -ENOMEM;
	834	+ goto fail;
803	835	}
	836	+
	837	+ if (i == 0)
	838	+ err = prepare_uptodate_page(pages[i], pos);
	839	+ if (i == num_pages - 1)
	840	+ err = prepare_uptodate_page(pages[i],
	841	+ pos + write_bytes);
	842	+ if (err) {
	843	+ page_cache_release(pages[i]);
	844	+ faili = i - 1;
	845	+ goto fail;
	846	+ }
804	847	wait_on_page_writeback(pages[i]);
805	848	}
	849	+ err = 0;
806	850	if (start_pos < inode->i_size) {
807	851	struct btrfs_ordered_extent *ordered;
808	852	lock_extent_bits(&BTRFS_I(inode)->io_tree,
...	...	@@ -842,6 +886,14 @@
842	886	WARN_ON(!PageLocked(pages[i]));
843	887	}
844	888	return 0;
	889	+fail:
	890	+ while (faili >= 0) {
	891	+ unlock_page(pages[faili]);
	892	+ page_cache_release(pages[faili]);
	893	+ faili--;
	894	+ }
	895	+ return err;
	896	+
845	897	}
846	898
847	899	static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
...	...	@@ -851,7 +903,6 @@
851	903	struct file *file = iocb->ki_filp;
852	904	struct inode *inode = fdentry(file)->d_inode;
853	905	struct btrfs_root *root = BTRFS_I(inode)->root;
854		- struct page *pinned[2];
855	906	struct page **pages = NULL;
856	907	struct iov_iter i;
857	908	loff_t *ppos = &iocb->ki_pos;
...	...	@@ -872,9 +923,6 @@
872	923	will_write = ((file->f_flags & O_DSYNC) \|\| IS_SYNC(inode) \|\|
873	924	(file->f_flags & O_DIRECT));
874	925
875		- pinned[0] = NULL;
876		- pinned[1] = NULL;
877		-
878	926	start_pos = pos;
879	927
880	928	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
...	...	@@ -962,32 +1010,6 @@
962	1010	first_index = pos >> PAGE_CACHE_SHIFT;
963	1011	last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
964	1012
965		- /*
966		- * there are lots of better ways to do this, but this code
967		- * makes sure the first and last page in the file range are
968		- * up to date and ready for cow
969		- */
970		- if ((pos & (PAGE_CACHE_SIZE - 1))) {
971		- pinned[0] = grab_cache_page(inode->i_mapping, first_index);
972		- if (!PageUptodate(pinned[0])) {
973		- ret = btrfs_readpage(NULL, pinned[0]);
974		- BUG_ON(ret);
975		- wait_on_page_locked(pinned[0]);
976		- } else {
977		- unlock_page(pinned[0]);
978		- }
979		- }
980		- if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
981		- pinned[1] = grab_cache_page(inode->i_mapping, last_index);
982		- if (!PageUptodate(pinned[1])) {
983		- ret = btrfs_readpage(NULL, pinned[1]);
984		- BUG_ON(ret);
985		- wait_on_page_locked(pinned[1]);
986		- } else {
987		- unlock_page(pinned[1]);
988		- }
989		- }
990		-
991	1013	while (iov_iter_count(&i) > 0) {
992	1014	size_t offset = pos & (PAGE_CACHE_SIZE - 1);
993	1015	size_t write_bytes = min(iov_iter_count(&i),
994	1016
...	...	@@ -1024,9 +1046,21 @@
1024	1046
1025	1047	copied = btrfs_copy_from_user(pos, num_pages,
1026	1048	write_bytes, pages, &i);
1027		- dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >>
1028		- PAGE_CACHE_SHIFT;
1029	1049
	1050	+ /*
	1051	+ * if we have trouble faulting in the pages, fall
	1052	+ * back to one page at a time
	1053	+ */
	1054	+ if (copied < write_bytes)
	1055	+ nrptrs = 1;
	1056	+
	1057	+ if (copied == 0)
	1058	+ dirty_pages = 0;
	1059	+ else
	1060	+ dirty_pages = (copied + offset +
	1061	+ PAGE_CACHE_SIZE - 1) >>
	1062	+ PAGE_CACHE_SHIFT;
	1063	+
1030	1064	if (num_pages > dirty_pages) {
1031	1065	if (copied > 0)
1032	1066	atomic_inc(
...	...	@@ -1069,10 +1103,6 @@
1069	1103	err = ret;
1070	1104
1071	1105	kfree(pages);
1072		- if (pinned[0])
1073		- page_cache_release(pinned[0]);
1074		- if (pinned[1])
1075		- page_cache_release(pinned[1]);
1076	1106	*ppos = pos;
1077	1107
1078	1108	/*
...	...	@@ -4821,10 +4821,11 @@
4821	4821	goto fail;
4822	4822
4823	4823	/*
4824		- * 1 item for inode ref
	4824	+ * 2 items for inode and inode ref
4825	4825	* 2 items for dir items
	4826	+ * 1 item for parent inode
4826	4827	*/
4827		- trans = btrfs_start_transaction(root, 3);
	4828	+ trans = btrfs_start_transaction(root, 5);
4828	4829	if (IS_ERR(trans)) {
4829	4830	err = PTR_ERR(trans);
4830	4831	goto fail;
...	...	@@ -6056,6 +6057,7 @@
6056	6057	if (!skip_sum) {
6057	6058	dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
6058	6059	if (!dip->csums) {
	6060	+ kfree(dip);
6059	6061	ret = -ENOMEM;
6060	6062	goto free_ordered;
6061	6063	}