Doug / smarc-fsl-linux-kernel | Embedian Git Server

Commit 15ac08a8b2c129abccf1be47b6ab09491e013db2

Authored by Christoph Hellwig 2008-12-09 17:47:30 +0800

Committed by Lachlan McIlroy 2008-12-11 10:13:33 +0800

Exists in master and in 7 other branches

[XFS] replace b_fspriv with b_mount

Replace the b_fspriv pointer and it's ugly accessors with a properly types
xfs_mount pointer.  Also switch log reocvery over to it instead of using
b_fspriv for the mount pointer.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

Showing 6 changed files with 19 additions and 32 deletions Inline Diff

fs/xfs/linux-2.6/xfs_buf.c
fs/xfs/linux-2.6/xfs_buf.h
fs/xfs/linux-2.6/xfs_lrw.c
fs/xfs/xfs_buf_item.c
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_rw.c

fs/xfs/linux-2.6/xfs_buf.c

Diff comments View file @ 15ac08a

1	/*	1	/*
2	* Copyright (c) 2000-2006 Silicon Graphics, Inc.	2	* Copyright (c) 2000-2006 Silicon Graphics, Inc.
3	* All Rights Reserved.	3	* All Rights Reserved.
4	*	4	*
5	* This program is free software; you can redistribute it and/or	5	* This program is free software; you can redistribute it and/or
6	* modify it under the terms of the GNU General Public License as	6	* modify it under the terms of the GNU General Public License as
7	* published by the Free Software Foundation.	7	* published by the Free Software Foundation.
8	*	8	*
9	* This program is distributed in the hope that it would be useful,	9	* This program is distributed in the hope that it would be useful,
10	* but WITHOUT ANY WARRANTY; without even the implied warranty of	10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	* GNU General Public License for more details.	12	* GNU General Public License for more details.
13	*	13	*
14	* You should have received a copy of the GNU General Public License	14	* You should have received a copy of the GNU General Public License
15	* along with this program; if not, write the Free Software Foundation,	15	* along with this program; if not, write the Free Software Foundation,
16	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA	16	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17	*/	17	*/
18	#include "xfs.h"	18	#include "xfs.h"
19	#include <linux/stddef.h>	19	#include <linux/stddef.h>
20	#include <linux/errno.h>	20	#include <linux/errno.h>
21	#include <linux/slab.h>	21	#include <linux/slab.h>
22	#include <linux/pagemap.h>	22	#include <linux/pagemap.h>
23	#include <linux/init.h>	23	#include <linux/init.h>
24	#include <linux/vmalloc.h>	24	#include <linux/vmalloc.h>
25	#include <linux/bio.h>	25	#include <linux/bio.h>
26	#include <linux/sysctl.h>	26	#include <linux/sysctl.h>
27	#include <linux/proc_fs.h>	27	#include <linux/proc_fs.h>
28	#include <linux/workqueue.h>	28	#include <linux/workqueue.h>
29	#include <linux/percpu.h>	29	#include <linux/percpu.h>
30	#include <linux/blkdev.h>	30	#include <linux/blkdev.h>
31	#include <linux/hash.h>	31	#include <linux/hash.h>
32	#include <linux/kthread.h>	32	#include <linux/kthread.h>
33	#include <linux/migrate.h>	33	#include <linux/migrate.h>
34	#include <linux/backing-dev.h>	34	#include <linux/backing-dev.h>
35	#include <linux/freezer.h>	35	#include <linux/freezer.h>
36		36
37	static kmem_zone_t *xfs_buf_zone;	37	static kmem_zone_t *xfs_buf_zone;
38	STATIC int xfsbufd(void *);	38	STATIC int xfsbufd(void *);
39	STATIC int xfsbufd_wakeup(int, gfp_t);	39	STATIC int xfsbufd_wakeup(int, gfp_t);
40	STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);	40	STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
41	static struct shrinker xfs_buf_shake = {	41	static struct shrinker xfs_buf_shake = {
42	.shrink = xfsbufd_wakeup,	42	.shrink = xfsbufd_wakeup,
43	.seeks = DEFAULT_SEEKS,	43	.seeks = DEFAULT_SEEKS,
44	};	44	};
45		45
46	static struct workqueue_struct *xfslogd_workqueue;	46	static struct workqueue_struct *xfslogd_workqueue;
47	struct workqueue_struct *xfsdatad_workqueue;	47	struct workqueue_struct *xfsdatad_workqueue;
48		48
49	#ifdef XFS_BUF_TRACE	49	#ifdef XFS_BUF_TRACE
50	void	50	void
51	xfs_buf_trace(	51	xfs_buf_trace(
52	xfs_buf_t *bp,	52	xfs_buf_t *bp,
53	char *id,	53	char *id,
54	void *data,	54	void *data,
55	void *ra)	55	void *ra)
56	{	56	{
57	ktrace_enter(xfs_buf_trace_buf,	57	ktrace_enter(xfs_buf_trace_buf,
58	bp, id,	58	bp, id,
59	(void *)(unsigned long)bp->b_flags,	59	(void *)(unsigned long)bp->b_flags,
60	(void *)(unsigned long)bp->b_hold.counter,	60	(void *)(unsigned long)bp->b_hold.counter,
61	(void *)(unsigned long)bp->b_sema.count,	61	(void *)(unsigned long)bp->b_sema.count,
62	(void *)current,	62	(void *)current,
63	data, ra,	63	data, ra,
64	(void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff),	64	(void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff),
65	(void *)(unsigned long)(bp->b_file_offset & 0xffffffff),	65	(void *)(unsigned long)(bp->b_file_offset & 0xffffffff),
66	(void *)(unsigned long)bp->b_buffer_length,	66	(void *)(unsigned long)bp->b_buffer_length,
67	NULL, NULL, NULL, NULL, NULL);	67	NULL, NULL, NULL, NULL, NULL);
68	}	68	}
69	ktrace_t *xfs_buf_trace_buf;	69	ktrace_t *xfs_buf_trace_buf;
70	#define XFS_BUF_TRACE_SIZE 4096	70	#define XFS_BUF_TRACE_SIZE 4096
71	#define XB_TRACE(bp, id, data) \	71	#define XB_TRACE(bp, id, data) \
72	xfs_buf_trace(bp, id, (void )data, (void )__builtin_return_address(0))	72	xfs_buf_trace(bp, id, (void )data, (void )__builtin_return_address(0))
73	#else	73	#else
74	#define XB_TRACE(bp, id, data) do { } while (0)	74	#define XB_TRACE(bp, id, data) do { } while (0)
75	#endif	75	#endif
76		76
77	#ifdef XFS_BUF_LOCK_TRACKING	77	#ifdef XFS_BUF_LOCK_TRACKING
78	# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)	78	# define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)
79	# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1)	79	# define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1)
80	# define XB_GET_OWNER(bp) ((bp)->b_last_holder)	80	# define XB_GET_OWNER(bp) ((bp)->b_last_holder)
81	#else	81	#else
82	# define XB_SET_OWNER(bp) do { } while (0)	82	# define XB_SET_OWNER(bp) do { } while (0)
83	# define XB_CLEAR_OWNER(bp) do { } while (0)	83	# define XB_CLEAR_OWNER(bp) do { } while (0)
84	# define XB_GET_OWNER(bp) do { } while (0)	84	# define XB_GET_OWNER(bp) do { } while (0)
85	#endif	85	#endif
86		86
87	#define xb_to_gfp(flags) \	87	#define xb_to_gfp(flags) \
88	((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \	88	((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \
89	((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) \| __GFP_NOWARN)	89	((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) \| __GFP_NOWARN)
90		90
91	#define xb_to_km(flags) \	91	#define xb_to_km(flags) \
92	(((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)	92	(((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
93		93
94	#define xfs_buf_allocate(flags) \	94	#define xfs_buf_allocate(flags) \
95	kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))	95	kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))
96	#define xfs_buf_deallocate(bp) \	96	#define xfs_buf_deallocate(bp) \
97	kmem_zone_free(xfs_buf_zone, (bp));	97	kmem_zone_free(xfs_buf_zone, (bp));
98		98
99	/*	99	/*
100	* Page Region interfaces.	100	* Page Region interfaces.
101	*	101	*
102	* For pages in filesystems where the blocksize is smaller than the	102	* For pages in filesystems where the blocksize is smaller than the
103	* pagesize, we use the page->private field (long) to hold a bitmap	103	* pagesize, we use the page->private field (long) to hold a bitmap
104	* of uptodate regions within the page.	104	* of uptodate regions within the page.
105	*	105	*
106	* Each such region is "bytes per page / bits per long" bytes long.	106	* Each such region is "bytes per page / bits per long" bytes long.
107	*	107	*
108	* NBPPR == number-of-bytes-per-page-region	108	* NBPPR == number-of-bytes-per-page-region
109	* BTOPR == bytes-to-page-region (rounded up)	109	* BTOPR == bytes-to-page-region (rounded up)
110	* BTOPRT == bytes-to-page-region-truncated (rounded down)	110	* BTOPRT == bytes-to-page-region-truncated (rounded down)
111	*/	111	*/
112	#if (BITS_PER_LONG == 32)	112	#if (BITS_PER_LONG == 32)
113	#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */	113	#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */
114	#elif (BITS_PER_LONG == 64)	114	#elif (BITS_PER_LONG == 64)
115	#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */	115	#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */
116	#else	116	#else
117	#error BITS_PER_LONG must be 32 or 64	117	#error BITS_PER_LONG must be 32 or 64
118	#endif	118	#endif
119	#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG)	119	#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG)
120	#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)	120	#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
121	#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT))	121	#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT))
122		122
123	STATIC unsigned long	123	STATIC unsigned long
124	page_region_mask(	124	page_region_mask(
125	size_t offset,	125	size_t offset,
126	size_t length)	126	size_t length)
127	{	127	{
128	unsigned long mask;	128	unsigned long mask;
129	int first, final;	129	int first, final;
130		130
131	first = BTOPR(offset);	131	first = BTOPR(offset);
132	final = BTOPRT(offset + length - 1);	132	final = BTOPRT(offset + length - 1);
133	first = min(first, final);	133	first = min(first, final);
134		134
135	mask = ~0UL;	135	mask = ~0UL;
136	mask <<= BITS_PER_LONG - (final - first);	136	mask <<= BITS_PER_LONG - (final - first);
137	mask >>= BITS_PER_LONG - (final);	137	mask >>= BITS_PER_LONG - (final);
138		138
139	ASSERT(offset + length <= PAGE_CACHE_SIZE);	139	ASSERT(offset + length <= PAGE_CACHE_SIZE);
140	ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);	140	ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
141		141
142	return mask;	142	return mask;
143	}	143	}
144		144
145	STATIC_INLINE void	145	STATIC_INLINE void
146	set_page_region(	146	set_page_region(
147	struct page *page,	147	struct page *page,
148	size_t offset,	148	size_t offset,
149	size_t length)	149	size_t length)
150	{	150	{
151	set_page_private(page,	151	set_page_private(page,
152	page_private(page) \| page_region_mask(offset, length));	152	page_private(page) \| page_region_mask(offset, length));
153	if (page_private(page) == ~0UL)	153	if (page_private(page) == ~0UL)
154	SetPageUptodate(page);	154	SetPageUptodate(page);
155	}	155	}
156		156
157	STATIC_INLINE int	157	STATIC_INLINE int
158	test_page_region(	158	test_page_region(
159	struct page *page,	159	struct page *page,
160	size_t offset,	160	size_t offset,
161	size_t length)	161	size_t length)
162	{	162	{
163	unsigned long mask = page_region_mask(offset, length);	163	unsigned long mask = page_region_mask(offset, length);
164		164
165	return (mask && (page_private(page) & mask) == mask);	165	return (mask && (page_private(page) & mask) == mask);
166	}	166	}
167		167
168	/*	168	/*
169	* Mapping of multi-page buffers into contiguous virtual space	169	* Mapping of multi-page buffers into contiguous virtual space
170	*/	170	*/
171		171
172	typedef struct a_list {	172	typedef struct a_list {
173	void *vm_addr;	173	void *vm_addr;
174	struct a_list *next;	174	struct a_list *next;
175	} a_list_t;	175	} a_list_t;
176		176
177	static a_list_t *as_free_head;	177	static a_list_t *as_free_head;
178	static int as_list_len;	178	static int as_list_len;
179	static DEFINE_SPINLOCK(as_lock);	179	static DEFINE_SPINLOCK(as_lock);
180		180
181	/*	181	/*
182	* Try to batch vunmaps because they are costly.	182	* Try to batch vunmaps because they are costly.
183	*/	183	*/
184	STATIC void	184	STATIC void
185	free_address(	185	free_address(
186	void *addr)	186	void *addr)
187	{	187	{
188	a_list_t *aentry;	188	a_list_t *aentry;
189		189
190	#ifdef CONFIG_XEN	190	#ifdef CONFIG_XEN
191	/*	191	/*
192	* Xen needs to be able to make sure it can get an exclusive	192	* Xen needs to be able to make sure it can get an exclusive
193	* RO mapping of pages it wants to turn into a pagetable. If	193	* RO mapping of pages it wants to turn into a pagetable. If
194	* a newly allocated page is also still being vmap()ed by xfs,	194	* a newly allocated page is also still being vmap()ed by xfs,
195	* it will cause pagetable construction to fail. This is a	195	* it will cause pagetable construction to fail. This is a
196	* quick workaround to always eagerly unmap pages so that Xen	196	* quick workaround to always eagerly unmap pages so that Xen
197	* is happy.	197	* is happy.
198	*/	198	*/
199	vunmap(addr);	199	vunmap(addr);
200	return;	200	return;
201	#endif	201	#endif
202		202
203	aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);	203	aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
204	if (likely(aentry)) {	204	if (likely(aentry)) {
205	spin_lock(&as_lock);	205	spin_lock(&as_lock);
206	aentry->next = as_free_head;	206	aentry->next = as_free_head;
207	aentry->vm_addr = addr;	207	aentry->vm_addr = addr;
208	as_free_head = aentry;	208	as_free_head = aentry;
209	as_list_len++;	209	as_list_len++;
210	spin_unlock(&as_lock);	210	spin_unlock(&as_lock);
211	} else {	211	} else {
212	vunmap(addr);	212	vunmap(addr);
213	}	213	}
214	}	214	}
215		215
216	STATIC void	216	STATIC void
217	purge_addresses(void)	217	purge_addresses(void)
218	{	218	{
219	a_list_t aentry, old;	219	a_list_t aentry, old;
220		220
221	if (as_free_head == NULL)	221	if (as_free_head == NULL)
222	return;	222	return;
223		223
224	spin_lock(&as_lock);	224	spin_lock(&as_lock);
225	aentry = as_free_head;	225	aentry = as_free_head;
226	as_free_head = NULL;	226	as_free_head = NULL;
227	as_list_len = 0;	227	as_list_len = 0;
228	spin_unlock(&as_lock);	228	spin_unlock(&as_lock);
229		229
230	while ((old = aentry) != NULL) {	230	while ((old = aentry) != NULL) {
231	vunmap(aentry->vm_addr);	231	vunmap(aentry->vm_addr);
232	aentry = aentry->next;	232	aentry = aentry->next;
233	kfree(old);	233	kfree(old);
234	}	234	}
235	}	235	}
236		236
237	/*	237	/*
238	* Internal xfs_buf_t object manipulation	238	* Internal xfs_buf_t object manipulation
239	*/	239	*/
240		240
241	STATIC void	241	STATIC void
242	_xfs_buf_initialize(	242	_xfs_buf_initialize(
243	xfs_buf_t *bp,	243	xfs_buf_t *bp,
244	xfs_buftarg_t *target,	244	xfs_buftarg_t *target,
245	xfs_off_t range_base,	245	xfs_off_t range_base,
246	size_t range_length,	246	size_t range_length,
247	xfs_buf_flags_t flags)	247	xfs_buf_flags_t flags)
248	{	248	{
249	/*	249	/*
250	* We don't want certain flags to appear in b_flags.	250	* We don't want certain flags to appear in b_flags.
251	*/	251	*/
252	flags &= ~(XBF_LOCK\|XBF_MAPPED\|XBF_DONT_BLOCK\|XBF_READ_AHEAD);	252	flags &= ~(XBF_LOCK\|XBF_MAPPED\|XBF_DONT_BLOCK\|XBF_READ_AHEAD);
253		253
254	memset(bp, 0, sizeof(xfs_buf_t));	254	memset(bp, 0, sizeof(xfs_buf_t));
255	atomic_set(&bp->b_hold, 1);	255	atomic_set(&bp->b_hold, 1);
256	init_completion(&bp->b_iowait);	256	init_completion(&bp->b_iowait);
257	INIT_LIST_HEAD(&bp->b_list);	257	INIT_LIST_HEAD(&bp->b_list);
258	INIT_LIST_HEAD(&bp->b_hash_list);	258	INIT_LIST_HEAD(&bp->b_hash_list);
259	init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */	259	init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
260	XB_SET_OWNER(bp);	260	XB_SET_OWNER(bp);
261	bp->b_target = target;	261	bp->b_target = target;
262	bp->b_file_offset = range_base;	262	bp->b_file_offset = range_base;
263	/*	263	/*
264	* Set buffer_length and count_desired to the same value initially.	264	* Set buffer_length and count_desired to the same value initially.
265	* I/O routines should use count_desired, which will be the same in	265	* I/O routines should use count_desired, which will be the same in
266	* most cases but may be reset (e.g. XFS recovery).	266	* most cases but may be reset (e.g. XFS recovery).
267	*/	267	*/
268	bp->b_buffer_length = bp->b_count_desired = range_length;	268	bp->b_buffer_length = bp->b_count_desired = range_length;
269	bp->b_flags = flags;	269	bp->b_flags = flags;
270	bp->b_bn = XFS_BUF_DADDR_NULL;	270	bp->b_bn = XFS_BUF_DADDR_NULL;
271	atomic_set(&bp->b_pin_count, 0);	271	atomic_set(&bp->b_pin_count, 0);
272	init_waitqueue_head(&bp->b_waiters);	272	init_waitqueue_head(&bp->b_waiters);
273		273
274	XFS_STATS_INC(xb_create);	274	XFS_STATS_INC(xb_create);
275	XB_TRACE(bp, "initialize", target);	275	XB_TRACE(bp, "initialize", target);
276	}	276	}
277		277
278	/*	278	/*
279	* Allocate a page array capable of holding a specified number	279	* Allocate a page array capable of holding a specified number
280	* of pages, and point the page buf at it.	280	* of pages, and point the page buf at it.
281	*/	281	*/
282	STATIC int	282	STATIC int
283	_xfs_buf_get_pages(	283	_xfs_buf_get_pages(
284	xfs_buf_t *bp,	284	xfs_buf_t *bp,
285	int page_count,	285	int page_count,
286	xfs_buf_flags_t flags)	286	xfs_buf_flags_t flags)
287	{	287	{
288	/* Make sure that we have a page list */	288	/* Make sure that we have a page list */
289	if (bp->b_pages == NULL) {	289	if (bp->b_pages == NULL) {
290	bp->b_offset = xfs_buf_poff(bp->b_file_offset);	290	bp->b_offset = xfs_buf_poff(bp->b_file_offset);
291	bp->b_page_count = page_count;	291	bp->b_page_count = page_count;
292	if (page_count <= XB_PAGES) {	292	if (page_count <= XB_PAGES) {
293	bp->b_pages = bp->b_page_array;	293	bp->b_pages = bp->b_page_array;
294	} else {	294	} else {
295	bp->b_pages = kmem_alloc(sizeof(struct page )	295	bp->b_pages = kmem_alloc(sizeof(struct page )
296	page_count, xb_to_km(flags));	296	page_count, xb_to_km(flags));
297	if (bp->b_pages == NULL)	297	if (bp->b_pages == NULL)
298	return -ENOMEM;	298	return -ENOMEM;
299	}	299	}
300	memset(bp->b_pages, 0, sizeof(struct page ) page_count);	300	memset(bp->b_pages, 0, sizeof(struct page ) page_count);
301	}	301	}
302	return 0;	302	return 0;
303	}	303	}
304		304
305	/*	305	/*
306	* Frees b_pages if it was allocated.	306	* Frees b_pages if it was allocated.
307	*/	307	*/
308	STATIC void	308	STATIC void
309	_xfs_buf_free_pages(	309	_xfs_buf_free_pages(
310	xfs_buf_t *bp)	310	xfs_buf_t *bp)
311	{	311	{
312	if (bp->b_pages != bp->b_page_array) {	312	if (bp->b_pages != bp->b_page_array) {
313	kmem_free(bp->b_pages);	313	kmem_free(bp->b_pages);
314	}	314	}
315	}	315	}
316		316
317	/*	317	/*
318	* Releases the specified buffer.	318	* Releases the specified buffer.
319	*	319	*
320	* The modification state of any associated pages is left unchanged.	320	* The modification state of any associated pages is left unchanged.
321	* The buffer most not be on any hash - use xfs_buf_rele instead for	321	* The buffer most not be on any hash - use xfs_buf_rele instead for
322	* hashed and refcounted buffers	322	* hashed and refcounted buffers
323	*/	323	*/
324	void	324	void
325	xfs_buf_free(	325	xfs_buf_free(
326	xfs_buf_t *bp)	326	xfs_buf_t *bp)
327	{	327	{
328	XB_TRACE(bp, "free", 0);	328	XB_TRACE(bp, "free", 0);
329		329
330	ASSERT(list_empty(&bp->b_hash_list));	330	ASSERT(list_empty(&bp->b_hash_list));
331		331
332	if (bp->b_flags & (_XBF_PAGE_CACHE\|_XBF_PAGES)) {	332	if (bp->b_flags & (_XBF_PAGE_CACHE\|_XBF_PAGES)) {
333	uint i;	333	uint i;
334		334
335	if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))	335	if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
336	free_address(bp->b_addr - bp->b_offset);	336	free_address(bp->b_addr - bp->b_offset);
337		337
338	for (i = 0; i < bp->b_page_count; i++) {	338	for (i = 0; i < bp->b_page_count; i++) {
339	struct page *page = bp->b_pages[i];	339	struct page *page = bp->b_pages[i];
340		340
341	if (bp->b_flags & _XBF_PAGE_CACHE)	341	if (bp->b_flags & _XBF_PAGE_CACHE)
342	ASSERT(!PagePrivate(page));	342	ASSERT(!PagePrivate(page));
343	page_cache_release(page);	343	page_cache_release(page);
344	}	344	}
345	_xfs_buf_free_pages(bp);	345	_xfs_buf_free_pages(bp);
346	}	346	}
347		347
348	xfs_buf_deallocate(bp);	348	xfs_buf_deallocate(bp);
349	}	349	}
350		350
351	/*	351	/*
352	* Finds all pages for buffer in question and builds it's page list.	352	* Finds all pages for buffer in question and builds it's page list.
353	*/	353	*/
354	STATIC int	354	STATIC int
355	_xfs_buf_lookup_pages(	355	_xfs_buf_lookup_pages(
356	xfs_buf_t *bp,	356	xfs_buf_t *bp,
357	uint flags)	357	uint flags)
358	{	358	{
359	struct address_space *mapping = bp->b_target->bt_mapping;	359	struct address_space *mapping = bp->b_target->bt_mapping;
360	size_t blocksize = bp->b_target->bt_bsize;	360	size_t blocksize = bp->b_target->bt_bsize;
361	size_t size = bp->b_count_desired;	361	size_t size = bp->b_count_desired;
362	size_t nbytes, offset;	362	size_t nbytes, offset;
363	gfp_t gfp_mask = xb_to_gfp(flags);	363	gfp_t gfp_mask = xb_to_gfp(flags);
364	unsigned short page_count, i;	364	unsigned short page_count, i;
365	pgoff_t first;	365	pgoff_t first;
366	xfs_off_t end;	366	xfs_off_t end;
367	int error;	367	int error;
368		368
369	end = bp->b_file_offset + bp->b_buffer_length;	369	end = bp->b_file_offset + bp->b_buffer_length;
370	page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);	370	page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
371		371
372	error = _xfs_buf_get_pages(bp, page_count, flags);	372	error = _xfs_buf_get_pages(bp, page_count, flags);
373	if (unlikely(error))	373	if (unlikely(error))
374	return error;	374	return error;
375	bp->b_flags \|= _XBF_PAGE_CACHE;	375	bp->b_flags \|= _XBF_PAGE_CACHE;
376		376
377	offset = bp->b_offset;	377	offset = bp->b_offset;
378	first = bp->b_file_offset >> PAGE_CACHE_SHIFT;	378	first = bp->b_file_offset >> PAGE_CACHE_SHIFT;
379		379
380	for (i = 0; i < bp->b_page_count; i++) {	380	for (i = 0; i < bp->b_page_count; i++) {
381	struct page *page;	381	struct page *page;
382	uint retries = 0;	382	uint retries = 0;
383		383
384	retry:	384	retry:
385	page = find_or_create_page(mapping, first + i, gfp_mask);	385	page = find_or_create_page(mapping, first + i, gfp_mask);
386	if (unlikely(page == NULL)) {	386	if (unlikely(page == NULL)) {
387	if (flags & XBF_READ_AHEAD) {	387	if (flags & XBF_READ_AHEAD) {
388	bp->b_page_count = i;	388	bp->b_page_count = i;
389	for (i = 0; i < bp->b_page_count; i++)	389	for (i = 0; i < bp->b_page_count; i++)
390	unlock_page(bp->b_pages[i]);	390	unlock_page(bp->b_pages[i]);
391	return -ENOMEM;	391	return -ENOMEM;
392	}	392	}
393		393
394	/*	394	/*
395	* This could deadlock.	395	* This could deadlock.
396	*	396	*
397	* But until all the XFS lowlevel code is revamped to	397	* But until all the XFS lowlevel code is revamped to
398	* handle buffer allocation failures we can't do much.	398	* handle buffer allocation failures we can't do much.
399	*/	399	*/
400	if (!(++retries % 100))	400	if (!(++retries % 100))
401	printk(KERN_ERR	401	printk(KERN_ERR
402	"XFS: possible memory allocation "	402	"XFS: possible memory allocation "
403	"deadlock in %s (mode:0x%x)\n",	403	"deadlock in %s (mode:0x%x)\n",
404	__func__, gfp_mask);	404	__func__, gfp_mask);
405		405
406	XFS_STATS_INC(xb_page_retries);	406	XFS_STATS_INC(xb_page_retries);
407	xfsbufd_wakeup(0, gfp_mask);	407	xfsbufd_wakeup(0, gfp_mask);
408	congestion_wait(WRITE, HZ/50);	408	congestion_wait(WRITE, HZ/50);
409	goto retry;	409	goto retry;
410	}	410	}
411		411
412	XFS_STATS_INC(xb_page_found);	412	XFS_STATS_INC(xb_page_found);
413		413
414	nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);	414	nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
415	size -= nbytes;	415	size -= nbytes;
416		416
417	ASSERT(!PagePrivate(page));	417	ASSERT(!PagePrivate(page));
418	if (!PageUptodate(page)) {	418	if (!PageUptodate(page)) {
419	page_count--;	419	page_count--;
420	if (blocksize >= PAGE_CACHE_SIZE) {	420	if (blocksize >= PAGE_CACHE_SIZE) {
421	if (flags & XBF_READ)	421	if (flags & XBF_READ)
422	bp->b_flags \|= _XBF_PAGE_LOCKED;	422	bp->b_flags \|= _XBF_PAGE_LOCKED;
423	} else if (!PagePrivate(page)) {	423	} else if (!PagePrivate(page)) {
424	if (test_page_region(page, offset, nbytes))	424	if (test_page_region(page, offset, nbytes))
425	page_count++;	425	page_count++;
426	}	426	}
427	}	427	}
428		428
429	bp->b_pages[i] = page;	429	bp->b_pages[i] = page;
430	offset = 0;	430	offset = 0;
431	}	431	}
432		432
433	if (!(bp->b_flags & _XBF_PAGE_LOCKED)) {	433	if (!(bp->b_flags & _XBF_PAGE_LOCKED)) {
434	for (i = 0; i < bp->b_page_count; i++)	434	for (i = 0; i < bp->b_page_count; i++)
435	unlock_page(bp->b_pages[i]);	435	unlock_page(bp->b_pages[i]);
436	}	436	}
437		437
438	if (page_count == bp->b_page_count)	438	if (page_count == bp->b_page_count)
439	bp->b_flags \|= XBF_DONE;	439	bp->b_flags \|= XBF_DONE;
440		440
441	XB_TRACE(bp, "lookup_pages", (long)page_count);	441	XB_TRACE(bp, "lookup_pages", (long)page_count);
442	return error;	442	return error;
443	}	443	}
444		444
445	/*	445	/*
446	* Map buffer into kernel address-space if nessecary.	446	* Map buffer into kernel address-space if nessecary.
447	*/	447	*/
448	STATIC int	448	STATIC int
449	_xfs_buf_map_pages(	449	_xfs_buf_map_pages(
450	xfs_buf_t *bp,	450	xfs_buf_t *bp,
451	uint flags)	451	uint flags)
452	{	452	{
453	/* A single page buffer is always mappable */	453	/* A single page buffer is always mappable */
454	if (bp->b_page_count == 1) {	454	if (bp->b_page_count == 1) {
455	bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;	455	bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
456	bp->b_flags \|= XBF_MAPPED;	456	bp->b_flags \|= XBF_MAPPED;
457	} else if (flags & XBF_MAPPED) {	457	} else if (flags & XBF_MAPPED) {
458	if (as_list_len > 64)	458	if (as_list_len > 64)
459	purge_addresses();	459	purge_addresses();
460	bp->b_addr = vmap(bp->b_pages, bp->b_page_count,	460	bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
461	VM_MAP, PAGE_KERNEL);	461	VM_MAP, PAGE_KERNEL);
462	if (unlikely(bp->b_addr == NULL))	462	if (unlikely(bp->b_addr == NULL))
463	return -ENOMEM;	463	return -ENOMEM;
464	bp->b_addr += bp->b_offset;	464	bp->b_addr += bp->b_offset;
465	bp->b_flags \|= XBF_MAPPED;	465	bp->b_flags \|= XBF_MAPPED;
466	}	466	}
467		467
468	return 0;	468	return 0;
469	}	469	}
470		470
471	/*	471	/*
472	* Finding and Reading Buffers	472	* Finding and Reading Buffers
473	*/	473	*/
474		474
475	/*	475	/*
476	* Look up, and creates if absent, a lockable buffer for	476	* Look up, and creates if absent, a lockable buffer for
477	* a given range of an inode. The buffer is returned	477	* a given range of an inode. The buffer is returned
478	* locked. If other overlapping buffers exist, they are	478	* locked. If other overlapping buffers exist, they are
479	* released before the new buffer is created and locked,	479	* released before the new buffer is created and locked,
480	* which may imply that this call will block until those buffers	480	* which may imply that this call will block until those buffers
481	* are unlocked. No I/O is implied by this call.	481	* are unlocked. No I/O is implied by this call.
482	*/	482	*/
483	xfs_buf_t *	483	xfs_buf_t *
484	_xfs_buf_find(	484	_xfs_buf_find(
485	xfs_buftarg_t btp, / block device target */	485	xfs_buftarg_t btp, / block device target */
486	xfs_off_t ioff, /* starting offset of range */	486	xfs_off_t ioff, /* starting offset of range */
487	size_t isize, /* length of range */	487	size_t isize, /* length of range */
488	xfs_buf_flags_t flags,	488	xfs_buf_flags_t flags,
489	xfs_buf_t *new_bp)	489	xfs_buf_t *new_bp)
490	{	490	{
491	xfs_off_t range_base;	491	xfs_off_t range_base;
492	size_t range_length;	492	size_t range_length;
493	xfs_bufhash_t *hash;	493	xfs_bufhash_t *hash;
494	xfs_buf_t bp, n;	494	xfs_buf_t bp, n;
495		495
496	range_base = (ioff << BBSHIFT);	496	range_base = (ioff << BBSHIFT);
497	range_length = (isize << BBSHIFT);	497	range_length = (isize << BBSHIFT);
498		498
499	/* Check for IOs smaller than the sector size / not sector aligned */	499	/* Check for IOs smaller than the sector size / not sector aligned */
500	ASSERT(!(range_length < (1 << btp->bt_sshift)));	500	ASSERT(!(range_length < (1 << btp->bt_sshift)));
501	ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));	501	ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
502		502
503	hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];	503	hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
504		504
505	spin_lock(&hash->bh_lock);	505	spin_lock(&hash->bh_lock);
506		506
507	list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {	507	list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
508	ASSERT(btp == bp->b_target);	508	ASSERT(btp == bp->b_target);
509	if (bp->b_file_offset == range_base &&	509	if (bp->b_file_offset == range_base &&
510	bp->b_buffer_length == range_length) {	510	bp->b_buffer_length == range_length) {
511	/*	511	/*
512	* If we look at something, bring it to the	512	* If we look at something, bring it to the
513	* front of the list for next time.	513	* front of the list for next time.
514	*/	514	*/
515	atomic_inc(&bp->b_hold);	515	atomic_inc(&bp->b_hold);
516	list_move(&bp->b_hash_list, &hash->bh_list);	516	list_move(&bp->b_hash_list, &hash->bh_list);
517	goto found;	517	goto found;
518	}	518	}
519	}	519	}
520		520
521	/* No match found */	521	/* No match found */
522	if (new_bp) {	522	if (new_bp) {
523	_xfs_buf_initialize(new_bp, btp, range_base,	523	_xfs_buf_initialize(new_bp, btp, range_base,
524	range_length, flags);	524	range_length, flags);
525	new_bp->b_hash = hash;	525	new_bp->b_hash = hash;
526	list_add(&new_bp->b_hash_list, &hash->bh_list);	526	list_add(&new_bp->b_hash_list, &hash->bh_list);
527	} else {	527	} else {
528	XFS_STATS_INC(xb_miss_locked);	528	XFS_STATS_INC(xb_miss_locked);
529	}	529	}
530		530
531	spin_unlock(&hash->bh_lock);	531	spin_unlock(&hash->bh_lock);
532	return new_bp;	532	return new_bp;
533		533
534	found:	534	found:
535	spin_unlock(&hash->bh_lock);	535	spin_unlock(&hash->bh_lock);
536		536
537	/* Attempt to get the semaphore without sleeping,	537	/* Attempt to get the semaphore without sleeping,
538	* if this does not work then we need to drop the	538	* if this does not work then we need to drop the
539	* spinlock and do a hard attempt on the semaphore.	539	* spinlock and do a hard attempt on the semaphore.
540	*/	540	*/
541	if (down_trylock(&bp->b_sema)) {	541	if (down_trylock(&bp->b_sema)) {
542	if (!(flags & XBF_TRYLOCK)) {	542	if (!(flags & XBF_TRYLOCK)) {
543	/* wait for buffer ownership */	543	/* wait for buffer ownership */
544	XB_TRACE(bp, "get_lock", 0);	544	XB_TRACE(bp, "get_lock", 0);
545	xfs_buf_lock(bp);	545	xfs_buf_lock(bp);
546	XFS_STATS_INC(xb_get_locked_waited);	546	XFS_STATS_INC(xb_get_locked_waited);
547	} else {	547	} else {
548	/* We asked for a trylock and failed, no need	548	/* We asked for a trylock and failed, no need
549	* to look at file offset and length here, we	549	* to look at file offset and length here, we
550	* know that this buffer at least overlaps our	550	* know that this buffer at least overlaps our
551	* buffer and is locked, therefore our buffer	551	* buffer and is locked, therefore our buffer
552	* either does not exist, or is this buffer.	552	* either does not exist, or is this buffer.
553	*/	553	*/
554	xfs_buf_rele(bp);	554	xfs_buf_rele(bp);
555	XFS_STATS_INC(xb_busy_locked);	555	XFS_STATS_INC(xb_busy_locked);
556	return NULL;	556	return NULL;
557	}	557	}
558	} else {	558	} else {
559	/* trylock worked */	559	/* trylock worked */
560	XB_SET_OWNER(bp);	560	XB_SET_OWNER(bp);
561	}	561	}
562		562
563	if (bp->b_flags & XBF_STALE) {	563	if (bp->b_flags & XBF_STALE) {
564	ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);	564	ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
565	bp->b_flags &= XBF_MAPPED;	565	bp->b_flags &= XBF_MAPPED;
566	}	566	}
567	XB_TRACE(bp, "got_lock", 0);	567	XB_TRACE(bp, "got_lock", 0);
568	XFS_STATS_INC(xb_get_locked);	568	XFS_STATS_INC(xb_get_locked);
569	return bp;	569	return bp;
570	}	570	}
571		571
572	/*	572	/*
573	* Assembles a buffer covering the specified range.	573	* Assembles a buffer covering the specified range.
574	* Storage in memory for all portions of the buffer will be allocated,	574	* Storage in memory for all portions of the buffer will be allocated,
575	* although backing storage may not be.	575	* although backing storage may not be.
576	*/	576	*/
577	xfs_buf_t *	577	xfs_buf_t *
578	xfs_buf_get_flags(	578	xfs_buf_get_flags(
579	xfs_buftarg_t target,/ target for buffer */	579	xfs_buftarg_t target,/ target for buffer */
580	xfs_off_t ioff, /* starting offset of range */	580	xfs_off_t ioff, /* starting offset of range */
581	size_t isize, /* length of range */	581	size_t isize, /* length of range */
582	xfs_buf_flags_t flags)	582	xfs_buf_flags_t flags)
583	{	583	{
584	xfs_buf_t bp, new_bp;	584	xfs_buf_t bp, new_bp;
585	int error = 0, i;	585	int error = 0, i;
586		586
587	new_bp = xfs_buf_allocate(flags);	587	new_bp = xfs_buf_allocate(flags);
588	if (unlikely(!new_bp))	588	if (unlikely(!new_bp))
589	return NULL;	589	return NULL;
590		590
591	bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);	591	bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
592	if (bp == new_bp) {	592	if (bp == new_bp) {
593	error = _xfs_buf_lookup_pages(bp, flags);	593	error = _xfs_buf_lookup_pages(bp, flags);
594	if (error)	594	if (error)
595	goto no_buffer;	595	goto no_buffer;
596	} else {	596	} else {
597	xfs_buf_deallocate(new_bp);	597	xfs_buf_deallocate(new_bp);
598	if (unlikely(bp == NULL))	598	if (unlikely(bp == NULL))
599	return NULL;	599	return NULL;
600	}	600	}
601		601
602	for (i = 0; i < bp->b_page_count; i++)	602	for (i = 0; i < bp->b_page_count; i++)
603	mark_page_accessed(bp->b_pages[i]);	603	mark_page_accessed(bp->b_pages[i]);
604		604
605	if (!(bp->b_flags & XBF_MAPPED)) {	605	if (!(bp->b_flags & XBF_MAPPED)) {
606	error = _xfs_buf_map_pages(bp, flags);	606	error = _xfs_buf_map_pages(bp, flags);
607	if (unlikely(error)) {	607	if (unlikely(error)) {
608	printk(KERN_WARNING "%s: failed to map pages\n",	608	printk(KERN_WARNING "%s: failed to map pages\n",
609	__func__);	609	__func__);
610	goto no_buffer;	610	goto no_buffer;
611	}	611	}
612	}	612	}
613		613
614	XFS_STATS_INC(xb_get);	614	XFS_STATS_INC(xb_get);
615		615
616	/*	616	/*
617	* Always fill in the block number now, the mapped cases can do	617	* Always fill in the block number now, the mapped cases can do
618	* their own overlay of this later.	618	* their own overlay of this later.
619	*/	619	*/
620	bp->b_bn = ioff;	620	bp->b_bn = ioff;
621	bp->b_count_desired = bp->b_buffer_length;	621	bp->b_count_desired = bp->b_buffer_length;
622		622
623	XB_TRACE(bp, "get", (unsigned long)flags);	623	XB_TRACE(bp, "get", (unsigned long)flags);
624	return bp;	624	return bp;
625		625
626	no_buffer:	626	no_buffer:
627	if (flags & (XBF_LOCK \| XBF_TRYLOCK))	627	if (flags & (XBF_LOCK \| XBF_TRYLOCK))
628	xfs_buf_unlock(bp);	628	xfs_buf_unlock(bp);
629	xfs_buf_rele(bp);	629	xfs_buf_rele(bp);
630	return NULL;	630	return NULL;
631	}	631	}
632		632
633	STATIC int	633	STATIC int
634	_xfs_buf_read(	634	_xfs_buf_read(
635	xfs_buf_t *bp,	635	xfs_buf_t *bp,
636	xfs_buf_flags_t flags)	636	xfs_buf_flags_t flags)
637	{	637	{
638	int status;	638	int status;
639		639
640	XB_TRACE(bp, "_xfs_buf_read", (unsigned long)flags);	640	XB_TRACE(bp, "_xfs_buf_read", (unsigned long)flags);
641		641
642	ASSERT(!(flags & (XBF_DELWRI\|XBF_WRITE)));	642	ASSERT(!(flags & (XBF_DELWRI\|XBF_WRITE)));
643	ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);	643	ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
644		644
645	bp->b_flags &= ~(XBF_WRITE \| XBF_ASYNC \| XBF_DELWRI \| \	645	bp->b_flags &= ~(XBF_WRITE \| XBF_ASYNC \| XBF_DELWRI \| \
646	XBF_READ_AHEAD \| _XBF_RUN_QUEUES);	646	XBF_READ_AHEAD \| _XBF_RUN_QUEUES);
647	bp->b_flags \|= flags & (XBF_READ \| XBF_ASYNC \| \	647	bp->b_flags \|= flags & (XBF_READ \| XBF_ASYNC \| \
648	XBF_READ_AHEAD \| _XBF_RUN_QUEUES);	648	XBF_READ_AHEAD \| _XBF_RUN_QUEUES);
649		649
650	status = xfs_buf_iorequest(bp);	650	status = xfs_buf_iorequest(bp);
651	if (!status && !(flags & XBF_ASYNC))	651	if (!status && !(flags & XBF_ASYNC))
652	status = xfs_buf_iowait(bp);	652	status = xfs_buf_iowait(bp);
653	return status;	653	return status;
654	}	654	}
655		655
656	xfs_buf_t *	656	xfs_buf_t *
657	xfs_buf_read_flags(	657	xfs_buf_read_flags(
658	xfs_buftarg_t *target,	658	xfs_buftarg_t *target,
659	xfs_off_t ioff,	659	xfs_off_t ioff,
660	size_t isize,	660	size_t isize,
661	xfs_buf_flags_t flags)	661	xfs_buf_flags_t flags)
662	{	662	{
663	xfs_buf_t *bp;	663	xfs_buf_t *bp;
664		664
665	flags \|= XBF_READ;	665	flags \|= XBF_READ;
666		666
667	bp = xfs_buf_get_flags(target, ioff, isize, flags);	667	bp = xfs_buf_get_flags(target, ioff, isize, flags);
668	if (bp) {	668	if (bp) {
669	if (!XFS_BUF_ISDONE(bp)) {	669	if (!XFS_BUF_ISDONE(bp)) {
670	XB_TRACE(bp, "read", (unsigned long)flags);	670	XB_TRACE(bp, "read", (unsigned long)flags);
671	XFS_STATS_INC(xb_get_read);	671	XFS_STATS_INC(xb_get_read);
672	_xfs_buf_read(bp, flags);	672	_xfs_buf_read(bp, flags);
673	} else if (flags & XBF_ASYNC) {	673	} else if (flags & XBF_ASYNC) {
674	XB_TRACE(bp, "read_async", (unsigned long)flags);	674	XB_TRACE(bp, "read_async", (unsigned long)flags);
675	/*	675	/*
676	* Read ahead call which is already satisfied,	676	* Read ahead call which is already satisfied,
677	* drop the buffer	677	* drop the buffer
678	*/	678	*/
679	goto no_buffer;	679	goto no_buffer;
680	} else {	680	} else {
681	XB_TRACE(bp, "read_done", (unsigned long)flags);	681	XB_TRACE(bp, "read_done", (unsigned long)flags);
682	/* We do not want read in the flags */	682	/* We do not want read in the flags */
683	bp->b_flags &= ~XBF_READ;	683	bp->b_flags &= ~XBF_READ;
684	}	684	}
685	}	685	}
686		686
687	return bp;	687	return bp;
688		688
689	no_buffer:	689	no_buffer:
690	if (flags & (XBF_LOCK \| XBF_TRYLOCK))	690	if (flags & (XBF_LOCK \| XBF_TRYLOCK))
691	xfs_buf_unlock(bp);	691	xfs_buf_unlock(bp);
692	xfs_buf_rele(bp);	692	xfs_buf_rele(bp);
693	return NULL;	693	return NULL;
694	}	694	}
695		695
696	/*	696	/*
697	* If we are not low on memory then do the readahead in a deadlock	697	* If we are not low on memory then do the readahead in a deadlock
698	* safe manner.	698	* safe manner.
699	*/	699	*/
700	void	700	void
701	xfs_buf_readahead(	701	xfs_buf_readahead(
702	xfs_buftarg_t *target,	702	xfs_buftarg_t *target,
703	xfs_off_t ioff,	703	xfs_off_t ioff,
704	size_t isize,	704	size_t isize,
705	xfs_buf_flags_t flags)	705	xfs_buf_flags_t flags)
706	{	706	{
707	struct backing_dev_info *bdi;	707	struct backing_dev_info *bdi;
708		708
709	bdi = target->bt_mapping->backing_dev_info;	709	bdi = target->bt_mapping->backing_dev_info;
710	if (bdi_read_congested(bdi))	710	if (bdi_read_congested(bdi))
711	return;	711	return;
712		712
713	flags \|= (XBF_TRYLOCK\|XBF_ASYNC\|XBF_READ_AHEAD);	713	flags \|= (XBF_TRYLOCK\|XBF_ASYNC\|XBF_READ_AHEAD);
714	xfs_buf_read_flags(target, ioff, isize, flags);	714	xfs_buf_read_flags(target, ioff, isize, flags);
715	}	715	}
716		716
717	xfs_buf_t *	717	xfs_buf_t *
718	xfs_buf_get_empty(	718	xfs_buf_get_empty(
719	size_t len,	719	size_t len,
720	xfs_buftarg_t *target)	720	xfs_buftarg_t *target)
721	{	721	{
722	xfs_buf_t *bp;	722	xfs_buf_t *bp;
723		723
724	bp = xfs_buf_allocate(0);	724	bp = xfs_buf_allocate(0);
725	if (bp)	725	if (bp)
726	_xfs_buf_initialize(bp, target, 0, len, 0);	726	_xfs_buf_initialize(bp, target, 0, len, 0);
727	return bp;	727	return bp;
728	}	728	}
729		729
730	static inline struct page *	730	static inline struct page *
731	mem_to_page(	731	mem_to_page(
732	void *addr)	732	void *addr)
733	{	733	{
734	if ((!is_vmalloc_addr(addr))) {	734	if ((!is_vmalloc_addr(addr))) {
735	return virt_to_page(addr);	735	return virt_to_page(addr);
736	} else {	736	} else {
737	return vmalloc_to_page(addr);	737	return vmalloc_to_page(addr);
738	}	738	}
739	}	739	}
740		740
741	int	741	int
742	xfs_buf_associate_memory(	742	xfs_buf_associate_memory(
743	xfs_buf_t *bp,	743	xfs_buf_t *bp,
744	void *mem,	744	void *mem,
745	size_t len)	745	size_t len)
746	{	746	{
747	int rval;	747	int rval;
748	int i = 0;	748	int i = 0;
749	unsigned long pageaddr;	749	unsigned long pageaddr;
750	unsigned long offset;	750	unsigned long offset;
751	size_t buflen;	751	size_t buflen;
752	int page_count;	752	int page_count;
753		753
754	pageaddr = (unsigned long)mem & PAGE_CACHE_MASK;	754	pageaddr = (unsigned long)mem & PAGE_CACHE_MASK;
755	offset = (unsigned long)mem - pageaddr;	755	offset = (unsigned long)mem - pageaddr;
756	buflen = PAGE_CACHE_ALIGN(len + offset);	756	buflen = PAGE_CACHE_ALIGN(len + offset);
757	page_count = buflen >> PAGE_CACHE_SHIFT;	757	page_count = buflen >> PAGE_CACHE_SHIFT;
758		758
759	/* Free any previous set of page pointers */	759	/* Free any previous set of page pointers */
760	if (bp->b_pages)	760	if (bp->b_pages)
761	_xfs_buf_free_pages(bp);	761	_xfs_buf_free_pages(bp);
762		762
763	bp->b_pages = NULL;	763	bp->b_pages = NULL;
764	bp->b_addr = mem;	764	bp->b_addr = mem;
765		765
766	rval = _xfs_buf_get_pages(bp, page_count, 0);	766	rval = _xfs_buf_get_pages(bp, page_count, 0);
767	if (rval)	767	if (rval)
768	return rval;	768	return rval;
769		769
770	bp->b_offset = offset;	770	bp->b_offset = offset;
771		771
772	for (i = 0; i < bp->b_page_count; i++) {	772	for (i = 0; i < bp->b_page_count; i++) {
773	bp->b_pages[i] = mem_to_page((void *)pageaddr);	773	bp->b_pages[i] = mem_to_page((void *)pageaddr);
774	pageaddr += PAGE_CACHE_SIZE;	774	pageaddr += PAGE_CACHE_SIZE;
775	}	775	}
776		776
777	bp->b_count_desired = len;	777	bp->b_count_desired = len;
778	bp->b_buffer_length = buflen;	778	bp->b_buffer_length = buflen;
779	bp->b_flags \|= XBF_MAPPED;	779	bp->b_flags \|= XBF_MAPPED;
780	bp->b_flags &= ~_XBF_PAGE_LOCKED;	780	bp->b_flags &= ~_XBF_PAGE_LOCKED;
781		781
782	return 0;	782	return 0;
783	}	783	}
784		784
785	xfs_buf_t *	785	xfs_buf_t *
786	xfs_buf_get_noaddr(	786	xfs_buf_get_noaddr(
787	size_t len,	787	size_t len,
788	xfs_buftarg_t *target)	788	xfs_buftarg_t *target)
789	{	789	{
790	unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;	790	unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
791	int error, i;	791	int error, i;
792	xfs_buf_t *bp;	792	xfs_buf_t *bp;
793		793
794	bp = xfs_buf_allocate(0);	794	bp = xfs_buf_allocate(0);
795	if (unlikely(bp == NULL))	795	if (unlikely(bp == NULL))
796	goto fail;	796	goto fail;
797	_xfs_buf_initialize(bp, target, 0, len, 0);	797	_xfs_buf_initialize(bp, target, 0, len, 0);
798		798
799	error = _xfs_buf_get_pages(bp, page_count, 0);	799	error = _xfs_buf_get_pages(bp, page_count, 0);
800	if (error)	800	if (error)
801	goto fail_free_buf;	801	goto fail_free_buf;
802		802
803	for (i = 0; i < page_count; i++) {	803	for (i = 0; i < page_count; i++) {
804	bp->b_pages[i] = alloc_page(GFP_KERNEL);	804	bp->b_pages[i] = alloc_page(GFP_KERNEL);
805	if (!bp->b_pages[i])	805	if (!bp->b_pages[i])
806	goto fail_free_mem;	806	goto fail_free_mem;
807	}	807	}
808	bp->b_flags \|= _XBF_PAGES;	808	bp->b_flags \|= _XBF_PAGES;
809		809
810	error = _xfs_buf_map_pages(bp, XBF_MAPPED);	810	error = _xfs_buf_map_pages(bp, XBF_MAPPED);
811	if (unlikely(error)) {	811	if (unlikely(error)) {
812	printk(KERN_WARNING "%s: failed to map pages\n",	812	printk(KERN_WARNING "%s: failed to map pages\n",
813	__func__);	813	__func__);
814	goto fail_free_mem;	814	goto fail_free_mem;
815	}	815	}
816		816
817	xfs_buf_unlock(bp);	817	xfs_buf_unlock(bp);
818		818
819	XB_TRACE(bp, "no_daddr", len);	819	XB_TRACE(bp, "no_daddr", len);
820	return bp;	820	return bp;
821		821
822	fail_free_mem:	822	fail_free_mem:
823	while (--i >= 0)	823	while (--i >= 0)
824	__free_page(bp->b_pages[i]);	824	__free_page(bp->b_pages[i]);
825	_xfs_buf_free_pages(bp);	825	_xfs_buf_free_pages(bp);
826	fail_free_buf:	826	fail_free_buf:
827	xfs_buf_deallocate(bp);	827	xfs_buf_deallocate(bp);
828	fail:	828	fail:
829	return NULL;	829	return NULL;
830	}	830	}
831		831
832	/*	832	/*
833	* Increment reference count on buffer, to hold the buffer concurrently	833	* Increment reference count on buffer, to hold the buffer concurrently
834	* with another thread which may release (free) the buffer asynchronously.	834	* with another thread which may release (free) the buffer asynchronously.
835	* Must hold the buffer already to call this function.	835	* Must hold the buffer already to call this function.
836	*/	836	*/
837	void	837	void
838	xfs_buf_hold(	838	xfs_buf_hold(
839	xfs_buf_t *bp)	839	xfs_buf_t *bp)
840	{	840	{
841	atomic_inc(&bp->b_hold);	841	atomic_inc(&bp->b_hold);
842	XB_TRACE(bp, "hold", 0);	842	XB_TRACE(bp, "hold", 0);
843	}	843	}
844		844
845	/*	845	/*
846	* Releases a hold on the specified buffer. If the	846	* Releases a hold on the specified buffer. If the
847	* the hold count is 1, calls xfs_buf_free.	847	* the hold count is 1, calls xfs_buf_free.
848	*/	848	*/
849	void	849	void
850	xfs_buf_rele(	850	xfs_buf_rele(
851	xfs_buf_t *bp)	851	xfs_buf_t *bp)
852	{	852	{
853	xfs_bufhash_t *hash = bp->b_hash;	853	xfs_bufhash_t *hash = bp->b_hash;
854		854
855	XB_TRACE(bp, "rele", bp->b_relse);	855	XB_TRACE(bp, "rele", bp->b_relse);
856		856
857	if (unlikely(!hash)) {	857	if (unlikely(!hash)) {
858	ASSERT(!bp->b_relse);	858	ASSERT(!bp->b_relse);
859	if (atomic_dec_and_test(&bp->b_hold))	859	if (atomic_dec_and_test(&bp->b_hold))
860	xfs_buf_free(bp);	860	xfs_buf_free(bp);
861	return;	861	return;
862	}	862	}
863		863
864	ASSERT(atomic_read(&bp->b_hold) > 0);	864	ASSERT(atomic_read(&bp->b_hold) > 0);
865	if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {	865	if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
866	if (bp->b_relse) {	866	if (bp->b_relse) {
867	atomic_inc(&bp->b_hold);	867	atomic_inc(&bp->b_hold);
868	spin_unlock(&hash->bh_lock);	868	spin_unlock(&hash->bh_lock);
869	(*(bp->b_relse)) (bp);	869	(*(bp->b_relse)) (bp);
870	} else if (bp->b_flags & XBF_FS_MANAGED) {	870	} else if (bp->b_flags & XBF_FS_MANAGED) {
871	spin_unlock(&hash->bh_lock);	871	spin_unlock(&hash->bh_lock);
872	} else {	872	} else {
873	ASSERT(!(bp->b_flags & (XBF_DELWRI\|_XBF_DELWRI_Q)));	873	ASSERT(!(bp->b_flags & (XBF_DELWRI\|_XBF_DELWRI_Q)));
874	list_del_init(&bp->b_hash_list);	874	list_del_init(&bp->b_hash_list);
875	spin_unlock(&hash->bh_lock);	875	spin_unlock(&hash->bh_lock);
876	xfs_buf_free(bp);	876	xfs_buf_free(bp);
877	}	877	}
878	}	878	}
879	}	879	}
880		880
881		881
882	/*	882	/*
883	* Mutual exclusion on buffers. Locking model:	883	* Mutual exclusion on buffers. Locking model:
884	*	884	*
885	* Buffers associated with inodes for which buffer locking	885	* Buffers associated with inodes for which buffer locking
886	* is not enabled are not protected by semaphores, and are	886	* is not enabled are not protected by semaphores, and are
887	* assumed to be exclusively owned by the caller. There is a	887	* assumed to be exclusively owned by the caller. There is a
888	* spinlock in the buffer, used by the caller when concurrent	888	* spinlock in the buffer, used by the caller when concurrent
889	* access is possible.	889	* access is possible.
890	*/	890	*/
891		891
892	/*	892	/*
893	* Locks a buffer object, if it is not already locked.	893	* Locks a buffer object, if it is not already locked.
894	* Note that this in no way locks the underlying pages, so it is only	894	* Note that this in no way locks the underlying pages, so it is only
895	* useful for synchronizing concurrent use of buffer objects, not for	895	* useful for synchronizing concurrent use of buffer objects, not for
896	* synchronizing independent access to the underlying pages.	896	* synchronizing independent access to the underlying pages.
897	*/	897	*/
898	int	898	int
899	xfs_buf_cond_lock(	899	xfs_buf_cond_lock(
900	xfs_buf_t *bp)	900	xfs_buf_t *bp)
901	{	901	{
902	int locked;	902	int locked;
903		903
904	locked = down_trylock(&bp->b_sema) == 0;	904	locked = down_trylock(&bp->b_sema) == 0;
905	if (locked) {	905	if (locked) {
906	XB_SET_OWNER(bp);	906	XB_SET_OWNER(bp);
907	}	907	}
908	XB_TRACE(bp, "cond_lock", (long)locked);	908	XB_TRACE(bp, "cond_lock", (long)locked);
909	return locked ? 0 : -EBUSY;	909	return locked ? 0 : -EBUSY;
910	}	910	}
911		911
912	#if defined(DEBUG) \|\| defined(XFS_BLI_TRACE)	912	#if defined(DEBUG) \|\| defined(XFS_BLI_TRACE)
913	int	913	int
914	xfs_buf_lock_value(	914	xfs_buf_lock_value(
915	xfs_buf_t *bp)	915	xfs_buf_t *bp)
916	{	916	{
917	return bp->b_sema.count;	917	return bp->b_sema.count;
918	}	918	}
919	#endif	919	#endif
920		920
921	/*	921	/*
922	* Locks a buffer object.	922	* Locks a buffer object.
923	* Note that this in no way locks the underlying pages, so it is only	923	* Note that this in no way locks the underlying pages, so it is only
924	* useful for synchronizing concurrent use of buffer objects, not for	924	* useful for synchronizing concurrent use of buffer objects, not for
925	* synchronizing independent access to the underlying pages.	925	* synchronizing independent access to the underlying pages.
926	*/	926	*/
927	void	927	void
928	xfs_buf_lock(	928	xfs_buf_lock(
929	xfs_buf_t *bp)	929	xfs_buf_t *bp)
930	{	930	{
931	XB_TRACE(bp, "lock", 0);	931	XB_TRACE(bp, "lock", 0);
932	if (atomic_read(&bp->b_io_remaining))	932	if (atomic_read(&bp->b_io_remaining))
933	blk_run_address_space(bp->b_target->bt_mapping);	933	blk_run_address_space(bp->b_target->bt_mapping);
934	down(&bp->b_sema);	934	down(&bp->b_sema);
935	XB_SET_OWNER(bp);	935	XB_SET_OWNER(bp);
936	XB_TRACE(bp, "locked", 0);	936	XB_TRACE(bp, "locked", 0);
937	}	937	}
938		938
939	/*	939	/*
940	* Releases the lock on the buffer object.	940	* Releases the lock on the buffer object.
941	* If the buffer is marked delwri but is not queued, do so before we	941	* If the buffer is marked delwri but is not queued, do so before we
942	* unlock the buffer as we need to set flags correctly. We also need to	942	* unlock the buffer as we need to set flags correctly. We also need to
943	* take a reference for the delwri queue because the unlocker is going to	943	* take a reference for the delwri queue because the unlocker is going to
944	* drop their's and they don't know we just queued it.	944	* drop their's and they don't know we just queued it.
945	*/	945	*/
946	void	946	void
947	xfs_buf_unlock(	947	xfs_buf_unlock(
948	xfs_buf_t *bp)	948	xfs_buf_t *bp)
949	{	949	{
950	if ((bp->b_flags & (XBF_DELWRI\|_XBF_DELWRI_Q)) == XBF_DELWRI) {	950	if ((bp->b_flags & (XBF_DELWRI\|_XBF_DELWRI_Q)) == XBF_DELWRI) {
951	atomic_inc(&bp->b_hold);	951	atomic_inc(&bp->b_hold);
952	bp->b_flags \|= XBF_ASYNC;	952	bp->b_flags \|= XBF_ASYNC;
953	xfs_buf_delwri_queue(bp, 0);	953	xfs_buf_delwri_queue(bp, 0);
954	}	954	}
955		955
956	XB_CLEAR_OWNER(bp);	956	XB_CLEAR_OWNER(bp);
957	up(&bp->b_sema);	957	up(&bp->b_sema);
958	XB_TRACE(bp, "unlock", 0);	958	XB_TRACE(bp, "unlock", 0);
959	}	959	}
960		960
961		961
962	/*	962	/*
963	* Pinning Buffer Storage in Memory	963	* Pinning Buffer Storage in Memory
964	* Ensure that no attempt to force a buffer to disk will succeed.	964	* Ensure that no attempt to force a buffer to disk will succeed.
965	*/	965	*/
966	void	966	void
967	xfs_buf_pin(	967	xfs_buf_pin(
968	xfs_buf_t *bp)	968	xfs_buf_t *bp)
969	{	969	{
970	atomic_inc(&bp->b_pin_count);	970	atomic_inc(&bp->b_pin_count);
971	XB_TRACE(bp, "pin", (long)bp->b_pin_count.counter);	971	XB_TRACE(bp, "pin", (long)bp->b_pin_count.counter);
972	}	972	}
973		973
974	void	974	void
975	xfs_buf_unpin(	975	xfs_buf_unpin(
976	xfs_buf_t *bp)	976	xfs_buf_t *bp)
977	{	977	{
978	if (atomic_dec_and_test(&bp->b_pin_count))	978	if (atomic_dec_and_test(&bp->b_pin_count))
979	wake_up_all(&bp->b_waiters);	979	wake_up_all(&bp->b_waiters);
980	XB_TRACE(bp, "unpin", (long)bp->b_pin_count.counter);	980	XB_TRACE(bp, "unpin", (long)bp->b_pin_count.counter);
981	}	981	}
982		982
983	int	983	int
984	xfs_buf_ispin(	984	xfs_buf_ispin(
985	xfs_buf_t *bp)	985	xfs_buf_t *bp)
986	{	986	{
987	return atomic_read(&bp->b_pin_count);	987	return atomic_read(&bp->b_pin_count);
988	}	988	}
989		989
990	STATIC void	990	STATIC void
991	xfs_buf_wait_unpin(	991	xfs_buf_wait_unpin(
992	xfs_buf_t *bp)	992	xfs_buf_t *bp)
993	{	993	{
994	DECLARE_WAITQUEUE (wait, current);	994	DECLARE_WAITQUEUE (wait, current);
995		995
996	if (atomic_read(&bp->b_pin_count) == 0)	996	if (atomic_read(&bp->b_pin_count) == 0)
997	return;	997	return;
998		998
999	add_wait_queue(&bp->b_waiters, &wait);	999	add_wait_queue(&bp->b_waiters, &wait);
1000	for (;;) {	1000	for (;;) {
1001	set_current_state(TASK_UNINTERRUPTIBLE);	1001	set_current_state(TASK_UNINTERRUPTIBLE);
1002	if (atomic_read(&bp->b_pin_count) == 0)	1002	if (atomic_read(&bp->b_pin_count) == 0)
1003	break;	1003	break;
1004	if (atomic_read(&bp->b_io_remaining))	1004	if (atomic_read(&bp->b_io_remaining))
1005	blk_run_address_space(bp->b_target->bt_mapping);	1005	blk_run_address_space(bp->b_target->bt_mapping);
1006	schedule();	1006	schedule();
1007	}	1007	}
1008	remove_wait_queue(&bp->b_waiters, &wait);	1008	remove_wait_queue(&bp->b_waiters, &wait);
1009	set_current_state(TASK_RUNNING);	1009	set_current_state(TASK_RUNNING);
1010	}	1010	}
1011		1011
1012	/*	1012	/*
1013	* Buffer Utility Routines	1013	* Buffer Utility Routines
1014	*/	1014	*/
1015		1015
1016	STATIC void	1016	STATIC void
1017	xfs_buf_iodone_work(	1017	xfs_buf_iodone_work(
1018	struct work_struct *work)	1018	struct work_struct *work)
1019	{	1019	{
1020	xfs_buf_t *bp =	1020	xfs_buf_t *bp =
1021	container_of(work, xfs_buf_t, b_iodone_work);	1021	container_of(work, xfs_buf_t, b_iodone_work);
1022		1022
1023	/*	1023	/*
1024	* We can get an EOPNOTSUPP to ordered writes. Here we clear the	1024	* We can get an EOPNOTSUPP to ordered writes. Here we clear the
1025	* ordered flag and reissue them. Because we can't tell the higher	1025	* ordered flag and reissue them. Because we can't tell the higher
1026	* layers directly that they should not issue ordered I/O anymore, they	1026	* layers directly that they should not issue ordered I/O anymore, they
1027	* need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.	1027	* need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
1028	*/	1028	*/
1029	if ((bp->b_error == EOPNOTSUPP) &&	1029	if ((bp->b_error == EOPNOTSUPP) &&
1030	(bp->b_flags & (XBF_ORDERED\|XBF_ASYNC)) == (XBF_ORDERED\|XBF_ASYNC)) {	1030	(bp->b_flags & (XBF_ORDERED\|XBF_ASYNC)) == (XBF_ORDERED\|XBF_ASYNC)) {
1031	XB_TRACE(bp, "ordered_retry", bp->b_iodone);	1031	XB_TRACE(bp, "ordered_retry", bp->b_iodone);
1032	bp->b_flags &= ~XBF_ORDERED;	1032	bp->b_flags &= ~XBF_ORDERED;
1033	bp->b_flags \|= _XFS_BARRIER_FAILED;	1033	bp->b_flags \|= _XFS_BARRIER_FAILED;
1034	xfs_buf_iorequest(bp);	1034	xfs_buf_iorequest(bp);
1035	} else if (bp->b_iodone)	1035	} else if (bp->b_iodone)
1036	(*(bp->b_iodone))(bp);	1036	(*(bp->b_iodone))(bp);
1037	else if (bp->b_flags & XBF_ASYNC)	1037	else if (bp->b_flags & XBF_ASYNC)
1038	xfs_buf_relse(bp);	1038	xfs_buf_relse(bp);
1039	}	1039	}
1040		1040
1041	void	1041	void
1042	xfs_buf_ioend(	1042	xfs_buf_ioend(
1043	xfs_buf_t *bp,	1043	xfs_buf_t *bp,
1044	int schedule)	1044	int schedule)
1045	{	1045	{
1046	bp->b_flags &= ~(XBF_READ \| XBF_WRITE \| XBF_READ_AHEAD);	1046	bp->b_flags &= ~(XBF_READ \| XBF_WRITE \| XBF_READ_AHEAD);
1047	if (bp->b_error == 0)	1047	if (bp->b_error == 0)
1048	bp->b_flags \|= XBF_DONE;	1048	bp->b_flags \|= XBF_DONE;
1049		1049
1050	XB_TRACE(bp, "iodone", bp->b_iodone);	1050	XB_TRACE(bp, "iodone", bp->b_iodone);
1051		1051
1052	if ((bp->b_iodone) \|\| (bp->b_flags & XBF_ASYNC)) {	1052	if ((bp->b_iodone) \|\| (bp->b_flags & XBF_ASYNC)) {
1053	if (schedule) {	1053	if (schedule) {
1054	INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);	1054	INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
1055	queue_work(xfslogd_workqueue, &bp->b_iodone_work);	1055	queue_work(xfslogd_workqueue, &bp->b_iodone_work);
1056	} else {	1056	} else {
1057	xfs_buf_iodone_work(&bp->b_iodone_work);	1057	xfs_buf_iodone_work(&bp->b_iodone_work);
1058	}	1058	}
1059	} else {	1059	} else {
1060	complete(&bp->b_iowait);	1060	complete(&bp->b_iowait);
1061	}	1061	}
1062	}	1062	}
1063		1063
1064	void	1064	void
1065	xfs_buf_ioerror(	1065	xfs_buf_ioerror(
1066	xfs_buf_t *bp,	1066	xfs_buf_t *bp,
1067	int error)	1067	int error)
1068	{	1068	{
1069	ASSERT(error >= 0 && error <= 0xffff);	1069	ASSERT(error >= 0 && error <= 0xffff);
1070	bp->b_error = (unsigned short)error;	1070	bp->b_error = (unsigned short)error;
1071	XB_TRACE(bp, "ioerror", (unsigned long)error);	1071	XB_TRACE(bp, "ioerror", (unsigned long)error);
1072	}	1072	}
1073		1073
1074	int	1074	int
1075	xfs_bawrite(	1075	xfs_bawrite(
1076	void *mp,	1076	void *mp,
1077	struct xfs_buf *bp)	1077	struct xfs_buf *bp)
1078	{	1078	{
1079	XB_TRACE(bp, "bawrite", 0);	1079	XB_TRACE(bp, "bawrite", 0);
1080		1080
1081	ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);	1081	ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
1082		1082
1083	xfs_buf_delwri_dequeue(bp);	1083	xfs_buf_delwri_dequeue(bp);
1084		1084
1085	bp->b_flags &= ~(XBF_READ \| XBF_DELWRI \| XBF_READ_AHEAD);	1085	bp->b_flags &= ~(XBF_READ \| XBF_DELWRI \| XBF_READ_AHEAD);
1086	bp->b_flags \|= (XBF_WRITE \| XBF_ASYNC \| _XBF_RUN_QUEUES);	1086	bp->b_flags \|= (XBF_WRITE \| XBF_ASYNC \| _XBF_RUN_QUEUES);
1087		1087
1088	bp->b_fspriv3 = mp;	1088	bp->b_mount = mp;
1089	bp->b_strat = xfs_bdstrat_cb;	1089	bp->b_strat = xfs_bdstrat_cb;
1090	return xfs_bdstrat_cb(bp);	1090	return xfs_bdstrat_cb(bp);
1091	}	1091	}
1092		1092
1093	void	1093	void
1094	xfs_bdwrite(	1094	xfs_bdwrite(
1095	void *mp,	1095	void *mp,
1096	struct xfs_buf *bp)	1096	struct xfs_buf *bp)
1097	{	1097	{
1098	XB_TRACE(bp, "bdwrite", 0);	1098	XB_TRACE(bp, "bdwrite", 0);
1099		1099
1100	bp->b_strat = xfs_bdstrat_cb;	1100	bp->b_strat = xfs_bdstrat_cb;
1101	bp->b_fspriv3 = mp;	1101	bp->b_mount = mp;
1102		1102
1103	bp->b_flags &= ~XBF_READ;	1103	bp->b_flags &= ~XBF_READ;
1104	bp->b_flags \|= (XBF_DELWRI \| XBF_ASYNC);	1104	bp->b_flags \|= (XBF_DELWRI \| XBF_ASYNC);
1105		1105
1106	xfs_buf_delwri_queue(bp, 1);	1106	xfs_buf_delwri_queue(bp, 1);
1107	}	1107	}
1108		1108
1109	STATIC_INLINE void	1109	STATIC_INLINE void
1110	_xfs_buf_ioend(	1110	_xfs_buf_ioend(
1111	xfs_buf_t *bp,	1111	xfs_buf_t *bp,
1112	int schedule)	1112	int schedule)
1113	{	1113	{
1114	if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {	1114	if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
1115	bp->b_flags &= ~_XBF_PAGE_LOCKED;	1115	bp->b_flags &= ~_XBF_PAGE_LOCKED;
1116	xfs_buf_ioend(bp, schedule);	1116	xfs_buf_ioend(bp, schedule);
1117	}	1117	}
1118	}	1118	}
1119		1119
1120	STATIC void	1120	STATIC void
1121	xfs_buf_bio_end_io(	1121	xfs_buf_bio_end_io(
1122	struct bio *bio,	1122	struct bio *bio,
1123	int error)	1123	int error)
1124	{	1124	{
1125	xfs_buf_t bp = (xfs_buf_t )bio->bi_private;	1125	xfs_buf_t bp = (xfs_buf_t )bio->bi_private;
1126	unsigned int blocksize = bp->b_target->bt_bsize;	1126	unsigned int blocksize = bp->b_target->bt_bsize;
1127	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;	1127	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1128		1128
1129	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))	1129	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1130	bp->b_error = EIO;	1130	bp->b_error = EIO;
1131		1131
1132	do {	1132	do {
1133	struct page *page = bvec->bv_page;	1133	struct page *page = bvec->bv_page;
1134		1134
1135	ASSERT(!PagePrivate(page));	1135	ASSERT(!PagePrivate(page));
1136	if (unlikely(bp->b_error)) {	1136	if (unlikely(bp->b_error)) {
1137	if (bp->b_flags & XBF_READ)	1137	if (bp->b_flags & XBF_READ)
1138	ClearPageUptodate(page);	1138	ClearPageUptodate(page);
1139	} else if (blocksize >= PAGE_CACHE_SIZE) {	1139	} else if (blocksize >= PAGE_CACHE_SIZE) {
1140	SetPageUptodate(page);	1140	SetPageUptodate(page);
1141	} else if (!PagePrivate(page) &&	1141	} else if (!PagePrivate(page) &&
1142	(bp->b_flags & _XBF_PAGE_CACHE)) {	1142	(bp->b_flags & _XBF_PAGE_CACHE)) {
1143	set_page_region(page, bvec->bv_offset, bvec->bv_len);	1143	set_page_region(page, bvec->bv_offset, bvec->bv_len);
1144	}	1144	}
1145		1145
1146	if (--bvec >= bio->bi_io_vec)	1146	if (--bvec >= bio->bi_io_vec)
1147	prefetchw(&bvec->bv_page->flags);	1147	prefetchw(&bvec->bv_page->flags);
1148		1148
1149	if (bp->b_flags & _XBF_PAGE_LOCKED)	1149	if (bp->b_flags & _XBF_PAGE_LOCKED)
1150	unlock_page(page);	1150	unlock_page(page);
1151	} while (bvec >= bio->bi_io_vec);	1151	} while (bvec >= bio->bi_io_vec);
1152		1152
1153	_xfs_buf_ioend(bp, 1);	1153	_xfs_buf_ioend(bp, 1);
1154	bio_put(bio);	1154	bio_put(bio);
1155	}	1155	}
1156		1156
1157	STATIC void	1157	STATIC void
1158	_xfs_buf_ioapply(	1158	_xfs_buf_ioapply(
1159	xfs_buf_t *bp)	1159	xfs_buf_t *bp)
1160	{	1160	{
1161	int rw, map_i, total_nr_pages, nr_pages;	1161	int rw, map_i, total_nr_pages, nr_pages;
1162	struct bio *bio;	1162	struct bio *bio;
1163	int offset = bp->b_offset;	1163	int offset = bp->b_offset;
1164	int size = bp->b_count_desired;	1164	int size = bp->b_count_desired;
1165	sector_t sector = bp->b_bn;	1165	sector_t sector = bp->b_bn;
1166	unsigned int blocksize = bp->b_target->bt_bsize;	1166	unsigned int blocksize = bp->b_target->bt_bsize;
1167		1167
1168	total_nr_pages = bp->b_page_count;	1168	total_nr_pages = bp->b_page_count;
1169	map_i = 0;	1169	map_i = 0;
1170		1170
1171	if (bp->b_flags & XBF_ORDERED) {	1171	if (bp->b_flags & XBF_ORDERED) {
1172	ASSERT(!(bp->b_flags & XBF_READ));	1172	ASSERT(!(bp->b_flags & XBF_READ));
1173	rw = WRITE_BARRIER;	1173	rw = WRITE_BARRIER;
1174	} else if (bp->b_flags & _XBF_RUN_QUEUES) {	1174	} else if (bp->b_flags & _XBF_RUN_QUEUES) {
1175	ASSERT(!(bp->b_flags & XBF_READ_AHEAD));	1175	ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
1176	bp->b_flags &= ~_XBF_RUN_QUEUES;	1176	bp->b_flags &= ~_XBF_RUN_QUEUES;
1177	rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC;	1177	rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC;
1178	} else {	1178	} else {
1179	rw = (bp->b_flags & XBF_WRITE) ? WRITE :	1179	rw = (bp->b_flags & XBF_WRITE) ? WRITE :
1180	(bp->b_flags & XBF_READ_AHEAD) ? READA : READ;	1180	(bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
1181	}	1181	}
1182		1182
1183	/* Special code path for reading a sub page size buffer in --	1183	/* Special code path for reading a sub page size buffer in --
1184	* we populate up the whole page, and hence the other metadata	1184	* we populate up the whole page, and hence the other metadata
1185	* in the same page. This optimization is only valid when the	1185	* in the same page. This optimization is only valid when the
1186	* filesystem block size is not smaller than the page size.	1186	* filesystem block size is not smaller than the page size.
1187	*/	1187	*/
1188	if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&	1188	if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
1189	((bp->b_flags & (XBF_READ\|_XBF_PAGE_LOCKED)) ==	1189	((bp->b_flags & (XBF_READ\|_XBF_PAGE_LOCKED)) ==
1190	(XBF_READ\|_XBF_PAGE_LOCKED)) &&	1190	(XBF_READ\|_XBF_PAGE_LOCKED)) &&
1191	(blocksize >= PAGE_CACHE_SIZE)) {	1191	(blocksize >= PAGE_CACHE_SIZE)) {
1192	bio = bio_alloc(GFP_NOIO, 1);	1192	bio = bio_alloc(GFP_NOIO, 1);
1193		1193
1194	bio->bi_bdev = bp->b_target->bt_bdev;	1194	bio->bi_bdev = bp->b_target->bt_bdev;
1195	bio->bi_sector = sector - (offset >> BBSHIFT);	1195	bio->bi_sector = sector - (offset >> BBSHIFT);
1196	bio->bi_end_io = xfs_buf_bio_end_io;	1196	bio->bi_end_io = xfs_buf_bio_end_io;
1197	bio->bi_private = bp;	1197	bio->bi_private = bp;
1198		1198
1199	bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0);	1199	bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0);
1200	size = 0;	1200	size = 0;
1201		1201
1202	atomic_inc(&bp->b_io_remaining);	1202	atomic_inc(&bp->b_io_remaining);
1203		1203
1204	goto submit_io;	1204	goto submit_io;
1205	}	1205	}
1206		1206
1207	next_chunk:	1207	next_chunk:
1208	atomic_inc(&bp->b_io_remaining);	1208	atomic_inc(&bp->b_io_remaining);
1209	nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);	1209	nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
1210	if (nr_pages > total_nr_pages)	1210	if (nr_pages > total_nr_pages)
1211	nr_pages = total_nr_pages;	1211	nr_pages = total_nr_pages;
1212		1212
1213	bio = bio_alloc(GFP_NOIO, nr_pages);	1213	bio = bio_alloc(GFP_NOIO, nr_pages);
1214	bio->bi_bdev = bp->b_target->bt_bdev;	1214	bio->bi_bdev = bp->b_target->bt_bdev;
1215	bio->bi_sector = sector;	1215	bio->bi_sector = sector;
1216	bio->bi_end_io = xfs_buf_bio_end_io;	1216	bio->bi_end_io = xfs_buf_bio_end_io;
1217	bio->bi_private = bp;	1217	bio->bi_private = bp;
1218		1218
1219	for (; size && nr_pages; nr_pages--, map_i++) {	1219	for (; size && nr_pages; nr_pages--, map_i++) {
1220	int rbytes, nbytes = PAGE_CACHE_SIZE - offset;	1220	int rbytes, nbytes = PAGE_CACHE_SIZE - offset;
1221		1221
1222	if (nbytes > size)	1222	if (nbytes > size)
1223	nbytes = size;	1223	nbytes = size;
1224		1224
1225	rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);	1225	rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);
1226	if (rbytes < nbytes)	1226	if (rbytes < nbytes)
1227	break;	1227	break;
1228		1228
1229	offset = 0;	1229	offset = 0;
1230	sector += nbytes >> BBSHIFT;	1230	sector += nbytes >> BBSHIFT;
1231	size -= nbytes;	1231	size -= nbytes;
1232	total_nr_pages--;	1232	total_nr_pages--;
1233	}	1233	}
1234		1234
1235	submit_io:	1235	submit_io:
1236	if (likely(bio->bi_size)) {	1236	if (likely(bio->bi_size)) {
1237	submit_bio(rw, bio);	1237	submit_bio(rw, bio);
1238	if (size)	1238	if (size)
1239	goto next_chunk;	1239	goto next_chunk;
1240	} else {	1240	} else {
1241	bio_put(bio);	1241	bio_put(bio);
1242	xfs_buf_ioerror(bp, EIO);	1242	xfs_buf_ioerror(bp, EIO);
1243	}	1243	}
1244	}	1244	}
1245		1245
1246	int	1246	int
1247	xfs_buf_iorequest(	1247	xfs_buf_iorequest(
1248	xfs_buf_t *bp)	1248	xfs_buf_t *bp)
1249	{	1249	{
1250	XB_TRACE(bp, "iorequest", 0);	1250	XB_TRACE(bp, "iorequest", 0);
1251		1251
1252	if (bp->b_flags & XBF_DELWRI) {	1252	if (bp->b_flags & XBF_DELWRI) {
1253	xfs_buf_delwri_queue(bp, 1);	1253	xfs_buf_delwri_queue(bp, 1);
1254	return 0;	1254	return 0;
1255	}	1255	}
1256		1256
1257	if (bp->b_flags & XBF_WRITE) {	1257	if (bp->b_flags & XBF_WRITE) {
1258	xfs_buf_wait_unpin(bp);	1258	xfs_buf_wait_unpin(bp);
1259	}	1259	}
1260		1260
1261	xfs_buf_hold(bp);	1261	xfs_buf_hold(bp);
1262		1262
1263	/* Set the count to 1 initially, this will stop an I/O	1263	/* Set the count to 1 initially, this will stop an I/O
1264	* completion callout which happens before we have started	1264	* completion callout which happens before we have started
1265	* all the I/O from calling xfs_buf_ioend too early.	1265	* all the I/O from calling xfs_buf_ioend too early.
1266	*/	1266	*/
1267	atomic_set(&bp->b_io_remaining, 1);	1267	atomic_set(&bp->b_io_remaining, 1);
1268	_xfs_buf_ioapply(bp);	1268	_xfs_buf_ioapply(bp);
1269	_xfs_buf_ioend(bp, 0);	1269	_xfs_buf_ioend(bp, 0);
1270		1270
1271	xfs_buf_rele(bp);	1271	xfs_buf_rele(bp);
1272	return 0;	1272	return 0;
1273	}	1273	}
1274		1274
1275	/*	1275	/*
1276	* Waits for I/O to complete on the buffer supplied.	1276	* Waits for I/O to complete on the buffer supplied.
1277	* It returns immediately if no I/O is pending.	1277	* It returns immediately if no I/O is pending.
1278	* It returns the I/O error code, if any, or 0 if there was no error.	1278	* It returns the I/O error code, if any, or 0 if there was no error.
1279	*/	1279	*/
1280	int	1280	int
1281	xfs_buf_iowait(	1281	xfs_buf_iowait(
1282	xfs_buf_t *bp)	1282	xfs_buf_t *bp)
1283	{	1283	{
1284	XB_TRACE(bp, "iowait", 0);	1284	XB_TRACE(bp, "iowait", 0);
1285	if (atomic_read(&bp->b_io_remaining))	1285	if (atomic_read(&bp->b_io_remaining))
1286	blk_run_address_space(bp->b_target->bt_mapping);	1286	blk_run_address_space(bp->b_target->bt_mapping);
1287	wait_for_completion(&bp->b_iowait);	1287	wait_for_completion(&bp->b_iowait);
1288	XB_TRACE(bp, "iowaited", (long)bp->b_error);	1288	XB_TRACE(bp, "iowaited", (long)bp->b_error);
1289	return bp->b_error;	1289	return bp->b_error;
1290	}	1290	}
1291		1291
1292	xfs_caddr_t	1292	xfs_caddr_t
1293	xfs_buf_offset(	1293	xfs_buf_offset(
1294	xfs_buf_t *bp,	1294	xfs_buf_t *bp,
1295	size_t offset)	1295	size_t offset)
1296	{	1296	{
1297	struct page *page;	1297	struct page *page;
1298		1298
1299	if (bp->b_flags & XBF_MAPPED)	1299	if (bp->b_flags & XBF_MAPPED)
1300	return XFS_BUF_PTR(bp) + offset;	1300	return XFS_BUF_PTR(bp) + offset;
1301		1301
1302	offset += bp->b_offset;	1302	offset += bp->b_offset;
1303	page = bp->b_pages[offset >> PAGE_CACHE_SHIFT];	1303	page = bp->b_pages[offset >> PAGE_CACHE_SHIFT];
1304	return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1));	1304	return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1));
1305	}	1305	}
1306		1306
1307	/*	1307	/*
1308	* Move data into or out of a buffer.	1308	* Move data into or out of a buffer.
1309	*/	1309	*/
1310	void	1310	void
1311	xfs_buf_iomove(	1311	xfs_buf_iomove(
1312	xfs_buf_t bp, / buffer to process */	1312	xfs_buf_t bp, / buffer to process */
1313	size_t boff, /* starting buffer offset */	1313	size_t boff, /* starting buffer offset */
1314	size_t bsize, /* length to copy */	1314	size_t bsize, /* length to copy */
1315	caddr_t data, /* data address */	1315	caddr_t data, /* data address */
1316	xfs_buf_rw_t mode) /* read/write/zero flag */	1316	xfs_buf_rw_t mode) /* read/write/zero flag */
1317	{	1317	{
1318	size_t bend, cpoff, csize;	1318	size_t bend, cpoff, csize;
1319	struct page *page;	1319	struct page *page;
1320		1320
1321	bend = boff + bsize;	1321	bend = boff + bsize;
1322	while (boff < bend) {	1322	while (boff < bend) {
1323	page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];	1323	page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
1324	cpoff = xfs_buf_poff(boff + bp->b_offset);	1324	cpoff = xfs_buf_poff(boff + bp->b_offset);
1325	csize = min_t(size_t,	1325	csize = min_t(size_t,
1326	PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff);	1326	PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff);
1327		1327
1328	ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));	1328	ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
1329		1329
1330	switch (mode) {	1330	switch (mode) {
1331	case XBRW_ZERO:	1331	case XBRW_ZERO:
1332	memset(page_address(page) + cpoff, 0, csize);	1332	memset(page_address(page) + cpoff, 0, csize);
1333	break;	1333	break;
1334	case XBRW_READ:	1334	case XBRW_READ:
1335	memcpy(data, page_address(page) + cpoff, csize);	1335	memcpy(data, page_address(page) + cpoff, csize);
1336	break;	1336	break;
1337	case XBRW_WRITE:	1337	case XBRW_WRITE:
1338	memcpy(page_address(page) + cpoff, data, csize);	1338	memcpy(page_address(page) + cpoff, data, csize);
1339	}	1339	}
1340		1340
1341	boff += csize;	1341	boff += csize;
1342	data += csize;	1342	data += csize;
1343	}	1343	}
1344	}	1344	}
1345		1345
1346	/*	1346	/*
1347	* Handling of buffer targets (buftargs).	1347	* Handling of buffer targets (buftargs).
1348	*/	1348	*/
1349		1349
1350	/*	1350	/*
1351	* Wait for any bufs with callbacks that have been submitted but	1351	* Wait for any bufs with callbacks that have been submitted but
1352	* have not yet returned... walk the hash list for the target.	1352	* have not yet returned... walk the hash list for the target.
1353	*/	1353	*/
1354	void	1354	void
1355	xfs_wait_buftarg(	1355	xfs_wait_buftarg(
1356	xfs_buftarg_t *btp)	1356	xfs_buftarg_t *btp)
1357	{	1357	{
1358	xfs_buf_t bp, n;	1358	xfs_buf_t bp, n;
1359	xfs_bufhash_t *hash;	1359	xfs_bufhash_t *hash;
1360	uint i;	1360	uint i;
1361		1361
1362	for (i = 0; i < (1 << btp->bt_hashshift); i++) {	1362	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1363	hash = &btp->bt_hash[i];	1363	hash = &btp->bt_hash[i];
1364	again:	1364	again:
1365	spin_lock(&hash->bh_lock);	1365	spin_lock(&hash->bh_lock);
1366	list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {	1366	list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
1367	ASSERT(btp == bp->b_target);	1367	ASSERT(btp == bp->b_target);
1368	if (!(bp->b_flags & XBF_FS_MANAGED)) {	1368	if (!(bp->b_flags & XBF_FS_MANAGED)) {
1369	spin_unlock(&hash->bh_lock);	1369	spin_unlock(&hash->bh_lock);
1370	/*	1370	/*
1371	* Catch superblock reference count leaks	1371	* Catch superblock reference count leaks
1372	* immediately	1372	* immediately
1373	*/	1373	*/
1374	BUG_ON(bp->b_bn == 0);	1374	BUG_ON(bp->b_bn == 0);
1375	delay(100);	1375	delay(100);
1376	goto again;	1376	goto again;
1377	}	1377	}
1378	}	1378	}
1379	spin_unlock(&hash->bh_lock);	1379	spin_unlock(&hash->bh_lock);
1380	}	1380	}
1381	}	1381	}
1382		1382
1383	/*	1383	/*
1384	* Allocate buffer hash table for a given target.	1384	* Allocate buffer hash table for a given target.
1385	* For devices containing metadata (i.e. not the log/realtime devices)	1385	* For devices containing metadata (i.e. not the log/realtime devices)
1386	* we need to allocate a much larger hash table.	1386	* we need to allocate a much larger hash table.
1387	*/	1387	*/
1388	STATIC void	1388	STATIC void
1389	xfs_alloc_bufhash(	1389	xfs_alloc_bufhash(
1390	xfs_buftarg_t *btp,	1390	xfs_buftarg_t *btp,
1391	int external)	1391	int external)
1392	{	1392	{
1393	unsigned int i;	1393	unsigned int i;
1394		1394
1395	btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */	1395	btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */
1396	btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;	1396	btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
1397	btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *	1397	btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *
1398	sizeof(xfs_bufhash_t), KM_SLEEP \| KM_LARGE);	1398	sizeof(xfs_bufhash_t), KM_SLEEP \| KM_LARGE);
1399	for (i = 0; i < (1 << btp->bt_hashshift); i++) {	1399	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1400	spin_lock_init(&btp->bt_hash[i].bh_lock);	1400	spin_lock_init(&btp->bt_hash[i].bh_lock);
1401	INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);	1401	INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
1402	}	1402	}
1403	}	1403	}
1404		1404
1405	STATIC void	1405	STATIC void
1406	xfs_free_bufhash(	1406	xfs_free_bufhash(
1407	xfs_buftarg_t *btp)	1407	xfs_buftarg_t *btp)
1408	{	1408	{
1409	kmem_free(btp->bt_hash);	1409	kmem_free(btp->bt_hash);
1410	btp->bt_hash = NULL;	1410	btp->bt_hash = NULL;
1411	}	1411	}
1412		1412
1413	/*	1413	/*
1414	* buftarg list for delwrite queue processing	1414	* buftarg list for delwrite queue processing
1415	*/	1415	*/
1416	static LIST_HEAD(xfs_buftarg_list);	1416	static LIST_HEAD(xfs_buftarg_list);
1417	static DEFINE_SPINLOCK(xfs_buftarg_lock);	1417	static DEFINE_SPINLOCK(xfs_buftarg_lock);
1418		1418
1419	STATIC void	1419	STATIC void
1420	xfs_register_buftarg(	1420	xfs_register_buftarg(
1421	xfs_buftarg_t *btp)	1421	xfs_buftarg_t *btp)
1422	{	1422	{
1423	spin_lock(&xfs_buftarg_lock);	1423	spin_lock(&xfs_buftarg_lock);
1424	list_add(&btp->bt_list, &xfs_buftarg_list);	1424	list_add(&btp->bt_list, &xfs_buftarg_list);
1425	spin_unlock(&xfs_buftarg_lock);	1425	spin_unlock(&xfs_buftarg_lock);
1426	}	1426	}
1427		1427
1428	STATIC void	1428	STATIC void
1429	xfs_unregister_buftarg(	1429	xfs_unregister_buftarg(
1430	xfs_buftarg_t *btp)	1430	xfs_buftarg_t *btp)
1431	{	1431	{
1432	spin_lock(&xfs_buftarg_lock);	1432	spin_lock(&xfs_buftarg_lock);
1433	list_del(&btp->bt_list);	1433	list_del(&btp->bt_list);
1434	spin_unlock(&xfs_buftarg_lock);	1434	spin_unlock(&xfs_buftarg_lock);
1435	}	1435	}
1436		1436
1437	void	1437	void
1438	xfs_free_buftarg(	1438	xfs_free_buftarg(
1439	xfs_buftarg_t *btp)	1439	xfs_buftarg_t *btp)
1440	{	1440	{
1441	xfs_flush_buftarg(btp, 1);	1441	xfs_flush_buftarg(btp, 1);
1442	xfs_blkdev_issue_flush(btp);	1442	xfs_blkdev_issue_flush(btp);
1443	xfs_free_bufhash(btp);	1443	xfs_free_bufhash(btp);
1444	iput(btp->bt_mapping->host);	1444	iput(btp->bt_mapping->host);
1445		1445
1446	/* Unregister the buftarg first so that we don't get a	1446	/* Unregister the buftarg first so that we don't get a
1447	* wakeup finding a non-existent task	1447	* wakeup finding a non-existent task
1448	*/	1448	*/
1449	xfs_unregister_buftarg(btp);	1449	xfs_unregister_buftarg(btp);
1450	kthread_stop(btp->bt_task);	1450	kthread_stop(btp->bt_task);
1451		1451
1452	kmem_free(btp);	1452	kmem_free(btp);
1453	}	1453	}
1454		1454
1455	STATIC int	1455	STATIC int
1456	xfs_setsize_buftarg_flags(	1456	xfs_setsize_buftarg_flags(
1457	xfs_buftarg_t *btp,	1457	xfs_buftarg_t *btp,
1458	unsigned int blocksize,	1458	unsigned int blocksize,
1459	unsigned int sectorsize,	1459	unsigned int sectorsize,
1460	int verbose)	1460	int verbose)
1461	{	1461	{
1462	btp->bt_bsize = blocksize;	1462	btp->bt_bsize = blocksize;
1463	btp->bt_sshift = ffs(sectorsize) - 1;	1463	btp->bt_sshift = ffs(sectorsize) - 1;
1464	btp->bt_smask = sectorsize - 1;	1464	btp->bt_smask = sectorsize - 1;
1465		1465
1466	if (set_blocksize(btp->bt_bdev, sectorsize)) {	1466	if (set_blocksize(btp->bt_bdev, sectorsize)) {
1467	printk(KERN_WARNING	1467	printk(KERN_WARNING
1468	"XFS: Cannot set_blocksize to %u on device %s\n",	1468	"XFS: Cannot set_blocksize to %u on device %s\n",
1469	sectorsize, XFS_BUFTARG_NAME(btp));	1469	sectorsize, XFS_BUFTARG_NAME(btp));
1470	return EINVAL;	1470	return EINVAL;
1471	}	1471	}
1472		1472
1473	if (verbose &&	1473	if (verbose &&
1474	(PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {	1474	(PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
1475	printk(KERN_WARNING	1475	printk(KERN_WARNING
1476	"XFS: %u byte sectors in use on device %s. "	1476	"XFS: %u byte sectors in use on device %s. "
1477	"This is suboptimal; %u or greater is ideal.\n",	1477	"This is suboptimal; %u or greater is ideal.\n",
1478	sectorsize, XFS_BUFTARG_NAME(btp),	1478	sectorsize, XFS_BUFTARG_NAME(btp),
1479	(unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);	1479	(unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
1480	}	1480	}
1481		1481
1482	return 0;	1482	return 0;
1483	}	1483	}
1484		1484
1485	/*	1485	/*
1486	* When allocating the initial buffer target we have not yet	1486	* When allocating the initial buffer target we have not yet
1487	* read in the superblock, so don't know what sized sectors	1487	* read in the superblock, so don't know what sized sectors
1488	* are being used is at this early stage. Play safe.	1488	* are being used is at this early stage. Play safe.
1489	*/	1489	*/
1490	STATIC int	1490	STATIC int
1491	xfs_setsize_buftarg_early(	1491	xfs_setsize_buftarg_early(
1492	xfs_buftarg_t *btp,	1492	xfs_buftarg_t *btp,
1493	struct block_device *bdev)	1493	struct block_device *bdev)
1494	{	1494	{
1495	return xfs_setsize_buftarg_flags(btp,	1495	return xfs_setsize_buftarg_flags(btp,
1496	PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);	1496	PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
1497	}	1497	}
1498		1498
1499	int	1499	int
1500	xfs_setsize_buftarg(	1500	xfs_setsize_buftarg(
1501	xfs_buftarg_t *btp,	1501	xfs_buftarg_t *btp,
1502	unsigned int blocksize,	1502	unsigned int blocksize,
1503	unsigned int sectorsize)	1503	unsigned int sectorsize)
1504	{	1504	{
1505	return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);	1505	return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
1506	}	1506	}
1507		1507
1508	STATIC int	1508	STATIC int
1509	xfs_mapping_buftarg(	1509	xfs_mapping_buftarg(
1510	xfs_buftarg_t *btp,	1510	xfs_buftarg_t *btp,
1511	struct block_device *bdev)	1511	struct block_device *bdev)
1512	{	1512	{
1513	struct backing_dev_info *bdi;	1513	struct backing_dev_info *bdi;
1514	struct inode *inode;	1514	struct inode *inode;
1515	struct address_space *mapping;	1515	struct address_space *mapping;
1516	static const struct address_space_operations mapping_aops = {	1516	static const struct address_space_operations mapping_aops = {
1517	.sync_page = block_sync_page,	1517	.sync_page = block_sync_page,
1518	.migratepage = fail_migrate_page,	1518	.migratepage = fail_migrate_page,
1519	};	1519	};
1520		1520
1521	inode = new_inode(bdev->bd_inode->i_sb);	1521	inode = new_inode(bdev->bd_inode->i_sb);
1522	if (!inode) {	1522	if (!inode) {
1523	printk(KERN_WARNING	1523	printk(KERN_WARNING
1524	"XFS: Cannot allocate mapping inode for device %s\n",	1524	"XFS: Cannot allocate mapping inode for device %s\n",
1525	XFS_BUFTARG_NAME(btp));	1525	XFS_BUFTARG_NAME(btp));
1526	return ENOMEM;	1526	return ENOMEM;
1527	}	1527	}
1528	inode->i_mode = S_IFBLK;	1528	inode->i_mode = S_IFBLK;
1529	inode->i_bdev = bdev;	1529	inode->i_bdev = bdev;
1530	inode->i_rdev = bdev->bd_dev;	1530	inode->i_rdev = bdev->bd_dev;
1531	bdi = blk_get_backing_dev_info(bdev);	1531	bdi = blk_get_backing_dev_info(bdev);
1532	if (!bdi)	1532	if (!bdi)
1533	bdi = &default_backing_dev_info;	1533	bdi = &default_backing_dev_info;
1534	mapping = &inode->i_data;	1534	mapping = &inode->i_data;
1535	mapping->a_ops = &mapping_aops;	1535	mapping->a_ops = &mapping_aops;
1536	mapping->backing_dev_info = bdi;	1536	mapping->backing_dev_info = bdi;
1537	mapping_set_gfp_mask(mapping, GFP_NOFS);	1537	mapping_set_gfp_mask(mapping, GFP_NOFS);
1538	btp->bt_mapping = mapping;	1538	btp->bt_mapping = mapping;
1539	return 0;	1539	return 0;
1540	}	1540	}
1541		1541
1542	STATIC int	1542	STATIC int
1543	xfs_alloc_delwrite_queue(	1543	xfs_alloc_delwrite_queue(
1544	xfs_buftarg_t *btp)	1544	xfs_buftarg_t *btp)
1545	{	1545	{
1546	int error = 0;	1546	int error = 0;
1547		1547
1548	INIT_LIST_HEAD(&btp->bt_list);	1548	INIT_LIST_HEAD(&btp->bt_list);
1549	INIT_LIST_HEAD(&btp->bt_delwrite_queue);	1549	INIT_LIST_HEAD(&btp->bt_delwrite_queue);
1550	spin_lock_init(&btp->bt_delwrite_lock);	1550	spin_lock_init(&btp->bt_delwrite_lock);
1551	btp->bt_flags = 0;	1551	btp->bt_flags = 0;
1552	btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd");	1552	btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd");
1553	if (IS_ERR(btp->bt_task)) {	1553	if (IS_ERR(btp->bt_task)) {
1554	error = PTR_ERR(btp->bt_task);	1554	error = PTR_ERR(btp->bt_task);
1555	goto out_error;	1555	goto out_error;
1556	}	1556	}
1557	xfs_register_buftarg(btp);	1557	xfs_register_buftarg(btp);
1558	out_error:	1558	out_error:
1559	return error;	1559	return error;
1560	}	1560	}
1561		1561
1562	xfs_buftarg_t *	1562	xfs_buftarg_t *
1563	xfs_alloc_buftarg(	1563	xfs_alloc_buftarg(
1564	struct block_device *bdev,	1564	struct block_device *bdev,
1565	int external)	1565	int external)
1566	{	1566	{
1567	xfs_buftarg_t *btp;	1567	xfs_buftarg_t *btp;
1568		1568
1569	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);	1569	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
1570		1570
1571	btp->bt_dev = bdev->bd_dev;	1571	btp->bt_dev = bdev->bd_dev;
1572	btp->bt_bdev = bdev;	1572	btp->bt_bdev = bdev;
1573	if (xfs_setsize_buftarg_early(btp, bdev))	1573	if (xfs_setsize_buftarg_early(btp, bdev))
1574	goto error;	1574	goto error;
1575	if (xfs_mapping_buftarg(btp, bdev))	1575	if (xfs_mapping_buftarg(btp, bdev))
1576	goto error;	1576	goto error;
1577	if (xfs_alloc_delwrite_queue(btp))	1577	if (xfs_alloc_delwrite_queue(btp))
1578	goto error;	1578	goto error;
1579	xfs_alloc_bufhash(btp, external);	1579	xfs_alloc_bufhash(btp, external);
1580	return btp;	1580	return btp;
1581		1581
1582	error:	1582	error:
1583	kmem_free(btp);	1583	kmem_free(btp);
1584	return NULL;	1584	return NULL;
1585	}	1585	}
1586		1586
1587		1587
1588	/*	1588	/*
1589	* Delayed write buffer handling	1589	* Delayed write buffer handling
1590	*/	1590	*/
1591	STATIC void	1591	STATIC void
1592	xfs_buf_delwri_queue(	1592	xfs_buf_delwri_queue(
1593	xfs_buf_t *bp,	1593	xfs_buf_t *bp,
1594	int unlock)	1594	int unlock)
1595	{	1595	{
1596	struct list_head *dwq = &bp->b_target->bt_delwrite_queue;	1596	struct list_head *dwq = &bp->b_target->bt_delwrite_queue;
1597	spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;	1597	spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;
1598		1598
1599	XB_TRACE(bp, "delwri_q", (long)unlock);	1599	XB_TRACE(bp, "delwri_q", (long)unlock);
1600	ASSERT((bp->b_flags&(XBF_DELWRI\|XBF_ASYNC)) == (XBF_DELWRI\|XBF_ASYNC));	1600	ASSERT((bp->b_flags&(XBF_DELWRI\|XBF_ASYNC)) == (XBF_DELWRI\|XBF_ASYNC));
1601		1601
1602	spin_lock(dwlk);	1602	spin_lock(dwlk);
1603	/* If already in the queue, dequeue and place at tail */	1603	/* If already in the queue, dequeue and place at tail */
1604	if (!list_empty(&bp->b_list)) {	1604	if (!list_empty(&bp->b_list)) {
1605	ASSERT(bp->b_flags & _XBF_DELWRI_Q);	1605	ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1606	if (unlock)	1606	if (unlock)
1607	atomic_dec(&bp->b_hold);	1607	atomic_dec(&bp->b_hold);
1608	list_del(&bp->b_list);	1608	list_del(&bp->b_list);
1609	}	1609	}
1610		1610
1611	bp->b_flags \|= _XBF_DELWRI_Q;	1611	bp->b_flags \|= _XBF_DELWRI_Q;
1612	list_add_tail(&bp->b_list, dwq);	1612	list_add_tail(&bp->b_list, dwq);
1613	bp->b_queuetime = jiffies;	1613	bp->b_queuetime = jiffies;
1614	spin_unlock(dwlk);	1614	spin_unlock(dwlk);
1615		1615
1616	if (unlock)	1616	if (unlock)
1617	xfs_buf_unlock(bp);	1617	xfs_buf_unlock(bp);
1618	}	1618	}
1619		1619
1620	void	1620	void
1621	xfs_buf_delwri_dequeue(	1621	xfs_buf_delwri_dequeue(
1622	xfs_buf_t *bp)	1622	xfs_buf_t *bp)
1623	{	1623	{
1624	spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;	1624	spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;
1625	int dequeued = 0;	1625	int dequeued = 0;
1626		1626
1627	spin_lock(dwlk);	1627	spin_lock(dwlk);
1628	if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {	1628	if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
1629	ASSERT(bp->b_flags & _XBF_DELWRI_Q);	1629	ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1630	list_del_init(&bp->b_list);	1630	list_del_init(&bp->b_list);
1631	dequeued = 1;	1631	dequeued = 1;
1632	}	1632	}
1633	bp->b_flags &= ~(XBF_DELWRI\|_XBF_DELWRI_Q);	1633	bp->b_flags &= ~(XBF_DELWRI\|_XBF_DELWRI_Q);
1634	spin_unlock(dwlk);	1634	spin_unlock(dwlk);
1635		1635
1636	if (dequeued)	1636	if (dequeued)
1637	xfs_buf_rele(bp);	1637	xfs_buf_rele(bp);
1638		1638
1639	XB_TRACE(bp, "delwri_dq", (long)dequeued);	1639	XB_TRACE(bp, "delwri_dq", (long)dequeued);
1640	}	1640	}
1641		1641
1642	STATIC void	1642	STATIC void
1643	xfs_buf_runall_queues(	1643	xfs_buf_runall_queues(
1644	struct workqueue_struct *queue)	1644	struct workqueue_struct *queue)
1645	{	1645	{
1646	flush_workqueue(queue);	1646	flush_workqueue(queue);
1647	}	1647	}
1648		1648
1649	STATIC int	1649	STATIC int
1650	xfsbufd_wakeup(	1650	xfsbufd_wakeup(
1651	int priority,	1651	int priority,
1652	gfp_t mask)	1652	gfp_t mask)
1653	{	1653	{
1654	xfs_buftarg_t *btp;	1654	xfs_buftarg_t *btp;
1655		1655
1656	spin_lock(&xfs_buftarg_lock);	1656	spin_lock(&xfs_buftarg_lock);
1657	list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {	1657	list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
1658	if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))	1658	if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
1659	continue;	1659	continue;
1660	set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);	1660	set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
1661	wake_up_process(btp->bt_task);	1661	wake_up_process(btp->bt_task);
1662	}	1662	}
1663	spin_unlock(&xfs_buftarg_lock);	1663	spin_unlock(&xfs_buftarg_lock);
1664	return 0;	1664	return 0;
1665	}	1665	}
1666		1666
1667	/*	1667	/*
1668	* Move as many buffers as specified to the supplied list	1668	* Move as many buffers as specified to the supplied list
1669	* idicating if we skipped any buffers to prevent deadlocks.	1669	* idicating if we skipped any buffers to prevent deadlocks.
1670	*/	1670	*/
1671	STATIC int	1671	STATIC int
1672	xfs_buf_delwri_split(	1672	xfs_buf_delwri_split(
1673	xfs_buftarg_t *target,	1673	xfs_buftarg_t *target,
1674	struct list_head *list,	1674	struct list_head *list,
1675	unsigned long age)	1675	unsigned long age)
1676	{	1676	{
1677	xfs_buf_t bp, n;	1677	xfs_buf_t bp, n;
1678	struct list_head *dwq = &target->bt_delwrite_queue;	1678	struct list_head *dwq = &target->bt_delwrite_queue;
1679	spinlock_t *dwlk = &target->bt_delwrite_lock;	1679	spinlock_t *dwlk = &target->bt_delwrite_lock;
1680	int skipped = 0;	1680	int skipped = 0;
1681	int force;	1681	int force;
1682		1682
1683	force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);	1683	force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1684	INIT_LIST_HEAD(list);	1684	INIT_LIST_HEAD(list);
1685	spin_lock(dwlk);	1685	spin_lock(dwlk);
1686	list_for_each_entry_safe(bp, n, dwq, b_list) {	1686	list_for_each_entry_safe(bp, n, dwq, b_list) {
1687	XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp));	1687	XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp));
1688	ASSERT(bp->b_flags & XBF_DELWRI);	1688	ASSERT(bp->b_flags & XBF_DELWRI);
1689		1689
1690	if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) {	1690	if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) {
1691	if (!force &&	1691	if (!force &&
1692	time_before(jiffies, bp->b_queuetime + age)) {	1692	time_before(jiffies, bp->b_queuetime + age)) {
1693	xfs_buf_unlock(bp);	1693	xfs_buf_unlock(bp);
1694	break;	1694	break;
1695	}	1695	}
1696		1696
1697	bp->b_flags &= ~(XBF_DELWRI\|_XBF_DELWRI_Q\|	1697	bp->b_flags &= ~(XBF_DELWRI\|_XBF_DELWRI_Q\|
1698	_XBF_RUN_QUEUES);	1698	_XBF_RUN_QUEUES);
1699	bp->b_flags \|= XBF_WRITE;	1699	bp->b_flags \|= XBF_WRITE;
1700	list_move_tail(&bp->b_list, list);	1700	list_move_tail(&bp->b_list, list);
1701	} else	1701	} else
1702	skipped++;	1702	skipped++;
1703	}	1703	}
1704	spin_unlock(dwlk);	1704	spin_unlock(dwlk);
1705		1705
1706	return skipped;	1706	return skipped;
1707		1707
1708	}	1708	}
1709		1709
1710	STATIC int	1710	STATIC int
1711	xfsbufd(	1711	xfsbufd(
1712	void *data)	1712	void *data)
1713	{	1713	{
1714	struct list_head tmp;	1714	struct list_head tmp;
1715	xfs_buftarg_t target = (xfs_buftarg_t )data;	1715	xfs_buftarg_t target = (xfs_buftarg_t )data;
1716	int count;	1716	int count;
1717	xfs_buf_t *bp;	1717	xfs_buf_t *bp;
1718		1718
1719	current->flags \|= PF_MEMALLOC;	1719	current->flags \|= PF_MEMALLOC;
1720		1720
1721	set_freezable();	1721	set_freezable();
1722		1722
1723	do {	1723	do {
1724	if (unlikely(freezing(current))) {	1724	if (unlikely(freezing(current))) {
1725	set_bit(XBT_FORCE_SLEEP, &target->bt_flags);	1725	set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1726	refrigerator();	1726	refrigerator();
1727	} else {	1727	} else {
1728	clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);	1728	clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1729	}	1729	}
1730		1730
1731	schedule_timeout_interruptible(	1731	schedule_timeout_interruptible(
1732	xfs_buf_timer_centisecs * msecs_to_jiffies(10));	1732	xfs_buf_timer_centisecs * msecs_to_jiffies(10));
1733		1733
1734	xfs_buf_delwri_split(target, &tmp,	1734	xfs_buf_delwri_split(target, &tmp,
1735	xfs_buf_age_centisecs * msecs_to_jiffies(10));	1735	xfs_buf_age_centisecs * msecs_to_jiffies(10));
1736		1736
1737	count = 0;	1737	count = 0;
1738	while (!list_empty(&tmp)) {	1738	while (!list_empty(&tmp)) {
1739	bp = list_entry(tmp.next, xfs_buf_t, b_list);	1739	bp = list_entry(tmp.next, xfs_buf_t, b_list);
1740	ASSERT(target == bp->b_target);	1740	ASSERT(target == bp->b_target);
1741		1741
1742	list_del_init(&bp->b_list);	1742	list_del_init(&bp->b_list);
1743	xfs_buf_iostrategy(bp);	1743	xfs_buf_iostrategy(bp);
1744	count++;	1744	count++;
1745	}	1745	}
1746		1746
1747	if (as_list_len > 0)	1747	if (as_list_len > 0)
1748	purge_addresses();	1748	purge_addresses();
1749	if (count)	1749	if (count)
1750	blk_run_address_space(target->bt_mapping);	1750	blk_run_address_space(target->bt_mapping);
1751		1751
1752	} while (!kthread_should_stop());	1752	} while (!kthread_should_stop());
1753		1753
1754	return 0;	1754	return 0;
1755	}	1755	}
1756		1756
1757	/*	1757	/*
1758	* Go through all incore buffers, and release buffers if they belong to	1758	* Go through all incore buffers, and release buffers if they belong to
1759	* the given device. This is used in filesystem error handling to	1759	* the given device. This is used in filesystem error handling to
1760	* preserve the consistency of its metadata.	1760	* preserve the consistency of its metadata.
1761	*/	1761	*/
1762	int	1762	int
1763	xfs_flush_buftarg(	1763	xfs_flush_buftarg(
1764	xfs_buftarg_t *target,	1764	xfs_buftarg_t *target,
1765	int wait)	1765	int wait)
1766	{	1766	{
1767	struct list_head tmp;	1767	struct list_head tmp;
1768	xfs_buf_t bp, n;	1768	xfs_buf_t bp, n;
1769	int pincount = 0;	1769	int pincount = 0;
1770		1770
1771	xfs_buf_runall_queues(xfsdatad_workqueue);	1771	xfs_buf_runall_queues(xfsdatad_workqueue);
1772	xfs_buf_runall_queues(xfslogd_workqueue);	1772	xfs_buf_runall_queues(xfslogd_workqueue);
1773		1773
1774	set_bit(XBT_FORCE_FLUSH, &target->bt_flags);	1774	set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1775	pincount = xfs_buf_delwri_split(target, &tmp, 0);	1775	pincount = xfs_buf_delwri_split(target, &tmp, 0);
1776		1776
1777	/*	1777	/*
1778	* Dropped the delayed write list lock, now walk the temporary list	1778	* Dropped the delayed write list lock, now walk the temporary list
1779	*/	1779	*/
1780	list_for_each_entry_safe(bp, n, &tmp, b_list) {	1780	list_for_each_entry_safe(bp, n, &tmp, b_list) {
1781	ASSERT(target == bp->b_target);	1781	ASSERT(target == bp->b_target);
1782	if (wait)	1782	if (wait)
1783	bp->b_flags &= ~XBF_ASYNC;	1783	bp->b_flags &= ~XBF_ASYNC;
1784	else	1784	else
1785	list_del_init(&bp->b_list);	1785	list_del_init(&bp->b_list);
1786		1786
1787	xfs_buf_iostrategy(bp);	1787	xfs_buf_iostrategy(bp);
1788	}	1788	}
1789		1789
1790	if (wait)	1790	if (wait)
1791	blk_run_address_space(target->bt_mapping);	1791	blk_run_address_space(target->bt_mapping);
1792		1792
1793	/*	1793	/*
1794	* Remaining list items must be flushed before returning	1794	* Remaining list items must be flushed before returning
1795	*/	1795	*/
1796	while (!list_empty(&tmp)) {	1796	while (!list_empty(&tmp)) {
1797	bp = list_entry(tmp.next, xfs_buf_t, b_list);	1797	bp = list_entry(tmp.next, xfs_buf_t, b_list);
1798		1798
1799	list_del_init(&bp->b_list);	1799	list_del_init(&bp->b_list);
1800	xfs_iowait(bp);	1800	xfs_iowait(bp);
1801	xfs_buf_relse(bp);	1801	xfs_buf_relse(bp);
1802	}	1802	}
1803		1803
1804	return pincount;	1804	return pincount;
1805	}	1805	}
1806		1806
1807	int __init	1807	int __init
1808	xfs_buf_init(void)	1808	xfs_buf_init(void)
1809	{	1809	{
1810	#ifdef XFS_BUF_TRACE	1810	#ifdef XFS_BUF_TRACE
1811	xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_NOFS);	1811	xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_NOFS);
1812	#endif	1812	#endif
1813		1813
1814	xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",	1814	xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
1815	KM_ZONE_HWALIGN, NULL);	1815	KM_ZONE_HWALIGN, NULL);
1816	if (!xfs_buf_zone)	1816	if (!xfs_buf_zone)
1817	goto out_free_trace_buf;	1817	goto out_free_trace_buf;
1818		1818
1819	xfslogd_workqueue = create_workqueue("xfslogd");	1819	xfslogd_workqueue = create_workqueue("xfslogd");
1820	if (!xfslogd_workqueue)	1820	if (!xfslogd_workqueue)
1821	goto out_free_buf_zone;	1821	goto out_free_buf_zone;
1822		1822
1823	xfsdatad_workqueue = create_workqueue("xfsdatad");	1823	xfsdatad_workqueue = create_workqueue("xfsdatad");
1824	if (!xfsdatad_workqueue)	1824	if (!xfsdatad_workqueue)
1825	goto out_destroy_xfslogd_workqueue;	1825	goto out_destroy_xfslogd_workqueue;
1826		1826
1827	register_shrinker(&xfs_buf_shake);	1827	register_shrinker(&xfs_buf_shake);
1828	return 0;	1828	return 0;
1829		1829
1830	out_destroy_xfslogd_workqueue:	1830	out_destroy_xfslogd_workqueue:
1831	destroy_workqueue(xfslogd_workqueue);	1831	destroy_workqueue(xfslogd_workqueue);
1832	out_free_buf_zone:	1832	out_free_buf_zone:
1833	kmem_zone_destroy(xfs_buf_zone);	1833	kmem_zone_destroy(xfs_buf_zone);
1834	out_free_trace_buf:	1834	out_free_trace_buf:
1835	#ifdef XFS_BUF_TRACE	1835	#ifdef XFS_BUF_TRACE
1836	ktrace_free(xfs_buf_trace_buf);	1836	ktrace_free(xfs_buf_trace_buf);
1837	#endif	1837	#endif
1838	return -ENOMEM;	1838	return -ENOMEM;
1839	}	1839	}
1840		1840
1841	void	1841	void
1842	xfs_buf_terminate(void)	1842	xfs_buf_terminate(void)
1843	{	1843	{
1844	unregister_shrinker(&xfs_buf_shake);	1844	unregister_shrinker(&xfs_buf_shake);
1845	destroy_workqueue(xfsdatad_workqueue);	1845	destroy_workqueue(xfsdatad_workqueue);
1846	destroy_workqueue(xfslogd_workqueue);	1846	destroy_workqueue(xfslogd_workqueue);
1847	kmem_zone_destroy(xfs_buf_zone);	1847	kmem_zone_destroy(xfs_buf_zone);
1848	#ifdef XFS_BUF_TRACE	1848	#ifdef XFS_BUF_TRACE
1849	ktrace_free(xfs_buf_trace_buf);	1849	ktrace_free(xfs_buf_trace_buf);
1850	#endif	1850	#endif
1851	}	1851	}
1852		1852
1853	#ifdef CONFIG_KDB_MODULES	1853	#ifdef CONFIG_KDB_MODULES
1854	struct list_head *	1854	struct list_head *
1855	xfs_get_buftarg_list(void)	1855	xfs_get_buftarg_list(void)
1856	{	1856	{
1857	return &xfs_buftarg_list;	1857	return &xfs_buftarg_list;
1858	}	1858	}
1859	#endif	1859	#endif
1860		1860

fs/xfs/linux-2.6/xfs_buf.h

Diff comments View file @ 15ac08a

 /*
  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it would be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #ifndef __XFS_BUF_H__
 #define __XFS_BUF_H__
 #include <linux/list.h>
 #include <linux/types.h>
 #include <linux/spinlock.h>
 #include <asm/system.h>
 #include <linux/mm.h>
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/uio.h>
 /*
  *	Base types
  */
 #define XFS_BUF_DADDR_NULL	((xfs_daddr_t) (-1LL))
 #define xfs_buf_ctob(pp)	((pp) * PAGE_CACHE_SIZE)
 #define xfs_buf_btoc(dd)	(((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT)
 #define xfs_buf_btoct(dd)	((dd) >> PAGE_CACHE_SHIFT)
 #define xfs_buf_poff(aa)	((aa) & ~PAGE_CACHE_MASK)
 typedef enum {
 	XBRW_READ = 1,			/* transfer into target memory */
 	XBRW_WRITE = 2,			/* transfer from target memory */
 	XBRW_ZERO = 3,			/* Zero target memory */
 } xfs_buf_rw_t;
 typedef enum {
 	XBF_READ = (1 << 0),	/* buffer intended for reading from device */
 	XBF_WRITE = (1 << 1),	/* buffer intended for writing to device   */
 	XBF_MAPPED = (1 << 2),  /* buffer mapped (b_addr valid)            */
 	XBF_ASYNC = (1 << 4),   /* initiator will not wait for completion  */
 	XBF_DONE = (1 << 5),    /* all pages in the buffer uptodate	   */
 	XBF_DELWRI = (1 << 6),  /* buffer has dirty pages                  */
 	XBF_STALE = (1 << 7),	/* buffer has been staled, do not find it  */
 	XBF_FS_MANAGED = (1 << 8),  /* filesystem controls freeing memory  */
  	XBF_ORDERED = (1 << 11),    /* use ordered writes		   */
 	XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead		   */
 	/* flags used only as arguments to access routines */
 	XBF_LOCK = (1 << 14),       /* lock requested			   */
 	XBF_TRYLOCK = (1 << 15),    /* lock requested, but do not wait	   */
 	XBF_DONT_BLOCK = (1 << 16), /* do not block in current thread	   */
 	/* flags used only internally */
 	_XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache		   */
 	_XBF_PAGES = (1 << 18),	    /* backed by refcounted pages	   */
 	_XBF_RUN_QUEUES = (1 << 19),/* run block device task queue	   */
 	_XBF_DELWRI_Q = (1 << 21),   /* buffer on delwri queue		   */
 	/*
 	 * Special flag for supporting metadata blocks smaller than a FSB.
 	 *
 	 * In this case we can have multiple xfs_buf_t on a single page and
 	 * need to lock out concurrent xfs_buf_t readers as they only
 	 * serialise access to the buffer.
 	 *
 	 * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
 	 * between reads of the page. Hence we can have one thread read the
 	 * page and modify it, but then race with another thread that thinks
 	 * the page is not up-to-date and hence reads it again.
 	 *
 	 * The result is that the first modifcation to the page is lost.
 	 * This sort of AGF/AGI reading race can happen when unlinking inodes
 	 * that require truncation and results in the AGI unlinked list
 	 * modifications being lost.
 	 */
 	_XBF_PAGE_LOCKED = (1 << 22),
 	/*
 	 * If we try a barrier write, but it fails we have to communicate
 	 * this to the upper layers.  Unfortunately b_error gets overwritten
 	 * when the buffer is re-issued so we have to add another flag to
 	 * keep this information.
 	 */
 	_XFS_BARRIER_FAILED = (1 << 23),
 } xfs_buf_flags_t;
 typedef enum {
 	XBT_FORCE_SLEEP = 0,
 	XBT_FORCE_FLUSH = 1,
 } xfs_buftarg_flags_t;
 typedef struct xfs_bufhash {
 	struct list_head	bh_list;
 	spinlock_t		bh_lock;
 } xfs_bufhash_t;
 typedef struct xfs_buftarg {
 	dev_t			bt_dev;
 	struct block_device	*bt_bdev;
 	struct address_space	*bt_mapping;
 	unsigned int		bt_bsize;
 	unsigned int		bt_sshift;
 	size_t			bt_smask;
 	/* per device buffer hash table */
 	uint			bt_hashmask;
 	uint			bt_hashshift;
 	xfs_bufhash_t		*bt_hash;
 	/* per device delwri queue */
 	struct task_struct	*bt_task;
 	struct list_head	bt_list;
 	struct list_head	bt_delwrite_queue;
 	spinlock_t		bt_delwrite_lock;
 	unsigned long		bt_flags;
 } xfs_buftarg_t;
 /*
  *	xfs_buf_t:  Buffer structure for pagecache-based buffers
  *
  * This buffer structure is used by the pagecache buffer management routines
  * to refer to an assembly of pages forming a logical buffer.
  *
  * The buffer structure is used on a temporary basis only, and discarded when
  * released.  The real data storage is recorded in the pagecache. Buffers are
  * hashed to the block device on which the file system resides.
  */
 struct xfs_buf;
 typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
 typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
 typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
 #define XB_PAGES	2
 typedef struct xfs_buf {
 	struct semaphore	b_sema;		/* semaphore for lockables */
 	unsigned long		b_queuetime;	/* time buffer was queued */
 	atomic_t		b_pin_count;	/* pin count */
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
 	struct list_head	b_list;
 	xfs_buf_flags_t		b_flags;	/* status flags */
 	struct list_head	b_hash_list;	/* hash table list */
 	xfs_bufhash_t		*b_hash;	/* hash table list start */
 	xfs_buftarg_t		*b_target;	/* buffer target (device) */
 	atomic_t		b_hold;		/* reference count */
 	xfs_daddr_t		b_bn;		/* block number for I/O */
 	xfs_off_t		b_file_offset;	/* offset in file */
 	size_t			b_buffer_length;/* size of buffer in bytes */
 	size_t			b_count_desired;/* desired transfer size */
 	void			*b_addr;	/* virtual address of buffer */
 	struct work_struct	b_iodone_work;
 	atomic_t		b_io_remaining;	/* #outstanding I/O requests */
 	xfs_buf_iodone_t	b_iodone;	/* I/O completion function */
 	xfs_buf_relse_t		b_relse;	/* releasing function */
 	xfs_buf_bdstrat_t	b_strat;	/* pre-write function */
 	struct completion	b_iowait;	/* queue for I/O waiters */
 	void			*b_fspriv;
 	void			*b_fspriv2;
-	void			*b_fspriv3;
+	struct xfs_mount	*b_mount;
 	unsigned short		b_error;	/* error code on I/O */
 	unsigned int		b_page_count;	/* size of page array */
 	unsigned int		b_offset;	/* page offset in first page */
 	struct page		**b_pages;	/* array of page pointers */
 	struct page		*b_page_array[XB_PAGES]; /* inline pages */
 #ifdef XFS_BUF_LOCK_TRACKING
 	int			b_last_holder;
 #endif
 } xfs_buf_t;
 /* Finding and Reading Buffers */
 extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t,
 				xfs_buf_flags_t, xfs_buf_t *);
 #define xfs_incore(buftarg,blkno,len,lockit) \
 	_xfs_buf_find(buftarg, blkno ,len, lockit, NULL)
 extern xfs_buf_t *xfs_buf_get_flags(xfs_buftarg_t *, xfs_off_t, size_t,
 				xfs_buf_flags_t);
 #define xfs_buf_get(target, blkno, len, flags) \
 	xfs_buf_get_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED)
 extern xfs_buf_t *xfs_buf_read_flags(xfs_buftarg_t *, xfs_off_t, size_t,
 				xfs_buf_flags_t);
 #define xfs_buf_read(target, blkno, len, flags) \
 	xfs_buf_read_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED)
 extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
 extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *);
 extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
 extern void xfs_buf_hold(xfs_buf_t *);
 extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t,
 				xfs_buf_flags_t);
 /* Releasing Buffers */
 extern void xfs_buf_free(xfs_buf_t *);
 extern void xfs_buf_rele(xfs_buf_t *);
 /* Locking and Unlocking Buffers */
 extern int xfs_buf_cond_lock(xfs_buf_t *);
 extern int xfs_buf_lock_value(xfs_buf_t *);
 extern void xfs_buf_lock(xfs_buf_t *);
 extern void xfs_buf_unlock(xfs_buf_t *);
 /* Buffer Read and Write Routines */
 extern int xfs_bawrite(void *mp, xfs_buf_t *bp);
 extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
 extern void xfs_buf_ioend(xfs_buf_t *,	int);
 extern void xfs_buf_ioerror(xfs_buf_t *, int);
 extern int xfs_buf_iorequest(xfs_buf_t *);
 extern int xfs_buf_iowait(xfs_buf_t *);
 extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t,
 				xfs_buf_rw_t);
 static inline int xfs_buf_iostrategy(xfs_buf_t *bp)
 {
 	return bp->b_strat ? bp->b_strat(bp) : xfs_buf_iorequest(bp);
 }
 static inline int xfs_buf_geterror(xfs_buf_t *bp)
 {
 	return bp ? bp->b_error : ENOMEM;
 }
 /* Buffer Utility Routines */
 extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
 /* Pinning Buffer Storage in Memory */
 extern void xfs_buf_pin(xfs_buf_t *);
 extern void xfs_buf_unpin(xfs_buf_t *);
 extern int xfs_buf_ispin(xfs_buf_t *);
 /* Delayed Write Buffer Routines */
 extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
 /* Buffer Daemon Setup Routines */
 extern int xfs_buf_init(void);
 extern void xfs_buf_terminate(void);
 #ifdef XFS_BUF_TRACE
 extern ktrace_t *xfs_buf_trace_buf;
 extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
 #else
 #define xfs_buf_trace(bp,id,ptr,ra)	do { } while (0)
 #endif
 #define xfs_buf_target_name(target)	\
 	({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
 #define XFS_B_ASYNC		XBF_ASYNC
 #define XFS_B_DELWRI		XBF_DELWRI
 #define XFS_B_READ		XBF_READ
 #define XFS_B_WRITE		XBF_WRITE
 #define XFS_B_STALE		XBF_STALE
 #define XFS_BUF_TRYLOCK		XBF_TRYLOCK
 #define XFS_INCORE_TRYLOCK	XBF_TRYLOCK
 #define XFS_BUF_LOCK		XBF_LOCK
 #define XFS_BUF_MAPPED		XBF_MAPPED
 #define BUF_BUSY		XBF_DONT_BLOCK
 #define XFS_BUF_BFLAGS(bp)	((bp)->b_flags)
 #define XFS_BUF_ZEROFLAGS(bp)	((bp)->b_flags &= \
 		~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
 #define XFS_BUF_STALE(bp)	((bp)->b_flags |= XFS_B_STALE)
 #define XFS_BUF_UNSTALE(bp)	((bp)->b_flags &= ~XFS_B_STALE)
 #define XFS_BUF_ISSTALE(bp)	((bp)->b_flags & XFS_B_STALE)
 #define XFS_BUF_SUPER_STALE(bp)	do {				\
 					XFS_BUF_STALE(bp);	\
 					xfs_buf_delwri_dequeue(bp);	\
 					XFS_BUF_DONE(bp);	\
 				} while (0)
 #define XFS_BUF_MANAGE		XBF_FS_MANAGED
 #define XFS_BUF_UNMANAGE(bp)	((bp)->b_flags &= ~XBF_FS_MANAGED)
 #define XFS_BUF_DELAYWRITE(bp)		((bp)->b_flags |= XBF_DELWRI)
 #define XFS_BUF_UNDELAYWRITE(bp)	xfs_buf_delwri_dequeue(bp)
 #define XFS_BUF_ISDELAYWRITE(bp)	((bp)->b_flags & XBF_DELWRI)
 #define XFS_BUF_ERROR(bp,no)	xfs_buf_ioerror(bp,no)
 #define XFS_BUF_GETERROR(bp)	xfs_buf_geterror(bp)
 #define XFS_BUF_ISERROR(bp)	(xfs_buf_geterror(bp) ? 1 : 0)
 #define XFS_BUF_DONE(bp)	((bp)->b_flags |= XBF_DONE)
 #define XFS_BUF_UNDONE(bp)	((bp)->b_flags &= ~XBF_DONE)
 #define XFS_BUF_ISDONE(bp)	((bp)->b_flags & XBF_DONE)
 #define XFS_BUF_BUSY(bp)	do { } while (0)
 #define XFS_BUF_UNBUSY(bp)	do { } while (0)
 #define XFS_BUF_ISBUSY(bp)	(1)
 #define XFS_BUF_ASYNC(bp)	((bp)->b_flags |= XBF_ASYNC)
 #define XFS_BUF_UNASYNC(bp)	((bp)->b_flags &= ~XBF_ASYNC)
 #define XFS_BUF_ISASYNC(bp)	((bp)->b_flags & XBF_ASYNC)
 #define XFS_BUF_ORDERED(bp)	((bp)->b_flags |= XBF_ORDERED)
 #define XFS_BUF_UNORDERED(bp)	((bp)->b_flags &= ~XBF_ORDERED)
 #define XFS_BUF_ISORDERED(bp)	((bp)->b_flags & XBF_ORDERED)
 #define XFS_BUF_SHUT(bp)	do { } while (0)
 #define XFS_BUF_UNSHUT(bp)	do { } while (0)
 #define XFS_BUF_ISSHUT(bp)	(0)
 #define XFS_BUF_HOLD(bp)	xfs_buf_hold(bp)
 #define XFS_BUF_READ(bp)	((bp)->b_flags |= XBF_READ)
 #define XFS_BUF_UNREAD(bp)	((bp)->b_flags &= ~XBF_READ)
 #define XFS_BUF_ISREAD(bp)	((bp)->b_flags & XBF_READ)
 #define XFS_BUF_WRITE(bp)	((bp)->b_flags |= XBF_WRITE)
 #define XFS_BUF_UNWRITE(bp)	((bp)->b_flags &= ~XBF_WRITE)
 #define XFS_BUF_ISWRITE(bp)	((bp)->b_flags & XBF_WRITE)
 #define XFS_BUF_IODONE_FUNC(bp)			((bp)->b_iodone)
 #define XFS_BUF_SET_IODONE_FUNC(bp, func)	((bp)->b_iodone = (func))
 #define XFS_BUF_CLR_IODONE_FUNC(bp)		((bp)->b_iodone = NULL)
 #define XFS_BUF_SET_BDSTRAT_FUNC(bp, func)	((bp)->b_strat = (func))
 #define XFS_BUF_CLR_BDSTRAT_FUNC(bp)		((bp)->b_strat = NULL)
 #define XFS_BUF_FSPRIVATE(bp, type)		((type)(bp)->b_fspriv)
 #define XFS_BUF_SET_FSPRIVATE(bp, val)		((bp)->b_fspriv = (void*)(val))
 #define XFS_BUF_FSPRIVATE2(bp, type)		((type)(bp)->b_fspriv2)
 #define XFS_BUF_SET_FSPRIVATE2(bp, val)		((bp)->b_fspriv2 = (void*)(val))
-#define XFS_BUF_FSPRIVATE3(bp, type)		((type)(bp)->b_fspriv3)
-#define XFS_BUF_SET_FSPRIVATE3(bp, val)		((bp)->b_fspriv3 = (void*)(val))
 #define XFS_BUF_SET_START(bp)			do { } while (0)
 #define XFS_BUF_SET_BRELSE_FUNC(bp, func)	((bp)->b_relse = (func))
 #define XFS_BUF_PTR(bp)			(xfs_caddr_t)((bp)->b_addr)
 #define XFS_BUF_SET_PTR(bp, val, cnt)	xfs_buf_associate_memory(bp, val, cnt)
 #define XFS_BUF_ADDR(bp)		((bp)->b_bn)
 #define XFS_BUF_SET_ADDR(bp, bno)	((bp)->b_bn = (xfs_daddr_t)(bno))
 #define XFS_BUF_OFFSET(bp)		((bp)->b_file_offset)
 #define XFS_BUF_SET_OFFSET(bp, off)	((bp)->b_file_offset = (off))
 #define XFS_BUF_COUNT(bp)		((bp)->b_count_desired)
 #define XFS_BUF_SET_COUNT(bp, cnt)	((bp)->b_count_desired = (cnt))
 #define XFS_BUF_SIZE(bp)		((bp)->b_buffer_length)
 #define XFS_BUF_SET_SIZE(bp, cnt)	((bp)->b_buffer_length = (cnt))
 #define XFS_BUF_SET_VTYPE_REF(bp, type, ref)	do { } while (0)
 #define XFS_BUF_SET_VTYPE(bp, type)		do { } while (0)
 #define XFS_BUF_SET_REF(bp, ref)		do { } while (0)
 #define XFS_BUF_ISPINNED(bp)	xfs_buf_ispin(bp)
 #define XFS_BUF_VALUSEMA(bp)	xfs_buf_lock_value(bp)
 #define XFS_BUF_CPSEMA(bp)	(xfs_buf_cond_lock(bp) == 0)
 #define XFS_BUF_VSEMA(bp)	xfs_buf_unlock(bp)
 #define XFS_BUF_PSEMA(bp,x)	xfs_buf_lock(bp)
 #define XFS_BUF_FINISH_IOWAIT(bp)	complete(&bp->b_iowait);
 #define XFS_BUF_SET_TARGET(bp, target)	((bp)->b_target = (target))
 #define XFS_BUF_TARGET(bp)		((bp)->b_target)
 #define XFS_BUFTARG_NAME(target)	xfs_buf_target_name(target)
 static inline void xfs_buf_relse(xfs_buf_t *bp)
 {
 	if (!bp->b_relse)
 		xfs_buf_unlock(bp);
 	xfs_buf_rele(bp);
 }
 #define xfs_bpin(bp)		xfs_buf_pin(bp)
 #define xfs_bunpin(bp)		xfs_buf_unpin(bp)
 #define xfs_buftrace(id, bp)	\
 	    xfs_buf_trace(bp, id, NULL, (void *)__builtin_return_address(0))
 #define xfs_biodone(bp)		xfs_buf_ioend(bp, 0)
 #define xfs_biomove(bp, off, len, data, rw) \
 	    xfs_buf_iomove((bp), (off), (len), (data), \
 		((rw) == XFS_B_WRITE) ? XBRW_WRITE : XBRW_READ)
 #define xfs_biozero(bp, off, len) \
 	    xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
 static inline int XFS_bwrite(xfs_buf_t *bp)
 {
 	int	iowait = (bp->b_flags & XBF_ASYNC) == 0;
 	int	error = 0;
 	if (!iowait)
 		bp->b_flags |= _XBF_RUN_QUEUES;
 	xfs_buf_delwri_dequeue(bp);
 	xfs_buf_iostrategy(bp);
 	if (iowait) {
 		error = xfs_buf_iowait(bp);
 		xfs_buf_relse(bp);
 	}
 	return error;
 }
 #define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
 #define xfs_iowait(bp)	xfs_buf_iowait(bp)
 #define xfs_baread(target, rablkno, ralen)  \
 	xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK)
 /*
  *	Handling of buftargs.
  */
 extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
 extern void xfs_free_buftarg(xfs_buftarg_t *);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
 extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
 #ifdef CONFIG_KDB_MODULES
 extern struct list_head *xfs_get_buftarg_list(void);
 #endif
 #define xfs_getsize_buftarg(buftarg)	block_size((buftarg)->bt_bdev)
 #define xfs_readonly_buftarg(buftarg)	bdev_read_only((buftarg)->bt_bdev)
 #define xfs_binval(buftarg)		xfs_flush_buftarg(buftarg, 1)
 #define XFS_bflush(buftarg)		xfs_flush_buftarg(buftarg, 1)
 #endif	/* __XFS_BUF_H__ */

fs/xfs/linux-2.6/xfs_lrw.c

Diff comments View file @ 15ac08a

 /*
  * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it would be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
 #include "xfs_alloc.h"
 #include "xfs_dmapi.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_bmap.h"
 #include "xfs_btree.h"
 #include "xfs_ialloc.h"
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_rw.h"
 #include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_inode_item.h"
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
 #include "xfs_iomap.h"
 #include "xfs_vnodeops.h"
 #include <linux/capability.h>
 #include <linux/writeback.h>
 #if defined(XFS_RW_TRACE)
 void
 xfs_rw_enter_trace(
 	int			tag,
 	xfs_inode_t		*ip,
 	void			*data,
 	size_t			segs,
 	loff_t			offset,
 	int			ioflags)
 {
 	if (ip->i_rwtrace == NULL)
 		return;
 	ktrace_enter(ip->i_rwtrace,
 		(void *)(unsigned long)tag,
 		(void *)ip,
 		(void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
 		(void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
 		(void *)data,
 		(void *)((unsigned long)segs),
 		(void *)((unsigned long)((offset >> 32) & 0xffffffff)),
 		(void *)((unsigned long)(offset & 0xffffffff)),
 		(void *)((unsigned long)ioflags),
 		(void *)((unsigned long)((ip->i_new_size >> 32) & 0xffffffff)),
 		(void *)((unsigned long)(ip->i_new_size & 0xffffffff)),
 		(void *)((unsigned long)current_pid()),
 		(void *)NULL,
 		(void *)NULL,
 		(void *)NULL,
 		(void *)NULL);
 }
 void
 xfs_inval_cached_trace(
 	xfs_inode_t	*ip,
 	xfs_off_t	offset,
 	xfs_off_t	len,
 	xfs_off_t	first,
 	xfs_off_t	last)
 {
 	if (ip->i_rwtrace == NULL)
 		return;
 	ktrace_enter(ip->i_rwtrace,
 		(void *)(__psint_t)XFS_INVAL_CACHED,
 		(void *)ip,
 		(void *)((unsigned long)((offset >> 32) & 0xffffffff)),
 		(void *)((unsigned long)(offset & 0xffffffff)),
 		(void *)((unsigned long)((len >> 32) & 0xffffffff)),
 		(void *)((unsigned long)(len & 0xffffffff)),
 		(void *)((unsigned long)((first >> 32) & 0xffffffff)),
 		(void *)((unsigned long)(first & 0xffffffff)),
 		(void *)((unsigned long)((last >> 32) & 0xffffffff)),
 		(void *)((unsigned long)(last & 0xffffffff)),
 		(void *)((unsigned long)current_pid()),
 		(void *)NULL,
 		(void *)NULL,
 		(void *)NULL,
 		(void *)NULL,
 		(void *)NULL);
 }
 #endif
 /*
  *	xfs_iozero
  *
  *	xfs_iozero clears the specified range of buffer supplied,
  *	and marks all the affected blocks as valid and modified.  If
  *	an affected block is not allocated, it will be allocated.  If
  *	an affected block is not completely overwritten, and is not
  *	valid before the operation, it will be read from disk before
  *	being partially zeroed.
  */
 STATIC int
 xfs_iozero(
 	struct xfs_inode	*ip,	/* inode			*/
 	loff_t			pos,	/* offset in file		*/
 	size_t			count)	/* size of data to zero		*/
 {
 	struct page		*page;
 	struct address_space	*mapping;
 	int			status;
 	mapping = VFS_I(ip)->i_mapping;
 	do {
 		unsigned offset, bytes;
 		void *fsdata;
 		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
 		bytes = PAGE_CACHE_SIZE - offset;
 		if (bytes > count)
 			bytes = count;
 		status = pagecache_write_begin(NULL, mapping, pos, bytes,
 					AOP_FLAG_UNINTERRUPTIBLE,
 					&page, &fsdata);
 		if (status)
 			break;
 		zero_user(page, offset, bytes);
 		status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
 					page, fsdata);
 		WARN_ON(status <= 0); /* can't return less than zero! */
 		pos += bytes;
 		count -= bytes;
 		status = 0;
 	} while (count);
 	return (-status);
 }
 ssize_t			/* bytes read, or (-)  error */
 xfs_read(
 	xfs_inode_t		*ip,
 	struct kiocb		*iocb,
 	const struct iovec	*iovp,
 	unsigned int		segs,
 	loff_t			*offset,
 	int			ioflags)
 {
 	struct file		*file = iocb->ki_filp;
 	struct inode		*inode = file->f_mapping->host;
 	xfs_mount_t		*mp = ip->i_mount;
 	size_t			size = 0;
 	ssize_t			ret = 0;
 	xfs_fsize_t		n;
 	unsigned long		seg;
 	XFS_STATS_INC(xs_read_calls);
 	/* START copy & waste from filemap.c */
 	for (seg = 0; seg < segs; seg++) {
 		const struct iovec *iv = &iovp[seg];
 		/*
 		 * If any segment has a negative length, or the cumulative
 		 * length ever wraps negative then return -EINVAL.
 		 */
 		size += iv->iov_len;
 		if (unlikely((ssize_t)(size|iv->iov_len) < 0))
 			return XFS_ERROR(-EINVAL);
 	}
 	/* END copy & waste from filemap.c */
 	if (unlikely(ioflags & IO_ISDIRECT)) {
 		xfs_buftarg_t	*target =
 			XFS_IS_REALTIME_INODE(ip) ?
 				mp->m_rtdev_targp : mp->m_ddev_targp;
 		if ((*offset & target->bt_smask) ||
 		    (size & target->bt_smask)) {
 			if (*offset == ip->i_size) {
 				return (0);
 			}
 			return -XFS_ERROR(EINVAL);
 		}
 	}
 	n = XFS_MAXIOFFSET(mp) - *offset;
 	if ((n <= 0) || (size == 0))
 		return 0;
 	if (n < size)
 		size = n;
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 	if (unlikely(ioflags & IO_ISDIRECT))
 		mutex_lock(&inode->i_mutex);
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
 		int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
 		int iolock = XFS_IOLOCK_SHARED;
 		ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size,
 					dmflags, &iolock);
 		if (ret) {
 			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 			if (unlikely(ioflags & IO_ISDIRECT))
 				mutex_unlock(&inode->i_mutex);
 			return ret;
 		}
 	}
 	if (unlikely(ioflags & IO_ISDIRECT)) {
 		if (inode->i_mapping->nrpages)
 			ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
 						    -1, FI_REMAPF_LOCKED);
 		mutex_unlock(&inode->i_mutex);
 		if (ret) {
 			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 			return ret;
 		}
 	}
 	xfs_rw_enter_trace(XFS_READ_ENTER, ip,
 				(void *)iovp, segs, *offset, ioflags);
 	iocb->ki_pos = *offset;
 	ret = generic_file_aio_read(iocb, iovp, segs, *offset);
 	if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
 		ret = wait_on_sync_kiocb(iocb);
 	if (ret > 0)
 		XFS_STATS_ADD(xs_read_bytes, ret);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 	return ret;
 }
 ssize_t
 xfs_splice_read(
 	xfs_inode_t		*ip,
 	struct file		*infilp,
 	loff_t			*ppos,
 	struct pipe_inode_info	*pipe,
 	size_t			count,
 	int			flags,
 	int			ioflags)
 {
 	xfs_mount_t		*mp = ip->i_mount;
 	ssize_t			ret;
 	XFS_STATS_INC(xs_read_calls);
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return -EIO;
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
 		int iolock = XFS_IOLOCK_SHARED;
 		int error;
 		error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
 					FILP_DELAY_FLAG(infilp), &iolock);
 		if (error) {
 			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 			return -error;
 		}
 	}
 	xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, ip,
 			   pipe, count, *ppos, ioflags);
 	ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
 	if (ret > 0)
 		XFS_STATS_ADD(xs_read_bytes, ret);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 	return ret;
 }
 ssize_t
 xfs_splice_write(
 	xfs_inode_t		*ip,
 	struct pipe_inode_info	*pipe,
 	struct file		*outfilp,
 	loff_t			*ppos,
 	size_t			count,
 	int			flags,
 	int			ioflags)
 {
 	xfs_mount_t		*mp = ip->i_mount;
 	ssize_t			ret;
 	struct inode		*inode = outfilp->f_mapping->host;
 	xfs_fsize_t		isize, new_size;
 	XFS_STATS_INC(xs_write_calls);
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return -EIO;
 	xfs_ilock(ip, XFS_IOLOCK_EXCL);
 	if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
 		int iolock = XFS_IOLOCK_EXCL;
 		int error;
 		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
 					FILP_DELAY_FLAG(outfilp), &iolock);
 		if (error) {
 			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 			return -error;
 		}
 	}
 	new_size = *ppos + count;
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	if (new_size > ip->i_size)
 		ip->i_new_size = new_size;
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, ip,
 			   pipe, count, *ppos, ioflags);
 	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
 	if (ret > 0)
 		XFS_STATS_ADD(xs_write_bytes, ret);
 	isize = i_size_read(inode);
 	if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
 		*ppos = isize;
 	if (*ppos > ip->i_size) {
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 		if (*ppos > ip->i_size)
 			ip->i_size = *ppos;
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	}
 	if (ip->i_new_size) {
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 		ip->i_new_size = 0;
 		if (ip->i_d.di_size > ip->i_size)
 			ip->i_d.di_size = ip->i_size;
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	}
 	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 	return ret;
 }
 /*
  * This routine is called to handle zeroing any space in the last
  * block of the file that is beyond the EOF.  We do this since the
  * size is being increased without writing anything to that block
  * and we don't want anyone to read the garbage on the disk.
  */
 STATIC int				/* error (positive) */
 xfs_zero_last_block(
 	xfs_inode_t	*ip,
 	xfs_fsize_t	offset,
 	xfs_fsize_t	isize)
 {
 	xfs_fileoff_t	last_fsb;
 	xfs_mount_t	*mp = ip->i_mount;
 	int		nimaps;
 	int		zero_offset;
 	int		zero_len;
 	int		error = 0;
 	xfs_bmbt_irec_t	imap;
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	zero_offset = XFS_B_FSB_OFFSET(mp, isize);
 	if (zero_offset == 0) {
 		/*
 		 * There are no extra bytes in the last block on disk to
 		 * zero, so return.
 		 */
 		return 0;
 	}
 	last_fsb = XFS_B_TO_FSBT(mp, isize);
 	nimaps = 1;
 	error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
 			  &nimaps, NULL, NULL);
 	if (error) {
 		return error;
 	}
 	ASSERT(nimaps > 0);
 	/*
 	 * If the block underlying isize is just a hole, then there
 	 * is nothing to zero.
 	 */
 	if (imap.br_startblock == HOLESTARTBLOCK) {
 		return 0;
 	}
 	/*
 	 * Zero the part of the last block beyond the EOF, and write it
 	 * out sync.  We need to drop the ilock while we do this so we
 	 * don't deadlock when the buffer cache calls back to us.
 	 */
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	zero_len = mp->m_sb.sb_blocksize - zero_offset;
 	if (isize + zero_len > offset)
 		zero_len = offset - isize;
 	error = xfs_iozero(ip, isize, zero_len);
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	ASSERT(error >= 0);
 	return error;
 }
 /*
  * Zero any on disk space between the current EOF and the new,
  * larger EOF.  This handles the normal case of zeroing the remainder
  * of the last block in the file and the unusual case of zeroing blocks
  * out beyond the size of the file.  This second case only happens
  * with fixed size extents and when the system crashes before the inode
  * size was updated but after blocks were allocated.  If fill is set,
  * then any holes in the range are filled and zeroed.  If not, the holes
  * are left alone as holes.
  */
 int					/* error (positive) */
 xfs_zero_eof(
 	xfs_inode_t	*ip,
 	xfs_off_t	offset,		/* starting I/O offset */
 	xfs_fsize_t	isize)		/* current inode size */
 {
 	xfs_mount_t	*mp = ip->i_mount;
 	xfs_fileoff_t	start_zero_fsb;
 	xfs_fileoff_t	end_zero_fsb;
 	xfs_fileoff_t	zero_count_fsb;
 	xfs_fileoff_t	last_fsb;
 	xfs_fileoff_t	zero_off;
 	xfs_fsize_t	zero_len;
 	int		nimaps;
 	int		error = 0;
 	xfs_bmbt_irec_t	imap;
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
 	ASSERT(offset > isize);
 	/*
 	 * First handle zeroing the block on which isize resides.
 	 * We only zero a part of that block so it is handled specially.
 	 */
 	error = xfs_zero_last_block(ip, offset, isize);
 	if (error) {
 		ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
 		return error;
 	}
 	/*
 	 * Calculate the range between the new size and the old
 	 * where blocks needing to be zeroed may exist.  To get the
 	 * block where the last byte in the file currently resides,
 	 * we need to subtract one from the size and truncate back
 	 * to a block boundary.  We subtract 1 in case the size is
 	 * exactly on a block boundary.
 	 */
 	last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
 	start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
 	end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
 	ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
 	if (last_fsb == end_zero_fsb) {
 		/*
 		 * The size was only incremented on its last block.
 		 * We took care of that above, so just return.
 		 */
 		return 0;
 	}
 	ASSERT(start_zero_fsb <= end_zero_fsb);
 	while (start_zero_fsb <= end_zero_fsb) {
 		nimaps = 1;
 		zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
 		error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
 				  0, NULL, 0, &imap, &nimaps, NULL, NULL);
 		if (error) {
 			ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
 			return error;
 		}
 		ASSERT(nimaps > 0);
 		if (imap.br_state == XFS_EXT_UNWRITTEN ||
 		    imap.br_startblock == HOLESTARTBLOCK) {
 			/*
 			 * This loop handles initializing pages that were
 			 * partially initialized by the code below this
 			 * loop. It basically zeroes the part of the page
 			 * that sits on a hole and sets the page as P_HOLE
 			 * and calls remapf if it is a mapped file.
 			 */
 			start_zero_fsb = imap.br_startoff + imap.br_blockcount;
 			ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
 			continue;
 		}
 		/*
 		 * There are blocks we need to zero.
 		 * Drop the inode lock while we're doing the I/O.
 		 * We'll still have the iolock to protect us.
 		 */
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
 		zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
 		if ((zero_off + zero_len) > offset)
 			zero_len = offset - zero_off;
 		error = xfs_iozero(ip, zero_off, zero_len);
 		if (error) {
 			goto out_lock;
 		}
 		start_zero_fsb = imap.br_startoff + imap.br_blockcount;
 		ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 	}
 	return 0;
 out_lock:
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	ASSERT(error >= 0);
 	return error;
 }
 ssize_t				/* bytes written, or (-) error */
 xfs_write(
 	struct xfs_inode	*xip,
 	struct kiocb		*iocb,
 	const struct iovec	*iovp,
 	unsigned int		nsegs,
 	loff_t			*offset,
 	int			ioflags)
 {
 	struct file		*file = iocb->ki_filp;
 	struct address_space	*mapping = file->f_mapping;
 	struct inode		*inode = mapping->host;
 	unsigned long		segs = nsegs;
 	xfs_mount_t		*mp;
 	ssize_t			ret = 0, error = 0;
 	xfs_fsize_t		isize, new_size;
 	int			iolock;
 	int			eventsent = 0;
 	size_t			ocount = 0, count;
 	loff_t			pos;
 	int			need_i_mutex;
 	XFS_STATS_INC(xs_write_calls);
 	error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ);
 	if (error)
 		return error;
 	count = ocount;
 	pos = *offset;
 	if (count == 0)
 		return 0;
 	mp = xip->i_mount;
 	xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 relock:
 	if (ioflags & IO_ISDIRECT) {
 		iolock = XFS_IOLOCK_SHARED;
 		need_i_mutex = 0;
 	} else {
 		iolock = XFS_IOLOCK_EXCL;
 		need_i_mutex = 1;
 		mutex_lock(&inode->i_mutex);
 	}
 	xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
 start:
 	error = -generic_write_checks(file, &pos, &count,
 					S_ISBLK(inode->i_mode));
 	if (error) {
 		xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
 		goto out_unlock_mutex;
 	}
 	if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) &&
 	    !(ioflags & IO_INVIS) && !eventsent)) {
 		int		dmflags = FILP_DELAY_FLAG(file);
 		if (need_i_mutex)
 			dmflags |= DM_FLAGS_IMUX;
 		xfs_iunlock(xip, XFS_ILOCK_EXCL);
 		error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip,
 				      pos, count, dmflags, &iolock);
 		if (error) {
 			goto out_unlock_internal;
 		}
 		xfs_ilock(xip, XFS_ILOCK_EXCL);
 		eventsent = 1;
 		/*
 		 * The iolock was dropped and reacquired in XFS_SEND_DATA
 		 * so we have to recheck the size when appending.
 		 * We will only "goto start;" once, since having sent the
 		 * event prevents another call to XFS_SEND_DATA, which is
 		 * what allows the size to change in the first place.
 		 */
 		if ((file->f_flags & O_APPEND) && pos != xip->i_size)
 			goto start;
 	}
 	if (ioflags & IO_ISDIRECT) {
 		xfs_buftarg_t	*target =
 			XFS_IS_REALTIME_INODE(xip) ?
 				mp->m_rtdev_targp : mp->m_ddev_targp;
 		if ((pos & target->bt_smask) || (count & target->bt_smask)) {
 			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
 			return XFS_ERROR(-EINVAL);
 		}
 		if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) {
 			xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
 			iolock = XFS_IOLOCK_EXCL;
 			need_i_mutex = 1;
 			mutex_lock(&inode->i_mutex);
 			xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
 			goto start;
 		}
 	}
 	new_size = pos + count;
 	if (new_size > xip->i_size)
 		xip->i_new_size = new_size;
 	if (likely(!(ioflags & IO_INVIS)))
 		xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	/*
 	 * If the offset is beyond the size of the file, we have a couple
 	 * of things to do. First, if there is already space allocated
 	 * we need to either create holes or zero the disk or ...
 	 *
 	 * If there is a page where the previous size lands, we need
 	 * to zero it out up to the new size.
 	 */
 	if (pos > xip->i_size) {
 		error = xfs_zero_eof(xip, pos, xip->i_size);
 		if (error) {
 			xfs_iunlock(xip, XFS_ILOCK_EXCL);
 			goto out_unlock_internal;
 		}
 	}
 	xfs_iunlock(xip, XFS_ILOCK_EXCL);
 	/*
 	 * If we're writing the file then make sure to clear the
 	 * setuid and setgid bits if the process is not being run
 	 * by root.  This keeps people from modifying setuid and
 	 * setgid binaries.
 	 */
 	if (((xip->i_d.di_mode & S_ISUID) ||
 	    ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
 		(S_ISGID | S_IXGRP))) &&
 	     !capable(CAP_FSETID)) {
 		error = xfs_write_clear_setuid(xip);
 		if (likely(!error))
 			error = -file_remove_suid(file);
 		if (unlikely(error)) {
 			goto out_unlock_internal;
 		}
 	}
 retry:
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = mapping->backing_dev_info;
 	if ((ioflags & IO_ISDIRECT)) {
 		if (mapping->nrpages) {
 			WARN_ON(need_i_mutex == 0);
 			xfs_inval_cached_trace(xip, pos, -1,
 					(pos & PAGE_CACHE_MASK), -1);
 			error = xfs_flushinval_pages(xip,
 					(pos & PAGE_CACHE_MASK),
 					-1, FI_REMAPF_LOCKED);
 			if (error)
 				goto out_unlock_internal;
 		}
 		if (need_i_mutex) {
 			/* demote the lock now the cached pages are gone */
 			xfs_ilock_demote(xip, XFS_IOLOCK_EXCL);
 			mutex_unlock(&inode->i_mutex);
 			iolock = XFS_IOLOCK_SHARED;
 			need_i_mutex = 0;
 		}
  		xfs_rw_enter_trace(XFS_DIOWR_ENTER, xip, (void *)iovp, segs,
 				*offset, ioflags);
 		ret = generic_file_direct_write(iocb, iovp,
 				&segs, pos, offset, count, ocount);
 		/*
 		 * direct-io write to a hole: fall through to buffered I/O
 		 * for completing the rest of the request.
 		 */
 		if (ret >= 0 && ret != count) {
 			XFS_STATS_ADD(xs_write_bytes, ret);
 			pos += ret;
 			count -= ret;
 			ioflags &= ~IO_ISDIRECT;
 			xfs_iunlock(xip, iolock);
 			goto relock;
 		}
 	} else {
 		xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs,
 				*offset, ioflags);
 		ret = generic_file_buffered_write(iocb, iovp, segs,
 				pos, offset, count, ret);
 	}
 	current->backing_dev_info = NULL;
 	if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
 		ret = wait_on_sync_kiocb(iocb);
 	if (ret == -ENOSPC &&
 	    DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
 		xfs_iunlock(xip, iolock);
 		if (need_i_mutex)
 			mutex_unlock(&inode->i_mutex);
 		error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip,
 				DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL,
 				0, 0, 0); /* Delay flag intentionally  unused */
 		if (need_i_mutex)
 			mutex_lock(&inode->i_mutex);
 		xfs_ilock(xip, iolock);
 		if (error)
 			goto out_unlock_internal;
 		pos = xip->i_size;
 		ret = 0;
 		goto retry;
 	}
 	isize = i_size_read(inode);
 	if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
 		*offset = isize;
 	if (*offset > xip->i_size) {
 		xfs_ilock(xip, XFS_ILOCK_EXCL);
 		if (*offset > xip->i_size)
 			xip->i_size = *offset;
 		xfs_iunlock(xip, XFS_ILOCK_EXCL);
 	}
 	error = -ret;
 	if (ret <= 0)
 		goto out_unlock_internal;
 	XFS_STATS_ADD(xs_write_bytes, ret);
 	/* Handle various SYNC-type writes */
 	if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
 		int error2;
 		xfs_iunlock(xip, iolock);
 		if (need_i_mutex)
 			mutex_unlock(&inode->i_mutex);
 		error2 = sync_page_range(inode, mapping, pos, ret);
 		if (!error)
 			error = error2;
 		if (need_i_mutex)
 			mutex_lock(&inode->i_mutex);
 		xfs_ilock(xip, iolock);
 		error2 = xfs_write_sync_logforce(mp, xip);
 		if (!error)
 			error = error2;
 	}
  out_unlock_internal:
 	if (xip->i_new_size) {
 		xfs_ilock(xip, XFS_ILOCK_EXCL);
 		xip->i_new_size = 0;
 		/*
 		 * If this was a direct or synchronous I/O that failed (such
 		 * as ENOSPC) then part of the I/O may have been written to
 		 * disk before the error occured.  In this case the on-disk
 		 * file size may have been adjusted beyond the in-memory file
 		 * size and now needs to be truncated back.
 		 */
 		if (xip->i_d.di_size > xip->i_size)
 			xip->i_d.di_size = xip->i_size;
 		xfs_iunlock(xip, XFS_ILOCK_EXCL);
 	}
 	xfs_iunlock(xip, iolock);
  out_unlock_mutex:
 	if (need_i_mutex)
 		mutex_unlock(&inode->i_mutex);
 	return -error;
 }
 /*
  * All xfs metadata buffers except log state machine buffers
  * get this attached as their b_bdstrat callback function.
  * This is so that we can catch a buffer
  * after prematurely unpinning it to forcibly shutdown the filesystem.
  */
 int
 xfs_bdstrat_cb(struct xfs_buf *bp)
 {
-	xfs_mount_t	*mp;
+	if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
-	mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
-	if (!XFS_FORCED_SHUTDOWN(mp)) {
-		xfs_buf_iorequest(bp);
-		return 0;
-	} else {
 		xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
 		/*
 		 * Metadata write that didn't get logged but
 		 * written delayed anyway. These aren't associated
 		 * with a transaction, and can be ignored.
 		 */
 		if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
 		    (XFS_BUF_ISREAD(bp)) == 0)
 			return (xfs_bioerror_relse(bp));
 		else
 			return (xfs_bioerror(bp));
 	}
+	xfs_buf_iorequest(bp);
+	return 0;
 }
 /*
  * Wrapper around bdstrat so that we can stop data from going to disk in case
  * we are shutting down the filesystem.  Typically user data goes thru this
  * path; one of the exceptions is the superblock.
  */
 void
 xfsbdstrat(
 	struct xfs_mount	*mp,
 	struct xfs_buf		*bp)
 {
 	ASSERT(mp);
 	if (!XFS_FORCED_SHUTDOWN(mp)) {
 		xfs_buf_iorequest(bp);
 		return;
 	}
 	xfs_buftrace("XFSBDSTRAT IOERROR", bp);
 	xfs_bioerror_relse(bp);
 }
 /*
  * If the underlying (data/log/rt) device is readonly, there are some
  * operations that cannot proceed.
  */
 int
 xfs_dev_is_read_only(
 	xfs_mount_t		*mp,
 	char			*message)
 {
 	if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
 	    xfs_readonly_buftarg(mp->m_logdev_targp) ||
 	    (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
 		cmn_err(CE_NOTE,
 			"XFS: %s required on read-only device.", message);
 		cmn_err(CE_NOTE,
 			"XFS: write access unavailable, cannot proceed.");
 		return EROFS;
 	}

fs/xfs/xfs_buf_item.c

Diff comments View file @ 15ac08a

1	/*	1	/*
2	* Copyright (c) 2000-2005 Silicon Graphics, Inc.	2	* Copyright (c) 2000-2005 Silicon Graphics, Inc.
3	* All Rights Reserved.	3	* All Rights Reserved.
4	*	4	*
5	* This program is free software; you can redistribute it and/or	5	* This program is free software; you can redistribute it and/or
6	* modify it under the terms of the GNU General Public License as	6	* modify it under the terms of the GNU General Public License as
7	* published by the Free Software Foundation.	7	* published by the Free Software Foundation.
8	*	8	*
9	* This program is distributed in the hope that it would be useful,	9	* This program is distributed in the hope that it would be useful,
10	* but WITHOUT ANY WARRANTY; without even the implied warranty of	10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	* GNU General Public License for more details.	12	* GNU General Public License for more details.
13	*	13	*
14	* You should have received a copy of the GNU General Public License	14	* You should have received a copy of the GNU General Public License
15	* along with this program; if not, write the Free Software Foundation,	15	* along with this program; if not, write the Free Software Foundation,
16	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA	16	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17	*/	17	*/
18	#include "xfs.h"	18	#include "xfs.h"
19	#include "xfs_fs.h"	19	#include "xfs_fs.h"
20	#include "xfs_types.h"	20	#include "xfs_types.h"
21	#include "xfs_bit.h"	21	#include "xfs_bit.h"
22	#include "xfs_log.h"	22	#include "xfs_log.h"
23	#include "xfs_inum.h"	23	#include "xfs_inum.h"
24	#include "xfs_trans.h"	24	#include "xfs_trans.h"
25	#include "xfs_sb.h"	25	#include "xfs_sb.h"
26	#include "xfs_ag.h"	26	#include "xfs_ag.h"
27	#include "xfs_dmapi.h"	27	#include "xfs_dmapi.h"
28	#include "xfs_mount.h"	28	#include "xfs_mount.h"
29	#include "xfs_buf_item.h"	29	#include "xfs_buf_item.h"
30	#include "xfs_trans_priv.h"	30	#include "xfs_trans_priv.h"
31	#include "xfs_error.h"	31	#include "xfs_error.h"
32		32
33		33
34	kmem_zone_t *xfs_buf_item_zone;	34	kmem_zone_t *xfs_buf_item_zone;
35		35
36	#ifdef XFS_TRANS_DEBUG	36	#ifdef XFS_TRANS_DEBUG
37	/*	37	/*
38	* This function uses an alternate strategy for tracking the bytes	38	* This function uses an alternate strategy for tracking the bytes
39	* that the user requests to be logged. This can then be used	39	* that the user requests to be logged. This can then be used
40	* in conjunction with the bli_orig array in the buf log item to	40	* in conjunction with the bli_orig array in the buf log item to
41	* catch bugs in our callers' code.	41	* catch bugs in our callers' code.
42	*	42	*
43	* We also double check the bits set in xfs_buf_item_log using a	43	* We also double check the bits set in xfs_buf_item_log using a
44	* simple algorithm to check that every byte is accounted for.	44	* simple algorithm to check that every byte is accounted for.
45	*/	45	*/
46	STATIC void	46	STATIC void
47	xfs_buf_item_log_debug(	47	xfs_buf_item_log_debug(
48	xfs_buf_log_item_t *bip,	48	xfs_buf_log_item_t *bip,
49	uint first,	49	uint first,
50	uint last)	50	uint last)
51	{	51	{
52	uint x;	52	uint x;
53	uint byte;	53	uint byte;
54	uint nbytes;	54	uint nbytes;
55	uint chunk_num;	55	uint chunk_num;
56	uint word_num;	56	uint word_num;
57	uint bit_num;	57	uint bit_num;
58	uint bit_set;	58	uint bit_set;
59	uint *wordp;	59	uint *wordp;
60		60
61	ASSERT(bip->bli_logged != NULL);	61	ASSERT(bip->bli_logged != NULL);
62	byte = first;	62	byte = first;
63	nbytes = last - first + 1;	63	nbytes = last - first + 1;
64	bfset(bip->bli_logged, first, nbytes);	64	bfset(bip->bli_logged, first, nbytes);
65	for (x = 0; x < nbytes; x++) {	65	for (x = 0; x < nbytes; x++) {
66	chunk_num = byte >> XFS_BLI_SHIFT;	66	chunk_num = byte >> XFS_BLI_SHIFT;
67	word_num = chunk_num >> BIT_TO_WORD_SHIFT;	67	word_num = chunk_num >> BIT_TO_WORD_SHIFT;
68	bit_num = chunk_num & (NBWORD - 1);	68	bit_num = chunk_num & (NBWORD - 1);
69	wordp = &(bip->bli_format.blf_data_map[word_num]);	69	wordp = &(bip->bli_format.blf_data_map[word_num]);
70	bit_set = *wordp & (1 << bit_num);	70	bit_set = *wordp & (1 << bit_num);
71	ASSERT(bit_set);	71	ASSERT(bit_set);
72	byte++;	72	byte++;
73	}	73	}
74	}	74	}
75		75
76	/*	76	/*
77	* This function is called when we flush something into a buffer without	77	* This function is called when we flush something into a buffer without
78	* logging it. This happens for things like inodes which are logged	78	* logging it. This happens for things like inodes which are logged
79	* separately from the buffer.	79	* separately from the buffer.
80	*/	80	*/
81	void	81	void
82	xfs_buf_item_flush_log_debug(	82	xfs_buf_item_flush_log_debug(
83	xfs_buf_t *bp,	83	xfs_buf_t *bp,
84	uint first,	84	uint first,
85	uint last)	85	uint last)
86	{	86	{
87	xfs_buf_log_item_t *bip;	87	xfs_buf_log_item_t *bip;
88	uint nbytes;	88	uint nbytes;
89		89
90	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);	90	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
91	if ((bip == NULL) \|\| (bip->bli_item.li_type != XFS_LI_BUF)) {	91	if ((bip == NULL) \|\| (bip->bli_item.li_type != XFS_LI_BUF)) {
92	return;	92	return;
93	}	93	}
94		94
95	ASSERT(bip->bli_logged != NULL);	95	ASSERT(bip->bli_logged != NULL);
96	nbytes = last - first + 1;	96	nbytes = last - first + 1;
97	bfset(bip->bli_logged, first, nbytes);	97	bfset(bip->bli_logged, first, nbytes);
98	}	98	}
99		99
100	/*	100	/*
101	* This function is called to verify that our callers have logged	101	* This function is called to verify that our callers have logged
102	* all the bytes that they changed.	102	* all the bytes that they changed.
103	*	103	*
104	* It does this by comparing the original copy of the buffer stored in	104	* It does this by comparing the original copy of the buffer stored in
105	* the buf log item's bli_orig array to the current copy of the buffer	105	* the buf log item's bli_orig array to the current copy of the buffer
106	* and ensuring that all bytes which mismatch are set in the bli_logged	106	* and ensuring that all bytes which mismatch are set in the bli_logged
107	* array of the buf log item.	107	* array of the buf log item.
108	*/	108	*/
109	STATIC void	109	STATIC void
110	xfs_buf_item_log_check(	110	xfs_buf_item_log_check(
111	xfs_buf_log_item_t *bip)	111	xfs_buf_log_item_t *bip)
112	{	112	{
113	char *orig;	113	char *orig;
114	char *buffer;	114	char *buffer;
115	int x;	115	int x;
116	xfs_buf_t *bp;	116	xfs_buf_t *bp;
117		117
118	ASSERT(bip->bli_orig != NULL);	118	ASSERT(bip->bli_orig != NULL);
119	ASSERT(bip->bli_logged != NULL);	119	ASSERT(bip->bli_logged != NULL);
120		120
121	bp = bip->bli_buf;	121	bp = bip->bli_buf;
122	ASSERT(XFS_BUF_COUNT(bp) > 0);	122	ASSERT(XFS_BUF_COUNT(bp) > 0);
123	ASSERT(XFS_BUF_PTR(bp) != NULL);	123	ASSERT(XFS_BUF_PTR(bp) != NULL);
124	orig = bip->bli_orig;	124	orig = bip->bli_orig;
125	buffer = XFS_BUF_PTR(bp);	125	buffer = XFS_BUF_PTR(bp);
126	for (x = 0; x < XFS_BUF_COUNT(bp); x++) {	126	for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
127	if (orig[x] != buffer[x] && !btst(bip->bli_logged, x))	127	if (orig[x] != buffer[x] && !btst(bip->bli_logged, x))
128	cmn_err(CE_PANIC,	128	cmn_err(CE_PANIC,
129	"xfs_buf_item_log_check bip %x buffer %x orig %x index %d",	129	"xfs_buf_item_log_check bip %x buffer %x orig %x index %d",
130	bip, bp, orig, x);	130	bip, bp, orig, x);
131	}	131	}
132	}	132	}
133	#else	133	#else
134	#define xfs_buf_item_log_debug(x,y,z)	134	#define xfs_buf_item_log_debug(x,y,z)
135	#define xfs_buf_item_log_check(x)	135	#define xfs_buf_item_log_check(x)
136	#endif	136	#endif
137		137
138	STATIC void xfs_buf_error_relse(xfs_buf_t *bp);	138	STATIC void xfs_buf_error_relse(xfs_buf_t *bp);
139	STATIC void xfs_buf_do_callbacks(xfs_buf_t bp, xfs_log_item_t lip);	139	STATIC void xfs_buf_do_callbacks(xfs_buf_t bp, xfs_log_item_t lip);
140		140
141	/*	141	/*
142	* This returns the number of log iovecs needed to log the	142	* This returns the number of log iovecs needed to log the
143	* given buf log item.	143	* given buf log item.
144	*	144	*
145	* It calculates this as 1 iovec for the buf log format structure	145	* It calculates this as 1 iovec for the buf log format structure
146	* and 1 for each stretch of non-contiguous chunks to be logged.	146	* and 1 for each stretch of non-contiguous chunks to be logged.
147	* Contiguous chunks are logged in a single iovec.	147	* Contiguous chunks are logged in a single iovec.
148	*	148	*
149	* If the XFS_BLI_STALE flag has been set, then log nothing.	149	* If the XFS_BLI_STALE flag has been set, then log nothing.
150	*/	150	*/
151	STATIC uint	151	STATIC uint
152	xfs_buf_item_size(	152	xfs_buf_item_size(
153	xfs_buf_log_item_t *bip)	153	xfs_buf_log_item_t *bip)
154	{	154	{
155	uint nvecs;	155	uint nvecs;
156	int next_bit;	156	int next_bit;
157	int last_bit;	157	int last_bit;
158	xfs_buf_t *bp;	158	xfs_buf_t *bp;
159		159
160	ASSERT(atomic_read(&bip->bli_refcount) > 0);	160	ASSERT(atomic_read(&bip->bli_refcount) > 0);
161	if (bip->bli_flags & XFS_BLI_STALE) {	161	if (bip->bli_flags & XFS_BLI_STALE) {
162	/*	162	/*
163	* The buffer is stale, so all we need to log	163	* The buffer is stale, so all we need to log
164	* is the buf log format structure with the	164	* is the buf log format structure with the
165	* cancel flag in it.	165	* cancel flag in it.
166	*/	166	*/
167	xfs_buf_item_trace("SIZE STALE", bip);	167	xfs_buf_item_trace("SIZE STALE", bip);
168	ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);	168	ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
169	return 1;	169	return 1;
170	}	170	}
171		171
172	bp = bip->bli_buf;	172	bp = bip->bli_buf;
173	ASSERT(bip->bli_flags & XFS_BLI_LOGGED);	173	ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
174	nvecs = 1;	174	nvecs = 1;
175	last_bit = xfs_next_bit(bip->bli_format.blf_data_map,	175	last_bit = xfs_next_bit(bip->bli_format.blf_data_map,
176	bip->bli_format.blf_map_size, 0);	176	bip->bli_format.blf_map_size, 0);
177	ASSERT(last_bit != -1);	177	ASSERT(last_bit != -1);
178	nvecs++;	178	nvecs++;
179	while (last_bit != -1) {	179	while (last_bit != -1) {
180	/*	180	/*
181	* This takes the bit number to start looking from and	181	* This takes the bit number to start looking from and
182	* returns the next set bit from there. It returns -1	182	* returns the next set bit from there. It returns -1
183	* if there are no more bits set or the start bit is	183	* if there are no more bits set or the start bit is
184	* beyond the end of the bitmap.	184	* beyond the end of the bitmap.
185	*/	185	*/
186	next_bit = xfs_next_bit(bip->bli_format.blf_data_map,	186	next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
187	bip->bli_format.blf_map_size,	187	bip->bli_format.blf_map_size,
188	last_bit + 1);	188	last_bit + 1);
189	/*	189	/*
190	* If we run out of bits, leave the loop,	190	* If we run out of bits, leave the loop,
191	* else if we find a new set of bits bump the number of vecs,	191	* else if we find a new set of bits bump the number of vecs,
192	* else keep scanning the current set of bits.	192	* else keep scanning the current set of bits.
193	*/	193	*/
194	if (next_bit == -1) {	194	if (next_bit == -1) {
195	last_bit = -1;	195	last_bit = -1;
196	} else if (next_bit != last_bit + 1) {	196	} else if (next_bit != last_bit + 1) {
197	last_bit = next_bit;	197	last_bit = next_bit;
198	nvecs++;	198	nvecs++;
199	} else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) !=	199	} else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) !=
200	(xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) +	200	(xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) +
201	XFS_BLI_CHUNK)) {	201	XFS_BLI_CHUNK)) {
202	last_bit = next_bit;	202	last_bit = next_bit;
203	nvecs++;	203	nvecs++;
204	} else {	204	} else {
205	last_bit++;	205	last_bit++;
206	}	206	}
207	}	207	}
208		208
209	xfs_buf_item_trace("SIZE NORM", bip);	209	xfs_buf_item_trace("SIZE NORM", bip);
210	return nvecs;	210	return nvecs;
211	}	211	}
212		212
213	/*	213	/*
214	* This is called to fill in the vector of log iovecs for the	214	* This is called to fill in the vector of log iovecs for the
215	* given log buf item. It fills the first entry with a buf log	215	* given log buf item. It fills the first entry with a buf log
216	* format structure, and the rest point to contiguous chunks	216	* format structure, and the rest point to contiguous chunks
217	* within the buffer.	217	* within the buffer.
218	*/	218	*/
219	STATIC void	219	STATIC void
220	xfs_buf_item_format(	220	xfs_buf_item_format(
221	xfs_buf_log_item_t *bip,	221	xfs_buf_log_item_t *bip,
222	xfs_log_iovec_t *log_vector)	222	xfs_log_iovec_t *log_vector)
223	{	223	{
224	uint base_size;	224	uint base_size;
225	uint nvecs;	225	uint nvecs;
226	xfs_log_iovec_t *vecp;	226	xfs_log_iovec_t *vecp;
227	xfs_buf_t *bp;	227	xfs_buf_t *bp;
228	int first_bit;	228	int first_bit;
229	int last_bit;	229	int last_bit;
230	int next_bit;	230	int next_bit;
231	uint nbits;	231	uint nbits;
232	uint buffer_offset;	232	uint buffer_offset;
233		233
234	ASSERT(atomic_read(&bip->bli_refcount) > 0);	234	ASSERT(atomic_read(&bip->bli_refcount) > 0);
235	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) \|\|	235	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) \|\|
236	(bip->bli_flags & XFS_BLI_STALE));	236	(bip->bli_flags & XFS_BLI_STALE));
237	bp = bip->bli_buf;	237	bp = bip->bli_buf;
238	vecp = log_vector;	238	vecp = log_vector;
239		239
240	/*	240	/*
241	* The size of the base structure is the size of the	241	* The size of the base structure is the size of the
242	* declared structure plus the space for the extra words	242	* declared structure plus the space for the extra words
243	* of the bitmap. We subtract one from the map size, because	243	* of the bitmap. We subtract one from the map size, because
244	* the first element of the bitmap is accounted for in the	244	* the first element of the bitmap is accounted for in the
245	* size of the base structure.	245	* size of the base structure.
246	*/	246	*/
247	base_size =	247	base_size =
248	(uint)(sizeof(xfs_buf_log_format_t) +	248	(uint)(sizeof(xfs_buf_log_format_t) +
249	((bip->bli_format.blf_map_size - 1) * sizeof(uint)));	249	((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
250	vecp->i_addr = (xfs_caddr_t)&bip->bli_format;	250	vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
251	vecp->i_len = base_size;	251	vecp->i_len = base_size;
252	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BFORMAT);	252	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BFORMAT);
253	vecp++;	253	vecp++;
254	nvecs = 1;	254	nvecs = 1;
255		255
256	if (bip->bli_flags & XFS_BLI_STALE) {	256	if (bip->bli_flags & XFS_BLI_STALE) {
257	/*	257	/*
258	* The buffer is stale, so all we need to log	258	* The buffer is stale, so all we need to log
259	* is the buf log format structure with the	259	* is the buf log format structure with the
260	* cancel flag in it.	260	* cancel flag in it.
261	*/	261	*/
262	xfs_buf_item_trace("FORMAT STALE", bip);	262	xfs_buf_item_trace("FORMAT STALE", bip);
263	ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);	263	ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
264	bip->bli_format.blf_size = nvecs;	264	bip->bli_format.blf_size = nvecs;
265	return;	265	return;
266	}	266	}
267		267
268	/*	268	/*
269	* Fill in an iovec for each set of contiguous chunks.	269	* Fill in an iovec for each set of contiguous chunks.
270	*/	270	*/
271	first_bit = xfs_next_bit(bip->bli_format.blf_data_map,	271	first_bit = xfs_next_bit(bip->bli_format.blf_data_map,
272	bip->bli_format.blf_map_size, 0);	272	bip->bli_format.blf_map_size, 0);
273	ASSERT(first_bit != -1);	273	ASSERT(first_bit != -1);
274	last_bit = first_bit;	274	last_bit = first_bit;
275	nbits = 1;	275	nbits = 1;
276	for (;;) {	276	for (;;) {
277	/*	277	/*
278	* This takes the bit number to start looking from and	278	* This takes the bit number to start looking from and
279	* returns the next set bit from there. It returns -1	279	* returns the next set bit from there. It returns -1
280	* if there are no more bits set or the start bit is	280	* if there are no more bits set or the start bit is
281	* beyond the end of the bitmap.	281	* beyond the end of the bitmap.
282	*/	282	*/
283	next_bit = xfs_next_bit(bip->bli_format.blf_data_map,	283	next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
284	bip->bli_format.blf_map_size,	284	bip->bli_format.blf_map_size,
285	(uint)last_bit + 1);	285	(uint)last_bit + 1);
286	/*	286	/*
287	* If we run out of bits fill in the last iovec and get	287	* If we run out of bits fill in the last iovec and get
288	* out of the loop.	288	* out of the loop.
289	* Else if we start a new set of bits then fill in the	289	* Else if we start a new set of bits then fill in the
290	* iovec for the series we were looking at and start	290	* iovec for the series we were looking at and start
291	* counting the bits in the new one.	291	* counting the bits in the new one.
292	* Else we're still in the same set of bits so just	292	* Else we're still in the same set of bits so just
293	* keep counting and scanning.	293	* keep counting and scanning.
294	*/	294	*/
295	if (next_bit == -1) {	295	if (next_bit == -1) {
296	buffer_offset = first_bit * XFS_BLI_CHUNK;	296	buffer_offset = first_bit * XFS_BLI_CHUNK;
297	vecp->i_addr = xfs_buf_offset(bp, buffer_offset);	297	vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
298	vecp->i_len = nbits * XFS_BLI_CHUNK;	298	vecp->i_len = nbits * XFS_BLI_CHUNK;
299	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);	299	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
300	nvecs++;	300	nvecs++;
301	break;	301	break;
302	} else if (next_bit != last_bit + 1) {	302	} else if (next_bit != last_bit + 1) {
303	buffer_offset = first_bit * XFS_BLI_CHUNK;	303	buffer_offset = first_bit * XFS_BLI_CHUNK;
304	vecp->i_addr = xfs_buf_offset(bp, buffer_offset);	304	vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
305	vecp->i_len = nbits * XFS_BLI_CHUNK;	305	vecp->i_len = nbits * XFS_BLI_CHUNK;
306	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);	306	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
307	nvecs++;	307	nvecs++;
308	vecp++;	308	vecp++;
309	first_bit = next_bit;	309	first_bit = next_bit;
310	last_bit = next_bit;	310	last_bit = next_bit;
311	nbits = 1;	311	nbits = 1;
312	} else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) !=	312	} else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) !=
313	(xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) +	313	(xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) +
314	XFS_BLI_CHUNK)) {	314	XFS_BLI_CHUNK)) {
315	buffer_offset = first_bit * XFS_BLI_CHUNK;	315	buffer_offset = first_bit * XFS_BLI_CHUNK;
316	vecp->i_addr = xfs_buf_offset(bp, buffer_offset);	316	vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
317	vecp->i_len = nbits * XFS_BLI_CHUNK;	317	vecp->i_len = nbits * XFS_BLI_CHUNK;
318	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);	318	XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
319	/* You would think we need to bump the nvecs here too, but we do not	319	/* You would think we need to bump the nvecs here too, but we do not
320	* this number is used by recovery, and it gets confused by the boundary	320	* this number is used by recovery, and it gets confused by the boundary
321	* split here	321	* split here
322	* nvecs++;	322	* nvecs++;
323	*/	323	*/
324	vecp++;	324	vecp++;
325	first_bit = next_bit;	325	first_bit = next_bit;
326	last_bit = next_bit;	326	last_bit = next_bit;
327	nbits = 1;	327	nbits = 1;
328	} else {	328	} else {
329	last_bit++;	329	last_bit++;
330	nbits++;	330	nbits++;
331	}	331	}
332	}	332	}
333	bip->bli_format.blf_size = nvecs;	333	bip->bli_format.blf_size = nvecs;
334		334
335	/*	335	/*
336	* Check to make sure everything is consistent.	336	* Check to make sure everything is consistent.
337	*/	337	*/
338	xfs_buf_item_trace("FORMAT NORM", bip);	338	xfs_buf_item_trace("FORMAT NORM", bip);
339	xfs_buf_item_log_check(bip);	339	xfs_buf_item_log_check(bip);
340	}	340	}
341		341
342	/*	342	/*
343	* This is called to pin the buffer associated with the buf log	343	* This is called to pin the buffer associated with the buf log
344	* item in memory so it cannot be written out. Simply call bpin()	344	* item in memory so it cannot be written out. Simply call bpin()
345	* on the buffer to do this.	345	* on the buffer to do this.
346	*/	346	*/
347	STATIC void	347	STATIC void
348	xfs_buf_item_pin(	348	xfs_buf_item_pin(
349	xfs_buf_log_item_t *bip)	349	xfs_buf_log_item_t *bip)
350	{	350	{
351	xfs_buf_t *bp;	351	xfs_buf_t *bp;
352		352
353	bp = bip->bli_buf;	353	bp = bip->bli_buf;
354	ASSERT(XFS_BUF_ISBUSY(bp));	354	ASSERT(XFS_BUF_ISBUSY(bp));
355	ASSERT(atomic_read(&bip->bli_refcount) > 0);	355	ASSERT(atomic_read(&bip->bli_refcount) > 0);
356	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) \|\|	356	ASSERT((bip->bli_flags & XFS_BLI_LOGGED) \|\|
357	(bip->bli_flags & XFS_BLI_STALE));	357	(bip->bli_flags & XFS_BLI_STALE));
358	xfs_buf_item_trace("PIN", bip);	358	xfs_buf_item_trace("PIN", bip);
359	xfs_buftrace("XFS_PIN", bp);	359	xfs_buftrace("XFS_PIN", bp);
360	xfs_bpin(bp);	360	xfs_bpin(bp);
361	}	361	}
362		362
363		363
364	/*	364	/*
365	* This is called to unpin the buffer associated with the buf log	365	* This is called to unpin the buffer associated with the buf log
366	* item which was previously pinned with a call to xfs_buf_item_pin().	366	* item which was previously pinned with a call to xfs_buf_item_pin().
367	* Just call bunpin() on the buffer to do this.	367	* Just call bunpin() on the buffer to do this.
368	*	368	*
369	* Also drop the reference to the buf item for the current transaction.	369	* Also drop the reference to the buf item for the current transaction.
370	* If the XFS_BLI_STALE flag is set and we are the last reference,	370	* If the XFS_BLI_STALE flag is set and we are the last reference,
371	* then free up the buf log item and unlock the buffer.	371	* then free up the buf log item and unlock the buffer.
372	*/	372	*/
373	STATIC void	373	STATIC void
374	xfs_buf_item_unpin(	374	xfs_buf_item_unpin(
375	xfs_buf_log_item_t *bip,	375	xfs_buf_log_item_t *bip,
376	int stale)	376	int stale)
377	{	377	{
378	struct xfs_ail *ailp;	378	struct xfs_ail *ailp;
379	xfs_buf_t *bp;	379	xfs_buf_t *bp;
380	int freed;	380	int freed;
381		381
382	bp = bip->bli_buf;	382	bp = bip->bli_buf;
383	ASSERT(bp != NULL);	383	ASSERT(bp != NULL);
384	ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);	384	ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);
385	ASSERT(atomic_read(&bip->bli_refcount) > 0);	385	ASSERT(atomic_read(&bip->bli_refcount) > 0);
386	xfs_buf_item_trace("UNPIN", bip);	386	xfs_buf_item_trace("UNPIN", bip);
387	xfs_buftrace("XFS_UNPIN", bp);	387	xfs_buftrace("XFS_UNPIN", bp);
388		388
389	freed = atomic_dec_and_test(&bip->bli_refcount);	389	freed = atomic_dec_and_test(&bip->bli_refcount);
390	ailp = bip->bli_item.li_ailp;	390	ailp = bip->bli_item.li_ailp;
391	xfs_bunpin(bp);	391	xfs_bunpin(bp);
392	if (freed && stale) {	392	if (freed && stale) {
393	ASSERT(bip->bli_flags & XFS_BLI_STALE);	393	ASSERT(bip->bli_flags & XFS_BLI_STALE);
394	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);	394	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
395	ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));	395	ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
396	ASSERT(XFS_BUF_ISSTALE(bp));	396	ASSERT(XFS_BUF_ISSTALE(bp));
397	ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);	397	ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
398	xfs_buf_item_trace("UNPIN STALE", bip);	398	xfs_buf_item_trace("UNPIN STALE", bip);
399	xfs_buftrace("XFS_UNPIN STALE", bp);	399	xfs_buftrace("XFS_UNPIN STALE", bp);
400	/*	400	/*
401	* If we get called here because of an IO error, we may	401	* If we get called here because of an IO error, we may
402	* or may not have the item on the AIL. xfs_trans_ail_delete()	402	* or may not have the item on the AIL. xfs_trans_ail_delete()
403	* will take care of that situation.	403	* will take care of that situation.
404	* xfs_trans_ail_delete() drops the AIL lock.	404	* xfs_trans_ail_delete() drops the AIL lock.
405	*/	405	*/
406	if (bip->bli_flags & XFS_BLI_STALE_INODE) {	406	if (bip->bli_flags & XFS_BLI_STALE_INODE) {
407	xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);	407	xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
408	XFS_BUF_SET_FSPRIVATE(bp, NULL);	408	XFS_BUF_SET_FSPRIVATE(bp, NULL);
409	XFS_BUF_CLR_IODONE_FUNC(bp);	409	XFS_BUF_CLR_IODONE_FUNC(bp);
410	} else {	410	} else {
411	spin_lock(&ailp->xa_lock);	411	spin_lock(&ailp->xa_lock);
412	xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);	412	xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
413	xfs_buf_item_relse(bp);	413	xfs_buf_item_relse(bp);
414	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL);	414	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL);
415	}	415	}
416	xfs_buf_relse(bp);	416	xfs_buf_relse(bp);
417	}	417	}
418	}	418	}
419		419
420	/*	420	/*
421	* this is called from uncommit in the forced-shutdown path.	421	* this is called from uncommit in the forced-shutdown path.
422	* we need to check to see if the reference count on the log item	422	* we need to check to see if the reference count on the log item
423	* is going to drop to zero. If so, unpin will free the log item	423	* is going to drop to zero. If so, unpin will free the log item
424	* so we need to free the item's descriptor (that points to the item)	424	* so we need to free the item's descriptor (that points to the item)
425	* in the transaction.	425	* in the transaction.
426	*/	426	*/
427	STATIC void	427	STATIC void
428	xfs_buf_item_unpin_remove(	428	xfs_buf_item_unpin_remove(
429	xfs_buf_log_item_t *bip,	429	xfs_buf_log_item_t *bip,
430	xfs_trans_t *tp)	430	xfs_trans_t *tp)
431	{	431	{
432	xfs_buf_t *bp;	432	xfs_buf_t *bp;
433	xfs_log_item_desc_t *lidp;	433	xfs_log_item_desc_t *lidp;
434	int stale = 0;	434	int stale = 0;
435		435
436	bp = bip->bli_buf;	436	bp = bip->bli_buf;
437	/*	437	/*
438	* will xfs_buf_item_unpin() call xfs_buf_item_relse()?	438	* will xfs_buf_item_unpin() call xfs_buf_item_relse()?
439	*/	439	*/
440	if ((atomic_read(&bip->bli_refcount) == 1) &&	440	if ((atomic_read(&bip->bli_refcount) == 1) &&
441	(bip->bli_flags & XFS_BLI_STALE)) {	441	(bip->bli_flags & XFS_BLI_STALE)) {
442	ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);	442	ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
443	xfs_buf_item_trace("UNPIN REMOVE", bip);	443	xfs_buf_item_trace("UNPIN REMOVE", bip);
444	xfs_buftrace("XFS_UNPIN_REMOVE", bp);	444	xfs_buftrace("XFS_UNPIN_REMOVE", bp);
445	/*	445	/*
446	* yes -- clear the xaction descriptor in-use flag	446	* yes -- clear the xaction descriptor in-use flag
447	* and free the chunk if required. We can safely	447	* and free the chunk if required. We can safely
448	* do some work here and then call buf_item_unpin	448	* do some work here and then call buf_item_unpin
449	* to do the rest because if the if is true, then	449	* to do the rest because if the if is true, then
450	* we are holding the buffer locked so no one else	450	* we are holding the buffer locked so no one else
451	* will be able to bump up the refcount.	451	* will be able to bump up the refcount.
452	*/	452	*/
453	lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip);	453	lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip);
454	stale = lidp->lid_flags & XFS_LID_BUF_STALE;	454	stale = lidp->lid_flags & XFS_LID_BUF_STALE;
455	xfs_trans_free_item(tp, lidp);	455	xfs_trans_free_item(tp, lidp);
456	/*	456	/*
457	* Since the transaction no longer refers to the buffer,	457	* Since the transaction no longer refers to the buffer,
458	* the buffer should no longer refer to the transaction.	458	* the buffer should no longer refer to the transaction.
459	*/	459	*/
460	XFS_BUF_SET_FSPRIVATE2(bp, NULL);	460	XFS_BUF_SET_FSPRIVATE2(bp, NULL);
461	}	461	}
462		462
463	xfs_buf_item_unpin(bip, stale);	463	xfs_buf_item_unpin(bip, stale);
464		464
465	return;	465	return;
466	}	466	}
467		467
468	/*	468	/*
469	* This is called to attempt to lock the buffer associated with this	469	* This is called to attempt to lock the buffer associated with this
470	* buf log item. Don't sleep on the buffer lock. If we can't get	470	* buf log item. Don't sleep on the buffer lock. If we can't get
471	* the lock right away, return 0. If we can get the lock, pull the	471	* the lock right away, return 0. If we can get the lock, pull the
472	* buffer from the free list, mark it busy, and return 1.	472	* buffer from the free list, mark it busy, and return 1.
473	*/	473	*/
474	STATIC uint	474	STATIC uint
475	xfs_buf_item_trylock(	475	xfs_buf_item_trylock(
476	xfs_buf_log_item_t *bip)	476	xfs_buf_log_item_t *bip)
477	{	477	{
478	xfs_buf_t *bp;	478	xfs_buf_t *bp;
479		479
480	bp = bip->bli_buf;	480	bp = bip->bli_buf;
481		481
482	if (XFS_BUF_ISPINNED(bp)) {	482	if (XFS_BUF_ISPINNED(bp)) {
483	return XFS_ITEM_PINNED;	483	return XFS_ITEM_PINNED;
484	}	484	}
485		485
486	if (!XFS_BUF_CPSEMA(bp)) {	486	if (!XFS_BUF_CPSEMA(bp)) {
487	return XFS_ITEM_LOCKED;	487	return XFS_ITEM_LOCKED;
488	}	488	}
489		489
490	/*	490	/*
491	* Remove the buffer from the free list. Only do this	491	* Remove the buffer from the free list. Only do this
492	* if it's on the free list. Private buffers like the	492	* if it's on the free list. Private buffers like the
493	* superblock buffer are not.	493	* superblock buffer are not.
494	*/	494	*/
495	XFS_BUF_HOLD(bp);	495	XFS_BUF_HOLD(bp);
496		496
497	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));	497	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
498	xfs_buf_item_trace("TRYLOCK SUCCESS", bip);	498	xfs_buf_item_trace("TRYLOCK SUCCESS", bip);
499	return XFS_ITEM_SUCCESS;	499	return XFS_ITEM_SUCCESS;
500	}	500	}
501		501
502	/*	502	/*
503	* Release the buffer associated with the buf log item.	503	* Release the buffer associated with the buf log item.
504	* If there is no dirty logged data associated with the	504	* If there is no dirty logged data associated with the
505	* buffer recorded in the buf log item, then free the	505	* buffer recorded in the buf log item, then free the
506	* buf log item and remove the reference to it in the	506	* buf log item and remove the reference to it in the
507	* buffer.	507	* buffer.
508	*	508	*
509	* This call ignores the recursion count. It is only called	509	* This call ignores the recursion count. It is only called
510	* when the buffer should REALLY be unlocked, regardless	510	* when the buffer should REALLY be unlocked, regardless
511	* of the recursion count.	511	* of the recursion count.
512	*	512	*
513	* If the XFS_BLI_HOLD flag is set in the buf log item, then	513	* If the XFS_BLI_HOLD flag is set in the buf log item, then
514	* free the log item if necessary but do not unlock the buffer.	514	* free the log item if necessary but do not unlock the buffer.
515	* This is for support of xfs_trans_bhold(). Make sure the	515	* This is for support of xfs_trans_bhold(). Make sure the
516	* XFS_BLI_HOLD field is cleared if we don't free the item.	516	* XFS_BLI_HOLD field is cleared if we don't free the item.
517	*/	517	*/
518	STATIC void	518	STATIC void
519	xfs_buf_item_unlock(	519	xfs_buf_item_unlock(
520	xfs_buf_log_item_t *bip)	520	xfs_buf_log_item_t *bip)
521	{	521	{
522	int aborted;	522	int aborted;
523	xfs_buf_t *bp;	523	xfs_buf_t *bp;
524	uint hold;	524	uint hold;
525		525
526	bp = bip->bli_buf;	526	bp = bip->bli_buf;
527	xfs_buftrace("XFS_UNLOCK", bp);	527	xfs_buftrace("XFS_UNLOCK", bp);
528		528
529	/*	529	/*
530	* Clear the buffer's association with this transaction.	530	* Clear the buffer's association with this transaction.
531	*/	531	*/
532	XFS_BUF_SET_FSPRIVATE2(bp, NULL);	532	XFS_BUF_SET_FSPRIVATE2(bp, NULL);
533		533
534	/*	534	/*
535	* If this is a transaction abort, don't return early.	535	* If this is a transaction abort, don't return early.
536	* Instead, allow the brelse to happen.	536	* Instead, allow the brelse to happen.
537	* Normally it would be done for stale (cancelled) buffers	537	* Normally it would be done for stale (cancelled) buffers
538	* at unpin time, but we'll never go through the pin/unpin	538	* at unpin time, but we'll never go through the pin/unpin
539	* cycle if we abort inside commit.	539	* cycle if we abort inside commit.
540	*/	540	*/
541	aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;	541	aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
542		542
543	/*	543	/*
544	* If the buf item is marked stale, then don't do anything.	544	* If the buf item is marked stale, then don't do anything.
545	* We'll unlock the buffer and free the buf item when the	545	* We'll unlock the buffer and free the buf item when the
546	* buffer is unpinned for the last time.	546	* buffer is unpinned for the last time.
547	*/	547	*/
548	if (bip->bli_flags & XFS_BLI_STALE) {	548	if (bip->bli_flags & XFS_BLI_STALE) {
549	bip->bli_flags &= ~XFS_BLI_LOGGED;	549	bip->bli_flags &= ~XFS_BLI_LOGGED;
550	xfs_buf_item_trace("UNLOCK STALE", bip);	550	xfs_buf_item_trace("UNLOCK STALE", bip);
551	ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);	551	ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
552	if (!aborted)	552	if (!aborted)
553	return;	553	return;
554	}	554	}
555		555
556	/*	556	/*
557	* Drop the transaction's reference to the log item if	557	* Drop the transaction's reference to the log item if
558	* it was not logged as part of the transaction. Otherwise	558	* it was not logged as part of the transaction. Otherwise
559	* we'll drop the reference in xfs_buf_item_unpin() when	559	* we'll drop the reference in xfs_buf_item_unpin() when
560	* the transaction is really through with the buffer.	560	* the transaction is really through with the buffer.
561	*/	561	*/
562	if (!(bip->bli_flags & XFS_BLI_LOGGED)) {	562	if (!(bip->bli_flags & XFS_BLI_LOGGED)) {
563	atomic_dec(&bip->bli_refcount);	563	atomic_dec(&bip->bli_refcount);
564	} else {	564	} else {
565	/*	565	/*
566	* Clear the logged flag since this is per	566	* Clear the logged flag since this is per
567	* transaction state.	567	* transaction state.
568	*/	568	*/
569	bip->bli_flags &= ~XFS_BLI_LOGGED;	569	bip->bli_flags &= ~XFS_BLI_LOGGED;
570	}	570	}
571		571
572	/*	572	/*
573	* Before possibly freeing the buf item, determine if we should	573	* Before possibly freeing the buf item, determine if we should
574	* release the buffer at the end of this routine.	574	* release the buffer at the end of this routine.
575	*/	575	*/
576	hold = bip->bli_flags & XFS_BLI_HOLD;	576	hold = bip->bli_flags & XFS_BLI_HOLD;
577	xfs_buf_item_trace("UNLOCK", bip);	577	xfs_buf_item_trace("UNLOCK", bip);
578		578
579	/*	579	/*
580	* If the buf item isn't tracking any data, free it.	580	* If the buf item isn't tracking any data, free it.
581	* Otherwise, if XFS_BLI_HOLD is set clear it.	581	* Otherwise, if XFS_BLI_HOLD is set clear it.
582	*/	582	*/
583	if (xfs_bitmap_empty(bip->bli_format.blf_data_map,	583	if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
584	bip->bli_format.blf_map_size)) {	584	bip->bli_format.blf_map_size)) {
585	xfs_buf_item_relse(bp);	585	xfs_buf_item_relse(bp);
586	} else if (hold) {	586	} else if (hold) {
587	bip->bli_flags &= ~XFS_BLI_HOLD;	587	bip->bli_flags &= ~XFS_BLI_HOLD;
588	}	588	}
589		589
590	/*	590	/*
591	* Release the buffer if XFS_BLI_HOLD was not set.	591	* Release the buffer if XFS_BLI_HOLD was not set.
592	*/	592	*/
593	if (!hold) {	593	if (!hold) {
594	xfs_buf_relse(bp);	594	xfs_buf_relse(bp);
595	}	595	}
596	}	596	}
597		597
598	/*	598	/*
599	* This is called to find out where the oldest active copy of the	599	* This is called to find out where the oldest active copy of the
600	* buf log item in the on disk log resides now that the last log	600	* buf log item in the on disk log resides now that the last log
601	* write of it completed at the given lsn.	601	* write of it completed at the given lsn.
602	* We always re-log all the dirty data in a buffer, so usually the	602	* We always re-log all the dirty data in a buffer, so usually the
603	* latest copy in the on disk log is the only one that matters. For	603	* latest copy in the on disk log is the only one that matters. For
604	* those cases we simply return the given lsn.	604	* those cases we simply return the given lsn.
605	*	605	*
606	* The one exception to this is for buffers full of newly allocated	606	* The one exception to this is for buffers full of newly allocated
607	* inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF	607	* inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF
608	* flag set, indicating that only the di_next_unlinked fields from the	608	* flag set, indicating that only the di_next_unlinked fields from the
609	* inodes in the buffers will be replayed during recovery. If the	609	* inodes in the buffers will be replayed during recovery. If the
610	* original newly allocated inode images have not yet been flushed	610	* original newly allocated inode images have not yet been flushed
611	* when the buffer is so relogged, then we need to make sure that we	611	* when the buffer is so relogged, then we need to make sure that we
612	* keep the old images in the 'active' portion of the log. We do this	612	* keep the old images in the 'active' portion of the log. We do this
613	* by returning the original lsn of that transaction here rather than	613	* by returning the original lsn of that transaction here rather than
614	* the current one.	614	* the current one.
615	*/	615	*/
616	STATIC xfs_lsn_t	616	STATIC xfs_lsn_t
617	xfs_buf_item_committed(	617	xfs_buf_item_committed(
618	xfs_buf_log_item_t *bip,	618	xfs_buf_log_item_t *bip,
619	xfs_lsn_t lsn)	619	xfs_lsn_t lsn)
620	{	620	{
621	xfs_buf_item_trace("COMMITTED", bip);	621	xfs_buf_item_trace("COMMITTED", bip);
622	if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&	622	if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
623	(bip->bli_item.li_lsn != 0)) {	623	(bip->bli_item.li_lsn != 0)) {
624	return bip->bli_item.li_lsn;	624	return bip->bli_item.li_lsn;
625	}	625	}
626	return (lsn);	626	return (lsn);
627	}	627	}
628		628
629	/*	629	/*
630	* This is called to asynchronously write the buffer associated with this	630	* This is called to asynchronously write the buffer associated with this
631	* buf log item out to disk. The buffer will already have been locked by	631	* buf log item out to disk. The buffer will already have been locked by
632	* a successful call to xfs_buf_item_trylock(). If the buffer still has	632	* a successful call to xfs_buf_item_trylock(). If the buffer still has
633	* B_DELWRI set, then get it going out to disk with a call to bawrite().	633	* B_DELWRI set, then get it going out to disk with a call to bawrite().
634	* If not, then just release the buffer.	634	* If not, then just release the buffer.
635	*/	635	*/
636	STATIC void	636	STATIC void
637	xfs_buf_item_push(	637	xfs_buf_item_push(
638	xfs_buf_log_item_t *bip)	638	xfs_buf_log_item_t *bip)
639	{	639	{
640	xfs_buf_t *bp;	640	xfs_buf_t *bp;
641		641
642	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));	642	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
643	xfs_buf_item_trace("PUSH", bip);	643	xfs_buf_item_trace("PUSH", bip);
644		644
645	bp = bip->bli_buf;	645	bp = bip->bli_buf;
646		646
647	if (XFS_BUF_ISDELAYWRITE(bp)) {	647	if (XFS_BUF_ISDELAYWRITE(bp)) {
648	int error;	648	int error;
649	error = xfs_bawrite(bip->bli_item.li_mountp, bp);	649	error = xfs_bawrite(bip->bli_item.li_mountp, bp);
650	if (error)	650	if (error)
651	xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp,	651	xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp,
652	"xfs_buf_item_push: pushbuf error %d on bip %p, bp %p",	652	"xfs_buf_item_push: pushbuf error %d on bip %p, bp %p",
653	error, bip, bp);	653	error, bip, bp);
654	} else {	654	} else {
655	xfs_buf_relse(bp);	655	xfs_buf_relse(bp);
656	}	656	}
657	}	657	}
658		658
659	/* ARGSUSED */	659	/* ARGSUSED */
660	STATIC void	660	STATIC void
661	xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn)	661	xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn)
662	{	662	{
663	}	663	}
664		664
665	/*	665	/*
666	* This is the ops vector shared by all buf log items.	666	* This is the ops vector shared by all buf log items.
667	*/	667	*/
668	static struct xfs_item_ops xfs_buf_item_ops = {	668	static struct xfs_item_ops xfs_buf_item_ops = {
669	.iop_size = (uint()(xfs_log_item_t))xfs_buf_item_size,	669	.iop_size = (uint()(xfs_log_item_t))xfs_buf_item_size,
670	.iop_format = (void()(xfs_log_item_t, xfs_log_iovec_t*))	670	.iop_format = (void()(xfs_log_item_t, xfs_log_iovec_t*))
671	xfs_buf_item_format,	671	xfs_buf_item_format,
672	.iop_pin = (void()(xfs_log_item_t))xfs_buf_item_pin,	672	.iop_pin = (void()(xfs_log_item_t))xfs_buf_item_pin,
673	.iop_unpin = (void()(xfs_log_item_t, int))xfs_buf_item_unpin,	673	.iop_unpin = (void()(xfs_log_item_t, int))xfs_buf_item_unpin,
674	.iop_unpin_remove = (void()(xfs_log_item_t, xfs_trans_t *))	674	.iop_unpin_remove = (void()(xfs_log_item_t, xfs_trans_t *))
675	xfs_buf_item_unpin_remove,	675	xfs_buf_item_unpin_remove,
676	.iop_trylock = (uint()(xfs_log_item_t))xfs_buf_item_trylock,	676	.iop_trylock = (uint()(xfs_log_item_t))xfs_buf_item_trylock,
677	.iop_unlock = (void()(xfs_log_item_t))xfs_buf_item_unlock,	677	.iop_unlock = (void()(xfs_log_item_t))xfs_buf_item_unlock,
678	.iop_committed = (xfs_lsn_t()(xfs_log_item_t, xfs_lsn_t))	678	.iop_committed = (xfs_lsn_t()(xfs_log_item_t, xfs_lsn_t))
679	xfs_buf_item_committed,	679	xfs_buf_item_committed,
680	.iop_push = (void()(xfs_log_item_t))xfs_buf_item_push,	680	.iop_push = (void()(xfs_log_item_t))xfs_buf_item_push,
681	.iop_pushbuf = NULL,	681	.iop_pushbuf = NULL,
682	.iop_committing = (void()(xfs_log_item_t, xfs_lsn_t))	682	.iop_committing = (void()(xfs_log_item_t, xfs_lsn_t))
683	xfs_buf_item_committing	683	xfs_buf_item_committing
684	};	684	};
685		685
686		686
687	/*	687	/*
688	* Allocate a new buf log item to go with the given buffer.	688	* Allocate a new buf log item to go with the given buffer.
689	* Set the buffer's b_fsprivate field to point to the new	689	* Set the buffer's b_fsprivate field to point to the new
690	* buf log item. If there are other item's attached to the	690	* buf log item. If there are other item's attached to the
691	* buffer (see xfs_buf_attach_iodone() below), then put the	691	* buffer (see xfs_buf_attach_iodone() below), then put the
692	* buf log item at the front.	692	* buf log item at the front.
693	*/	693	*/
694	void	694	void
695	xfs_buf_item_init(	695	xfs_buf_item_init(
696	xfs_buf_t *bp,	696	xfs_buf_t *bp,
697	xfs_mount_t *mp)	697	xfs_mount_t *mp)
698	{	698	{
699	xfs_log_item_t *lip;	699	xfs_log_item_t *lip;
700	xfs_buf_log_item_t *bip;	700	xfs_buf_log_item_t *bip;
701	int chunks;	701	int chunks;
702	int map_size;	702	int map_size;
703		703
704	/*	704	/*
705	* Check to see if there is already a buf log item for	705	* Check to see if there is already a buf log item for
706	* this buffer. If there is, it is guaranteed to be	706	* this buffer. If there is, it is guaranteed to be
707	* the first. If we do already have one, there is	707	* the first. If we do already have one, there is
708	* nothing to do here so return.	708	* nothing to do here so return.
709	*/	709	*/
710	if (XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *) != mp)	710	if (bp->b_mount != mp)
711	XFS_BUF_SET_FSPRIVATE3(bp, mp);	711	bp->b_mount = mp;
712	XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);	712	XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
713	if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {	713	if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
714	lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);	714	lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
715	if (lip->li_type == XFS_LI_BUF) {	715	if (lip->li_type == XFS_LI_BUF) {
716	return;	716	return;
717	}	717	}
718	}	718	}
719		719
720	/*	720	/*
721	* chunks is the number of XFS_BLI_CHUNK size pieces	721	* chunks is the number of XFS_BLI_CHUNK size pieces
722	* the buffer can be divided into. Make sure not to	722	* the buffer can be divided into. Make sure not to
723	* truncate any pieces. map_size is the size of the	723	* truncate any pieces. map_size is the size of the
724	* bitmap needed to describe the chunks of the buffer.	724	* bitmap needed to describe the chunks of the buffer.
725	*/	725	*/
726	chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT);	726	chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT);
727	map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);	727	map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
728		728
729	bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,	729	bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
730	KM_SLEEP);	730	KM_SLEEP);
731	bip->bli_item.li_type = XFS_LI_BUF;	731	bip->bli_item.li_type = XFS_LI_BUF;
732	bip->bli_item.li_ops = &xfs_buf_item_ops;	732	bip->bli_item.li_ops = &xfs_buf_item_ops;
733	bip->bli_item.li_mountp = mp;	733	bip->bli_item.li_mountp = mp;
734	bip->bli_item.li_ailp = mp->m_ail;	734	bip->bli_item.li_ailp = mp->m_ail;
735	bip->bli_buf = bp;	735	bip->bli_buf = bp;
736	xfs_buf_hold(bp);	736	xfs_buf_hold(bp);
737	bip->bli_format.blf_type = XFS_LI_BUF;	737	bip->bli_format.blf_type = XFS_LI_BUF;
738	bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);	738	bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
739	bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));	739	bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
740	bip->bli_format.blf_map_size = map_size;	740	bip->bli_format.blf_map_size = map_size;
741	#ifdef XFS_BLI_TRACE	741	#ifdef XFS_BLI_TRACE
742	bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_NOFS);	742	bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_NOFS);
743	#endif	743	#endif
744		744
745	#ifdef XFS_TRANS_DEBUG	745	#ifdef XFS_TRANS_DEBUG
746	/*	746	/*
747	* Allocate the arrays for tracking what needs to be logged	747	* Allocate the arrays for tracking what needs to be logged
748	* and what our callers request to be logged. bli_orig	748	* and what our callers request to be logged. bli_orig
749	* holds a copy of the original, clean buffer for comparison	749	* holds a copy of the original, clean buffer for comparison
750	* against, and bli_logged keeps a 1 bit flag per byte in	750	* against, and bli_logged keeps a 1 bit flag per byte in
751	* the buffer to indicate which bytes the callers have asked	751	* the buffer to indicate which bytes the callers have asked
752	* to have logged.	752	* to have logged.
753	*/	753	*/
754	bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP);	754	bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP);
755	memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp));	755	memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp));
756	bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP);	756	bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP);
757	#endif	757	#endif
758		758
759	/*	759	/*
760	* Put the buf item into the list of items attached to the	760	* Put the buf item into the list of items attached to the
761	* buffer at the front.	761	* buffer at the front.
762	*/	762	*/
763	if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {	763	if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
764	bip->bli_item.li_bio_list =	764	bip->bli_item.li_bio_list =
765	XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);	765	XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
766	}	766	}
767	XFS_BUF_SET_FSPRIVATE(bp, bip);	767	XFS_BUF_SET_FSPRIVATE(bp, bip);
768	}	768	}
769		769
770		770
771	/*	771	/*
772	* Mark bytes first through last inclusive as dirty in the buf	772	* Mark bytes first through last inclusive as dirty in the buf
773	* item's bitmap.	773	* item's bitmap.
774	*/	774	*/
775	void	775	void
776	xfs_buf_item_log(	776	xfs_buf_item_log(
777	xfs_buf_log_item_t *bip,	777	xfs_buf_log_item_t *bip,
778	uint first,	778	uint first,
779	uint last)	779	uint last)
780	{	780	{
781	uint first_bit;	781	uint first_bit;
782	uint last_bit;	782	uint last_bit;
783	uint bits_to_set;	783	uint bits_to_set;
784	uint bits_set;	784	uint bits_set;
785	uint word_num;	785	uint word_num;
786	uint *wordp;	786	uint *wordp;
787	uint bit;	787	uint bit;
788	uint end_bit;	788	uint end_bit;
789	uint mask;	789	uint mask;
790		790
791	/*	791	/*
792	* Mark the item as having some dirty data for	792	* Mark the item as having some dirty data for
793	* quick reference in xfs_buf_item_dirty.	793	* quick reference in xfs_buf_item_dirty.
794	*/	794	*/
795	bip->bli_flags \|= XFS_BLI_DIRTY;	795	bip->bli_flags \|= XFS_BLI_DIRTY;
796		796
797	/*	797	/*
798	* Convert byte offsets to bit numbers.	798	* Convert byte offsets to bit numbers.
799	*/	799	*/
800	first_bit = first >> XFS_BLI_SHIFT;	800	first_bit = first >> XFS_BLI_SHIFT;
801	last_bit = last >> XFS_BLI_SHIFT;	801	last_bit = last >> XFS_BLI_SHIFT;
802		802
803	/*	803	/*
804	* Calculate the total number of bits to be set.	804	* Calculate the total number of bits to be set.
805	*/	805	*/
806	bits_to_set = last_bit - first_bit + 1;	806	bits_to_set = last_bit - first_bit + 1;
807		807
808	/*	808	/*
809	* Get a pointer to the first word in the bitmap	809	* Get a pointer to the first word in the bitmap
810	* to set a bit in.	810	* to set a bit in.
811	*/	811	*/
812	word_num = first_bit >> BIT_TO_WORD_SHIFT;	812	word_num = first_bit >> BIT_TO_WORD_SHIFT;
813	wordp = &(bip->bli_format.blf_data_map[word_num]);	813	wordp = &(bip->bli_format.blf_data_map[word_num]);
814		814
815	/*	815	/*
816	* Calculate the starting bit in the first word.	816	* Calculate the starting bit in the first word.
817	*/	817	*/
818	bit = first_bit & (uint)(NBWORD - 1);	818	bit = first_bit & (uint)(NBWORD - 1);
819		819
820	/*	820	/*
821	* First set any bits in the first word of our range.	821	* First set any bits in the first word of our range.
822	* If it starts at bit 0 of the word, it will be	822	* If it starts at bit 0 of the word, it will be
823	* set below rather than here. That is what the variable	823	* set below rather than here. That is what the variable
824	* bit tells us. The variable bits_set tracks the number	824	* bit tells us. The variable bits_set tracks the number
825	* of bits that have been set so far. End_bit is the number	825	* of bits that have been set so far. End_bit is the number
826	* of the last bit to be set in this word plus one.	826	* of the last bit to be set in this word plus one.
827	*/	827	*/
828	if (bit) {	828	if (bit) {
829	end_bit = MIN(bit + bits_to_set, (uint)NBWORD);	829	end_bit = MIN(bit + bits_to_set, (uint)NBWORD);
830	mask = ((1 << (end_bit - bit)) - 1) << bit;	830	mask = ((1 << (end_bit - bit)) - 1) << bit;
831	*wordp \|= mask;	831	*wordp \|= mask;
832	wordp++;	832	wordp++;
833	bits_set = end_bit - bit;	833	bits_set = end_bit - bit;
834	} else {	834	} else {
835	bits_set = 0;	835	bits_set = 0;
836	}	836	}
837		837
838	/*	838	/*
839	* Now set bits a whole word at a time that are between	839	* Now set bits a whole word at a time that are between
840	* first_bit and last_bit.	840	* first_bit and last_bit.
841	*/	841	*/
842	while ((bits_to_set - bits_set) >= NBWORD) {	842	while ((bits_to_set - bits_set) >= NBWORD) {
843	*wordp \|= 0xffffffff;	843	*wordp \|= 0xffffffff;
844	bits_set += NBWORD;	844	bits_set += NBWORD;
845	wordp++;	845	wordp++;
846	}	846	}
847		847
848	/*	848	/*
849	* Finally, set any bits left to be set in one last partial word.	849	* Finally, set any bits left to be set in one last partial word.
850	*/	850	*/
851	end_bit = bits_to_set - bits_set;	851	end_bit = bits_to_set - bits_set;
852	if (end_bit) {	852	if (end_bit) {
853	mask = (1 << end_bit) - 1;	853	mask = (1 << end_bit) - 1;
854	*wordp \|= mask;	854	*wordp \|= mask;
855	}	855	}
856		856
857	xfs_buf_item_log_debug(bip, first, last);	857	xfs_buf_item_log_debug(bip, first, last);
858	}	858	}
859		859
860		860
861	/*	861	/*
862	* Return 1 if the buffer has some data that has been logged (at any	862	* Return 1 if the buffer has some data that has been logged (at any
863	* point, not just the current transaction) and 0 if not.	863	* point, not just the current transaction) and 0 if not.
864	*/	864	*/
865	uint	865	uint
866	xfs_buf_item_dirty(	866	xfs_buf_item_dirty(
867	xfs_buf_log_item_t *bip)	867	xfs_buf_log_item_t *bip)
868	{	868	{
869	return (bip->bli_flags & XFS_BLI_DIRTY);	869	return (bip->bli_flags & XFS_BLI_DIRTY);
870	}	870	}
871		871
872	STATIC void	872	STATIC void
873	xfs_buf_item_free(	873	xfs_buf_item_free(
874	xfs_buf_log_item_t *bip)	874	xfs_buf_log_item_t *bip)
875	{	875	{
876	#ifdef XFS_TRANS_DEBUG	876	#ifdef XFS_TRANS_DEBUG
877	kmem_free(bip->bli_orig);	877	kmem_free(bip->bli_orig);
878	kmem_free(bip->bli_logged);	878	kmem_free(bip->bli_logged);
879	#endif /* XFS_TRANS_DEBUG */	879	#endif /* XFS_TRANS_DEBUG */
880		880
881	#ifdef XFS_BLI_TRACE	881	#ifdef XFS_BLI_TRACE
882	ktrace_free(bip->bli_trace);	882	ktrace_free(bip->bli_trace);
883	#endif	883	#endif
884	kmem_zone_free(xfs_buf_item_zone, bip);	884	kmem_zone_free(xfs_buf_item_zone, bip);
885	}	885	}
886		886
887	/*	887	/*
888	* This is called when the buf log item is no longer needed. It should	888	* This is called when the buf log item is no longer needed. It should
889	* free the buf log item associated with the given buffer and clear	889	* free the buf log item associated with the given buffer and clear
890	* the buffer's pointer to the buf log item. If there are no more	890	* the buffer's pointer to the buf log item. If there are no more
891	* items in the list, clear the b_iodone field of the buffer (see	891	* items in the list, clear the b_iodone field of the buffer (see
892	* xfs_buf_attach_iodone() below).	892	* xfs_buf_attach_iodone() below).
893	*/	893	*/
894	void	894	void
895	xfs_buf_item_relse(	895	xfs_buf_item_relse(
896	xfs_buf_t *bp)	896	xfs_buf_t *bp)
897	{	897	{
898	xfs_buf_log_item_t *bip;	898	xfs_buf_log_item_t *bip;
899		899
900	xfs_buftrace("XFS_RELSE", bp);	900	xfs_buftrace("XFS_RELSE", bp);
901	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);	901	bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
902	XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list);	902	XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list);
903	if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) &&	903	if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) &&
904	(XFS_BUF_IODONE_FUNC(bp) != NULL)) {	904	(XFS_BUF_IODONE_FUNC(bp) != NULL)) {
905	XFS_BUF_CLR_IODONE_FUNC(bp);	905	XFS_BUF_CLR_IODONE_FUNC(bp);
906	}	906	}
907	xfs_buf_rele(bp);	907	xfs_buf_rele(bp);
908	xfs_buf_item_free(bip);	908	xfs_buf_item_free(bip);
909	}	909	}
910		910
911		911
912	/*	912	/*
913	* Add the given log item with its callback to the list of callbacks	913	* Add the given log item with its callback to the list of callbacks
914	* to be called when the buffer's I/O completes. If it is not set	914	* to be called when the buffer's I/O completes. If it is not set
915	* already, set the buffer's b_iodone() routine to be	915	* already, set the buffer's b_iodone() routine to be
916	* xfs_buf_iodone_callbacks() and link the log item into the list of	916	* xfs_buf_iodone_callbacks() and link the log item into the list of
917	* items rooted at b_fsprivate. Items are always added as the second	917	* items rooted at b_fsprivate. Items are always added as the second
918	* entry in the list if there is a first, because the buf item code	918	* entry in the list if there is a first, because the buf item code
919	* assumes that the buf log item is first.	919	* assumes that the buf log item is first.
920	*/	920	*/
921	void	921	void
922	xfs_buf_attach_iodone(	922	xfs_buf_attach_iodone(
923	xfs_buf_t *bp,	923	xfs_buf_t *bp,
924	void (cb)(xfs_buf_t , xfs_log_item_t *),	924	void (cb)(xfs_buf_t , xfs_log_item_t *),
925	xfs_log_item_t *lip)	925	xfs_log_item_t *lip)
926	{	926	{
927	xfs_log_item_t *head_lip;	927	xfs_log_item_t *head_lip;
928		928
929	ASSERT(XFS_BUF_ISBUSY(bp));	929	ASSERT(XFS_BUF_ISBUSY(bp));
930	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);	930	ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
931		931
932	lip->li_cb = cb;	932	lip->li_cb = cb;
933	if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {	933	if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
934	head_lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);	934	head_lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
935	lip->li_bio_list = head_lip->li_bio_list;	935	lip->li_bio_list = head_lip->li_bio_list;
936	head_lip->li_bio_list = lip;	936	head_lip->li_bio_list = lip;
937	} else {	937	} else {
938	XFS_BUF_SET_FSPRIVATE(bp, lip);	938	XFS_BUF_SET_FSPRIVATE(bp, lip);
939	}	939	}
940		940
941	ASSERT((XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks) \|\|	941	ASSERT((XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks) \|\|
942	(XFS_BUF_IODONE_FUNC(bp) == NULL));	942	(XFS_BUF_IODONE_FUNC(bp) == NULL));
943	XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);	943	XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
944	}	944	}
945		945
946	STATIC void	946	STATIC void
947	xfs_buf_do_callbacks(	947	xfs_buf_do_callbacks(
948	xfs_buf_t *bp,	948	xfs_buf_t *bp,
949	xfs_log_item_t *lip)	949	xfs_log_item_t *lip)
950	{	950	{
951	xfs_log_item_t *nlip;	951	xfs_log_item_t *nlip;
952		952
953	while (lip != NULL) {	953	while (lip != NULL) {
954	nlip = lip->li_bio_list;	954	nlip = lip->li_bio_list;
955	ASSERT(lip->li_cb != NULL);	955	ASSERT(lip->li_cb != NULL);
956	/*	956	/*
957	* Clear the next pointer so we don't have any	957	* Clear the next pointer so we don't have any
958	* confusion if the item is added to another buf.	958	* confusion if the item is added to another buf.
959	* Don't touch the log item after calling its	959	* Don't touch the log item after calling its
960	* callback, because it could have freed itself.	960	* callback, because it could have freed itself.
961	*/	961	*/
962	lip->li_bio_list = NULL;	962	lip->li_bio_list = NULL;
963	lip->li_cb(bp, lip);	963	lip->li_cb(bp, lip);
964	lip = nlip;	964	lip = nlip;
965	}	965	}
966	}	966	}
967		967
968	/*	968	/*
969	* This is the iodone() function for buffers which have had callbacks	969	* This is the iodone() function for buffers which have had callbacks
970	* attached to them by xfs_buf_attach_iodone(). It should remove each	970	* attached to them by xfs_buf_attach_iodone(). It should remove each
971	* log item from the buffer's list and call the callback of each in turn.	971	* log item from the buffer's list and call the callback of each in turn.
972	* When done, the buffer's fsprivate field is set to NULL and the buffer	972	* When done, the buffer's fsprivate field is set to NULL and the buffer
973	* is unlocked with a call to iodone().	973	* is unlocked with a call to iodone().
974	*/	974	*/
975	void	975	void
976	xfs_buf_iodone_callbacks(	976	xfs_buf_iodone_callbacks(
977	xfs_buf_t *bp)	977	xfs_buf_t *bp)
978	{	978	{
979	xfs_log_item_t *lip;	979	xfs_log_item_t *lip;
980	static ulong lasttime;	980	static ulong lasttime;
981	static xfs_buftarg_t *lasttarg;	981	static xfs_buftarg_t *lasttarg;
982	xfs_mount_t *mp;	982	xfs_mount_t *mp;
983		983
984	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);	984	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
985	lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);	985	lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
986		986
987	if (XFS_BUF_GETERROR(bp) != 0) {	987	if (XFS_BUF_GETERROR(bp) != 0) {
988	/*	988	/*
989	* If we've already decided to shutdown the filesystem	989	* If we've already decided to shutdown the filesystem
990	* because of IO errors, there's no point in giving this	990	* because of IO errors, there's no point in giving this
991	* a retry.	991	* a retry.
992	*/	992	*/
993	mp = lip->li_mountp;	993	mp = lip->li_mountp;
994	if (XFS_FORCED_SHUTDOWN(mp)) {	994	if (XFS_FORCED_SHUTDOWN(mp)) {
995	ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);	995	ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
996	XFS_BUF_SUPER_STALE(bp);	996	XFS_BUF_SUPER_STALE(bp);
997	xfs_buftrace("BUF_IODONE_CB", bp);	997	xfs_buftrace("BUF_IODONE_CB", bp);
998	xfs_buf_do_callbacks(bp, lip);	998	xfs_buf_do_callbacks(bp, lip);
999	XFS_BUF_SET_FSPRIVATE(bp, NULL);	999	XFS_BUF_SET_FSPRIVATE(bp, NULL);
1000	XFS_BUF_CLR_IODONE_FUNC(bp);	1000	XFS_BUF_CLR_IODONE_FUNC(bp);
1001		1001
1002	/*	1002	/*
1003	* XFS_SHUT flag gets set when we go thru the	1003	* XFS_SHUT flag gets set when we go thru the
1004	* entire buffer cache and deliberately start	1004	* entire buffer cache and deliberately start
1005	* throwing away delayed write buffers.	1005	* throwing away delayed write buffers.
1006	* Since there's no biowait done on those,	1006	* Since there's no biowait done on those,
1007	* we should just brelse them.	1007	* we should just brelse them.
1008	*/	1008	*/
1009	if (XFS_BUF_ISSHUT(bp)) {	1009	if (XFS_BUF_ISSHUT(bp)) {
1010	XFS_BUF_UNSHUT(bp);	1010	XFS_BUF_UNSHUT(bp);
1011	xfs_buf_relse(bp);	1011	xfs_buf_relse(bp);
1012	} else {	1012	} else {
1013	xfs_biodone(bp);	1013	xfs_biodone(bp);
1014	}	1014	}
1015		1015
1016	return;	1016	return;
1017	}	1017	}
1018		1018
1019	if ((XFS_BUF_TARGET(bp) != lasttarg) \|\|	1019	if ((XFS_BUF_TARGET(bp) != lasttarg) \|\|
1020	(time_after(jiffies, (lasttime + 5*HZ)))) {	1020	(time_after(jiffies, (lasttime + 5*HZ)))) {
1021	lasttime = jiffies;	1021	lasttime = jiffies;
1022	cmn_err(CE_ALERT, "Device %s, XFS metadata write error"	1022	cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
1023	" block 0x%llx in %s",	1023	" block 0x%llx in %s",
1024	XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),	1024	XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
1025	(__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);	1025	(__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
1026	}	1026	}
1027	lasttarg = XFS_BUF_TARGET(bp);	1027	lasttarg = XFS_BUF_TARGET(bp);
1028		1028
1029	if (XFS_BUF_ISASYNC(bp)) {	1029	if (XFS_BUF_ISASYNC(bp)) {
1030	/*	1030	/*
1031	* If the write was asynchronous then noone will be	1031	* If the write was asynchronous then noone will be
1032	* looking for the error. Clear the error state	1032	* looking for the error. Clear the error state
1033	* and write the buffer out again delayed write.	1033	* and write the buffer out again delayed write.
1034	*	1034	*
1035	* XXXsup This is OK, so long as we catch these	1035	* XXXsup This is OK, so long as we catch these
1036	* before we start the umount; we don't want these	1036	* before we start the umount; we don't want these
1037	* DELWRI metadata bufs to be hanging around.	1037	* DELWRI metadata bufs to be hanging around.
1038	*/	1038	*/
1039	XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */	1039	XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */
1040		1040
1041	if (!(XFS_BUF_ISSTALE(bp))) {	1041	if (!(XFS_BUF_ISSTALE(bp))) {
1042	XFS_BUF_DELAYWRITE(bp);	1042	XFS_BUF_DELAYWRITE(bp);
1043	XFS_BUF_DONE(bp);	1043	XFS_BUF_DONE(bp);
1044	XFS_BUF_SET_START(bp);	1044	XFS_BUF_SET_START(bp);
1045	}	1045	}
1046	ASSERT(XFS_BUF_IODONE_FUNC(bp));	1046	ASSERT(XFS_BUF_IODONE_FUNC(bp));
1047	xfs_buftrace("BUF_IODONE ASYNC", bp);	1047	xfs_buftrace("BUF_IODONE ASYNC", bp);
1048	xfs_buf_relse(bp);	1048	xfs_buf_relse(bp);
1049	} else {	1049	} else {
1050	/*	1050	/*
1051	* If the write of the buffer was not asynchronous,	1051	* If the write of the buffer was not asynchronous,
1052	* then we want to make sure to return the error	1052	* then we want to make sure to return the error
1053	* to the caller of bwrite(). Because of this we	1053	* to the caller of bwrite(). Because of this we
1054	* cannot clear the B_ERROR state at this point.	1054	* cannot clear the B_ERROR state at this point.
1055	* Instead we install a callback function that	1055	* Instead we install a callback function that
1056	* will be called when the buffer is released, and	1056	* will be called when the buffer is released, and
1057	* that routine will clear the error state and	1057	* that routine will clear the error state and
1058	* set the buffer to be written out again after	1058	* set the buffer to be written out again after
1059	* some delay.	1059	* some delay.
1060	*/	1060	*/
1061	/* We actually overwrite the existing b-relse	1061	/* We actually overwrite the existing b-relse
1062	function at times, but we're gonna be shutting down	1062	function at times, but we're gonna be shutting down
1063	anyway. */	1063	anyway. */
1064	XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);	1064	XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
1065	XFS_BUF_DONE(bp);	1065	XFS_BUF_DONE(bp);
1066	XFS_BUF_FINISH_IOWAIT(bp);	1066	XFS_BUF_FINISH_IOWAIT(bp);
1067	}	1067	}
1068	return;	1068	return;
1069	}	1069	}
1070	#ifdef XFSERRORDEBUG	1070	#ifdef XFSERRORDEBUG
1071	xfs_buftrace("XFS BUFCB NOERR", bp);	1071	xfs_buftrace("XFS BUFCB NOERR", bp);
1072	#endif	1072	#endif
1073	xfs_buf_do_callbacks(bp, lip);	1073	xfs_buf_do_callbacks(bp, lip);
1074	XFS_BUF_SET_FSPRIVATE(bp, NULL);	1074	XFS_BUF_SET_FSPRIVATE(bp, NULL);
1075	XFS_BUF_CLR_IODONE_FUNC(bp);	1075	XFS_BUF_CLR_IODONE_FUNC(bp);
1076	xfs_biodone(bp);	1076	xfs_biodone(bp);
1077	}	1077	}
1078		1078
1079	/*	1079	/*
1080	* This is a callback routine attached to a buffer which gets an error	1080	* This is a callback routine attached to a buffer which gets an error
1081	* when being written out synchronously.	1081	* when being written out synchronously.
1082	*/	1082	*/
1083	STATIC void	1083	STATIC void
1084	xfs_buf_error_relse(	1084	xfs_buf_error_relse(
1085	xfs_buf_t *bp)	1085	xfs_buf_t *bp)
1086	{	1086	{
1087	xfs_log_item_t *lip;	1087	xfs_log_item_t *lip;
1088	xfs_mount_t *mp;	1088	xfs_mount_t *mp;
1089		1089
1090	lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);	1090	lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1091	mp = (xfs_mount_t *)lip->li_mountp;	1091	mp = (xfs_mount_t *)lip->li_mountp;
1092	ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);	1092	ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
1093		1093
1094	XFS_BUF_STALE(bp);	1094	XFS_BUF_STALE(bp);
1095	XFS_BUF_DONE(bp);	1095	XFS_BUF_DONE(bp);
1096	XFS_BUF_UNDELAYWRITE(bp);	1096	XFS_BUF_UNDELAYWRITE(bp);
1097	XFS_BUF_ERROR(bp,0);	1097	XFS_BUF_ERROR(bp,0);
1098	xfs_buftrace("BUF_ERROR_RELSE", bp);	1098	xfs_buftrace("BUF_ERROR_RELSE", bp);
1099	if (! XFS_FORCED_SHUTDOWN(mp))	1099	if (! XFS_FORCED_SHUTDOWN(mp))
1100	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);	1100	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1101	/*	1101	/*
1102	* We have to unpin the pinned buffers so do the	1102	* We have to unpin the pinned buffers so do the
1103	* callbacks.	1103	* callbacks.
1104	*/	1104	*/
1105	xfs_buf_do_callbacks(bp, lip);	1105	xfs_buf_do_callbacks(bp, lip);
1106	XFS_BUF_SET_FSPRIVATE(bp, NULL);	1106	XFS_BUF_SET_FSPRIVATE(bp, NULL);
1107	XFS_BUF_CLR_IODONE_FUNC(bp);	1107	XFS_BUF_CLR_IODONE_FUNC(bp);
1108	XFS_BUF_SET_BRELSE_FUNC(bp,NULL);	1108	XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
1109	xfs_buf_relse(bp);	1109	xfs_buf_relse(bp);
1110	}	1110	}
1111		1111
1112		1112
1113	/*	1113	/*
1114	* This is the iodone() function for buffers which have been	1114	* This is the iodone() function for buffers which have been
1115	* logged. It is called when they are eventually flushed out.	1115	* logged. It is called when they are eventually flushed out.
1116	* It should remove the buf item from the AIL, and free the buf item.	1116	* It should remove the buf item from the AIL, and free the buf item.
1117	* It is called by xfs_buf_iodone_callbacks() above which will take	1117	* It is called by xfs_buf_iodone_callbacks() above which will take
1118	* care of cleaning up the buffer itself.	1118	* care of cleaning up the buffer itself.
1119	*/	1119	*/
1120	/* ARGSUSED */	1120	/* ARGSUSED */
1121	void	1121	void
1122	xfs_buf_iodone(	1122	xfs_buf_iodone(
1123	xfs_buf_t *bp,	1123	xfs_buf_t *bp,
1124	xfs_buf_log_item_t *bip)	1124	xfs_buf_log_item_t *bip)
1125	{	1125	{
1126	struct xfs_ail *ailp = bip->bli_item.li_ailp;	1126	struct xfs_ail *ailp = bip->bli_item.li_ailp;
1127		1127
1128	ASSERT(bip->bli_buf == bp);	1128	ASSERT(bip->bli_buf == bp);
1129		1129
1130	xfs_buf_rele(bp);	1130	xfs_buf_rele(bp);
1131		1131
1132	/*	1132	/*
1133	* If we are forcibly shutting down, this may well be	1133	* If we are forcibly shutting down, this may well be
1134	* off the AIL already. That's because we simulate the	1134	* off the AIL already. That's because we simulate the
1135	* log-committed callbacks to unpin these buffers. Or we may never	1135	* log-committed callbacks to unpin these buffers. Or we may never
1136	* have put this item on AIL because of the transaction was	1136	* have put this item on AIL because of the transaction was
1137	* aborted forcibly. xfs_trans_ail_delete() takes care of these.	1137	* aborted forcibly. xfs_trans_ail_delete() takes care of these.
1138	*	1138	*
1139	* Either way, AIL is useless if we're forcing a shutdown.	1139	* Either way, AIL is useless if we're forcing a shutdown.
1140	*/	1140	*/
1141	spin_lock(&ailp->xa_lock);	1141	spin_lock(&ailp->xa_lock);
1142	xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);	1142	xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
1143	xfs_buf_item_free(bip);	1143	xfs_buf_item_free(bip);
1144	}	1144	}
1145		1145
1146	#if defined(XFS_BLI_TRACE)	1146	#if defined(XFS_BLI_TRACE)
1147	void	1147	void
1148	xfs_buf_item_trace(	1148	xfs_buf_item_trace(
1149	char *id,	1149	char *id,
1150	xfs_buf_log_item_t *bip)	1150	xfs_buf_log_item_t *bip)
1151	{	1151	{
1152	xfs_buf_t *bp;	1152	xfs_buf_t *bp;
1153	ASSERT(bip->bli_trace != NULL);	1153	ASSERT(bip->bli_trace != NULL);
1154		1154
1155	bp = bip->bli_buf;	1155	bp = bip->bli_buf;
1156	ktrace_enter(bip->bli_trace,	1156	ktrace_enter(bip->bli_trace,
1157	(void *)id,	1157	(void *)id,
1158	(void *)bip->bli_buf,	1158	(void *)bip->bli_buf,
1159	(void *)((unsigned long)bip->bli_flags),	1159	(void *)((unsigned long)bip->bli_flags),
1160	(void *)((unsigned long)bip->bli_recur),	1160	(void *)((unsigned long)bip->bli_recur),
1161	(void *)((unsigned long)atomic_read(&bip->bli_refcount)),	1161	(void *)((unsigned long)atomic_read(&bip->bli_refcount)),
1162	(void *)((unsigned long)	1162	(void *)((unsigned long)
1163	(0xFFFFFFFF & XFS_BUF_ADDR(bp) >> 32)),	1163	(0xFFFFFFFF & XFS_BUF_ADDR(bp) >> 32)),
1164	(void *)((unsigned long)(0xFFFFFFFF & XFS_BUF_ADDR(bp))),	1164	(void *)((unsigned long)(0xFFFFFFFF & XFS_BUF_ADDR(bp))),
1165	(void *)((unsigned long)XFS_BUF_COUNT(bp)),	1165	(void *)((unsigned long)XFS_BUF_COUNT(bp)),
1166	(void *)((unsigned long)XFS_BUF_BFLAGS(bp)),	1166	(void *)((unsigned long)XFS_BUF_BFLAGS(bp)),
1167	XFS_BUF_FSPRIVATE(bp, void *),	1167	XFS_BUF_FSPRIVATE(bp, void *),
1168	XFS_BUF_FSPRIVATE2(bp, void *),	1168	XFS_BUF_FSPRIVATE2(bp, void *),
1169	(void *)(unsigned long)XFS_BUF_ISPINNED(bp),	1169	(void *)(unsigned long)XFS_BUF_ISPINNED(bp),
1170	(void *)XFS_BUF_IODONE_FUNC(bp),	1170	(void *)XFS_BUF_IODONE_FUNC(bp),
1171	(void *)((unsigned long)(XFS_BUF_VALUSEMA(bp))),	1171	(void *)((unsigned long)(XFS_BUF_VALUSEMA(bp))),
1172	(void *)bip->bli_item.li_desc,	1172	(void *)bip->bli_item.li_desc,
1173	(void *)((unsigned long)bip->bli_item.li_flags));	1173	(void *)((unsigned long)bip->bli_item.li_flags));
1174	}	1174	}
1175	#endif /* XFS_BLI_TRACE */	1175	#endif /* XFS_BLI_TRACE */
1176		1176

fs/xfs/xfs_log_recover.c

Diff comments View file @ 15ac08a

 /*
  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  * All Rights Reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it would be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write the Free Software Foundation,
  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_types.h"
 #include "xfs_bit.h"
 #include "xfs_log.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_sb.h"
 #include "xfs_ag.h"
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
 #include "xfs_error.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_alloc_btree.h"
 #include "xfs_ialloc_btree.h"
 #include "xfs_dir2_sf.h"
 #include "xfs_attr_sf.h"
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_alloc.h"
 #include "xfs_ialloc.h"
 #include "xfs_log_priv.h"
 #include "xfs_buf_item.h"
 #include "xfs_log_recover.h"
 #include "xfs_extfree_item.h"
 #include "xfs_trans_priv.h"
 #include "xfs_quota.h"
 #include "xfs_rw.h"
 #include "xfs_utils.h"
 STATIC int	xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
 STATIC int	xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
 STATIC void	xlog_recover_insert_item_backq(xlog_recover_item_t **q,
 					       xlog_recover_item_t *item);
 #if defined(DEBUG)
 STATIC void	xlog_recover_check_summary(xlog_t *);
 #else
 #define	xlog_recover_check_summary(log)
 #endif
 /*
  * Sector aligned buffer routines for buffer create/read/write/access
  */
 #define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs)	\
 	( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \
 	((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
 #define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno)	((bno) & ~(log)->l_sectbb_mask)
 xfs_buf_t *
 xlog_get_bp(
 	xlog_t		*log,
 	int		num_bblks)
 {
 	ASSERT(num_bblks > 0);
 	if (log->l_sectbb_log) {
 		if (num_bblks > 1)
 			num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
 		num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks);
 	}
 	return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp);
 }
 void
 xlog_put_bp(
 	xfs_buf_t	*bp)
 {
 	xfs_buf_free(bp);
 }
 /*
  * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
  */
 int
 xlog_bread(
 	xlog_t		*log,
 	xfs_daddr_t	blk_no,
 	int		nbblks,
 	xfs_buf_t	*bp)
 {
 	int		error;
 	if (log->l_sectbb_log) {
 		blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
 		nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
 	}
 	ASSERT(nbblks > 0);
 	ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
 	ASSERT(bp);
 	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
 	XFS_BUF_READ(bp);
 	XFS_BUF_BUSY(bp);
 	XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
 	XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
 	xfsbdstrat(log->l_mp, bp);
 	error = xfs_iowait(bp);
 	if (error)
 		xfs_ioerror_alert("xlog_bread", log->l_mp,
 				  bp, XFS_BUF_ADDR(bp));
 	return error;
 }
 /*
  * Write out the buffer at the given block for the given number of blocks.
  * The buffer is kept locked across the write and is returned locked.
  * This can only be used for synchronous log writes.
  */
 STATIC int
 xlog_bwrite(
 	xlog_t		*log,
 	xfs_daddr_t	blk_no,
 	int		nbblks,
 	xfs_buf_t	*bp)
 {
 	int		error;
 	if (log->l_sectbb_log) {
 		blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
 		nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
 	}
 	ASSERT(nbblks > 0);
 	ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
 	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
 	XFS_BUF_ZEROFLAGS(bp);
 	XFS_BUF_BUSY(bp);
 	XFS_BUF_HOLD(bp);
 	XFS_BUF_PSEMA(bp, PRIBIO);
 	XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
 	XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
 	if ((error = xfs_bwrite(log->l_mp, bp)))
 		xfs_ioerror_alert("xlog_bwrite", log->l_mp,
 				  bp, XFS_BUF_ADDR(bp));
 	return error;
 }
 STATIC xfs_caddr_t
 xlog_align(
 	xlog_t		*log,
 	xfs_daddr_t	blk_no,
 	int		nbblks,
 	xfs_buf_t	*bp)
 {
 	xfs_caddr_t	ptr;
 	if (!log->l_sectbb_log)
 		return XFS_BUF_PTR(bp);
 	ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
 	ASSERT(XFS_BUF_SIZE(bp) >=
 		BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
 	return ptr;
 }
 #ifdef DEBUG
 /*
  * dump debug superblock and log record information
  */
 STATIC void
 xlog_header_check_dump(
 	xfs_mount_t		*mp,
 	xlog_rec_header_t	*head)
 {
 	int			b;
 	cmn_err(CE_DEBUG, "%s:  SB : uuid = ", __func__);
 	for (b = 0; b < 16; b++)
 		cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]);
 	cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
 	cmn_err(CE_DEBUG, "    log : uuid = ");
 	for (b = 0; b < 16; b++)
 		cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]);
 	cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt));
 }
 #else
 #define xlog_header_check_dump(mp, head)
 #endif
 /*
  * check log record header for recovery
  */
 STATIC int
 xlog_header_check_recover(
 	xfs_mount_t		*mp,
 	xlog_rec_header_t	*head)
 {
 	ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM);
 	/*
 	 * IRIX doesn't write the h_fmt field and leaves it zeroed
 	 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
 	 * a dirty log created in IRIX.
 	 */
 	if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) {
 		xlog_warn(
 	"XFS: dirty log written in incompatible format - can't recover");
 		xlog_header_check_dump(mp, head);
 		XFS_ERROR_REPORT("xlog_header_check_recover(1)",
 				 XFS_ERRLEVEL_HIGH, mp);
 		return XFS_ERROR(EFSCORRUPTED);
 	} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
 		xlog_warn(
 	"XFS: dirty log entry has mismatched uuid - can't recover");
 		xlog_header_check_dump(mp, head);
 		XFS_ERROR_REPORT("xlog_header_check_recover(2)",
 				 XFS_ERRLEVEL_HIGH, mp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	return 0;
 }
 /*
  * read the head block of the log and check the header
  */
 STATIC int
 xlog_header_check_mount(
 	xfs_mount_t		*mp,
 	xlog_rec_header_t	*head)
 {
 	ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM);
 	if (uuid_is_nil(&head->h_fs_uuid)) {
 		/*
 		 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
 		 * h_fs_uuid is nil, we assume this log was last mounted
 		 * by IRIX and continue.
 		 */
 		xlog_warn("XFS: nil uuid in log - IRIX style log");
 	} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
 		xlog_warn("XFS: log has mismatched uuid - can't recover");
 		xlog_header_check_dump(mp, head);
 		XFS_ERROR_REPORT("xlog_header_check_mount",
 				 XFS_ERRLEVEL_HIGH, mp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	return 0;
 }
 STATIC void
 xlog_recover_iodone(
 	struct xfs_buf	*bp)
 {
-	xfs_mount_t	*mp;
-	ASSERT(XFS_BUF_FSPRIVATE(bp, void *));
 	if (XFS_BUF_GETERROR(bp)) {
 		/*
 		 * We're not going to bother about retrying
 		 * this during recovery. One strike!
 		 */
-		mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *);
 		xfs_ioerror_alert("xlog_recover_iodone",
-				  mp, bp, XFS_BUF_ADDR(bp));
+				  bp->b_mount, bp, XFS_BUF_ADDR(bp));
-		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+		xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
 	}
-	XFS_BUF_SET_FSPRIVATE(bp, NULL);
+	bp->b_mount = NULL;
 	XFS_BUF_CLR_IODONE_FUNC(bp);
 	xfs_biodone(bp);
 }
 /*
  * This routine finds (to an approximation) the first block in the physical
  * log which contains the given cycle.  It uses a binary search algorithm.
  * Note that the algorithm can not be perfect because the disk will not
  * necessarily be perfect.
  */
 STATIC int
 xlog_find_cycle_start(
 	xlog_t		*log,
 	xfs_buf_t	*bp,
 	xfs_daddr_t	first_blk,
 	xfs_daddr_t	*last_blk,
 	uint		cycle)
 {
 	xfs_caddr_t	offset;
 	xfs_daddr_t	mid_blk;
 	uint		mid_cycle;
 	int		error;
 	mid_blk = BLK_AVG(first_blk, *last_blk);
 	while (mid_blk != first_blk && mid_blk != *last_blk) {
 		if ((error = xlog_bread(log, mid_blk, 1, bp)))
 			return error;
 		offset = xlog_align(log, mid_blk, 1, bp);
 		mid_cycle = xlog_get_cycle(offset);
 		if (mid_cycle == cycle) {
 			*last_blk = mid_blk;
 			/* last_half_cycle == mid_cycle */
 		} else {
 			first_blk = mid_blk;
 			/* first_half_cycle == mid_cycle */
 		}
 		mid_blk = BLK_AVG(first_blk, *last_blk);
 	}
 	ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) ||
 	       (mid_blk == *last_blk && mid_blk-1 == first_blk));
 	return 0;
 }
 /*
  * Check that the range of blocks does not contain the cycle number
  * given.  The scan needs to occur from front to back and the ptr into the
  * region must be updated since a later routine will need to perform another
  * test.  If the region is completely good, we end up returning the same
  * last block number.
  *
  * Set blkno to -1 if we encounter no errors.  This is an invalid block number
  * since we don't ever expect logs to get this large.
  */
 STATIC int
 xlog_find_verify_cycle(
 	xlog_t		*log,
 	xfs_daddr_t	start_blk,
 	int		nbblks,
 	uint		stop_on_cycle_no,
 	xfs_daddr_t	*new_blk)
 {
 	xfs_daddr_t	i, j;
 	uint		cycle;
 	xfs_buf_t	*bp;
 	xfs_daddr_t	bufblks;
 	xfs_caddr_t	buf = NULL;
 	int		error = 0;
 	bufblks = 1 << ffs(nbblks);
 	while (!(bp = xlog_get_bp(log, bufblks))) {
 		/* can't get enough memory to do everything in one big buffer */
 		bufblks >>= 1;
 		if (bufblks <= log->l_sectbb_log)
 			return ENOMEM;
 	}
 	for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
 		int	bcount;
 		bcount = min(bufblks, (start_blk + nbblks - i));
 		if ((error = xlog_bread(log, i, bcount, bp)))
 			goto out;
 		buf = xlog_align(log, i, bcount, bp);
 		for (j = 0; j < bcount; j++) {
 			cycle = xlog_get_cycle(buf);
 			if (cycle == stop_on_cycle_no) {
 				*new_blk = i+j;
 				goto out;
 			}
 			buf += BBSIZE;
 		}
 	}
 	*new_blk = -1;
 out:
 	xlog_put_bp(bp);
 	return error;
 }
 /*
  * Potentially backup over partial log record write.
  *
  * In the typical case, last_blk is the number of the block directly after
  * a good log record.  Therefore, we subtract one to get the block number
  * of the last block in the given buffer.  extra_bblks contains the number
  * of blocks we would have read on a previous read.  This happens when the
  * last log record is split over the end of the physical log.
  *
  * extra_bblks is the number of blocks potentially verified on a previous
  * call to this routine.
  */
 STATIC int
 xlog_find_verify_log_record(
 	xlog_t			*log,
 	xfs_daddr_t		start_blk,
 	xfs_daddr_t		*last_blk,
 	int			extra_bblks)
 {
 	xfs_daddr_t		i;
 	xfs_buf_t		*bp;
 	xfs_caddr_t		offset = NULL;
 	xlog_rec_header_t	*head = NULL;
 	int			error = 0;
 	int			smallmem = 0;
 	int			num_blks = *last_blk - start_blk;
 	int			xhdrs;
 	ASSERT(start_blk != 0 || *last_blk != start_blk);
 	if (!(bp = xlog_get_bp(log, num_blks))) {
 		if (!(bp = xlog_get_bp(log, 1)))
 			return ENOMEM;
 		smallmem = 1;
 	} else {
 		if ((error = xlog_bread(log, start_blk, num_blks, bp)))
 			goto out;
 		offset = xlog_align(log, start_blk, num_blks, bp);
 		offset += ((num_blks - 1) << BBSHIFT);
 	}
 	for (i = (*last_blk) - 1; i >= 0; i--) {
 		if (i < start_blk) {
 			/* valid log record not found */
 			xlog_warn(
 		"XFS: Log inconsistent (didn't find previous header)");
 			ASSERT(0);
 			error = XFS_ERROR(EIO);
 			goto out;
 		}
 		if (smallmem) {
 			if ((error = xlog_bread(log, i, 1, bp)))
 				goto out;
 			offset = xlog_align(log, i, 1, bp);
 		}
 		head = (xlog_rec_header_t *)offset;
 		if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(head->h_magicno))
 			break;
 		if (!smallmem)
 			offset -= BBSIZE;
 	}
 	/*
 	 * We hit the beginning of the physical log & still no header.  Return
 	 * to caller.  If caller can handle a return of -1, then this routine
 	 * will be called again for the end of the physical log.
 	 */
 	if (i == -1) {
 		error = -1;
 		goto out;
 	}
 	/*
 	 * We have the final block of the good log (the first block
 	 * of the log record _before_ the head. So we check the uuid.
 	 */
 	if ((error = xlog_header_check_mount(log->l_mp, head)))
 		goto out;
 	/*
 	 * We may have found a log record header before we expected one.
 	 * last_blk will be the 1st block # with a given cycle #.  We may end
 	 * up reading an entire log record.  In this case, we don't want to
 	 * reset last_blk.  Only when last_blk points in the middle of a log
 	 * record do we update last_blk.
 	 */
 	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 		uint	h_size = be32_to_cpu(head->h_size);
 		xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
 		if (h_size % XLOG_HEADER_CYCLE_SIZE)
 			xhdrs++;
 	} else {
 		xhdrs = 1;
 	}
 	if (*last_blk - i + extra_bblks !=
 	    BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
 		*last_blk = i;
 out:
 	xlog_put_bp(bp);
 	return error;
 }
 /*
  * Head is defined to be the point of the log where the next log write
  * write could go.  This means that incomplete LR writes at the end are
  * eliminated when calculating the head.  We aren't guaranteed that previous
  * LR have complete transactions.  We only know that a cycle number of
  * current cycle number -1 won't be present in the log if we start writing
  * from our current block number.
  *
  * last_blk contains the block number of the first block with a given
  * cycle number.
  *
  * Return: zero if normal, non-zero if error.
  */
 STATIC int
 xlog_find_head(
 	xlog_t 		*log,
 	xfs_daddr_t	*return_head_blk)
 {
 	xfs_buf_t	*bp;
 	xfs_caddr_t	offset;
 	xfs_daddr_t	new_blk, first_blk, start_blk, last_blk, head_blk;
 	int		num_scan_bblks;
 	uint		first_half_cycle, last_half_cycle;
 	uint		stop_on_cycle;
 	int		error, log_bbnum = log->l_logBBsize;
 	/* Is the end of the log device zeroed? */
 	if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
 		*return_head_blk = first_blk;
 		/* Is the whole lot zeroed? */
 		if (!first_blk) {
 			/* Linux XFS shouldn't generate totally zeroed logs -
 			 * mkfs etc write a dummy unmount record to a fresh
 			 * log so we can store the uuid in there
 			 */
 			xlog_warn("XFS: totally zeroed log");
 		}
 		return 0;
 	} else if (error) {
 		xlog_warn("XFS: empty log check failed");
 		return error;
 	}
 	first_blk = 0;			/* get cycle # of 1st block */
 	bp = xlog_get_bp(log, 1);
 	if (!bp)
 		return ENOMEM;
 	if ((error = xlog_bread(log, 0, 1, bp)))
 		goto bp_err;
 	offset = xlog_align(log, 0, 1, bp);
 	first_half_cycle = xlog_get_cycle(offset);
 	last_blk = head_blk = log_bbnum - 1;	/* get cycle # of last block */
 	if ((error = xlog_bread(log, last_blk, 1, bp)))
 		goto bp_err;
 	offset = xlog_align(log, last_blk, 1, bp);
 	last_half_cycle = xlog_get_cycle(offset);
 	ASSERT(last_half_cycle != 0);
 	/*
 	 * If the 1st half cycle number is equal to the last half cycle number,
 	 * then the entire log is stamped with the same cycle number.  In this
 	 * case, head_blk can't be set to zero (which makes sense).  The below
 	 * math doesn't work out properly with head_blk equal to zero.  Instead,
 	 * we set it to log_bbnum which is an invalid block number, but this
 	 * value makes the math correct.  If head_blk doesn't changed through
 	 * all the tests below, *head_blk is set to zero at the very end rather
 	 * than log_bbnum.  In a sense, log_bbnum and zero are the same block
 	 * in a circular file.
 	 */
 	if (first_half_cycle == last_half_cycle) {
 		/*
 		 * In this case we believe that the entire log should have
 		 * cycle number last_half_cycle.  We need to scan backwards
 		 * from the end verifying that there are no holes still
 		 * containing last_half_cycle - 1.  If we find such a hole,
 		 * then the start of that hole will be the new head.  The
 		 * simple case looks like
 		 *        x | x ... | x - 1 | x
 		 * Another case that fits this picture would be
 		 *        x | x + 1 | x ... | x
 		 * In this case the head really is somewhere at the end of the
 		 * log, as one of the latest writes at the beginning was
 		 * incomplete.
 		 * One more case is
 		 *        x | x + 1 | x ... | x - 1 | x
 		 * This is really the combination of the above two cases, and
 		 * the head has to end up at the start of the x-1 hole at the
 		 * end of the log.
 		 *
 		 * In the 256k log case, we will read from the beginning to the
 		 * end of the log and search for cycle numbers equal to x-1.
 		 * We don't worry about the x+1 blocks that we encounter,
 		 * because we know that they cannot be the head since the log
 		 * started with x.
 		 */
 		head_blk = log_bbnum;
 		stop_on_cycle = last_half_cycle - 1;
 	} else {
 		/*
 		 * In this case we want to find the first block with cycle
 		 * number matching last_half_cycle.  We expect the log to be
 		 * some variation on
 		 *        x + 1 ... | x ...
 		 * The first block with cycle number x (last_half_cycle) will
 		 * be where the new head belongs.  First we do a binary search
 		 * for the first occurrence of last_half_cycle.  The binary
 		 * search may not be totally accurate, so then we scan back
 		 * from there looking for occurrences of last_half_cycle before
 		 * us.  If that backwards scan wraps around the beginning of
 		 * the log, then we look for occurrences of last_half_cycle - 1
 		 * at the end of the log.  The cases we're looking for look
 		 * like
 		 *        x + 1 ... | x | x + 1 | x ...
 		 *                               ^ binary search stopped here
 		 * or
 		 *        x + 1 ... | x ... | x - 1 | x
 		 *        <---------> less than scan distance
 		 */
 		stop_on_cycle = last_half_cycle;
 		if ((error = xlog_find_cycle_start(log, bp, first_blk,
 						&head_blk, last_half_cycle)))
 			goto bp_err;
 	}
 	/*
 	 * Now validate the answer.  Scan back some number of maximum possible
 	 * blocks and make sure each one has the expected cycle number.  The
 	 * maximum is determined by the total possible amount of buffering
 	 * in the in-core log.  The following number can be made tighter if
 	 * we actually look at the block size of the filesystem.
 	 */
 	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
 	if (head_blk >= num_scan_bblks) {
 		/*
 		 * We are guaranteed that the entire check can be performed
 		 * in one buffer.
 		 */
 		start_blk = head_blk - num_scan_bblks;
 		if ((error = xlog_find_verify_cycle(log,
 						start_blk, num_scan_bblks,
 						stop_on_cycle, &new_blk)))
 			goto bp_err;
 		if (new_blk != -1)
 			head_blk = new_blk;
 	} else {		/* need to read 2 parts of log */
 		/*
 		 * We are going to scan backwards in the log in two parts.
 		 * First we scan the physical end of the log.  In this part
 		 * of the log, we are looking for blocks with cycle number
 		 * last_half_cycle - 1.
 		 * If we find one, then we know that the log starts there, as
 		 * we've found a hole that didn't get written in going around
 		 * the end of the physical log.  The simple case for this is
 		 *        x + 1 ... | x ... | x - 1 | x
 		 *        <---------> less than scan distance
 		 * If all of the blocks at the end of the log have cycle number
 		 * last_half_cycle, then we check the blocks at the start of
 		 * the log looking for occurrences of last_half_cycle.  If we
 		 * find one, then our current estimate for the location of the
 		 * first occurrence of last_half_cycle is wrong and we move
 		 * back to the hole we've found.  This case looks like
 		 *        x + 1 ... | x | x + 1 | x ...
 		 *                               ^ binary search stopped here
 		 * Another case we need to handle that only occurs in 256k
 		 * logs is
 		 *        x + 1 ... | x ... | x+1 | x ...
 		 *                   ^ binary search stops here
 		 * In a 256k log, the scan at the end of the log will see the
 		 * x + 1 blocks.  We need to skip past those since that is
 		 * certainly not the head of the log.  By searching for
 		 * last_half_cycle-1 we accomplish that.
 		 */
 		start_blk = log_bbnum - num_scan_bblks + head_blk;
 		ASSERT(head_blk <= INT_MAX &&
 			(xfs_daddr_t) num_scan_bblks - head_blk >= 0);
 		if ((error = xlog_find_verify_cycle(log, start_blk,
 					num_scan_bblks - (int)head_blk,
 					(stop_on_cycle - 1), &new_blk)))
 			goto bp_err;
 		if (new_blk != -1) {
 			head_blk = new_blk;
 			goto bad_blk;
 		}
 		/*
 		 * Scan beginning of log now.  The last part of the physical
 		 * log is good.  This scan needs to verify that it doesn't find
 		 * the last_half_cycle.
 		 */
 		start_blk = 0;
 		ASSERT(head_blk <= INT_MAX);
 		if ((error = xlog_find_verify_cycle(log,
 					start_blk, (int)head_blk,
 					stop_on_cycle, &new_blk)))
 			goto bp_err;
 		if (new_blk != -1)
 			head_blk = new_blk;
 	}
  bad_blk:
 	/*
 	 * Now we need to make sure head_blk is not pointing to a block in
 	 * the middle of a log record.
 	 */
 	num_scan_bblks = XLOG_REC_SHIFT(log);
 	if (head_blk >= num_scan_bblks) {
 		start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
 		/* start ptr at last block ptr before head_blk */
 		if ((error = xlog_find_verify_log_record(log, start_blk,
 							&head_blk, 0)) == -1) {
 			error = XFS_ERROR(EIO);
 			goto bp_err;
 		} else if (error)
 			goto bp_err;
 	} else {
 		start_blk = 0;
 		ASSERT(head_blk <= INT_MAX);
 		if ((error = xlog_find_verify_log_record(log, start_blk,
 							&head_blk, 0)) == -1) {
 			/* We hit the beginning of the log during our search */
 			start_blk = log_bbnum - num_scan_bblks + head_blk;
 			new_blk = log_bbnum;
 			ASSERT(start_blk <= INT_MAX &&
 				(xfs_daddr_t) log_bbnum-start_blk >= 0);
 			ASSERT(head_blk <= INT_MAX);
 			if ((error = xlog_find_verify_log_record(log,
 							start_blk, &new_blk,
 							(int)head_blk)) == -1) {
 				error = XFS_ERROR(EIO);
 				goto bp_err;
 			} else if (error)
 				goto bp_err;
 			if (new_blk != log_bbnum)
 				head_blk = new_blk;
 		} else if (error)
 			goto bp_err;
 	}
 	xlog_put_bp(bp);
 	if (head_blk == log_bbnum)
 		*return_head_blk = 0;
 	else
 		*return_head_blk = head_blk;
 	/*
 	 * When returning here, we have a good block number.  Bad block
 	 * means that during a previous crash, we didn't have a clean break
 	 * from cycle number N to cycle number N-1.  In this case, we need
 	 * to find the first block with cycle number N-1.
 	 */
 	return 0;
  bp_err:
 	xlog_put_bp(bp);
 	if (error)
 	    xlog_warn("XFS: failed to find log head");
 	return error;
 }
 /*
  * Find the sync block number or the tail of the log.
  *
  * This will be the block number of the last record to have its
  * associated buffers synced to disk.  Every log record header has
  * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
  * to get a sync block number.  The only concern is to figure out which
  * log record header to believe.
  *
  * The following algorithm uses the log record header with the largest
  * lsn.  The entire log record does not need to be valid.  We only care
  * that the header is valid.
  *
  * We could speed up search by using current head_blk buffer, but it is not
  * available.
  */
 int
 xlog_find_tail(
 	xlog_t			*log,
 	xfs_daddr_t		*head_blk,
 	xfs_daddr_t		*tail_blk)
 {
 	xlog_rec_header_t	*rhead;
 	xlog_op_header_t	*op_head;
 	xfs_caddr_t		offset = NULL;
 	xfs_buf_t		*bp;
 	int			error, i, found;
 	xfs_daddr_t		umount_data_blk;
 	xfs_daddr_t		after_umount_blk;
 	xfs_lsn_t		tail_lsn;
 	int			hblks;
 	found = 0;
 	/*
 	 * Find previous log record
 	 */
 	if ((error = xlog_find_head(log, head_blk)))
 		return error;
 	bp = xlog_get_bp(log, 1);
 	if (!bp)
 		return ENOMEM;
 	if (*head_blk == 0) {				/* special case */
 		if ((error = xlog_bread(log, 0, 1, bp)))
 			goto bread_err;
 		offset = xlog_align(log, 0, 1, bp);
 		if (xlog_get_cycle(offset) == 0) {
 			*tail_blk = 0;
 			/* leave all other log inited values alone */
 			goto exit;
 		}
 	}
 	/*
 	 * Search backwards looking for log record header block
 	 */
 	ASSERT(*head_blk < INT_MAX);
 	for (i = (int)(*head_blk) - 1; i >= 0; i--) {
 		if ((error = xlog_bread(log, i, 1, bp)))
 			goto bread_err;
 		offset = xlog_align(log, i, 1, bp);
 		if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
 			found = 1;
 			break;
 		}
 	}
 	/*
 	 * If we haven't found the log record header block, start looking
 	 * again from the end of the physical log.  XXXmiken: There should be
 	 * a check here to make sure we didn't search more than N blocks in
 	 * the previous code.
 	 */
 	if (!found) {
 		for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
 			if ((error = xlog_bread(log, i, 1, bp)))
 				goto bread_err;
 			offset = xlog_align(log, i, 1, bp);
 			if (XLOG_HEADER_MAGIC_NUM ==
 			    be32_to_cpu(*(__be32 *)offset)) {
 				found = 2;
 				break;
 			}
 		}
 	}
 	if (!found) {
 		xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
 		ASSERT(0);
 		return XFS_ERROR(EIO);
 	}
 	/* find blk_no of tail of log */
 	rhead = (xlog_rec_header_t *)offset;
 	*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
 	/*
 	 * Reset log values according to the state of the log when we
 	 * crashed.  In the case where head_blk == 0, we bump curr_cycle
 	 * one because the next write starts a new cycle rather than
 	 * continuing the cycle of the last good log record.  At this
 	 * point we have guaranteed that all partial log records have been
 	 * accounted for.  Therefore, we know that the last good log record
 	 * written was complete and ended exactly on the end boundary
 	 * of the physical log.
 	 */
 	log->l_prev_block = i;
 	log->l_curr_block = (int)*head_blk;
 	log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
 	if (found == 2)
 		log->l_curr_cycle++;
 	log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn);
 	log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn);
 	log->l_grant_reserve_cycle = log->l_curr_cycle;
 	log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
 	log->l_grant_write_cycle = log->l_curr_cycle;
 	log->l_grant_write_bytes = BBTOB(log->l_curr_block);
 	/*
 	 * Look for unmount record.  If we find it, then we know there
 	 * was a clean unmount.  Since 'i' could be the last block in
 	 * the physical log, we convert to a log block before comparing
 	 * to the head_blk.
 	 *
 	 * Save the current tail lsn to use to pass to
 	 * xlog_clear_stale_blocks() below.  We won't want to clear the
 	 * unmount record if there is one, so we pass the lsn of the
 	 * unmount record rather than the block after it.
 	 */
 	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 		int	h_size = be32_to_cpu(rhead->h_size);
 		int	h_version = be32_to_cpu(rhead->h_version);
 		if ((h_version & XLOG_VERSION_2) &&
 		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
 			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
 			if (h_size % XLOG_HEADER_CYCLE_SIZE)
 				hblks++;
 		} else {
 			hblks = 1;
 		}
 	} else {
 		hblks = 1;
 	}
 	after_umount_blk = (i + hblks + (int)
 		BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
 	tail_lsn = log->l_tail_lsn;
 	if (*head_blk == after_umount_blk &&
 	    be32_to_cpu(rhead->h_num_logops) == 1) {
 		umount_data_blk = (i + hblks) % log->l_logBBsize;
 		if ((error = xlog_bread(log, umount_data_blk, 1, bp))) {
 			goto bread_err;
 		}
 		offset = xlog_align(log, umount_data_blk, 1, bp);
 		op_head = (xlog_op_header_t *)offset;
 		if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
 			/*
 			 * Set tail and last sync so that newly written
 			 * log records will point recovery to after the
 			 * current unmount record.
 			 */
 			log->l_tail_lsn =
 				xlog_assign_lsn(log->l_curr_cycle,
 						after_umount_blk);
 			log->l_last_sync_lsn =
 				xlog_assign_lsn(log->l_curr_cycle,
 						after_umount_blk);
 			*tail_blk = after_umount_blk;
 			/*
 			 * Note that the unmount was clean. If the unmount
 			 * was not clean, we need to know this to rebuild the
 			 * superblock counters from the perag headers if we
 			 * have a filesystem using non-persistent counters.
 			 */
 			log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
 		}
 	}
 	/*
 	 * Make sure that there are no blocks in front of the head
 	 * with the same cycle number as the head.  This can happen
 	 * because we allow multiple outstanding log writes concurrently,
 	 * and the later writes might make it out before earlier ones.
 	 *
 	 * We use the lsn from before modifying it so that we'll never
 	 * overwrite the unmount record after a clean unmount.
 	 *
 	 * Do this only if we are going to recover the filesystem
 	 *
 	 * NOTE: This used to say "if (!readonly)"
 	 * However on Linux, we can & do recover a read-only filesystem.
 	 * We only skip recovery if NORECOVERY is specified on mount,
 	 * in which case we would not be here.
 	 *
 	 * But... if the -device- itself is readonly, just skip this.
 	 * We can't recover this device anyway, so it won't matter.
 	 */
 	if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
 		error = xlog_clear_stale_blocks(log, tail_lsn);
 	}
 bread_err:
 exit:
 	xlog_put_bp(bp);
 	if (error)
 		xlog_warn("XFS: failed to locate log tail");
 	return error;
 }
 /*
  * Is the log zeroed at all?
  *
  * The last binary search should be changed to perform an X block read
  * once X becomes small enough.  You can then search linearly through
  * the X blocks.  This will cut down on the number of reads we need to do.
  *
  * If the log is partially zeroed, this routine will pass back the blkno
  * of the first block with cycle number 0.  It won't have a complete LR
  * preceding it.
  *
  * Return:
  *	0  => the log is completely written to
  *	-1 => use *blk_no as the first block of the log
  *	>0 => error has occurred
  */
 STATIC int
 xlog_find_zeroed(
 	xlog_t		*log,
 	xfs_daddr_t	*blk_no)
 {
 	xfs_buf_t	*bp;
 	xfs_caddr_t	offset;
 	uint	        first_cycle, last_cycle;
 	xfs_daddr_t	new_blk, last_blk, start_blk;
 	xfs_daddr_t     num_scan_bblks;
 	int	        error, log_bbnum = log->l_logBBsize;
 	*blk_no = 0;
 	/* check totally zeroed log */
 	bp = xlog_get_bp(log, 1);
 	if (!bp)
 		return ENOMEM;
 	if ((error = xlog_bread(log, 0, 1, bp)))
 		goto bp_err;
 	offset = xlog_align(log, 0, 1, bp);
 	first_cycle = xlog_get_cycle(offset);
 	if (first_cycle == 0) {		/* completely zeroed log */
 		*blk_no = 0;
 		xlog_put_bp(bp);
 		return -1;
 	}
 	/* check partially zeroed log */
 	if ((error = xlog_bread(log, log_bbnum-1, 1, bp)))
 		goto bp_err;
 	offset = xlog_align(log, log_bbnum-1, 1, bp);
 	last_cycle = xlog_get_cycle(offset);
 	if (last_cycle != 0) {		/* log completely written to */
 		xlog_put_bp(bp);
 		return 0;
 	} else if (first_cycle != 1) {
 		/*
 		 * If the cycle of the last block is zero, the cycle of
 		 * the first block must be 1. If it's not, maybe we're
 		 * not looking at a log... Bail out.
 		 */
 		xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
 		return XFS_ERROR(EINVAL);
 	}
 	/* we have a partially zeroed log */
 	last_blk = log_bbnum-1;
 	if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
 		goto bp_err;
 	/*
 	 * Validate the answer.  Because there is no way to guarantee that
 	 * the entire log is made up of log records which are the same size,
 	 * we scan over the defined maximum blocks.  At this point, the maximum
 	 * is not chosen to mean anything special.   XXXmiken
 	 */
 	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
 	ASSERT(num_scan_bblks <= INT_MAX);
 	if (last_blk < num_scan_bblks)
 		num_scan_bblks = last_blk;
 	start_blk = last_blk - num_scan_bblks;
 	/*
 	 * We search for any instances of cycle number 0 that occur before
 	 * our current estimate of the head.  What we're trying to detect is
 	 *        1 ... | 0 | 1 | 0...
 	 *                       ^ binary search ends here
 	 */
 	if ((error = xlog_find_verify_cycle(log, start_blk,
 					 (int)num_scan_bblks, 0, &new_blk)))
 		goto bp_err;
 	if (new_blk != -1)
 		last_blk = new_blk;
 	/*
 	 * Potentially backup over partial log record write.  We don't need
 	 * to search the end of the log because we know it is zero.
 	 */
 	if ((error = xlog_find_verify_log_record(log, start_blk,
 				&last_blk, 0)) == -1) {
 	    error = XFS_ERROR(EIO);
 	    goto bp_err;
 	} else if (error)
 	    goto bp_err;
 	*blk_no = last_blk;
 bp_err:
 	xlog_put_bp(bp);
 	if (error)
 		return error;
 	return -1;
 }
 /*
  * These are simple subroutines used by xlog_clear_stale_blocks() below
  * to initialize a buffer full of empty log record headers and write
  * them into the log.
  */
 STATIC void
 xlog_add_record(
 	xlog_t			*log,
 	xfs_caddr_t		buf,
 	int			cycle,
 	int			block,
 	int			tail_cycle,
 	int			tail_block)
 {
 	xlog_rec_header_t	*recp = (xlog_rec_header_t *)buf;
 	memset(buf, 0, BBSIZE);
 	recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
 	recp->h_cycle = cpu_to_be32(cycle);
 	recp->h_version = cpu_to_be32(
 			xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
 	recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
 	recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
 	recp->h_fmt = cpu_to_be32(XLOG_FMT);
 	memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
 }
 STATIC int
 xlog_write_log_records(
 	xlog_t		*log,
 	int		cycle,
 	int		start_block,
 	int		blocks,
 	int		tail_cycle,
 	int		tail_block)
 {
 	xfs_caddr_t	offset;
 	xfs_buf_t	*bp;
 	int		balign, ealign;
 	int		sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
 	int		end_block = start_block + blocks;
 	int		bufblks;
 	int		error = 0;
 	int		i, j = 0;
 	bufblks = 1 << ffs(blocks);
 	while (!(bp = xlog_get_bp(log, bufblks))) {
 		bufblks >>= 1;
 		if (bufblks <= log->l_sectbb_log)
 			return ENOMEM;
 	}
 	/* We may need to do a read at the start to fill in part of
 	 * the buffer in the starting sector not covered by the first
 	 * write below.
 	 */
 	balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
 	if (balign != start_block) {
 		if ((error = xlog_bread(log, start_block, 1, bp))) {
 			xlog_put_bp(bp);
 			return error;
 		}
 		j = start_block - balign;
 	}
 	for (i = start_block; i < end_block; i += bufblks) {
 		int		bcount, endcount;
 		bcount = min(bufblks, end_block - start_block);
 		endcount = bcount - j;
 		/* We may need to do a read at the end to fill in part of
 		 * the buffer in the final sector not covered by the write.
 		 * If this is the same sector as the above read, skip it.
 		 */
 		ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block);
 		if (j == 0 && (start_block + endcount > ealign)) {
 			offset = XFS_BUF_PTR(bp);
 			balign = BBTOB(ealign - start_block);
 			error = XFS_BUF_SET_PTR(bp, offset + balign,
 						BBTOB(sectbb));
 			if (!error)
 				error = xlog_bread(log, ealign, sectbb, bp);
 			if (!error)
 				error = XFS_BUF_SET_PTR(bp, offset, bufblks);
 			if (error)
 				break;
 		}
 		offset = xlog_align(log, start_block, endcount, bp);
 		for (; j < endcount; j++) {
 			xlog_add_record(log, offset, cycle, i+j,
 					tail_cycle, tail_block);
 			offset += BBSIZE;
 		}
 		error = xlog_bwrite(log, start_block, endcount, bp);
 		if (error)
 			break;
 		start_block += endcount;
 		j = 0;
 	}
 	xlog_put_bp(bp);
 	return error;
 }
 /*
  * This routine is called to blow away any incomplete log writes out
  * in front of the log head.  We do this so that we won't become confused
  * if we come up, write only a little bit more, and then crash again.
  * If we leave the partial log records out there, this situation could
  * cause us to think those partial writes are valid blocks since they
  * have the current cycle number.  We get rid of them by overwriting them
  * with empty log records with the old cycle number rather than the
  * current one.
  *
  * The tail lsn is passed in rather than taken from
  * the log so that we will not write over the unmount record after a
  * clean unmount in a 512 block log.  Doing so would leave the log without
  * any valid log records in it until a new one was written.  If we crashed
  * during that time we would not be able to recover.
  */
 STATIC int
 xlog_clear_stale_blocks(
 	xlog_t		*log,
 	xfs_lsn_t	tail_lsn)
 {
 	int		tail_cycle, head_cycle;
 	int		tail_block, head_block;
 	int		tail_distance, max_distance;
 	int		distance;
 	int		error;
 	tail_cycle = CYCLE_LSN(tail_lsn);
 	tail_block = BLOCK_LSN(tail_lsn);
 	head_cycle = log->l_curr_cycle;
 	head_block = log->l_curr_block;
 	/*
 	 * Figure out the distance between the new head of the log
 	 * and the tail.  We want to write over any blocks beyond the
 	 * head that we may have written just before the crash, but
 	 * we don't want to overwrite the tail of the log.
 	 */
 	if (head_cycle == tail_cycle) {
 		/*
 		 * The tail is behind the head in the physical log,
 		 * so the distance from the head to the tail is the
 		 * distance from the head to the end of the log plus
 		 * the distance from the beginning of the log to the
 		 * tail.
 		 */
 		if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
 			XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
 					 XFS_ERRLEVEL_LOW, log->l_mp);
 			return XFS_ERROR(EFSCORRUPTED);
 		}
 		tail_distance = tail_block + (log->l_logBBsize - head_block);
 	} else {
 		/*
 		 * The head is behind the tail in the physical log,
 		 * so the distance from the head to the tail is just
 		 * the tail block minus the head block.
 		 */
 		if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
 			XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
 					 XFS_ERRLEVEL_LOW, log->l_mp);
 			return XFS_ERROR(EFSCORRUPTED);
 		}
 		tail_distance = tail_block - head_block;
 	}
 	/*
 	 * If the head is right up against the tail, we can't clear
 	 * anything.
 	 */
 	if (tail_distance <= 0) {
 		ASSERT(tail_distance == 0);
 		return 0;
 	}
 	max_distance = XLOG_TOTAL_REC_SHIFT(log);
 	/*
 	 * Take the smaller of the maximum amount of outstanding I/O
 	 * we could have and the distance to the tail to clear out.
 	 * We take the smaller so that we don't overwrite the tail and
 	 * we don't waste all day writing from the head to the tail
 	 * for no reason.
 	 */
 	max_distance = MIN(max_distance, tail_distance);
 	if ((head_block + max_distance) <= log->l_logBBsize) {
 		/*
 		 * We can stomp all the blocks we need to without
 		 * wrapping around the end of the log.  Just do it
 		 * in a single write.  Use the cycle number of the
 		 * current cycle minus one so that the log will look like:
 		 *     n ... | n - 1 ...
 		 */
 		error = xlog_write_log_records(log, (head_cycle - 1),
 				head_block, max_distance, tail_cycle,
 				tail_block);
 		if (error)
 			return error;
 	} else {
 		/*
 		 * We need to wrap around the end of the physical log in
 		 * order to clear all the blocks.  Do it in two separate
 		 * I/Os.  The first write should be from the head to the
 		 * end of the physical log, and it should use the current
 		 * cycle number minus one just like above.
 		 */
 		distance = log->l_logBBsize - head_block;
 		error = xlog_write_log_records(log, (head_cycle - 1),
 				head_block, distance, tail_cycle,
 				tail_block);
 		if (error)
 			return error;
 		/*
 		 * Now write the blocks at the start of the physical log.
 		 * This writes the remainder of the blocks we want to clear.
 		 * It uses the current cycle number since we're now on the
 		 * same cycle as the head so that we get:
 		 *    n ... n ... | n - 1 ...
 		 *    ^^^^^ blocks we're writing
 		 */
 		distance = max_distance - (log->l_logBBsize - head_block);
 		error = xlog_write_log_records(log, head_cycle, 0, distance,
 				tail_cycle, tail_block);
 		if (error)
 			return error;
 	}
 	return 0;
 }
 /******************************************************************************
  *
  *		Log recover routines
  *
  ******************************************************************************
  */
 STATIC xlog_recover_t *
 xlog_recover_find_tid(
 	xlog_recover_t		*q,
 	xlog_tid_t		tid)
 {
 	xlog_recover_t		*p = q;
 	while (p != NULL) {
 		if (p->r_log_tid == tid)
 		    break;
 		p = p->r_next;
 	}
 	return p;
 }
 STATIC void
 xlog_recover_put_hashq(
 	xlog_recover_t		**q,
 	xlog_recover_t		*trans)
 {
 	trans->r_next = *q;
 	*q = trans;
 }
 STATIC void
 xlog_recover_add_item(
 	xlog_recover_item_t	**itemq)
 {
 	xlog_recover_item_t	*item;
 	item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
 	xlog_recover_insert_item_backq(itemq, item);
 }
 STATIC int
 xlog_recover_add_to_cont_trans(
 	xlog_recover_t		*trans,
 	xfs_caddr_t		dp,
 	int			len)
 {
 	xlog_recover_item_t	*item;
 	xfs_caddr_t		ptr, old_ptr;
 	int			old_len;
 	item = trans->r_itemq;
 	if (item == NULL) {
 		/* finish copying rest of trans header */
 		xlog_recover_add_item(&trans->r_itemq);
 		ptr = (xfs_caddr_t) &trans->r_theader +
 				sizeof(xfs_trans_header_t) - len;
 		memcpy(ptr, dp, len); /* d, s, l */
 		return 0;
 	}
 	item = item->ri_prev;
 	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
 	old_len = item->ri_buf[item->ri_cnt-1].i_len;
 	ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u);
 	memcpy(&ptr[old_len], dp, len); /* d, s, l */
 	item->ri_buf[item->ri_cnt-1].i_len += len;
 	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
 	return 0;
 }
 /*
  * The next region to add is the start of a new region.  It could be
  * a whole region or it could be the first part of a new region.  Because
  * of this, the assumption here is that the type and size fields of all
  * format structures fit into the first 32 bits of the structure.
  *
  * This works because all regions must be 32 bit aligned.  Therefore, we
  * either have both fields or we have neither field.  In the case we have
  * neither field, the data part of the region is zero length.  We only have
  * a log_op_header and can throw away the header since a new one will appear
  * later.  If we have at least 4 bytes, then we can determine how many regions
  * will appear in the current log item.
  */
 STATIC int
 xlog_recover_add_to_trans(
 	xlog_recover_t		*trans,
 	xfs_caddr_t		dp,
 	int			len)
 {
 	xfs_inode_log_format_t	*in_f;			/* any will do */
 	xlog_recover_item_t	*item;
 	xfs_caddr_t		ptr;
 	if (!len)
 		return 0;
 	item = trans->r_itemq;
 	if (item == NULL) {
 		/* we need to catch log corruptions here */
 		if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
 			xlog_warn("XFS: xlog_recover_add_to_trans: "
 				  "bad header magic number");
 			ASSERT(0);
 			return XFS_ERROR(EIO);
 		}
 		if (len == sizeof(xfs_trans_header_t))
 			xlog_recover_add_item(&trans->r_itemq);
 		memcpy(&trans->r_theader, dp, len); /* d, s, l */
 		return 0;
 	}
 	ptr = kmem_alloc(len, KM_SLEEP);
 	memcpy(ptr, dp, len);
 	in_f = (xfs_inode_log_format_t *)ptr;
 	if (item->ri_prev->ri_total != 0 &&
 	     item->ri_prev->ri_total == item->ri_prev->ri_cnt) {
 		xlog_recover_add_item(&trans->r_itemq);
 	}
 	item = trans->r_itemq;
 	item = item->ri_prev;
 	if (item->ri_total == 0) {		/* first region to be added */
 		item->ri_total	= in_f->ilf_size;
 		ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM);
 		item->ri_buf = kmem_zalloc((item->ri_total *
 					    sizeof(xfs_log_iovec_t)), KM_SLEEP);
 	}
 	ASSERT(item->ri_total > item->ri_cnt);
 	/* Description region is ri_buf[0] */
 	item->ri_buf[item->ri_cnt].i_addr = ptr;
 	item->ri_buf[item->ri_cnt].i_len  = len;
 	item->ri_cnt++;
 	return 0;
 }
 STATIC void
 xlog_recover_new_tid(
 	xlog_recover_t		**q,
 	xlog_tid_t		tid,
 	xfs_lsn_t		lsn)
 {
 	xlog_recover_t		*trans;
 	trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
 	trans->r_log_tid   = tid;
 	trans->r_lsn	   = lsn;
 	xlog_recover_put_hashq(q, trans);
 }
 STATIC int
 xlog_recover_unlink_tid(
 	xlog_recover_t		**q,
 	xlog_recover_t		*trans)
 {
 	xlog_recover_t		*tp;
 	int			found = 0;
 	ASSERT(trans != NULL);
 	if (trans == *q) {
 		*q = (*q)->r_next;
 	} else {
 		tp = *q;
 		while (tp) {
 			if (tp->r_next == trans) {
 				found = 1;
 				break;
 			}
 			tp = tp->r_next;
 		}
 		if (!found) {
 			xlog_warn(
 			     "XFS: xlog_recover_unlink_tid: trans not found");
 			ASSERT(0);
 			return XFS_ERROR(EIO);
 		}
 		tp->r_next = tp->r_next->r_next;
 	}
 	return 0;
 }
 STATIC void
 xlog_recover_insert_item_backq(
 	xlog_recover_item_t	**q,
 	xlog_recover_item_t	*item)
 {
 	if (*q == NULL) {
 		item->ri_prev = item->ri_next = item;
 		*q = item;
 	} else {
 		item->ri_next		= *q;
 		item->ri_prev		= (*q)->ri_prev;
 		(*q)->ri_prev		= item;
 		item->ri_prev->ri_next	= item;
 	}
 }
 STATIC void
 xlog_recover_insert_item_frontq(
 	xlog_recover_item_t	**q,
 	xlog_recover_item_t	*item)
 {
 	xlog_recover_insert_item_backq(q, item);
 	*q = item;
 }
 STATIC int
 xlog_recover_reorder_trans(
 	xlog_recover_t		*trans)
 {
 	xlog_recover_item_t	*first_item, *itemq, *itemq_next;
 	xfs_buf_log_format_t	*buf_f;
 	ushort			flags = 0;
 	first_item = itemq = trans->r_itemq;
 	trans->r_itemq = NULL;
 	do {
 		itemq_next = itemq->ri_next;
 		buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr;
 		switch (ITEM_TYPE(itemq)) {
 		case XFS_LI_BUF:
 			flags = buf_f->blf_flags;
 			if (!(flags & XFS_BLI_CANCEL)) {
 				xlog_recover_insert_item_frontq(&trans->r_itemq,
 								itemq);
 				break;
 			}
 		case XFS_LI_INODE:
 		case XFS_LI_DQUOT:
 		case XFS_LI_QUOTAOFF:
 		case XFS_LI_EFD:
 		case XFS_LI_EFI:
 			xlog_recover_insert_item_backq(&trans->r_itemq, itemq);
 			break;
 		default:
 			xlog_warn(
 	"XFS: xlog_recover_reorder_trans: unrecognized type of log operation");
 			ASSERT(0);
 			return XFS_ERROR(EIO);
 		}
 		itemq = itemq_next;
 	} while (first_item != itemq);
 	return 0;
 }
 /*
  * Build up the table of buf cancel records so that we don't replay
  * cancelled data in the second pass.  For buffer records that are
  * not cancel records, there is nothing to do here so we just return.
  *
  * If we get a cancel record which is already in the table, this indicates
  * that the buffer was cancelled multiple times.  In order to ensure
  * that during pass 2 we keep the record in the table until we reach its
  * last occurrence in the log, we keep a reference count in the cancel
  * record in the table to tell us how many times we expect to see this
  * record during the second pass.
  */
 STATIC void
 xlog_recover_do_buffer_pass1(
 	xlog_t			*log,
 	xfs_buf_log_format_t	*buf_f)
 {
 	xfs_buf_cancel_t	*bcp;
 	xfs_buf_cancel_t	*nextp;
 	xfs_buf_cancel_t	*prevp;
 	xfs_buf_cancel_t	**bucket;
 	xfs_daddr_t		blkno = 0;
 	uint			len = 0;
 	ushort			flags = 0;
 	switch (buf_f->blf_type) {
 	case XFS_LI_BUF:
 		blkno = buf_f->blf_blkno;
 		len = buf_f->blf_len;
 		flags = buf_f->blf_flags;
 		break;
 	}
 	/*
 	 * If this isn't a cancel buffer item, then just return.
 	 */
 	if (!(flags & XFS_BLI_CANCEL))
 		return;
 	/*
 	 * Insert an xfs_buf_cancel record into the hash table of
 	 * them.  If there is already an identical record, bump
 	 * its reference count.
 	 */
 	bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
 					  XLOG_BC_TABLE_SIZE];
 	/*
 	 * If the hash bucket is empty then just insert a new record into
 	 * the bucket.
 	 */
 	if (*bucket == NULL) {
 		bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
 						     KM_SLEEP);
 		bcp->bc_blkno = blkno;
 		bcp->bc_len = len;
 		bcp->bc_refcount = 1;
 		bcp->bc_next = NULL;
 		*bucket = bcp;
 		return;
 	}
 	/*
 	 * The hash bucket is not empty, so search for duplicates of our
 	 * record.  If we find one them just bump its refcount.  If not
 	 * then add us at the end of the list.
 	 */
 	prevp = NULL;
 	nextp = *bucket;
 	while (nextp != NULL) {
 		if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
 			nextp->bc_refcount++;
 			return;
 		}
 		prevp = nextp;
 		nextp = nextp->bc_next;
 	}
 	ASSERT(prevp != NULL);
 	bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
 					     KM_SLEEP);
 	bcp->bc_blkno = blkno;
 	bcp->bc_len = len;
 	bcp->bc_refcount = 1;
 	bcp->bc_next = NULL;
 	prevp->bc_next = bcp;
 }
 /*
  * Check to see whether the buffer being recovered has a corresponding
  * entry in the buffer cancel record table.  If it does then return 1
  * so that it will be cancelled, otherwise return 0.  If the buffer is
  * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement
  * the refcount on the entry in the table and remove it from the table
  * if this is the last reference.
  *
  * We remove the cancel record from the table when we encounter its
  * last occurrence in the log so that if the same buffer is re-used
  * again after its last cancellation we actually replay the changes
  * made at that point.
  */
 STATIC int
 xlog_check_buffer_cancelled(
 	xlog_t			*log,
 	xfs_daddr_t		blkno,
 	uint			len,
 	ushort			flags)
 {
 	xfs_buf_cancel_t	*bcp;
 	xfs_buf_cancel_t	*prevp;
 	xfs_buf_cancel_t	**bucket;
 	if (log->l_buf_cancel_table == NULL) {
 		/*
 		 * There is nothing in the table built in pass one,
 		 * so this buffer must not be cancelled.
 		 */
 		ASSERT(!(flags & XFS_BLI_CANCEL));
 		return 0;
 	}
 	bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
 					  XLOG_BC_TABLE_SIZE];
 	bcp = *bucket;
 	if (bcp == NULL) {
 		/*
 		 * There is no corresponding entry in the table built
 		 * in pass one, so this buffer has not been cancelled.
 		 */
 		ASSERT(!(flags & XFS_BLI_CANCEL));
 		return 0;
 	}
 	/*
 	 * Search for an entry in the buffer cancel table that
 	 * matches our buffer.
 	 */
 	prevp = NULL;
 	while (bcp != NULL) {
 		if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
 			/*
 			 * We've go a match, so return 1 so that the
 			 * recovery of this buffer is cancelled.
 			 * If this buffer is actually a buffer cancel
 			 * log item, then decrement the refcount on the
 			 * one in the table and remove it if this is the
 			 * last reference.
 			 */
 			if (flags & XFS_BLI_CANCEL) {
 				bcp->bc_refcount--;
 				if (bcp->bc_refcount == 0) {
 					if (prevp == NULL) {
 						*bucket = bcp->bc_next;
 					} else {
 						prevp->bc_next = bcp->bc_next;
 					}
 					kmem_free(bcp);
 				}
 			}
 			return 1;
 		}
 		prevp = bcp;
 		bcp = bcp->bc_next;
 	}
 	/*
 	 * We didn't find a corresponding entry in the table, so
 	 * return 0 so that the buffer is NOT cancelled.
 	 */
 	ASSERT(!(flags & XFS_BLI_CANCEL));
 	return 0;
 }
 STATIC int
 xlog_recover_do_buffer_pass2(
 	xlog_t			*log,
 	xfs_buf_log_format_t	*buf_f)
 {
 	xfs_daddr_t		blkno = 0;
 	ushort			flags = 0;
 	uint			len = 0;
 	switch (buf_f->blf_type) {
 	case XFS_LI_BUF:
 		blkno = buf_f->blf_blkno;
 		flags = buf_f->blf_flags;
 		len = buf_f->blf_len;
 		break;
 	}
 	return xlog_check_buffer_cancelled(log, blkno, len, flags);
 }
 /*
  * Perform recovery for a buffer full of inodes.  In these buffers,
  * the only data which should be recovered is that which corresponds
  * to the di_next_unlinked pointers in the on disk inode structures.
  * The rest of the data for the inodes is always logged through the
  * inodes themselves rather than the inode buffer and is recovered
  * in xlog_recover_do_inode_trans().
  *
  * The only time when buffers full of inodes are fully recovered is
  * when the buffer is full of newly allocated inodes.  In this case
  * the buffer will not be marked as an inode buffer and so will be
  * sent to xlog_recover_do_reg_buffer() below during recovery.
  */
 STATIC int
 xlog_recover_do_inode_buffer(
 	xfs_mount_t		*mp,
 	xlog_recover_item_t	*item,
 	xfs_buf_t		*bp,
 	xfs_buf_log_format_t	*buf_f)
 {
 	int			i;
 	int			item_index;
 	int			bit;
 	int			nbits;
 	int			reg_buf_offset;
 	int			reg_buf_bytes;
 	int			next_unlinked_offset;
 	int			inodes_per_buf;
 	xfs_agino_t		*logged_nextp;
 	xfs_agino_t		*buffer_nextp;
 	unsigned int		*data_map = NULL;
 	unsigned int		map_size = 0;
 	switch (buf_f->blf_type) {
 	case XFS_LI_BUF:
 		data_map = buf_f->blf_data_map;
 		map_size = buf_f->blf_map_size;
 		break;
 	}
 	/*
 	 * Set the variables corresponding to the current region to
 	 * 0 so that we'll initialize them on the first pass through
 	 * the loop.
 	 */
 	reg_buf_offset = 0;
 	reg_buf_bytes = 0;
 	bit = 0;
 	nbits = 0;
 	item_index = 0;
 	inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
 	for (i = 0; i < inodes_per_buf; i++) {
 		next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
 			offsetof(xfs_dinode_t, di_next_unlinked);
 		while (next_unlinked_offset >=
 		       (reg_buf_offset + reg_buf_bytes)) {
 			/*
 			 * The next di_next_unlinked field is beyond
 			 * the current logged region.  Find the next
 			 * logged region that contains or is beyond
 			 * the current di_next_unlinked field.
 			 */
 			bit += nbits;
 			bit = xfs_next_bit(data_map, map_size, bit);
 			/*
 			 * If there are no more logged regions in the
 			 * buffer, then we're done.
 			 */
 			if (bit == -1) {
 				return 0;
 			}
 			nbits = xfs_contig_bits(data_map, map_size,
 							 bit);
 			ASSERT(nbits > 0);
 			reg_buf_offset = bit << XFS_BLI_SHIFT;
 			reg_buf_bytes = nbits << XFS_BLI_SHIFT;
 			item_index++;
 		}
 		/*
 		 * If the current logged region starts after the current
 		 * di_next_unlinked field, then move on to the next
 		 * di_next_unlinked field.
 		 */
 		if (next_unlinked_offset < reg_buf_offset) {
 			continue;
 		}
 		ASSERT(item->ri_buf[item_index].i_addr != NULL);
 		ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0);
 		ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
 		/*
 		 * The current logged region contains a copy of the
 		 * current di_next_unlinked field.  Extract its value
 		 * and copy it to the buffer copy.
 		 */
 		logged_nextp = (xfs_agino_t *)
 			       ((char *)(item->ri_buf[item_index].i_addr) +
 				(next_unlinked_offset - reg_buf_offset));
 		if (unlikely(*logged_nextp == 0)) {
 			xfs_fs_cmn_err(CE_ALERT, mp,
 				"bad inode buffer log record (ptr = 0x%p, bp = 0x%p).  XFS trying to replay bad (0) inode di_next_unlinked field",
 				item, bp);
 			XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
 					 XFS_ERRLEVEL_LOW, mp);
 			return XFS_ERROR(EFSCORRUPTED);
 		}
 		buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
 					      next_unlinked_offset);
 		*buffer_nextp = *logged_nextp;
 	}
 	return 0;
 }
 /*
  * Perform a 'normal' buffer recovery.  Each logged region of the
  * buffer should be copied over the corresponding region in the
  * given buffer.  The bitmap in the buf log format structure indicates
  * where to place the logged data.
  */
 /*ARGSUSED*/
 STATIC void
 xlog_recover_do_reg_buffer(
 	xlog_recover_item_t	*item,
 	xfs_buf_t		*bp,
 	xfs_buf_log_format_t	*buf_f)
 {
 	int			i;
 	int			bit;
 	int			nbits;
 	unsigned int		*data_map = NULL;
 	unsigned int		map_size = 0;
 	int                     error;
 	switch (buf_f->blf_type) {
 	case XFS_LI_BUF:
 		data_map = buf_f->blf_data_map;
 		map_size = buf_f->blf_map_size;
 		break;
 	}
 	bit = 0;
 	i = 1;  /* 0 is the buf format structure */
 	while (1) {
 		bit = xfs_next_bit(data_map, map_size, bit);
 		if (bit == -1)
 			break;
 		nbits = xfs_contig_bits(data_map, map_size, bit);
 		ASSERT(nbits > 0);
 		ASSERT(item->ri_buf[i].i_addr != NULL);
 		ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
 		ASSERT(XFS_BUF_COUNT(bp) >=
 		       ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
 		/*
 		 * Do a sanity check if this is a dquot buffer. Just checking
 		 * the first dquot in the buffer should do. XXXThis is
 		 * probably a good thing to do for other buf types also.
 		 */
 		error = 0;
 		if (buf_f->blf_flags &
 		   (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
 			error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
 					       item->ri_buf[i].i_addr,
 					       -1, 0, XFS_QMOPT_DOWARN,
 					       "dquot_buf_recover");
 		}
 		if (!error)
 			memcpy(xfs_buf_offset(bp,
 				(uint)bit << XFS_BLI_SHIFT),	/* dest */
 				item->ri_buf[i].i_addr,		/* source */
 				nbits<<XFS_BLI_SHIFT);		/* length */
 		i++;
 		bit += nbits;
 	}
 	/* Shouldn't be any more regions */
 	ASSERT(i == item->ri_total);
 }
 /*
  * Do some primitive error checking on ondisk dquot data structures.
  */
 int
 xfs_qm_dqcheck(
 	xfs_disk_dquot_t *ddq,
 	xfs_dqid_t	 id,
 	uint		 type,	  /* used only when IO_dorepair is true */
 	uint		 flags,
 	char		 *str)
 {
 	xfs_dqblk_t	 *d = (xfs_dqblk_t *)ddq;
 	int		errs = 0;
 	/*
 	 * We can encounter an uninitialized dquot buffer for 2 reasons:
 	 * 1. If we crash while deleting the quotainode(s), and those blks got
 	 *    used for user data. This is because we take the path of regular
 	 *    file deletion; however, the size field of quotainodes is never
 	 *    updated, so all the tricks that we play in itruncate_finish
 	 *    don't quite matter.
 	 *
 	 * 2. We don't play the quota buffers when there's a quotaoff logitem.
 	 *    But the allocation will be replayed so we'll end up with an
 	 *    uninitialized quota block.
 	 *
 	 * This is all fine; things are still consistent, and we haven't lost
 	 * any quota information. Just don't complain about bad dquot blks.
 	 */
 	if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
 		if (flags & XFS_QMOPT_DOWARN)
 			cmn_err(CE_ALERT,
 			"%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
 			str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
 		errs++;
 	}
 	if (ddq->d_version != XFS_DQUOT_VERSION) {
 		if (flags & XFS_QMOPT_DOWARN)
 			cmn_err(CE_ALERT,
 			"%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
 			str, id, ddq->d_version, XFS_DQUOT_VERSION);
 		errs++;
 	}
 	if (ddq->d_flags != XFS_DQ_USER &&
 	    ddq->d_flags != XFS_DQ_PROJ &&
 	    ddq->d_flags != XFS_DQ_GROUP) {
 		if (flags & XFS_QMOPT_DOWARN)
 			cmn_err(CE_ALERT,
 			"%s : XFS dquot ID 0x%x, unknown flags 0x%x",
 			str, id, ddq->d_flags);
 		errs++;
 	}
 	if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
 		if (flags & XFS_QMOPT_DOWARN)
 			cmn_err(CE_ALERT,
 			"%s : ondisk-dquot 0x%p, ID mismatch: "
 			"0x%x expected, found id 0x%x",
 			str, ddq, id, be32_to_cpu(ddq->d_id));
 		errs++;
 	}
 	if (!errs && ddq->d_id) {
 		if (ddq->d_blk_softlimit &&
 		    be64_to_cpu(ddq->d_bcount) >=
 				be64_to_cpu(ddq->d_blk_softlimit)) {
 			if (!ddq->d_btimer) {
 				if (flags & XFS_QMOPT_DOWARN)
 					cmn_err(CE_ALERT,
 					"%s : Dquot ID 0x%x (0x%p) "
 					"BLK TIMER NOT STARTED",
 					str, (int)be32_to_cpu(ddq->d_id), ddq);
 				errs++;
 			}
 		}
 		if (ddq->d_ino_softlimit &&
 		    be64_to_cpu(ddq->d_icount) >=
 				be64_to_cpu(ddq->d_ino_softlimit)) {
 			if (!ddq->d_itimer) {
 				if (flags & XFS_QMOPT_DOWARN)
 					cmn_err(CE_ALERT,
 					"%s : Dquot ID 0x%x (0x%p) "
 					"INODE TIMER NOT STARTED",
 					str, (int)be32_to_cpu(ddq->d_id), ddq);
 				errs++;
 			}
 		}
 		if (ddq->d_rtb_softlimit &&
 		    be64_to_cpu(ddq->d_rtbcount) >=
 				be64_to_cpu(ddq->d_rtb_softlimit)) {
 			if (!ddq->d_rtbtimer) {
 				if (flags & XFS_QMOPT_DOWARN)
 					cmn_err(CE_ALERT,
 					"%s : Dquot ID 0x%x (0x%p) "
 					"RTBLK TIMER NOT STARTED",
 					str, (int)be32_to_cpu(ddq->d_id), ddq);
 				errs++;
 			}
 		}
 	}
 	if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
 		return errs;
 	if (flags & XFS_QMOPT_DOWARN)
 		cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id);
 	/*
 	 * Typically, a repair is only requested by quotacheck.
 	 */
 	ASSERT(id != -1);
 	ASSERT(flags & XFS_QMOPT_DQREPAIR);
 	memset(d, 0, sizeof(xfs_dqblk_t));
 	d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
 	d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
 	d->dd_diskdq.d_flags = type;
 	d->dd_diskdq.d_id = cpu_to_be32(id);
 	return errs;
 }
 /*
  * Perform a dquot buffer recovery.
  * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
  * (ie. USR or GRP), then just toss this buffer away; don't recover it.
  * Else, treat it as a regular buffer and do recovery.
  */
 STATIC void
 xlog_recover_do_dquot_buffer(
 	xfs_mount_t		*mp,
 	xlog_t			*log,
 	xlog_recover_item_t	*item,
 	xfs_buf_t		*bp,
 	xfs_buf_log_format_t	*buf_f)
 {
 	uint			type;
 	/*
 	 * Filesystems are required to send in quota flags at mount time.
 	 */
 	if (mp->m_qflags == 0) {
 		return;
 	}
 	type = 0;
 	if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF)
 		type |= XFS_DQ_USER;
 	if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF)
 		type |= XFS_DQ_PROJ;
 	if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF)
 		type |= XFS_DQ_GROUP;
 	/*
 	 * This type of quotas was turned off, so ignore this buffer
 	 */
 	if (log->l_quotaoffs_flag & type)
 		return;
 	xlog_recover_do_reg_buffer(item, bp, buf_f);
 }
 /*
  * This routine replays a modification made to a buffer at runtime.
  * There are actually two types of buffer, regular and inode, which
  * are handled differently.  Inode buffers are handled differently
  * in that we only recover a specific set of data from them, namely
  * the inode di_next_unlinked fields.  This is because all other inode
  * data is actually logged via inode records and any data we replay
  * here which overlaps that may be stale.
  *
  * When meta-data buffers are freed at run time we log a buffer item
  * with the XFS_BLI_CANCEL bit set to indicate that previous copies
  * of the buffer in the log should not be replayed at recovery time.
  * This is so that if the blocks covered by the buffer are reused for
  * file data before we crash we don't end up replaying old, freed
  * meta-data into a user's file.
  *
  * To handle the cancellation of buffer log items, we make two passes
  * over the log during recovery.  During the first we build a table of
  * those buffers which have been cancelled, and during the second we
  * only replay those buffers which do not have corresponding cancel
  * records in the table.  See xlog_recover_do_buffer_pass[1,2] above
  * for more details on the implementation of the table of cancel records.
  */
 STATIC int
 xlog_recover_do_buffer_trans(
 	xlog_t			*log,
 	xlog_recover_item_t	*item,
 	int			pass)
 {
 	xfs_buf_log_format_t	*buf_f;
 	xfs_mount_t		*mp;
 	xfs_buf_t		*bp;
 	int			error;
 	int			cancel;
 	xfs_daddr_t		blkno;
 	int			len;
 	ushort			flags;
 	buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
 	if (pass == XLOG_RECOVER_PASS1) {
 		/*
 		 * In this pass we're only looking for buf items
 		 * with the XFS_BLI_CANCEL bit set.
 		 */
 		xlog_recover_do_buffer_pass1(log, buf_f);
 		return 0;
 	} else {
 		/*
 		 * In this pass we want to recover all the buffers
 		 * which have not been cancelled and are not
 		 * cancellation buffers themselves.  The routine
 		 * we call here will tell us whether or not to
 		 * continue with the replay of this buffer.
 		 */
 		cancel = xlog_recover_do_buffer_pass2(log, buf_f);
 		if (cancel) {
 			return 0;
 		}
 	}
 	switch (buf_f->blf_type) {
 	case XFS_LI_BUF:
 		blkno = buf_f->blf_blkno;
 		len = buf_f->blf_len;
 		flags = buf_f->blf_flags;
 		break;
 	default:
 		xfs_fs_cmn_err(CE_ALERT, log->l_mp,
 			"xfs_log_recover: unknown buffer type 0x%x, logdev %s",
 			buf_f->blf_type, log->l_mp->m_logname ?
 			log->l_mp->m_logname : "internal");
 		XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
 				 XFS_ERRLEVEL_LOW, log->l_mp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	mp = log->l_mp;
 	if (flags & XFS_BLI_INODE_BUF) {
 		bp = xfs_buf_read_flags(mp->m_ddev_targp, blkno, len,
 								XFS_BUF_LOCK);
 	} else {
 		bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, 0);
 	}
 	if (XFS_BUF_ISERROR(bp)) {
 		xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
 				  bp, blkno);
 		error = XFS_BUF_GETERROR(bp);
 		xfs_buf_relse(bp);
 		return error;
 	}
 	error = 0;
 	if (flags & XFS_BLI_INODE_BUF) {
 		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
 	} else if (flags &
 		  (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
 		xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
 	} else {
 		xlog_recover_do_reg_buffer(item, bp, buf_f);
 	}
 	if (error)
 		return XFS_ERROR(error);
 	/*
 	 * Perform delayed write on the buffer.  Asynchronous writes will be
 	 * slower when taking into account all the buffers to be flushed.
 	 *
 	 * Also make sure that only inode buffers with good sizes stay in
 	 * the buffer cache.  The kernel moves inodes in buffers of 1 block
 	 * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger.  The inode
 	 * buffers in the log can be a different size if the log was generated
 	 * by an older kernel using unclustered inode buffers or a newer kernel
 	 * running with a different inode cluster size.  Regardless, if the
 	 * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
 	 * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
 	 * the buffer out of the buffer cache so that the buffer won't
 	 * overlap with future reads of those inodes.
 	 */
 	if (XFS_DINODE_MAGIC ==
 	    be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
 	    (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
 			(__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
 		XFS_BUF_STALE(bp);
 		error = xfs_bwrite(mp, bp);
 	} else {
-		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+		ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-		       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+		bp->b_mount = mp;
-		XFS_BUF_SET_FSPRIVATE(bp, mp);
 		XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
 		xfs_bdwrite(mp, bp);
 	}
 	return (error);
 }
 STATIC int
 xlog_recover_do_inode_trans(
 	xlog_t			*log,
 	xlog_recover_item_t	*item,
 	int			pass)
 {
 	xfs_inode_log_format_t	*in_f;
 	xfs_mount_t		*mp;
 	xfs_buf_t		*bp;
 	xfs_dinode_t		*dip;
 	xfs_ino_t		ino;
 	int			len;
 	xfs_caddr_t		src;
 	xfs_caddr_t		dest;
 	int			error;
 	int			attr_index;
 	uint			fields;
 	xfs_icdinode_t		*dicp;
 	int			need_free = 0;
 	if (pass == XLOG_RECOVER_PASS1) {
 		return 0;
 	}
 	if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
 		in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
 	} else {
 		in_f = (xfs_inode_log_format_t *)kmem_alloc(
 			sizeof(xfs_inode_log_format_t), KM_SLEEP);
 		need_free = 1;
 		error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
 		if (error)
 			goto error;
 	}
 	ino = in_f->ilf_ino;
 	mp = log->l_mp;
 	/*
 	 * Inode buffers can be freed, look out for it,
 	 * and do not replay the inode.
 	 */
 	if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
 					in_f->ilf_len, 0)) {
 		error = 0;
 		goto error;
 	}
 	bp = xfs_buf_read_flags(mp->m_ddev_targp, in_f->ilf_blkno,
 				in_f->ilf_len, XFS_BUF_LOCK);
 	if (XFS_BUF_ISERROR(bp)) {
 		xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
 				  bp, in_f->ilf_blkno);
 		error = XFS_BUF_GETERROR(bp);
 		xfs_buf_relse(bp);
 		goto error;
 	}
 	error = 0;
 	ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
 	dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
 	/*
 	 * Make sure the place we're flushing out to really looks
 	 * like an inode!
 	 */
 	if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
 			dip, bp, ino);
 		XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
 				 XFS_ERRLEVEL_LOW, mp);
 		error = EFSCORRUPTED;
 		goto error;
 	}
 	dicp = (xfs_icdinode_t *)(item->ri_buf[1].i_addr);
 	if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
 			item, ino);
 		XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
 				 XFS_ERRLEVEL_LOW, mp);
 		error = EFSCORRUPTED;
 		goto error;
 	}
 	/* Skip replay when the on disk inode is newer than the log one */
 	if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
 		/*
 		 * Deal with the wrap case, DI_MAX_FLUSH is less
 		 * than smaller numbers
 		 */
 		if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
 		    dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
 			/* do nothing */
 		} else {
 			xfs_buf_relse(bp);
 			error = 0;
 			goto error;
 		}
 	}
 	/* Take the opportunity to reset the flush iteration count */
 	dicp->di_flushiter = 0;
 	if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
 		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
 		    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
 			XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
 					 XFS_ERRLEVEL_LOW, mp, dicp);
 			xfs_buf_relse(bp);
 			xfs_fs_cmn_err(CE_ALERT, mp,
 				"xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
 				item, dip, bp, ino);
 			error = EFSCORRUPTED;
 			goto error;
 		}
 	} else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) {
 		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
 		    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
 		    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
 			XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
 					     XFS_ERRLEVEL_LOW, mp, dicp);
 			xfs_buf_relse(bp);
 			xfs_fs_cmn_err(CE_ALERT, mp,
 				"xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
 				item, dip, bp, ino);
 			error = EFSCORRUPTED;
 			goto error;
 		}
 	}
 	if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
 		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
 				     XFS_ERRLEVEL_LOW, mp, dicp);
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
 			item, dip, bp, ino,
 			dicp->di_nextents + dicp->di_anextents,
 			dicp->di_nblocks);
 		error = EFSCORRUPTED;
 		goto error;
 	}
 	if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
 		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
 				     XFS_ERRLEVEL_LOW, mp, dicp);
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
 			item, dip, bp, ino, dicp->di_forkoff);
 		error = EFSCORRUPTED;
 		goto error;
 	}
 	if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
 		XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
 				     XFS_ERRLEVEL_LOW, mp, dicp);
 		xfs_buf_relse(bp);
 		xfs_fs_cmn_err(CE_ALERT, mp,
 			"xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
 			item->ri_buf[1].i_len, item);
 		error = EFSCORRUPTED;
 		goto error;
 	}
 	/* The core is in in-core format */
 	xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr);
 	/* the rest is in on-disk format */
 	if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
 		memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
 			item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
 			item->ri_buf[1].i_len  - sizeof(struct xfs_icdinode));
 	}
 	fields = in_f->ilf_fields;
 	switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
 	case XFS_ILOG_DEV:
 		xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
 		break;
 	case XFS_ILOG_UUID:
 		memcpy(XFS_DFORK_DPTR(dip),
 		       &in_f->ilf_u.ilfu_uuid,
 		       sizeof(uuid_t));
 		break;
 	}
 	if (in_f->ilf_size == 2)
 		goto write_inode_buffer;
 	len = item->ri_buf[2].i_len;
 	src = item->ri_buf[2].i_addr;
 	ASSERT(in_f->ilf_size <= 4);
 	ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
 	ASSERT(!(fields & XFS_ILOG_DFORK) ||
 	       (len == in_f->ilf_dsize));
 	switch (fields & XFS_ILOG_DFORK) {
 	case XFS_ILOG_DDATA:
 	case XFS_ILOG_DEXT:
 		memcpy(XFS_DFORK_DPTR(dip), src, len);
 		break;
 	case XFS_ILOG_DBROOT:
 		xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
 				 (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
 				 XFS_DFORK_DSIZE(dip, mp));
 		break;
 	default:
 		/*
 		 * There are no data fork flags set.
 		 */
 		ASSERT((fields & XFS_ILOG_DFORK) == 0);
 		break;
 	}
 	/*
 	 * If we logged any attribute data, recover it.  There may or
 	 * may not have been any other non-core data logged in this
 	 * transaction.
 	 */
 	if (in_f->ilf_fields & XFS_ILOG_AFORK) {
 		if (in_f->ilf_fields & XFS_ILOG_DFORK) {
 			attr_index = 3;
 		} else {
 			attr_index = 2;
 		}
 		len = item->ri_buf[attr_index].i_len;
 		src = item->ri_buf[attr_index].i_addr;
 		ASSERT(len == in_f->ilf_asize);
 		switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
 		case XFS_ILOG_ADATA:
 		case XFS_ILOG_AEXT:
 			dest = XFS_DFORK_APTR(dip);
 			ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
 			memcpy(dest, src, len);
 			break;
 		case XFS_ILOG_ABROOT:
 			dest = XFS_DFORK_APTR(dip);
 			xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
 					 len, (xfs_bmdr_block_t*)dest,
 					 XFS_DFORK_ASIZE(dip, mp));
 			break;
 		default:
 			xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
 			ASSERT(0);
 			xfs_buf_relse(bp);
 			error = EIO;
 			goto error;
 		}
 	}
 write_inode_buffer:
 	if (ITEM_TYPE(item) == XFS_LI_INODE) {
-		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+		ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-		       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+		bp->b_mount = mp;
-		XFS_BUF_SET_FSPRIVATE(bp, mp);
 		XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
 		xfs_bdwrite(mp, bp);
 	} else {
 		XFS_BUF_STALE(bp);
 		error = xfs_bwrite(mp, bp);
 	}
 error:
 	if (need_free)
 		kmem_free(in_f);
 	return XFS_ERROR(error);
 }
 /*
  * Recover QUOTAOFF records. We simply make a note of it in the xlog_t
  * structure, so that we know not to do any dquot item or dquot buffer recovery,
  * of that type.
  */
 STATIC int
 xlog_recover_do_quotaoff_trans(
 	xlog_t			*log,
 	xlog_recover_item_t	*item,
 	int			pass)
 {
 	xfs_qoff_logformat_t	*qoff_f;
 	if (pass == XLOG_RECOVER_PASS2) {
 		return (0);
 	}
 	qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr;
 	ASSERT(qoff_f);
 	/*
 	 * The logitem format's flag tells us if this was user quotaoff,
 	 * group/project quotaoff or both.
 	 */
 	if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
 		log->l_quotaoffs_flag |= XFS_DQ_USER;
 	if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
 		log->l_quotaoffs_flag |= XFS_DQ_PROJ;
 	if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
 		log->l_quotaoffs_flag |= XFS_DQ_GROUP;
 	return (0);
 }
 /*
  * Recover a dquot record
  */
 STATIC int
 xlog_recover_do_dquot_trans(
 	xlog_t			*log,
 	xlog_recover_item_t	*item,
 	int			pass)
 {
 	xfs_mount_t		*mp;
 	xfs_buf_t		*bp;
 	struct xfs_disk_dquot	*ddq, *recddq;
 	int			error;
 	xfs_dq_logformat_t	*dq_f;
 	uint			type;
 	if (pass == XLOG_RECOVER_PASS1) {
 		return 0;
 	}
 	mp = log->l_mp;
 	/*
 	 * Filesystems are required to send in quota flags at mount time.
 	 */
 	if (mp->m_qflags == 0)
 		return (0);
 	recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
 	ASSERT(recddq);
 	/*
 	 * This type of quotas was turned off, so ignore this record.
 	 */
 	type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
 	ASSERT(type);
 	if (log->l_quotaoffs_flag & type)
 		return (0);
 	/*
 	 * At this point we know that quota was _not_ turned off.
 	 * Since the mount flags are not indicating to us otherwise, this
 	 * must mean that quota is on, and the dquot needs to be replayed.
 	 * Remember that we may not have fully recovered the superblock yet,
 	 * so we can't do the usual trick of looking at the SB quota bits.
 	 *
 	 * The other possibility, of course, is that the quota subsystem was
 	 * removed since the last mount - ENOSYS.
 	 */
 	dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr;
 	ASSERT(dq_f);
 	if ((error = xfs_qm_dqcheck(recddq,
 			   dq_f->qlf_id,
 			   0, XFS_QMOPT_DOWARN,
 			   "xlog_recover_do_dquot_trans (log copy)"))) {
 		return XFS_ERROR(EIO);
 	}
 	ASSERT(dq_f->qlf_len == 1);
 	error = xfs_read_buf(mp, mp->m_ddev_targp,
 			     dq_f->qlf_blkno,
 			     XFS_FSB_TO_BB(mp, dq_f->qlf_len),
 			     0, &bp);
 	if (error) {
 		xfs_ioerror_alert("xlog_recover_do..(read#3)", mp,
 				  bp, dq_f->qlf_blkno);
 		return error;
 	}
 	ASSERT(bp);
 	ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
 	/*
 	 * At least the magic num portion should be on disk because this
 	 * was among a chunk of dquots created earlier, and we did some
 	 * minimal initialization then.
 	 */
 	if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
 			   "xlog_recover_do_dquot_trans")) {
 		xfs_buf_relse(bp);
 		return XFS_ERROR(EIO);
 	}
 	memcpy(ddq, recddq, item->ri_buf[1].i_len);
 	ASSERT(dq_f->qlf_size == 2);
-	ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
+	ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
-	       XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
+	bp->b_mount = mp;
-	XFS_BUF_SET_FSPRIVATE(bp, mp);
 	XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
 	xfs_bdwrite(mp, bp);
 	return (0);
 }
 /*
  * This routine is called to create an in-core extent free intent
  * item from the efi format structure which was logged on disk.
  * It allocates an in-core efi, copies the extents from the format
  * structure into it, and adds the efi to the AIL with the given
  * LSN.
  */
 STATIC int
 xlog_recover_do_efi_trans(
 	xlog_t			*log,
 	xlog_recover_item_t	*item,
 	xfs_lsn_t		lsn,
 	int			pass)
 {
 	int			error;
 	xfs_mount_t		*mp;
 	xfs_efi_log_item_t	*efip;
 	xfs_efi_log_format_t	*efi_formatp;
 	if (pass == XLOG_RECOVER_PASS1) {
 		return 0;
 	}
 	efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr;
 	mp = log->l_mp;
 	efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
 	if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
 					 &(efip->efi_format)))) {
 		xfs_efi_item_free(efip);
 		return error;
 	}
 	efip->efi_next_extent = efi_formatp->efi_nextents;
 	efip->efi_flags |= XFS_EFI_COMMITTED;
 	spin_lock(&log->l_ailp->xa_lock);
 	/*
 	 * xfs_trans_ail_update() drops the AIL lock.
 	 */
 	xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
 	return 0;
 }
 /*
  * This routine is called when an efd format structure is found in
  * a committed transaction in the log.  It's purpose is to cancel
  * the corresponding efi if it was still in the log.  To do this
  * it searches the AIL for the efi with an id equal to that in the
  * efd format structure.  If we find it, we remove the efi from the
  * AIL and free it.
  */
 STATIC void
 xlog_recover_do_efd_trans(
 	xlog_t			*log,
 	xlog_recover_item_t	*item,
 	int			pass)
 {
 	xfs_efd_log_format_t	*efd_formatp;
 	xfs_efi_log_item_t	*efip = NULL;
 	xfs_log_item_t		*lip;
 	__uint64_t		efi_id;
 	struct xfs_ail_cursor	cur;
 	struct xfs_ail		*ailp = log->l_ailp;
 	if (pass == XLOG_RECOVER_PASS1) {
 		return;
 	}
 	efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
 	ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
 		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
 	       (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
 		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
 	efi_id = efd_formatp->efd_efi_id;
 	/*
 	 * Search for the efi with the id in the efd format structure
 	 * in the AIL.
 	 */
 	spin_lock(&ailp->xa_lock);
 	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
 	while (lip != NULL) {
 		if (lip->li_type == XFS_LI_EFI) {
 			efip = (xfs_efi_log_item_t *)lip;
 			if (efip->efi_format.efi_id == efi_id) {
 				/*
 				 * xfs_trans_ail_delete() drops the
 				 * AIL lock.
 				 */
 				xfs_trans_ail_delete(ailp, lip);
 				xfs_efi_item_free(efip);
 				spin_lock(&ailp->xa_lock);
 				break;
 			}
 		}
 		lip = xfs_trans_ail_cursor_next(ailp, &cur);
 	}
 	xfs_trans_ail_cursor_done(ailp, &cur);
 	spin_unlock(&ailp->xa_lock);
 }
 /*
  * Perform the transaction
  *
  * If the transaction modifies a buffer or inode, do it now.  Otherwise,
  * EFIs and EFDs get queued up by adding entries into the AIL for them.
  */
 STATIC int
 xlog_recover_do_trans(
 	xlog_t			*log,
 	xlog_recover_t		*trans,
 	int			pass)
 {
 	int			error = 0;
 	xlog_recover_item_t	*item, *first_item;
 	if ((error = xlog_recover_reorder_trans(trans)))
 		return error;
 	first_item = item = trans->r_itemq;
 	do {
 		/*
 		 * we don't need to worry about the block number being
 		 * truncated in > 1 TB buffers because in user-land,
 		 * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so
 		 * the blknos will get through the user-mode buffer
 		 * cache properly.  The only bad case is o32 kernels
 		 * where xfs_daddr_t is 32-bits but mount will warn us
 		 * off a > 1 TB filesystem before we get here.
 		 */
 		if ((ITEM_TYPE(item) == XFS_LI_BUF)) {
 			if  ((error = xlog_recover_do_buffer_trans(log, item,
 								 pass)))
 				break;
 		} else if ((ITEM_TYPE(item) == XFS_LI_INODE)) {
 			if ((error = xlog_recover_do_inode_trans(log, item,
 								pass)))
 				break;
 		} else if (ITEM_TYPE(item) == XFS_LI_EFI) {
 			if ((error = xlog_recover_do_efi_trans(log, item, trans->r_lsn,
 						  pass)))
 				break;
 		} else if (ITEM_TYPE(item) == XFS_LI_EFD) {
 			xlog_recover_do_efd_trans(log, item, pass);
 		} else if (ITEM_TYPE(item) == XFS_LI_DQUOT) {
 			if ((error = xlog_recover_do_dquot_trans(log, item,
 								   pass)))
 					break;
 		} else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) {
 			if ((error = xlog_recover_do_quotaoff_trans(log, item,
 								   pass)))
 					break;
 		} else {
 			xlog_warn("XFS: xlog_recover_do_trans");
 			ASSERT(0);
 			error = XFS_ERROR(EIO);
 			break;
 		}
 		item = item->ri_next;
 	} while (first_item != item);
 	return error;
 }
 /*
  * Free up any resources allocated by the transaction
  *
  * Remember that EFIs, EFDs, and IUNLINKs are handled later.
  */
 STATIC void
 xlog_recover_free_trans(
 	xlog_recover_t		*trans)
 {
 	xlog_recover_item_t	*first_item, *item, *free_item;
 	int			i;
 	item = first_item = trans->r_itemq;
 	do {
 		free_item = item;
 		item = item->ri_next;
 		 /* Free the regions in the item. */
 		for (i = 0; i < free_item->ri_cnt; i++) {
 			kmem_free(free_item->ri_buf[i].i_addr);
 		}
 		/* Free the item itself */
 		kmem_free(free_item->ri_buf);
 		kmem_free(free_item);
 	} while (first_item != item);
 	/* Free the transaction recover structure */
 	kmem_free(trans);
 }
 STATIC int
 xlog_recover_commit_trans(
 	xlog_t			*log,
 	xlog_recover_t		**q,
 	xlog_recover_t		*trans,
 	int			pass)
 {
 	int			error;
 	if ((error = xlog_recover_unlink_tid(q, trans)))
 		return error;
 	if ((error = xlog_recover_do_trans(log, trans, pass)))
 		return error;
 	xlog_recover_free_trans(trans);			/* no error */
 	return 0;
 }
 STATIC int
 xlog_recover_unmount_trans(
 	xlog_recover_t		*trans)
 {
 	/* Do nothing now */
 	xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
 	return 0;
 }
 /*
  * There are two valid states of the r_state field.  0 indicates that the
  * transaction structure is in a normal state.  We have either seen the
  * start of the transaction or the last operation we added was not a partial
  * operation.  If the last operation we added to the transaction was a
  * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
  *
  * NOTE: skip LRs with 0 data length.
  */
 STATIC int
 xlog_recover_process_data(
 	xlog_t			*log,
 	xlog_recover_t		*rhash[],
 	xlog_rec_header_t	*rhead,
 	xfs_caddr_t		dp,
 	int			pass)
 {
 	xfs_caddr_t		lp;
 	int			num_logops;
 	xlog_op_header_t	*ohead;
 	xlog_recover_t		*trans;
 	xlog_tid_t		tid;
 	int			error;
 	unsigned long		hash;
 	uint			flags;
 	lp = dp + be32_to_cpu(rhead->h_len);
 	num_logops = be32_to_cpu(rhead->h_num_logops);
 	/* check the log format matches our own - else we can't recover */
 	if (xlog_header_check_recover(log->l_mp, rhead))
 		return (XFS_ERROR(EIO));
 	while ((dp < lp) && num_logops) {
 		ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
 		ohead = (xlog_op_header_t *)dp;
 		dp += sizeof(xlog_op_header_t);
 		if (ohead->oh_clientid != XFS_TRANSACTION &&
 		    ohead->oh_clientid != XFS_LOG) {
 			xlog_warn(
 		"XFS: xlog_recover_process_data: bad clientid");
 			ASSERT(0);
 			return (XFS_ERROR(EIO));
 		}
 		tid = be32_to_cpu(ohead->oh_tid);
 		hash = XLOG_RHASH(tid);
 		trans = xlog_recover_find_tid(rhash[hash], tid);
 		if (trans == NULL) {		   /* not found; add new tid */
 			if (ohead->oh_flags & XLOG_START_TRANS)
 				xlog_recover_new_tid(&rhash[hash], tid,
 					be64_to_cpu(rhead->h_lsn));
 		} else {
 			if (dp + be32_to_cpu(ohead->oh_len) > lp) {
 				xlog_warn(
 			"XFS: xlog_recover_process_data: bad length");
 				WARN_ON(1);
 				return (XFS_ERROR(EIO));
 			}
 			flags = ohead->oh_flags & ~XLOG_END_TRANS;
 			if (flags & XLOG_WAS_CONT_TRANS)
 				flags &= ~XLOG_CONTINUE_TRANS;
 			switch (flags) {
 			case XLOG_COMMIT_TRANS:
 				error = xlog_recover_commit_trans(log,
 						&rhash[hash], trans, pass);
 				break;
 			case XLOG_UNMOUNT_TRANS:
 				error = xlog_recover_unmount_trans(trans);
 				break;
 			case XLOG_WAS_CONT_TRANS:
 				error = xlog_recover_add_to_cont_trans(trans,
 						dp, be32_to_cpu(ohead->oh_len));
 				break;
 			case XLOG_START_TRANS:
 				xlog_warn(
 			"XFS: xlog_recover_process_data: bad transaction");
 				ASSERT(0);
 				error = XFS_ERROR(EIO);
 				break;
 			case 0:
 			case XLOG_CONTINUE_TRANS:
 				error = xlog_recover_add_to_trans(trans,
 						dp, be32_to_cpu(ohead->oh_len));
 				break;
 			default:
 				xlog_warn(
 			"XFS: xlog_recover_process_data: bad flag");
 				ASSERT(0);
 				error = XFS_ERROR(EIO);
 				break;
 			}
 			if (error)
 				return error;
 		}
 		dp += be32_to_cpu(ohead->oh_len);
 		num_logops--;
 	}
 	return 0;
 }
 /*
  * Process an extent free intent item that was recovered from
  * the log.  We need to free the extents that it describes.
  */
 STATIC int
 xlog_recover_process_efi(
 	xfs_mount_t		*mp,
 	xfs_efi_log_item_t	*efip)
 {
 	xfs_efd_log_item_t	*efdp;
 	xfs_trans_t		*tp;
 	int			i;
 	int			error = 0;
 	xfs_extent_t		*extp;
 	xfs_fsblock_t		startblock_fsb;
 	ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
 	/*
 	 * First check the validity of the extents described by the
 	 * EFI.  If any are bad, then assume that all are bad and
 	 * just toss the EFI.
 	 */
 	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
 		extp = &(efip->efi_format.efi_extents[i]);
 		startblock_fsb = XFS_BB_TO_FSB(mp,
 				   XFS_FSB_TO_DADDR(mp, extp->ext_start));
 		if ((startblock_fsb == 0) ||
 		    (extp->ext_len == 0) ||
 		    (startblock_fsb >= mp->m_sb.sb_dblocks) ||
 		    (extp->ext_len >= mp->m_sb.sb_agblocks)) {
 			/*
 			 * This will pull the EFI from the AIL and
 			 * free the memory associated with it.
 			 */
 			xfs_efi_release(efip, efip->efi_format.efi_nextents);
 			return XFS_ERROR(EIO);
 		}
 	}
 	tp = xfs_trans_alloc(mp, 0);
 	error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
 	if (error)
 		goto abort_error;
 	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
 	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
 		extp = &(efip->efi_format.efi_extents[i]);
 		error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
 		if (error)
 			goto abort_error;
 		xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
 					 extp->ext_len);
 	}
 	efip->efi_flags |= XFS_EFI_RECOVERED;
 	error = xfs_trans_commit(tp, 0);
 	return error;
 abort_error:
 	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
 	return error;
 }
 /*
  * When this is called, all of the EFIs which did not have
  * corresponding EFDs should be in the AIL.  What we do now
  * is free the extents associated with each one.
  *
  * Since we process the EFIs in normal transactions, they
  * will be removed at some point after the commit.  This prevents
  * us from just walking down the list processing each one.
  * We'll use a flag in the EFI to skip those that we've already
  * processed and use the AIL iteration mechanism's generation
  * count to try to speed this up at least a bit.
  *
  * When we start, we know that the EFIs are the only things in
  * the AIL.  As we process them, however, other items are added
  * to the AIL.  Since everything added to the AIL must come after
  * everything already in the AIL, we stop processing as soon as
  * we see something other than an EFI in the AIL.
  */
 STATIC int
 xlog_recover_process_efis(
 	xlog_t			*log)
 {
 	xfs_log_item_t		*lip;
 	xfs_efi_log_item_t	*efip;
 	int			error = 0;
 	struct xfs_ail_cursor	cur;
 	struct xfs_ail		*ailp;
 	ailp = log->l_ailp;
 	spin_lock(&ailp->xa_lock);
 	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
 	while (lip != NULL) {
 		/*
 		 * We're done when we see something other than an EFI.
 		 * There should be no EFIs left in the AIL now.
 		 */
 		if (lip->li_type != XFS_LI_EFI) {
 #ifdef DEBUG
 			for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
 				ASSERT(lip->li_type != XFS_LI_EFI);
 #endif
 			break;
 		}
 		/*
 		 * Skip EFIs that we've already processed.
 		 */
 		efip = (xfs_efi_log_item_t *)lip;
 		if (efip->efi_flags & XFS_EFI_RECOVERED) {
 			lip = xfs_trans_ail_cursor_next(ailp, &cur);
 			continue;
 		}
 		spin_unlock(&ailp->xa_lock);
 		error = xlog_recover_process_efi(log->l_mp, efip);
 		spin_lock(&ailp->xa_lock);
 		if (error)
 			goto out;
 		lip = xfs_trans_ail_cursor_next(ailp, &cur);
 	}
 out:
 	xfs_trans_ail_cursor_done(ailp, &cur);
 	spin_unlock(&ailp->xa_lock);
 	return error;
 }
 /*
  * This routine performs a transaction to null out a bad inode pointer
  * in an agi unlinked inode hash bucket.
  */
 STATIC void
 xlog_recover_clear_agi_bucket(
 	xfs_mount_t	*mp,
 	xfs_agnumber_t	agno,
 	int		bucket)
 {
 	xfs_trans_t	*tp;
 	xfs_agi_t	*agi;
 	xfs_buf_t	*agibp;
 	int		offset;
 	int		error;
 	tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
 	error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
 				  0, 0, 0);
 	if (error)
 		goto out_abort;
 	error = xfs_read_agi(mp, tp, agno, &agibp);
 	if (error)
 		goto out_abort;
 	agi = XFS_BUF_TO_AGI(agibp);
 	agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
 	offset = offsetof(xfs_agi_t, agi_unlinked) +
 		 (sizeof(xfs_agino_t) * bucket);
 	xfs_trans_log_buf(tp, agibp, offset,
 			  (offset + sizeof(xfs_agino_t) - 1));
 	error = xfs_trans_commit(tp, 0);
 	if (error)
 		goto out_error;
 	return;
 out_abort:
 	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
 out_error:
 	xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: "
 			"failed to clear agi %d. Continuing.", agno);
 	return;
 }
 STATIC xfs_agino_t
 xlog_recover_process_one_iunlink(
 	struct xfs_mount		*mp,
 	xfs_agnumber_t			agno,
 	xfs_agino_t			agino,
 	int				bucket)
 {
 	struct xfs_buf			*ibp;
 	struct xfs_dinode		*dip;
 	struct xfs_inode		*ip;
 	xfs_ino_t			ino;
 	int				error;
 	ino = XFS_AGINO_TO_INO(mp, agno, agino);
 	error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
 	if (error)
 		goto fail;
 	/*
 	 * Get the on disk inode to find the next inode in the bucket.
 	 */
 	error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK);
 	if (error)
 		goto fail_iput;
 	ASSERT(ip->i_d.di_nlink == 0);
 	ASSERT(ip->i_d.di_mode != 0);
 	/* setup for the next pass */
 	agino = be32_to_cpu(dip->di_next_unlinked);
 	xfs_buf_relse(ibp);
 	/*
 	 * Prevent any DMAPI event from being sent when the reference on
 	 * the inode is dropped.
 	 */
 	ip->i_d.di_dmevmask = 0;
 	IRELE(ip);
 	return agino;
  fail_iput:
 	IRELE(ip);
  fail:
 	/*
 	 * We can't read in the inode this bucket points to, or this inode
 	 * is messed up.  Just ditch this bucket of inodes.  We will lose
 	 * some inodes and space, but at least we won't hang.
 	 *
 	 * Call xlog_recover_clear_agi_bucket() to perform a transaction to
 	 * clear the inode pointer in the bucket.
 	 */
 	xlog_recover_clear_agi_bucket(mp, agno, bucket);
 	return NULLAGINO;
 }
 /*
  * xlog_iunlink_recover
  *
  * This is called during recovery to process any inodes which
  * we unlinked but not freed when the system crashed.  These
  * inodes will be on the lists in the AGI blocks.  What we do
  * here is scan all the AGIs and fully truncate and free any
  * inodes found on the lists.  Each inode is removed from the
  * lists when it has been fully truncated and is freed.  The
  * freeing of the inode and its removal from the list must be
  * atomic.
  */
 void
 xlog_recover_process_iunlinks(
 	xlog_t		*log)
 {
 	xfs_mount_t	*mp;
 	xfs_agnumber_t	agno;
 	xfs_agi_t	*agi;
 	xfs_buf_t	*agibp;
 	xfs_agino_t	agino;
 	int		bucket;
 	int		error;
 	uint		mp_dmevmask;
 	mp = log->l_mp;
 	/*
 	 * Prevent any DMAPI event from being sent while in this function.
 	 */
 	mp_dmevmask = mp->m_dmevmask;
 	mp->m_dmevmask = 0;
 	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
 		/*
 		 * Find the agi for this ag.
 		 */
 		error = xfs_read_agi(mp, NULL, agno, &agibp);
 		if (error) {
 			/*
 			 * AGI is b0rked. Don't process it.
 			 *
 			 * We should probably mark the filesystem as corrupt
 			 * after we've recovered all the ag's we can....
 			 */
 			continue;
 		}
 		agi = XFS_BUF_TO_AGI(agibp);
 		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
 			agino = be32_to_cpu(agi->agi_unlinked[bucket]);
 			while (agino != NULLAGINO) {
 				/*
 				 * Release the agi buffer so that it can
 				 * be acquired in the normal course of the
 				 * transaction to truncate and free the inode.
 				 */
 				xfs_buf_relse(agibp);
 				agino = xlog_recover_process_one_iunlink(mp,
 							agno, agino, bucket);
 				/*
 				 * Reacquire the agibuffer and continue around
 				 * the loop. This should never fail as we know
 				 * the buffer was good earlier on.
 				 */
 				error = xfs_read_agi(mp, NULL, agno, &agibp);
 				ASSERT(error == 0);
 				agi = XFS_BUF_TO_AGI(agibp);
 			}
 		}
 		/*
 		 * Release the buffer for the current agi so we can
 		 * go on to the next one.
 		 */
 		xfs_buf_relse(agibp);
 	}
 	mp->m_dmevmask = mp_dmevmask;
 }
 #ifdef DEBUG
 STATIC void
 xlog_pack_data_checksum(
 	xlog_t		*log,
 	xlog_in_core_t	*iclog,
 	int		size)
 {
 	int		i;
 	__be32		*up;
 	uint		chksum = 0;
 	up = (__be32 *)iclog->ic_datap;
 	/* divide length by 4 to get # words */
 	for (i = 0; i < (size >> 2); i++) {
 		chksum ^= be32_to_cpu(*up);
 		up++;
 	}
 	iclog->ic_header.h_chksum = cpu_to_be32(chksum);
 }
 #else
 #define xlog_pack_data_checksum(log, iclog, size)
 #endif
 /*
  * Stamp cycle number in every block
  */
 void
 xlog_pack_data(
 	xlog_t			*log,
 	xlog_in_core_t		*iclog,
 	int			roundoff)
 {
 	int			i, j, k;
 	int			size = iclog->ic_offset + roundoff;
 	__be32			cycle_lsn;
 	xfs_caddr_t		dp;
 	xlog_pack_data_checksum(log, iclog, size);
 	cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
 	dp = iclog->ic_datap;
 	for (i = 0; i < BTOBB(size) &&
 		i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
 		iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
 		*(__be32 *)dp = cycle_lsn;
 		dp += BBSIZE;
 	}
 	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 		xlog_in_core_2_t *xhdr = iclog->ic_data;
 		for ( ; i < BTOBB(size); i++) {
 			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
 			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
 			xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
 			*(__be32 *)dp = cycle_lsn;
 			dp += BBSIZE;
 		}
 		for (i = 1; i < log->l_iclog_heads; i++) {
 			xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
 		}
 	}
 }
 #if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
 STATIC void
 xlog_unpack_data_checksum(
 	xlog_rec_header_t	*rhead,
 	xfs_caddr_t		dp,
 	xlog_t			*log)
 {
 	__be32			*up = (__be32 *)dp;
 	uint			chksum = 0;
 	int			i;
 	/* divide length by 4 to get # words */
 	for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) {
 		chksum ^= be32_to_cpu(*up);
 		up++;
 	}
 	if (chksum != be32_to_cpu(rhead->h_chksum)) {
 	    if (rhead->h_chksum ||
 		((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
 		    cmn_err(CE_DEBUG,
 			"XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n",
 			    be32_to_cpu(rhead->h_chksum), chksum);
 		    cmn_err(CE_DEBUG,
 "XFS: Disregard message if filesystem was created with non-DEBUG kernel");
 		    if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 			    cmn_err(CE_DEBUG,
 				"XFS: LogR this is a LogV2 filesystem\n");
 		    }
 		    log->l_flags |= XLOG_CHKSUM_MISMATCH;
 	    }
 	}
 }
 #else
 #define xlog_unpack_data_checksum(rhead, dp, log)
 #endif
 STATIC void
 xlog_unpack_data(
 	xlog_rec_header_t	*rhead,
 	xfs_caddr_t		dp,
 	xlog_t			*log)
 {
 	int			i, j, k;
 	for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
 		  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
 		*(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
 		dp += BBSIZE;
 	}
 	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 		xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
 		for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
 			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
 			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
 			*(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
 			dp += BBSIZE;
 		}
 	}
 	xlog_unpack_data_checksum(rhead, dp, log);
 }
 STATIC int
 xlog_valid_rec_header(
 	xlog_t			*log,
 	xlog_rec_header_t	*rhead,
 	xfs_daddr_t		blkno)
 {
 	int			hlen;
 	if (unlikely(be32_to_cpu(rhead->h_magicno) != XLOG_HEADER_MAGIC_NUM)) {
 		XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
 				XFS_ERRLEVEL_LOW, log->l_mp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	if (unlikely(
 	    (!rhead->h_version ||
 	    (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
 		xlog_warn("XFS: %s: unrecognised log version (%d).",
 			__func__, be32_to_cpu(rhead->h_version));
 		return XFS_ERROR(EIO);
 	}
 	/* LR body must have data or it wouldn't have been written */
 	hlen = be32_to_cpu(rhead->h_len);
 	if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
 		XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
 				XFS_ERRLEVEL_LOW, log->l_mp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
 		XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
 				XFS_ERRLEVEL_LOW, log->l_mp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 	return 0;
 }
 /*
  * Read the log from tail to head and process the log records found.
  * Handle the two cases where the tail and head are in the same cycle
  * and where the active portion of the log wraps around the end of
  * the physical log separately.  The pass parameter is passed through
  * to the routines called to process the data and is not looked at
  * here.
  */
 STATIC int
 xlog_do_recovery_pass(
 	xlog_t			*log,
 	xfs_daddr_t		head_blk,
 	xfs_daddr_t		tail_blk,
 	int			pass)
 {
 	xlog_rec_header_t	*rhead;
 	xfs_daddr_t		blk_no;
 	xfs_caddr_t		bufaddr, offset;
 	xfs_buf_t		*hbp, *dbp;
 	int			error = 0, h_size;
 	int			bblks, split_bblks;
 	int			hblks, split_hblks, wrapped_hblks;
 	xlog_recover_t		*rhash[XLOG_RHASH_SIZE];
 	ASSERT(head_blk != tail_blk);
 	/*
 	 * Read the header of the tail block and get the iclog buffer size from
 	 * h_size.  Use this to tell how many sectors make up the log header.
 	 */
 	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 		/*
 		 * When using variable length iclogs, read first sector of
 		 * iclog header and extract the header size from it.  Get a
 		 * new hbp that is the correct size.
 		 */
 		hbp = xlog_get_bp(log, 1);
 		if (!hbp)
 			return ENOMEM;
 		if ((error = xlog_bread(log, tail_blk, 1, hbp)))
 			goto bread_err1;
 		offset = xlog_align(log, tail_blk, 1, hbp);
 		rhead = (xlog_rec_header_t *)offset;
 		error = xlog_valid_rec_header(log, rhead, tail_blk);
 		if (error)
 			goto bread_err1;
 		h_size = be32_to_cpu(rhead->h_size);
 		if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
 		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
 			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
 			if (h_size % XLOG_HEADER_CYCLE_SIZE)
 				hblks++;
 			xlog_put_bp(hbp);
 			hbp = xlog_get_bp(log, hblks);
 		} else {
 			hblks = 1;
 		}
 	} else {
 		ASSERT(log->l_sectbb_log == 0);
 		hblks = 1;
 		hbp = xlog_get_bp(log, 1);
 		h_size = XLOG_BIG_RECORD_BSIZE;
 	}
 	if (!hbp)
 		return ENOMEM;
 	dbp = xlog_get_bp(log, BTOBB(h_size));
 	if (!dbp) {
 		xlog_put_bp(hbp);
 		return ENOMEM;
 	}
 	memset(rhash, 0, sizeof(rhash));
 	if (tail_blk <= head_blk) {
 		for (blk_no = tail_blk; blk_no < head_blk; ) {
 			if ((error = xlog_bread(log, blk_no, hblks, hbp)))
 				goto bread_err2;
 			offset = xlog_align(log, blk_no, hblks, hbp);
 			rhead = (xlog_rec_header_t *)offset;
 			error = xlog_valid_rec_header(log, rhead, blk_no);
 			if (error)
 				goto bread_err2;
 			/* blocks in data section */
 			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
 			error = xlog_bread(log, blk_no + hblks, bblks, dbp);
 			if (error)
 				goto bread_err2;
 			offset = xlog_align(log, blk_no + hblks, bblks, dbp);
 			xlog_unpack_data(rhead, offset, log);
 			if ((error = xlog_recover_process_data(log,
 						rhash, rhead, offset, pass)))
 				goto bread_err2;
 			blk_no += bblks + hblks;
 		}
 	} else {
 		/*
 		 * Perform recovery around the end of the physical log.
 		 * When the head is not on the same cycle number as the tail,
 		 * we can't do a sequential recovery as above.
 		 */
 		blk_no = tail_blk;
 		while (blk_no < log->l_logBBsize) {
 			/*
 			 * Check for header wrapping around physical end-of-log
 			 */
 			offset = NULL;
 			split_hblks = 0;
 			wrapped_hblks = 0;
 			if (blk_no + hblks <= log->l_logBBsize) {
 				/* Read header in one read */
 				error = xlog_bread(log, blk_no, hblks, hbp);
 				if (error)
 					goto bread_err2;
 				offset = xlog_align(log, blk_no, hblks, hbp);
 			} else {
 				/* This LR is split across physical log end */
 				if (blk_no != log->l_logBBsize) {
 					/* some data before physical log end */
 					ASSERT(blk_no <= INT_MAX);
 					split_hblks = log->l_logBBsize - (int)blk_no;
 					ASSERT(split_hblks > 0);
 					if ((error = xlog_bread(log, blk_no,
 							split_hblks, hbp)))
 						goto bread_err2;
 					offset = xlog_align(log, blk_no,
 							split_hblks, hbp);
 				}
 				/*
 				 * Note: this black magic still works with
 				 * large sector sizes (non-512) only because:
 				 * - we increased the buffer size originally
 				 *   by 1 sector giving us enough extra space
 				 *   for the second read;
 				 * - the log start is guaranteed to be sector
 				 *   aligned;
 				 * - we read the log end (LR header start)
 				 *   _first_, then the log start (LR header end)
 				 *   - order is important.
 				 */
 				wrapped_hblks = hblks - split_hblks;
 				bufaddr = XFS_BUF_PTR(hbp);
 				error = XFS_BUF_SET_PTR(hbp,
 						bufaddr + BBTOB(split_hblks),
 						BBTOB(hblks - split_hblks));
 				if (!error)
 					error = xlog_bread(log, 0,
 							wrapped_hblks, hbp);
 				if (!error)
 					error = XFS_BUF_SET_PTR(hbp, bufaddr,
 							BBTOB(hblks));
 				if (error)
 					goto bread_err2;
 				if (!offset)
 					offset = xlog_align(log, 0,
 							wrapped_hblks, hbp);
 			}
 			rhead = (xlog_rec_header_t *)offset;
 			error = xlog_valid_rec_header(log, rhead,
 						split_hblks ? blk_no : 0);
 			if (error)
 				goto bread_err2;
 			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
 			blk_no += hblks;
 			/* Read in data for log record */
 			if (blk_no + bblks <= log->l_logBBsize) {
 				error = xlog_bread(log, blk_no, bblks, dbp);
 				if (error)
 					goto bread_err2;
 				offset = xlog_align(log, blk_no, bblks, dbp);
 			} else {
 				/* This log record is split across the
 				 * physical end of log */
 				offset = NULL;
 				split_bblks = 0;
 				if (blk_no != log->l_logBBsize) {
 					/* some data is before the physical
 					 * end of log */
 					ASSERT(!wrapped_hblks);
 					ASSERT(blk_no <= INT_MAX);
 					split_bblks =
 						log->l_logBBsize - (int)blk_no;
 					ASSERT(split_bblks > 0);
 					if ((error = xlog_bread(log, blk_no,
 							split_bblks, dbp)))
 						goto bread_err2;
 					offset = xlog_align(log, blk_no,
 							split_bblks, dbp);
 				}
 				/*
 				 * Note: this black magic still works with
 				 * large sector sizes (non-512) only because:
 				 * - we increased the buffer size originally
 				 *   by 1 sector giving us enough extra space
 				 *   for the second read;
 				 * - the log start is guaranteed to be sector
 				 *   aligned;
 				 * - we read the log end (LR header start)
 				 *   _first_, then the log start (LR header end)
 				 *   - order is important.
 				 */
 				bufaddr = XFS_BUF_PTR(dbp);
 				error = XFS_BUF_SET_PTR(dbp,
 						bufaddr + BBTOB(split_bblks),
 						BBTOB(bblks - split_bblks));
 				if (!error)
 					error = xlog_bread(log, wrapped_hblks,
 							bblks - split_bblks,
 							dbp);
 				if (!error)
 					error = XFS_BUF_SET_PTR(dbp, bufaddr,
 							h_size);
 				if (error)
 					goto bread_err2;
 				if (!offset)
 					offset = xlog_align(log, wrapped_hblks,
 						bblks - split_bblks, dbp);
 			}
 			xlog_unpack_data(rhead, offset, log);
 			if ((error = xlog_recover_process_data(log, rhash,
 							rhead, offset, pass)))
 				goto bread_err2;
 			blk_no += bblks;
 		}
 		ASSERT(blk_no >= log->l_logBBsize);
 		blk_no -= log->l_logBBsize;
 		/* read first part of physical log */
 		while (blk_no < head_blk) {
 			if ((error = xlog_bread(log, blk_no, hblks, hbp)))
 				goto bread_err2;
 			offset = xlog_align(log, blk_no, hblks, hbp);
 			rhead = (xlog_rec_header_t *)offset;
 			error = xlog_valid_rec_header(log, rhead, blk_no);
 			if (error)
 				goto bread_err2;
 			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
 			if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp)))
 				goto bread_err2;
 			offset = xlog_align(log, blk_no+hblks, bblks, dbp);
 			xlog_unpack_data(rhead, offset, log);
 			if ((error = xlog_recover_process_data(log, rhash,
 							rhead, offset, pass)))
 				goto bread_err2;
 			blk_no += bblks + hblks;
 		}
 	}
  bread_err2:
 	xlog_put_bp(dbp);
  bread_err1:
 	xlog_put_bp(hbp);
 	return error;
 }
 /*
  * Do the recovery of the log.  We actually do this in two phases.
  * The two passes are necessary in order to implement the function
  * of cancelling a record written into the log.  The first pass
  * determines those things which have been cancelled, and the
  * second pass replays log items normally except for those which
  * have been cancelled.  The handling of the replay and cancellations
  * takes place in the log item type specific routines.
  *
  * The table of items which have cancel records in the log is allocated
  * and freed at this level, since only here do we know when all of
  * the log recovery has been completed.
  */
 STATIC int
 xlog_do_log_recovery(
 	xlog_t		*log,
 	xfs_daddr_t	head_blk,
 	xfs_daddr_t	tail_blk)
 {
 	int		error;
 	ASSERT(head_blk != tail_blk);
 	/*
 	 * First do a pass to find all of the cancelled buf log items.
 	 * Store them in the buf_cancel_table for use in the second pass.
 	 */
 	log->l_buf_cancel_table =
 		(xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
 						 sizeof(xfs_buf_cancel_t*),
 						 KM_SLEEP);
 	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
 				      XLOG_RECOVER_PASS1);
 	if (error != 0) {
 		kmem_free(log->l_buf_cancel_table);
 		log->l_buf_cancel_table = NULL;
 		return error;
 	}
 	/*
 	 * Then do a second pass to actually recover the items in the log.
 	 * When it is complete free the table of buf cancel items.
 	 */
 	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
 				      XLOG_RECOVER_PASS2);
 #ifdef DEBUG
 	if (!error) {
 		int	i;
 		for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
 			ASSERT(log->l_buf_cancel_table[i] == NULL);
 	}
 #endif	/* DEBUG */
 	kmem_free(log->l_buf_cancel_table);
 	log->l_buf_cancel_table = NULL;
 	return error;
 }
 /*
  * Do the actual recovery
  */
 STATIC int
 xlog_do_recover(
 	xlog_t		*log,
 	xfs_daddr_t	head_blk,
 	xfs_daddr_t	tail_blk)
 {
 	int		error;
 	xfs_buf_t	*bp;
 	xfs_sb_t	*sbp;
 	/*
 	 * First replay the images in the log.
 	 */
 	error = xlog_do_log_recovery(log, head_blk, tail_blk);
 	if (error) {
 		return error;
 	}
 	XFS_bflush(log->l_mp->m_ddev_targp);
 	/*
 	 * If IO errors happened during recovery, bail out.
 	 */
 	if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
 		return (EIO);
 	}
 	/*
 	 * We now update the tail_lsn since much of the recovery has completed
 	 * and there may be space available to use.  If there were no extent
 	 * or iunlinks, we can free up the entire log and set the tail_lsn to
 	 * be the last_sync_lsn.  This was set in xlog_find_tail to be the
 	 * lsn of the last known good LR on disk.  If there are extent frees
 	 * or iunlinks they will have some entries in the AIL; so we look at
 	 * the AIL to determine how to set the tail_lsn.
 	 */
 	xlog_assign_tail_lsn(log->l_mp);
 	/*
 	 * Now that we've finished replaying all buffer and inode
 	 * updates, re-read in the superblock.
 	 */
 	bp = xfs_getsb(log->l_mp, 0);
 	XFS_BUF_UNDONE(bp);
 	ASSERT(!(XFS_BUF_ISWRITE(bp)));
 	ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
 	XFS_BUF_READ(bp);
 	XFS_BUF_UNASYNC(bp);
 	xfsbdstrat(log->l_mp, bp);
 	error = xfs_iowait(bp);
 	if (error) {
 		xfs_ioerror_alert("xlog_do_recover",
 				  log->l_mp, bp, XFS_BUF_ADDR(bp));
 		ASSERT(0);
 		xfs_buf_relse(bp);
 		return error;
 	}
 	/* Convert superblock from on-disk format */
 	sbp = &log->l_mp->m_sb;
 	xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
 	ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
 	ASSERT(xfs_sb_good_version(sbp));
 	xfs_buf_relse(bp);
 	/* We've re-read the superblock so re-initialize per-cpu counters */
 	xfs_icsb_reinit_counters(log->l_mp);
 	xlog_recover_check_summary(log);
 	/* Normal transactions can now occur */
 	log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
 	return 0;
 }
 /*
  * Perform recovery and re-initialize some log variables in xlog_find_tail.
  *
  * Return error or zero.
  */
 int
 xlog_recover(
 	xlog_t		*log)
 {
 	xfs_daddr_t	head_blk, tail_blk;
 	int		error;
 	/* find the tail of the log */
 	if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
 		return error;
 	if (tail_blk != head_blk) {
 		/* There used to be a comment here:
 		 *
 		 * disallow recovery on read-only mounts.  note -- mount
 		 * checks for ENOSPC and turns it into an intelligent
 		 * error message.
 		 * ...but this is no longer true.  Now, unless you specify
 		 * NORECOVERY (in which case this function would never be
 		 * called), we just go ahead and recover.  We do this all
 		 * under the vfs layer, so we can get away with it unless
 		 * the device itself is read-only, in which case we fail.
 		 */
 		if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
 			return error;
 		}
 		cmn_err(CE_NOTE,
 			"Starting XFS recovery on filesystem: %s (logdev: %s)",
 			log->l_mp->m_fsname, log->l_mp->m_logname ?
 			log->l_mp->m_logname : "internal");
 		error = xlog_do_recover(log, head_blk, tail_blk);
 		log->l_flags |= XLOG_RECOVERY_NEEDED;
 	}
 	return error;
 }
 /*
  * In the first part of recovery we replay inodes and buffers and build
  * up the list of extent free items which need to be processed.  Here
  * we process the extent free items and clean up the on disk unlinked
  * inode lists.  This is separated from the first part of recovery so
  * that the root and real-time bitmap inodes can be read in from disk in
  * between the two stages.  This is necessary so that we can free space
  * in the real-time portion of the file system.
  */
 int
 xlog_recover_finish(
 	xlog_t		*log)
 {
 	/*
 	 * Now we're ready to do the transactions needed for the
 	 * rest of recovery.  Start with completing all the extent
 	 * free intent records and then process the unlinked inode
 	 * lists.  At this point, we essentially run in normal mode
 	 * except that we're still performing recovery actions
 	 * rather than accepting new requests.
 	 */
 	if (log->l_flags & XLOG_RECOVERY_NEEDED) {
 		int	error;
 		error = xlog_recover_process_efis(log);
 		if (error) {
 			cmn_err(CE_ALERT,
 				"Failed to recover EFIs on filesystem: %s",
 				log->l_mp->m_fsname);
 			return error;
 		}
 		/*
 		 * Sync the log to get all the EFIs out of the AIL.
 		 * This isn't absolutely necessary, but it helps in
 		 * case the unlink transactions would have problems
 		 * pushing the EFIs out of the way.
 		 */
 		xfs_log_force(log->l_mp, (xfs_lsn_t)0,
 			      (XFS_LOG_FORCE | XFS_LOG_SYNC));
 		xlog_recover_process_iunlinks(log);
 		xlog_recover_check_summary(log);
 		cmn_err(CE_NOTE,
 			"Ending XFS recovery on filesystem: %s (logdev: %s)",
 			log->l_mp->m_fsname, log->l_mp->m_logname ?
 			log->l_mp->m_logname : "internal");
 		log->l_flags &= ~XLOG_RECOVERY_NEEDED;
 	} else {
 		cmn_err(CE_DEBUG,
 			"!Ending clean XFS mount for filesystem: %s\n",
 			log->l_mp->m_fsname);
 	}
 	return 0;
 }
 #if defined(DEBUG)
 /*
  * Read all of the agf and agi counters and check that they
  * are consistent with the superblock counters.
  */
 void
 xlog_recover_check_summary(
 	xlog_t		*log)
 {
 	xfs_mount_t	*mp;
 	xfs_agf_t	*agfp;
 	xfs_buf_t	*agfbp;
 	xfs_buf_t	*agibp;
 	xfs_buf_t	*sbbp;
 #ifdef XFS_LOUD_RECOVERY
 	xfs_sb_t	*sbp;
 #endif
 	xfs_agnumber_t	agno;
 	__uint64_t	freeblks;
 	__uint64_t	itotal;
 	__uint64_t	ifree;
 	int		error;
 	mp = log->l_mp;
 	freeblks = 0LL;
 	itotal = 0LL;
 	ifree = 0LL;
 	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
 		error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
 		if (error) {
 			xfs_fs_cmn_err(CE_ALERT, mp,
 					"xlog_recover_check_summary(agf)"
 					"agf read failed agno %d error %d",
 							agno, error);
 		} else {
 			agfp = XFS_BUF_TO_AGF(agfbp);
 			freeblks += be32_to_cpu(agfp->agf_freeblks) +
 				    be32_to_cpu(agfp->agf_flcount);
 			xfs_buf_relse(agfbp);
 		}
 		error = xfs_read_agi(mp, NULL, agno, &agibp);
 		if (!error) {
 			struct xfs_agi	*agi = XFS_BUF_TO_AGI(agibp);
 			itotal += be32_to_cpu(agi->agi_count);
 			ifree += be32_to_cpu(agi->agi_freecount);
 			xfs_buf_relse(agibp);
 		}
 	}
 	sbbp = xfs_getsb(mp, 0);
 #ifdef XFS_LOUD_RECOVERY
 	sbp = &mp->m_sb;
 	xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp));
 	cmn_err(CE_NOTE,
 		"xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
 		sbp->sb_icount, itotal);
 	cmn_err(CE_NOTE,
 		"xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
 		sbp->sb_ifree, ifree);
 	cmn_err(CE_NOTE,
 		"xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
 		sbp->sb_fdblocks, freeblks);
 #if 0
 	/*
 	 * This is turned off until I account for the allocation
 	 * btree blocks which live in free space.
 	 */
 	ASSERT(sbp->sb_icount == itotal);
 	ASSERT(sbp->sb_ifree == ifree);
 	ASSERT(sbp->sb_fdblocks == freeblks);
 #endif
 #endif
 	xfs_buf_relse(sbbp);
 }
 #endif /* DEBUG */

fs/xfs/xfs_rw.c

Diff comments View file @ 15ac08a

1	/*	1	/*
2	* Copyright (c) 2000-2006 Silicon Graphics, Inc.	2	* Copyright (c) 2000-2006 Silicon Graphics, Inc.
3	* All Rights Reserved.	3	* All Rights Reserved.
4	*	4	*
5	* This program is free software; you can redistribute it and/or	5	* This program is free software; you can redistribute it and/or
6	* modify it under the terms of the GNU General Public License as	6	* modify it under the terms of the GNU General Public License as
7	* published by the Free Software Foundation.	7	* published by the Free Software Foundation.
8	*	8	*
9	* This program is distributed in the hope that it would be useful,	9	* This program is distributed in the hope that it would be useful,
10	* but WITHOUT ANY WARRANTY; without even the implied warranty of	10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	* GNU General Public License for more details.	12	* GNU General Public License for more details.
13	*	13	*
14	* You should have received a copy of the GNU General Public License	14	* You should have received a copy of the GNU General Public License
15	* along with this program; if not, write the Free Software Foundation,	15	* along with this program; if not, write the Free Software Foundation,
16	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA	16	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17	*/	17	*/
18	#include "xfs.h"	18	#include "xfs.h"
19	#include "xfs_fs.h"	19	#include "xfs_fs.h"
20	#include "xfs_types.h"	20	#include "xfs_types.h"
21	#include "xfs_bit.h"	21	#include "xfs_bit.h"
22	#include "xfs_log.h"	22	#include "xfs_log.h"
23	#include "xfs_inum.h"	23	#include "xfs_inum.h"
24	#include "xfs_trans.h"	24	#include "xfs_trans.h"
25	#include "xfs_sb.h"	25	#include "xfs_sb.h"
26	#include "xfs_ag.h"	26	#include "xfs_ag.h"
27	#include "xfs_dir2.h"	27	#include "xfs_dir2.h"
28	#include "xfs_dmapi.h"	28	#include "xfs_dmapi.h"
29	#include "xfs_mount.h"	29	#include "xfs_mount.h"
30	#include "xfs_bmap_btree.h"	30	#include "xfs_bmap_btree.h"
31	#include "xfs_alloc_btree.h"	31	#include "xfs_alloc_btree.h"
32	#include "xfs_ialloc_btree.h"	32	#include "xfs_ialloc_btree.h"
33	#include "xfs_dir2_sf.h"	33	#include "xfs_dir2_sf.h"
34	#include "xfs_attr_sf.h"	34	#include "xfs_attr_sf.h"
35	#include "xfs_dinode.h"	35	#include "xfs_dinode.h"
36	#include "xfs_inode.h"	36	#include "xfs_inode.h"
37	#include "xfs_inode_item.h"	37	#include "xfs_inode_item.h"
38	#include "xfs_itable.h"	38	#include "xfs_itable.h"
39	#include "xfs_btree.h"	39	#include "xfs_btree.h"
40	#include "xfs_alloc.h"	40	#include "xfs_alloc.h"
41	#include "xfs_ialloc.h"	41	#include "xfs_ialloc.h"
42	#include "xfs_attr.h"	42	#include "xfs_attr.h"
43	#include "xfs_bmap.h"	43	#include "xfs_bmap.h"
44	#include "xfs_acl.h"	44	#include "xfs_acl.h"
45	#include "xfs_error.h"	45	#include "xfs_error.h"
46	#include "xfs_buf_item.h"	46	#include "xfs_buf_item.h"
47	#include "xfs_rw.h"	47	#include "xfs_rw.h"
48		48
49	/*	49	/*
50	* This is a subroutine for xfs_write() and other writers (xfs_ioctl)	50	* This is a subroutine for xfs_write() and other writers (xfs_ioctl)
51	* which clears the setuid and setgid bits when a file is written.	51	* which clears the setuid and setgid bits when a file is written.
52	*/	52	*/
53	int	53	int
54	xfs_write_clear_setuid(	54	xfs_write_clear_setuid(
55	xfs_inode_t *ip)	55	xfs_inode_t *ip)
56	{	56	{
57	xfs_mount_t *mp;	57	xfs_mount_t *mp;
58	xfs_trans_t *tp;	58	xfs_trans_t *tp;
59	int error;	59	int error;
60		60
61	mp = ip->i_mount;	61	mp = ip->i_mount;
62	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);	62	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
63	if ((error = xfs_trans_reserve(tp, 0,	63	if ((error = xfs_trans_reserve(tp, 0,
64	XFS_WRITEID_LOG_RES(mp),	64	XFS_WRITEID_LOG_RES(mp),
65	0, 0, 0))) {	65	0, 0, 0))) {
66	xfs_trans_cancel(tp, 0);	66	xfs_trans_cancel(tp, 0);
67	return error;	67	return error;
68	}	68	}
69	xfs_ilock(ip, XFS_ILOCK_EXCL);	69	xfs_ilock(ip, XFS_ILOCK_EXCL);
70	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);	70	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
71	xfs_trans_ihold(tp, ip);	71	xfs_trans_ihold(tp, ip);
72	ip->i_d.di_mode &= ~S_ISUID;	72	ip->i_d.di_mode &= ~S_ISUID;
73		73
74	/*	74	/*
75	* Note that we don't have to worry about mandatory	75	* Note that we don't have to worry about mandatory
76	* file locking being disabled here because we only	76	* file locking being disabled here because we only
77	* clear the S_ISGID bit if the Group execute bit is	77	* clear the S_ISGID bit if the Group execute bit is
78	* on, but if it was on then mandatory locking wouldn't	78	* on, but if it was on then mandatory locking wouldn't
79	* have been enabled.	79	* have been enabled.
80	*/	80	*/
81	if (ip->i_d.di_mode & S_IXGRP) {	81	if (ip->i_d.di_mode & S_IXGRP) {
82	ip->i_d.di_mode &= ~S_ISGID;	82	ip->i_d.di_mode &= ~S_ISGID;
83	}	83	}
84	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);	84	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
85	xfs_trans_set_sync(tp);	85	xfs_trans_set_sync(tp);
86	error = xfs_trans_commit(tp, 0);	86	error = xfs_trans_commit(tp, 0);
87	xfs_iunlock(ip, XFS_ILOCK_EXCL);	87	xfs_iunlock(ip, XFS_ILOCK_EXCL);
88	return 0;	88	return 0;
89	}	89	}
90		90
91	/*	91	/*
92	* Handle logging requirements of various synchronous types of write.	92	* Handle logging requirements of various synchronous types of write.
93	*/	93	*/
94	int	94	int
95	xfs_write_sync_logforce(	95	xfs_write_sync_logforce(
96	xfs_mount_t *mp,	96	xfs_mount_t *mp,
97	xfs_inode_t *ip)	97	xfs_inode_t *ip)
98	{	98	{
99	int error = 0;	99	int error = 0;
100		100
101	/*	101	/*
102	* If we're treating this as O_DSYNC and we have not updated the	102	* If we're treating this as O_DSYNC and we have not updated the
103	* size, force the log.	103	* size, force the log.
104	*/	104	*/
105	if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&	105	if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
106	!(ip->i_update_size)) {	106	!(ip->i_update_size)) {
107	xfs_inode_log_item_t *iip = ip->i_itemp;	107	xfs_inode_log_item_t *iip = ip->i_itemp;
108		108
109	/*	109	/*
110	* If an allocation transaction occurred	110	* If an allocation transaction occurred
111	* without extending the size, then we have to force	111	* without extending the size, then we have to force
112	* the log up the proper point to ensure that the	112	* the log up the proper point to ensure that the
113	* allocation is permanent. We can't count on	113	* allocation is permanent. We can't count on
114	* the fact that buffered writes lock out direct I/O	114	* the fact that buffered writes lock out direct I/O
115	* writes - the direct I/O write could have extended	115	* writes - the direct I/O write could have extended
116	* the size nontransactionally, then finished before	116	* the size nontransactionally, then finished before
117	* we started. xfs_write_file will think that the file	117	* we started. xfs_write_file will think that the file
118	* didn't grow but the update isn't safe unless the	118	* didn't grow but the update isn't safe unless the
119	* size change is logged.	119	* size change is logged.
120	*	120	*
121	* Force the log if we've committed a transaction	121	* Force the log if we've committed a transaction
122	* against the inode or if someone else has and	122	* against the inode or if someone else has and
123	* the commit record hasn't gone to disk (e.g.	123	* the commit record hasn't gone to disk (e.g.
124	* the inode is pinned). This guarantees that	124	* the inode is pinned). This guarantees that
125	* all changes affecting the inode are permanent	125	* all changes affecting the inode are permanent
126	* when we return.	126	* when we return.
127	*/	127	*/
128	if (iip && iip->ili_last_lsn) {	128	if (iip && iip->ili_last_lsn) {
129	error = _xfs_log_force(mp, iip->ili_last_lsn,	129	error = _xfs_log_force(mp, iip->ili_last_lsn,
130	XFS_LOG_FORCE \| XFS_LOG_SYNC, NULL);	130	XFS_LOG_FORCE \| XFS_LOG_SYNC, NULL);
131	} else if (xfs_ipincount(ip) > 0) {	131	} else if (xfs_ipincount(ip) > 0) {
132	error = _xfs_log_force(mp, (xfs_lsn_t)0,	132	error = _xfs_log_force(mp, (xfs_lsn_t)0,
133	XFS_LOG_FORCE \| XFS_LOG_SYNC, NULL);	133	XFS_LOG_FORCE \| XFS_LOG_SYNC, NULL);
134	}	134	}
135		135
136	} else {	136	} else {
137	xfs_trans_t *tp;	137	xfs_trans_t *tp;
138		138
139	/*	139	/*
140	* O_SYNC or O_DSYNC _with_ a size update are handled	140	* O_SYNC or O_DSYNC _with_ a size update are handled
141	* the same way.	141	* the same way.
142	*	142	*
143	* If the write was synchronous then we need to make	143	* If the write was synchronous then we need to make
144	* sure that the inode modification time is permanent.	144	* sure that the inode modification time is permanent.
145	* We'll have updated the timestamp above, so here	145	* We'll have updated the timestamp above, so here
146	* we use a synchronous transaction to log the inode.	146	* we use a synchronous transaction to log the inode.
147	* It's not fast, but it's necessary.	147	* It's not fast, but it's necessary.
148	*	148	*
149	* If this a dsync write and the size got changed	149	* If this a dsync write and the size got changed
150	* non-transactionally, then we need to ensure that	150	* non-transactionally, then we need to ensure that
151	* the size change gets logged in a synchronous	151	* the size change gets logged in a synchronous
152	* transaction.	152	* transaction.
153	*/	153	*/
154	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);	154	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
155	if ((error = xfs_trans_reserve(tp, 0,	155	if ((error = xfs_trans_reserve(tp, 0,
156	XFS_SWRITE_LOG_RES(mp),	156	XFS_SWRITE_LOG_RES(mp),
157	0, 0, 0))) {	157	0, 0, 0))) {
158	/* Transaction reserve failed */	158	/* Transaction reserve failed */
159	xfs_trans_cancel(tp, 0);	159	xfs_trans_cancel(tp, 0);
160	} else {	160	} else {
161	/* Transaction reserve successful */	161	/* Transaction reserve successful */
162	xfs_ilock(ip, XFS_ILOCK_EXCL);	162	xfs_ilock(ip, XFS_ILOCK_EXCL);
163	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);	163	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
164	xfs_trans_ihold(tp, ip);	164	xfs_trans_ihold(tp, ip);
165	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);	165	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
166	xfs_trans_set_sync(tp);	166	xfs_trans_set_sync(tp);
167	error = xfs_trans_commit(tp, 0);	167	error = xfs_trans_commit(tp, 0);
168	xfs_iunlock(ip, XFS_ILOCK_EXCL);	168	xfs_iunlock(ip, XFS_ILOCK_EXCL);
169	}	169	}
170	}	170	}
171		171
172	return error;	172	return error;
173	}	173	}
174		174
175	/*	175	/*
176	* Force a shutdown of the filesystem instantly while keeping	176	* Force a shutdown of the filesystem instantly while keeping
177	* the filesystem consistent. We don't do an unmount here; just shutdown	177	* the filesystem consistent. We don't do an unmount here; just shutdown
178	* the shop, make sure that absolutely nothing persistent happens to	178	* the shop, make sure that absolutely nothing persistent happens to
179	* this filesystem after this point.	179	* this filesystem after this point.
180	*/	180	*/
181	void	181	void
182	xfs_do_force_shutdown(	182	xfs_do_force_shutdown(
183	xfs_mount_t *mp,	183	xfs_mount_t *mp,
184	int flags,	184	int flags,
185	char *fname,	185	char *fname,
186	int lnnum)	186	int lnnum)
187	{	187	{
188	int logerror;	188	int logerror;
189		189
190	logerror = flags & SHUTDOWN_LOG_IO_ERROR;	190	logerror = flags & SHUTDOWN_LOG_IO_ERROR;
191		191
192	if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {	192	if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
193	cmn_err(CE_NOTE, "xfs_force_shutdown(%s,0x%x) called from "	193	cmn_err(CE_NOTE, "xfs_force_shutdown(%s,0x%x) called from "
194	"line %d of file %s. Return address = 0x%p",	194	"line %d of file %s. Return address = 0x%p",
195	mp->m_fsname, flags, lnnum, fname, __return_address);	195	mp->m_fsname, flags, lnnum, fname, __return_address);
196	}	196	}
197	/*	197	/*
198	* No need to duplicate efforts.	198	* No need to duplicate efforts.
199	*/	199	*/
200	if (XFS_FORCED_SHUTDOWN(mp) && !logerror)	200	if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
201	return;	201	return;
202		202
203	/*	203	/*
204	* This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't	204	* This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
205	* queue up anybody new on the log reservations, and wakes up	205	* queue up anybody new on the log reservations, and wakes up
206	* everybody who's sleeping on log reservations to tell them	206	* everybody who's sleeping on log reservations to tell them
207	* the bad news.	207	* the bad news.
208	*/	208	*/
209	if (xfs_log_force_umount(mp, logerror))	209	if (xfs_log_force_umount(mp, logerror))
210	return;	210	return;
211		211
212	if (flags & SHUTDOWN_CORRUPT_INCORE) {	212	if (flags & SHUTDOWN_CORRUPT_INCORE) {
213	xfs_cmn_err(XFS_PTAG_SHUTDOWN_CORRUPT, CE_ALERT, mp,	213	xfs_cmn_err(XFS_PTAG_SHUTDOWN_CORRUPT, CE_ALERT, mp,
214	"Corruption of in-memory data detected. Shutting down filesystem: %s",	214	"Corruption of in-memory data detected. Shutting down filesystem: %s",
215	mp->m_fsname);	215	mp->m_fsname);
216	if (XFS_ERRLEVEL_HIGH <= xfs_error_level) {	216	if (XFS_ERRLEVEL_HIGH <= xfs_error_level) {
217	xfs_stack_trace();	217	xfs_stack_trace();
218	}	218	}
219	} else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {	219	} else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
220	if (logerror) {	220	if (logerror) {
221	xfs_cmn_err(XFS_PTAG_SHUTDOWN_LOGERROR, CE_ALERT, mp,	221	xfs_cmn_err(XFS_PTAG_SHUTDOWN_LOGERROR, CE_ALERT, mp,
222	"Log I/O Error Detected. Shutting down filesystem: %s",	222	"Log I/O Error Detected. Shutting down filesystem: %s",
223	mp->m_fsname);	223	mp->m_fsname);
224	} else if (flags & SHUTDOWN_DEVICE_REQ) {	224	} else if (flags & SHUTDOWN_DEVICE_REQ) {
225	xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,	225	xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
226	"All device paths lost. Shutting down filesystem: %s",	226	"All device paths lost. Shutting down filesystem: %s",
227	mp->m_fsname);	227	mp->m_fsname);
228	} else if (!(flags & SHUTDOWN_REMOTE_REQ)) {	228	} else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
229	xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,	229	xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
230	"I/O Error Detected. Shutting down filesystem: %s",	230	"I/O Error Detected. Shutting down filesystem: %s",
231	mp->m_fsname);	231	mp->m_fsname);
232	}	232	}
233	}	233	}
234	if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {	234	if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
235	cmn_err(CE_ALERT, "Please umount the filesystem, "	235	cmn_err(CE_ALERT, "Please umount the filesystem, "
236	"and rectify the problem(s)");	236	"and rectify the problem(s)");
237	}	237	}
238	}	238	}
239		239
240		240
241	/*	241	/*
242	* Called when we want to stop a buffer from getting written or read.	242	* Called when we want to stop a buffer from getting written or read.
243	* We attach the EIO error, muck with its flags, and call biodone	243	* We attach the EIO error, muck with its flags, and call biodone
244	* so that the proper iodone callbacks get called.	244	* so that the proper iodone callbacks get called.
245	*/	245	*/
246	int	246	int
247	xfs_bioerror(	247	xfs_bioerror(
248	xfs_buf_t *bp)	248	xfs_buf_t *bp)
249	{	249	{
250		250
251	#ifdef XFSERRORDEBUG	251	#ifdef XFSERRORDEBUG
252	ASSERT(XFS_BUF_ISREAD(bp) \|\| bp->b_iodone);	252	ASSERT(XFS_BUF_ISREAD(bp) \|\| bp->b_iodone);
253	#endif	253	#endif
254		254
255	/*	255	/*
256	* No need to wait until the buffer is unpinned.	256	* No need to wait until the buffer is unpinned.
257	* We aren't flushing it.	257	* We aren't flushing it.
258	*/	258	*/
259	xfs_buftrace("XFS IOERROR", bp);	259	xfs_buftrace("XFS IOERROR", bp);
260	XFS_BUF_ERROR(bp, EIO);	260	XFS_BUF_ERROR(bp, EIO);
261	/*	261	/*
262	* We're calling biodone, so delete B_DONE flag. Either way	262	* We're calling biodone, so delete B_DONE flag. Either way
263	* we have to call the iodone callback, and calling biodone	263	* we have to call the iodone callback, and calling biodone
264	* probably is the best way since it takes care of	264	* probably is the best way since it takes care of
265	* GRIO as well.	265	* GRIO as well.
266	*/	266	*/
267	XFS_BUF_UNREAD(bp);	267	XFS_BUF_UNREAD(bp);
268	XFS_BUF_UNDELAYWRITE(bp);	268	XFS_BUF_UNDELAYWRITE(bp);
269	XFS_BUF_UNDONE(bp);	269	XFS_BUF_UNDONE(bp);
270	XFS_BUF_STALE(bp);	270	XFS_BUF_STALE(bp);
271		271
272	XFS_BUF_CLR_BDSTRAT_FUNC(bp);	272	XFS_BUF_CLR_BDSTRAT_FUNC(bp);
273	xfs_biodone(bp);	273	xfs_biodone(bp);
274		274
275	return (EIO);	275	return (EIO);
276	}	276	}
277		277
278	/*	278	/*
279	* Same as xfs_bioerror, except that we are releasing the buffer	279	* Same as xfs_bioerror, except that we are releasing the buffer
280	* here ourselves, and avoiding the biodone call.	280	* here ourselves, and avoiding the biodone call.
281	* This is meant for userdata errors; metadata bufs come with	281	* This is meant for userdata errors; metadata bufs come with
282	* iodone functions attached, so that we can track down errors.	282	* iodone functions attached, so that we can track down errors.
283	*/	283	*/
284	int	284	int
285	xfs_bioerror_relse(	285	xfs_bioerror_relse(
286	xfs_buf_t *bp)	286	xfs_buf_t *bp)
287	{	287	{
288	int64_t fl;	288	int64_t fl;
289		289
290	ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks);	290	ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks);
291	ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone);	291	ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone);
292		292
293	xfs_buftrace("XFS IOERRELSE", bp);	293	xfs_buftrace("XFS IOERRELSE", bp);
294	fl = XFS_BUF_BFLAGS(bp);	294	fl = XFS_BUF_BFLAGS(bp);
295	/*	295	/*
296	* No need to wait until the buffer is unpinned.	296	* No need to wait until the buffer is unpinned.
297	* We aren't flushing it.	297	* We aren't flushing it.
298	*	298	*
299	* chunkhold expects B_DONE to be set, whether	299	* chunkhold expects B_DONE to be set, whether
300	* we actually finish the I/O or not. We don't want to	300	* we actually finish the I/O or not. We don't want to
301	* change that interface.	301	* change that interface.
302	*/	302	*/
303	XFS_BUF_UNREAD(bp);	303	XFS_BUF_UNREAD(bp);
304	XFS_BUF_UNDELAYWRITE(bp);	304	XFS_BUF_UNDELAYWRITE(bp);
305	XFS_BUF_DONE(bp);	305	XFS_BUF_DONE(bp);
306	XFS_BUF_STALE(bp);	306	XFS_BUF_STALE(bp);
307	XFS_BUF_CLR_IODONE_FUNC(bp);	307	XFS_BUF_CLR_IODONE_FUNC(bp);
308	XFS_BUF_CLR_BDSTRAT_FUNC(bp);	308	XFS_BUF_CLR_BDSTRAT_FUNC(bp);
309	if (!(fl & XFS_B_ASYNC)) {	309	if (!(fl & XFS_B_ASYNC)) {
310	/*	310	/*
311	* Mark b_error and B_ERROR _both_.	311	* Mark b_error and B_ERROR _both_.
312	* Lot's of chunkcache code assumes that.	312	* Lot's of chunkcache code assumes that.
313	* There's no reason to mark error for	313	* There's no reason to mark error for
314	* ASYNC buffers.	314	* ASYNC buffers.
315	*/	315	*/
316	XFS_BUF_ERROR(bp, EIO);	316	XFS_BUF_ERROR(bp, EIO);
317	XFS_BUF_FINISH_IOWAIT(bp);	317	XFS_BUF_FINISH_IOWAIT(bp);
318	} else {	318	} else {
319	xfs_buf_relse(bp);	319	xfs_buf_relse(bp);
320	}	320	}
321	return (EIO);	321	return (EIO);
322	}	322	}
323		323
324	/*	324	/*
325	* Prints out an ALERT message about I/O error.	325	* Prints out an ALERT message about I/O error.
326	*/	326	*/
327	void	327	void
328	xfs_ioerror_alert(	328	xfs_ioerror_alert(
329	char *func,	329	char *func,
330	struct xfs_mount *mp,	330	struct xfs_mount *mp,
331	xfs_buf_t *bp,	331	xfs_buf_t *bp,
332	xfs_daddr_t blkno)	332	xfs_daddr_t blkno)
333	{	333	{
334	cmn_err(CE_ALERT,	334	cmn_err(CE_ALERT,
335	"I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx"	335	"I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx"
336	" (\"%s\") error %d buf count %zd",	336	" (\"%s\") error %d buf count %zd",
337	(!mp \|\| !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname,	337	(!mp \|\| !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname,
338	XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),	338	XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
339	(__uint64_t)blkno, func,	339	(__uint64_t)blkno, func,
340	XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp));	340	XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp));
341	}	341	}
342		342
343	/*	343	/*
344	* This isn't an absolute requirement, but it is	344	* This isn't an absolute requirement, but it is
345	* just a good idea to call xfs_read_buf instead of	345	* just a good idea to call xfs_read_buf instead of
346	* directly doing a read_buf call. For one, we shouldn't	346	* directly doing a read_buf call. For one, we shouldn't
347	* be doing this disk read if we are in SHUTDOWN state anyway,	347	* be doing this disk read if we are in SHUTDOWN state anyway,
348	* so this stops that from happening. Secondly, this does all	348	* so this stops that from happening. Secondly, this does all
349	* the error checking stuff and the brelse if appropriate for	349	* the error checking stuff and the brelse if appropriate for
350	* the caller, so the code can be a little leaner.	350	* the caller, so the code can be a little leaner.
351	*/	351	*/
352		352
353	int	353	int
354	xfs_read_buf(	354	xfs_read_buf(
355	struct xfs_mount *mp,	355	struct xfs_mount *mp,
356	xfs_buftarg_t *target,	356	xfs_buftarg_t *target,
357	xfs_daddr_t blkno,	357	xfs_daddr_t blkno,
358	int len,	358	int len,
359	uint flags,	359	uint flags,
360	xfs_buf_t **bpp)	360	xfs_buf_t **bpp)
361	{	361	{
362	xfs_buf_t *bp;	362	xfs_buf_t *bp;
363	int error;	363	int error;
364		364
365	if (flags)	365	if (flags)
366	bp = xfs_buf_read_flags(target, blkno, len, flags);	366	bp = xfs_buf_read_flags(target, blkno, len, flags);
367	else	367	else
368	bp = xfs_buf_read(target, blkno, len, flags);	368	bp = xfs_buf_read(target, blkno, len, flags);
369	if (!bp)	369	if (!bp)
370	return XFS_ERROR(EIO);	370	return XFS_ERROR(EIO);
371	error = XFS_BUF_GETERROR(bp);	371	error = XFS_BUF_GETERROR(bp);
372	if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) {	372	if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) {
373	*bpp = bp;	373	*bpp = bp;
374	} else {	374	} else {
375	*bpp = NULL;	375	*bpp = NULL;
376	if (error) {	376	if (error) {
377	xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp));	377	xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp));
378	} else {	378	} else {
379	error = XFS_ERROR(EIO);	379	error = XFS_ERROR(EIO);
380	}	380	}
381	if (bp) {	381	if (bp) {
382	XFS_BUF_UNDONE(bp);	382	XFS_BUF_UNDONE(bp);
383	XFS_BUF_UNDELAYWRITE(bp);	383	XFS_BUF_UNDELAYWRITE(bp);
384	XFS_BUF_STALE(bp);	384	XFS_BUF_STALE(bp);
385	/*	385	/*
386	* brelse clears B_ERROR and b_error	386	* brelse clears B_ERROR and b_error
387	*/	387	*/
388	xfs_buf_relse(bp);	388	xfs_buf_relse(bp);
389	}	389	}
390	}	390	}
391	return (error);	391	return (error);
392	}	392	}
393		393
394	/*	394	/*
395	* Wrapper around bwrite() so that we can trap	395	* Wrapper around bwrite() so that we can trap
396	* write errors, and act accordingly.	396	* write errors, and act accordingly.
397	*/	397	*/
398	int	398	int
399	xfs_bwrite(	399	xfs_bwrite(
400	struct xfs_mount *mp,	400	struct xfs_mount *mp,
401	struct xfs_buf *bp)	401	struct xfs_buf *bp)
402	{	402	{
403	int error;	403	int error;
404		404
405	/*	405	/*
406	* XXXsup how does this work for quotas.	406	* XXXsup how does this work for quotas.
407	*/	407	*/
408	XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);	408	XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
409	XFS_BUF_SET_FSPRIVATE3(bp, mp);	409	bp->b_mount = mp;
410	XFS_BUF_WRITE(bp);	410	XFS_BUF_WRITE(bp);
411		411
412	if ((error = XFS_bwrite(bp))) {	412	if ((error = XFS_bwrite(bp))) {
413	ASSERT(mp);	413	ASSERT(mp);
414	/*	414	/*
415	* Cannot put a buftrace here since if the buffer is not	415	* Cannot put a buftrace here since if the buffer is not
416	* B_HOLD then we will brelse() the buffer before returning	416	* B_HOLD then we will brelse() the buffer before returning
417	* from bwrite and we could be tracing a buffer that has	417	* from bwrite and we could be tracing a buffer that has
418	* been reused.	418	* been reused.
419	*/	419	*/
420	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);	420	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
421	}	421	}
422	return (error);	422	return (error);
423	}	423	}
424		424