Commit 15ac08a8b2c129abccf1be47b6ab09491e013db2

Authored by Christoph Hellwig
Committed by Lachlan McIlroy
1 parent e055f13a6d

[XFS] replace b_fspriv with b_mount

Replace the b_fspriv pointer and it's ugly accessors with a properly types
xfs_mount pointer.  Also switch log reocvery over to it instead of using
b_fspriv for the mount pointer.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>

Showing 6 changed files with 19 additions and 32 deletions Inline Diff

fs/xfs/linux-2.6/xfs_buf.c
1 /* 1 /*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved. 3 * All Rights Reserved.
4 * 4 *
5 * This program is free software; you can redistribute it and/or 5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as 6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 * 8 *
9 * This program is distributed in the hope that it would be useful, 9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation, 15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18 #include "xfs.h" 18 #include "xfs.h"
19 #include <linux/stddef.h> 19 #include <linux/stddef.h>
20 #include <linux/errno.h> 20 #include <linux/errno.h>
21 #include <linux/slab.h> 21 #include <linux/slab.h>
22 #include <linux/pagemap.h> 22 #include <linux/pagemap.h>
23 #include <linux/init.h> 23 #include <linux/init.h>
24 #include <linux/vmalloc.h> 24 #include <linux/vmalloc.h>
25 #include <linux/bio.h> 25 #include <linux/bio.h>
26 #include <linux/sysctl.h> 26 #include <linux/sysctl.h>
27 #include <linux/proc_fs.h> 27 #include <linux/proc_fs.h>
28 #include <linux/workqueue.h> 28 #include <linux/workqueue.h>
29 #include <linux/percpu.h> 29 #include <linux/percpu.h>
30 #include <linux/blkdev.h> 30 #include <linux/blkdev.h>
31 #include <linux/hash.h> 31 #include <linux/hash.h>
32 #include <linux/kthread.h> 32 #include <linux/kthread.h>
33 #include <linux/migrate.h> 33 #include <linux/migrate.h>
34 #include <linux/backing-dev.h> 34 #include <linux/backing-dev.h>
35 #include <linux/freezer.h> 35 #include <linux/freezer.h>
36 36
37 static kmem_zone_t *xfs_buf_zone; 37 static kmem_zone_t *xfs_buf_zone;
38 STATIC int xfsbufd(void *); 38 STATIC int xfsbufd(void *);
39 STATIC int xfsbufd_wakeup(int, gfp_t); 39 STATIC int xfsbufd_wakeup(int, gfp_t);
40 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); 40 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
41 static struct shrinker xfs_buf_shake = { 41 static struct shrinker xfs_buf_shake = {
42 .shrink = xfsbufd_wakeup, 42 .shrink = xfsbufd_wakeup,
43 .seeks = DEFAULT_SEEKS, 43 .seeks = DEFAULT_SEEKS,
44 }; 44 };
45 45
46 static struct workqueue_struct *xfslogd_workqueue; 46 static struct workqueue_struct *xfslogd_workqueue;
47 struct workqueue_struct *xfsdatad_workqueue; 47 struct workqueue_struct *xfsdatad_workqueue;
48 48
49 #ifdef XFS_BUF_TRACE 49 #ifdef XFS_BUF_TRACE
50 void 50 void
51 xfs_buf_trace( 51 xfs_buf_trace(
52 xfs_buf_t *bp, 52 xfs_buf_t *bp,
53 char *id, 53 char *id,
54 void *data, 54 void *data,
55 void *ra) 55 void *ra)
56 { 56 {
57 ktrace_enter(xfs_buf_trace_buf, 57 ktrace_enter(xfs_buf_trace_buf,
58 bp, id, 58 bp, id,
59 (void *)(unsigned long)bp->b_flags, 59 (void *)(unsigned long)bp->b_flags,
60 (void *)(unsigned long)bp->b_hold.counter, 60 (void *)(unsigned long)bp->b_hold.counter,
61 (void *)(unsigned long)bp->b_sema.count, 61 (void *)(unsigned long)bp->b_sema.count,
62 (void *)current, 62 (void *)current,
63 data, ra, 63 data, ra,
64 (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff), 64 (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff),
65 (void *)(unsigned long)(bp->b_file_offset & 0xffffffff), 65 (void *)(unsigned long)(bp->b_file_offset & 0xffffffff),
66 (void *)(unsigned long)bp->b_buffer_length, 66 (void *)(unsigned long)bp->b_buffer_length,
67 NULL, NULL, NULL, NULL, NULL); 67 NULL, NULL, NULL, NULL, NULL);
68 } 68 }
69 ktrace_t *xfs_buf_trace_buf; 69 ktrace_t *xfs_buf_trace_buf;
70 #define XFS_BUF_TRACE_SIZE 4096 70 #define XFS_BUF_TRACE_SIZE 4096
71 #define XB_TRACE(bp, id, data) \ 71 #define XB_TRACE(bp, id, data) \
72 xfs_buf_trace(bp, id, (void *)data, (void *)__builtin_return_address(0)) 72 xfs_buf_trace(bp, id, (void *)data, (void *)__builtin_return_address(0))
73 #else 73 #else
74 #define XB_TRACE(bp, id, data) do { } while (0) 74 #define XB_TRACE(bp, id, data) do { } while (0)
75 #endif 75 #endif
76 76
77 #ifdef XFS_BUF_LOCK_TRACKING 77 #ifdef XFS_BUF_LOCK_TRACKING
78 # define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) 78 # define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid)
79 # define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) 79 # define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1)
80 # define XB_GET_OWNER(bp) ((bp)->b_last_holder) 80 # define XB_GET_OWNER(bp) ((bp)->b_last_holder)
81 #else 81 #else
82 # define XB_SET_OWNER(bp) do { } while (0) 82 # define XB_SET_OWNER(bp) do { } while (0)
83 # define XB_CLEAR_OWNER(bp) do { } while (0) 83 # define XB_CLEAR_OWNER(bp) do { } while (0)
84 # define XB_GET_OWNER(bp) do { } while (0) 84 # define XB_GET_OWNER(bp) do { } while (0)
85 #endif 85 #endif
86 86
87 #define xb_to_gfp(flags) \ 87 #define xb_to_gfp(flags) \
88 ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \ 88 ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \
89 ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN) 89 ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
90 90
91 #define xb_to_km(flags) \ 91 #define xb_to_km(flags) \
92 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) 92 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
93 93
94 #define xfs_buf_allocate(flags) \ 94 #define xfs_buf_allocate(flags) \
95 kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)) 95 kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))
96 #define xfs_buf_deallocate(bp) \ 96 #define xfs_buf_deallocate(bp) \
97 kmem_zone_free(xfs_buf_zone, (bp)); 97 kmem_zone_free(xfs_buf_zone, (bp));
98 98
99 /* 99 /*
100 * Page Region interfaces. 100 * Page Region interfaces.
101 * 101 *
102 * For pages in filesystems where the blocksize is smaller than the 102 * For pages in filesystems where the blocksize is smaller than the
103 * pagesize, we use the page->private field (long) to hold a bitmap 103 * pagesize, we use the page->private field (long) to hold a bitmap
104 * of uptodate regions within the page. 104 * of uptodate regions within the page.
105 * 105 *
106 * Each such region is "bytes per page / bits per long" bytes long. 106 * Each such region is "bytes per page / bits per long" bytes long.
107 * 107 *
108 * NBPPR == number-of-bytes-per-page-region 108 * NBPPR == number-of-bytes-per-page-region
109 * BTOPR == bytes-to-page-region (rounded up) 109 * BTOPR == bytes-to-page-region (rounded up)
110 * BTOPRT == bytes-to-page-region-truncated (rounded down) 110 * BTOPRT == bytes-to-page-region-truncated (rounded down)
111 */ 111 */
112 #if (BITS_PER_LONG == 32) 112 #if (BITS_PER_LONG == 32)
113 #define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */ 113 #define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */
114 #elif (BITS_PER_LONG == 64) 114 #elif (BITS_PER_LONG == 64)
115 #define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */ 115 #define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */
116 #else 116 #else
117 #error BITS_PER_LONG must be 32 or 64 117 #error BITS_PER_LONG must be 32 or 64
118 #endif 118 #endif
119 #define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG) 119 #define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG)
120 #define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT) 120 #define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
121 #define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT)) 121 #define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT))
122 122
123 STATIC unsigned long 123 STATIC unsigned long
124 page_region_mask( 124 page_region_mask(
125 size_t offset, 125 size_t offset,
126 size_t length) 126 size_t length)
127 { 127 {
128 unsigned long mask; 128 unsigned long mask;
129 int first, final; 129 int first, final;
130 130
131 first = BTOPR(offset); 131 first = BTOPR(offset);
132 final = BTOPRT(offset + length - 1); 132 final = BTOPRT(offset + length - 1);
133 first = min(first, final); 133 first = min(first, final);
134 134
135 mask = ~0UL; 135 mask = ~0UL;
136 mask <<= BITS_PER_LONG - (final - first); 136 mask <<= BITS_PER_LONG - (final - first);
137 mask >>= BITS_PER_LONG - (final); 137 mask >>= BITS_PER_LONG - (final);
138 138
139 ASSERT(offset + length <= PAGE_CACHE_SIZE); 139 ASSERT(offset + length <= PAGE_CACHE_SIZE);
140 ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0); 140 ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
141 141
142 return mask; 142 return mask;
143 } 143 }
144 144
145 STATIC_INLINE void 145 STATIC_INLINE void
146 set_page_region( 146 set_page_region(
147 struct page *page, 147 struct page *page,
148 size_t offset, 148 size_t offset,
149 size_t length) 149 size_t length)
150 { 150 {
151 set_page_private(page, 151 set_page_private(page,
152 page_private(page) | page_region_mask(offset, length)); 152 page_private(page) | page_region_mask(offset, length));
153 if (page_private(page) == ~0UL) 153 if (page_private(page) == ~0UL)
154 SetPageUptodate(page); 154 SetPageUptodate(page);
155 } 155 }
156 156
157 STATIC_INLINE int 157 STATIC_INLINE int
158 test_page_region( 158 test_page_region(
159 struct page *page, 159 struct page *page,
160 size_t offset, 160 size_t offset,
161 size_t length) 161 size_t length)
162 { 162 {
163 unsigned long mask = page_region_mask(offset, length); 163 unsigned long mask = page_region_mask(offset, length);
164 164
165 return (mask && (page_private(page) & mask) == mask); 165 return (mask && (page_private(page) & mask) == mask);
166 } 166 }
167 167
168 /* 168 /*
169 * Mapping of multi-page buffers into contiguous virtual space 169 * Mapping of multi-page buffers into contiguous virtual space
170 */ 170 */
171 171
172 typedef struct a_list { 172 typedef struct a_list {
173 void *vm_addr; 173 void *vm_addr;
174 struct a_list *next; 174 struct a_list *next;
175 } a_list_t; 175 } a_list_t;
176 176
177 static a_list_t *as_free_head; 177 static a_list_t *as_free_head;
178 static int as_list_len; 178 static int as_list_len;
179 static DEFINE_SPINLOCK(as_lock); 179 static DEFINE_SPINLOCK(as_lock);
180 180
181 /* 181 /*
182 * Try to batch vunmaps because they are costly. 182 * Try to batch vunmaps because they are costly.
183 */ 183 */
184 STATIC void 184 STATIC void
185 free_address( 185 free_address(
186 void *addr) 186 void *addr)
187 { 187 {
188 a_list_t *aentry; 188 a_list_t *aentry;
189 189
190 #ifdef CONFIG_XEN 190 #ifdef CONFIG_XEN
191 /* 191 /*
192 * Xen needs to be able to make sure it can get an exclusive 192 * Xen needs to be able to make sure it can get an exclusive
193 * RO mapping of pages it wants to turn into a pagetable. If 193 * RO mapping of pages it wants to turn into a pagetable. If
194 * a newly allocated page is also still being vmap()ed by xfs, 194 * a newly allocated page is also still being vmap()ed by xfs,
195 * it will cause pagetable construction to fail. This is a 195 * it will cause pagetable construction to fail. This is a
196 * quick workaround to always eagerly unmap pages so that Xen 196 * quick workaround to always eagerly unmap pages so that Xen
197 * is happy. 197 * is happy.
198 */ 198 */
199 vunmap(addr); 199 vunmap(addr);
200 return; 200 return;
201 #endif 201 #endif
202 202
203 aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT); 203 aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
204 if (likely(aentry)) { 204 if (likely(aentry)) {
205 spin_lock(&as_lock); 205 spin_lock(&as_lock);
206 aentry->next = as_free_head; 206 aentry->next = as_free_head;
207 aentry->vm_addr = addr; 207 aentry->vm_addr = addr;
208 as_free_head = aentry; 208 as_free_head = aentry;
209 as_list_len++; 209 as_list_len++;
210 spin_unlock(&as_lock); 210 spin_unlock(&as_lock);
211 } else { 211 } else {
212 vunmap(addr); 212 vunmap(addr);
213 } 213 }
214 } 214 }
215 215
216 STATIC void 216 STATIC void
217 purge_addresses(void) 217 purge_addresses(void)
218 { 218 {
219 a_list_t *aentry, *old; 219 a_list_t *aentry, *old;
220 220
221 if (as_free_head == NULL) 221 if (as_free_head == NULL)
222 return; 222 return;
223 223
224 spin_lock(&as_lock); 224 spin_lock(&as_lock);
225 aentry = as_free_head; 225 aentry = as_free_head;
226 as_free_head = NULL; 226 as_free_head = NULL;
227 as_list_len = 0; 227 as_list_len = 0;
228 spin_unlock(&as_lock); 228 spin_unlock(&as_lock);
229 229
230 while ((old = aentry) != NULL) { 230 while ((old = aentry) != NULL) {
231 vunmap(aentry->vm_addr); 231 vunmap(aentry->vm_addr);
232 aentry = aentry->next; 232 aentry = aentry->next;
233 kfree(old); 233 kfree(old);
234 } 234 }
235 } 235 }
236 236
237 /* 237 /*
238 * Internal xfs_buf_t object manipulation 238 * Internal xfs_buf_t object manipulation
239 */ 239 */
240 240
241 STATIC void 241 STATIC void
242 _xfs_buf_initialize( 242 _xfs_buf_initialize(
243 xfs_buf_t *bp, 243 xfs_buf_t *bp,
244 xfs_buftarg_t *target, 244 xfs_buftarg_t *target,
245 xfs_off_t range_base, 245 xfs_off_t range_base,
246 size_t range_length, 246 size_t range_length,
247 xfs_buf_flags_t flags) 247 xfs_buf_flags_t flags)
248 { 248 {
249 /* 249 /*
250 * We don't want certain flags to appear in b_flags. 250 * We don't want certain flags to appear in b_flags.
251 */ 251 */
252 flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD); 252 flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD);
253 253
254 memset(bp, 0, sizeof(xfs_buf_t)); 254 memset(bp, 0, sizeof(xfs_buf_t));
255 atomic_set(&bp->b_hold, 1); 255 atomic_set(&bp->b_hold, 1);
256 init_completion(&bp->b_iowait); 256 init_completion(&bp->b_iowait);
257 INIT_LIST_HEAD(&bp->b_list); 257 INIT_LIST_HEAD(&bp->b_list);
258 INIT_LIST_HEAD(&bp->b_hash_list); 258 INIT_LIST_HEAD(&bp->b_hash_list);
259 init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ 259 init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
260 XB_SET_OWNER(bp); 260 XB_SET_OWNER(bp);
261 bp->b_target = target; 261 bp->b_target = target;
262 bp->b_file_offset = range_base; 262 bp->b_file_offset = range_base;
263 /* 263 /*
264 * Set buffer_length and count_desired to the same value initially. 264 * Set buffer_length and count_desired to the same value initially.
265 * I/O routines should use count_desired, which will be the same in 265 * I/O routines should use count_desired, which will be the same in
266 * most cases but may be reset (e.g. XFS recovery). 266 * most cases but may be reset (e.g. XFS recovery).
267 */ 267 */
268 bp->b_buffer_length = bp->b_count_desired = range_length; 268 bp->b_buffer_length = bp->b_count_desired = range_length;
269 bp->b_flags = flags; 269 bp->b_flags = flags;
270 bp->b_bn = XFS_BUF_DADDR_NULL; 270 bp->b_bn = XFS_BUF_DADDR_NULL;
271 atomic_set(&bp->b_pin_count, 0); 271 atomic_set(&bp->b_pin_count, 0);
272 init_waitqueue_head(&bp->b_waiters); 272 init_waitqueue_head(&bp->b_waiters);
273 273
274 XFS_STATS_INC(xb_create); 274 XFS_STATS_INC(xb_create);
275 XB_TRACE(bp, "initialize", target); 275 XB_TRACE(bp, "initialize", target);
276 } 276 }
277 277
278 /* 278 /*
279 * Allocate a page array capable of holding a specified number 279 * Allocate a page array capable of holding a specified number
280 * of pages, and point the page buf at it. 280 * of pages, and point the page buf at it.
281 */ 281 */
282 STATIC int 282 STATIC int
283 _xfs_buf_get_pages( 283 _xfs_buf_get_pages(
284 xfs_buf_t *bp, 284 xfs_buf_t *bp,
285 int page_count, 285 int page_count,
286 xfs_buf_flags_t flags) 286 xfs_buf_flags_t flags)
287 { 287 {
288 /* Make sure that we have a page list */ 288 /* Make sure that we have a page list */
289 if (bp->b_pages == NULL) { 289 if (bp->b_pages == NULL) {
290 bp->b_offset = xfs_buf_poff(bp->b_file_offset); 290 bp->b_offset = xfs_buf_poff(bp->b_file_offset);
291 bp->b_page_count = page_count; 291 bp->b_page_count = page_count;
292 if (page_count <= XB_PAGES) { 292 if (page_count <= XB_PAGES) {
293 bp->b_pages = bp->b_page_array; 293 bp->b_pages = bp->b_page_array;
294 } else { 294 } else {
295 bp->b_pages = kmem_alloc(sizeof(struct page *) * 295 bp->b_pages = kmem_alloc(sizeof(struct page *) *
296 page_count, xb_to_km(flags)); 296 page_count, xb_to_km(flags));
297 if (bp->b_pages == NULL) 297 if (bp->b_pages == NULL)
298 return -ENOMEM; 298 return -ENOMEM;
299 } 299 }
300 memset(bp->b_pages, 0, sizeof(struct page *) * page_count); 300 memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
301 } 301 }
302 return 0; 302 return 0;
303 } 303 }
304 304
305 /* 305 /*
306 * Frees b_pages if it was allocated. 306 * Frees b_pages if it was allocated.
307 */ 307 */
308 STATIC void 308 STATIC void
309 _xfs_buf_free_pages( 309 _xfs_buf_free_pages(
310 xfs_buf_t *bp) 310 xfs_buf_t *bp)
311 { 311 {
312 if (bp->b_pages != bp->b_page_array) { 312 if (bp->b_pages != bp->b_page_array) {
313 kmem_free(bp->b_pages); 313 kmem_free(bp->b_pages);
314 } 314 }
315 } 315 }
316 316
317 /* 317 /*
318 * Releases the specified buffer. 318 * Releases the specified buffer.
319 * 319 *
320 * The modification state of any associated pages is left unchanged. 320 * The modification state of any associated pages is left unchanged.
321 * The buffer most not be on any hash - use xfs_buf_rele instead for 321 * The buffer most not be on any hash - use xfs_buf_rele instead for
322 * hashed and refcounted buffers 322 * hashed and refcounted buffers
323 */ 323 */
324 void 324 void
325 xfs_buf_free( 325 xfs_buf_free(
326 xfs_buf_t *bp) 326 xfs_buf_t *bp)
327 { 327 {
328 XB_TRACE(bp, "free", 0); 328 XB_TRACE(bp, "free", 0);
329 329
330 ASSERT(list_empty(&bp->b_hash_list)); 330 ASSERT(list_empty(&bp->b_hash_list));
331 331
332 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { 332 if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
333 uint i; 333 uint i;
334 334
335 if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) 335 if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
336 free_address(bp->b_addr - bp->b_offset); 336 free_address(bp->b_addr - bp->b_offset);
337 337
338 for (i = 0; i < bp->b_page_count; i++) { 338 for (i = 0; i < bp->b_page_count; i++) {
339 struct page *page = bp->b_pages[i]; 339 struct page *page = bp->b_pages[i];
340 340
341 if (bp->b_flags & _XBF_PAGE_CACHE) 341 if (bp->b_flags & _XBF_PAGE_CACHE)
342 ASSERT(!PagePrivate(page)); 342 ASSERT(!PagePrivate(page));
343 page_cache_release(page); 343 page_cache_release(page);
344 } 344 }
345 _xfs_buf_free_pages(bp); 345 _xfs_buf_free_pages(bp);
346 } 346 }
347 347
348 xfs_buf_deallocate(bp); 348 xfs_buf_deallocate(bp);
349 } 349 }
350 350
351 /* 351 /*
352 * Finds all pages for buffer in question and builds it's page list. 352 * Finds all pages for buffer in question and builds it's page list.
353 */ 353 */
354 STATIC int 354 STATIC int
355 _xfs_buf_lookup_pages( 355 _xfs_buf_lookup_pages(
356 xfs_buf_t *bp, 356 xfs_buf_t *bp,
357 uint flags) 357 uint flags)
358 { 358 {
359 struct address_space *mapping = bp->b_target->bt_mapping; 359 struct address_space *mapping = bp->b_target->bt_mapping;
360 size_t blocksize = bp->b_target->bt_bsize; 360 size_t blocksize = bp->b_target->bt_bsize;
361 size_t size = bp->b_count_desired; 361 size_t size = bp->b_count_desired;
362 size_t nbytes, offset; 362 size_t nbytes, offset;
363 gfp_t gfp_mask = xb_to_gfp(flags); 363 gfp_t gfp_mask = xb_to_gfp(flags);
364 unsigned short page_count, i; 364 unsigned short page_count, i;
365 pgoff_t first; 365 pgoff_t first;
366 xfs_off_t end; 366 xfs_off_t end;
367 int error; 367 int error;
368 368
369 end = bp->b_file_offset + bp->b_buffer_length; 369 end = bp->b_file_offset + bp->b_buffer_length;
370 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); 370 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
371 371
372 error = _xfs_buf_get_pages(bp, page_count, flags); 372 error = _xfs_buf_get_pages(bp, page_count, flags);
373 if (unlikely(error)) 373 if (unlikely(error))
374 return error; 374 return error;
375 bp->b_flags |= _XBF_PAGE_CACHE; 375 bp->b_flags |= _XBF_PAGE_CACHE;
376 376
377 offset = bp->b_offset; 377 offset = bp->b_offset;
378 first = bp->b_file_offset >> PAGE_CACHE_SHIFT; 378 first = bp->b_file_offset >> PAGE_CACHE_SHIFT;
379 379
380 for (i = 0; i < bp->b_page_count; i++) { 380 for (i = 0; i < bp->b_page_count; i++) {
381 struct page *page; 381 struct page *page;
382 uint retries = 0; 382 uint retries = 0;
383 383
384 retry: 384 retry:
385 page = find_or_create_page(mapping, first + i, gfp_mask); 385 page = find_or_create_page(mapping, first + i, gfp_mask);
386 if (unlikely(page == NULL)) { 386 if (unlikely(page == NULL)) {
387 if (flags & XBF_READ_AHEAD) { 387 if (flags & XBF_READ_AHEAD) {
388 bp->b_page_count = i; 388 bp->b_page_count = i;
389 for (i = 0; i < bp->b_page_count; i++) 389 for (i = 0; i < bp->b_page_count; i++)
390 unlock_page(bp->b_pages[i]); 390 unlock_page(bp->b_pages[i]);
391 return -ENOMEM; 391 return -ENOMEM;
392 } 392 }
393 393
394 /* 394 /*
395 * This could deadlock. 395 * This could deadlock.
396 * 396 *
397 * But until all the XFS lowlevel code is revamped to 397 * But until all the XFS lowlevel code is revamped to
398 * handle buffer allocation failures we can't do much. 398 * handle buffer allocation failures we can't do much.
399 */ 399 */
400 if (!(++retries % 100)) 400 if (!(++retries % 100))
401 printk(KERN_ERR 401 printk(KERN_ERR
402 "XFS: possible memory allocation " 402 "XFS: possible memory allocation "
403 "deadlock in %s (mode:0x%x)\n", 403 "deadlock in %s (mode:0x%x)\n",
404 __func__, gfp_mask); 404 __func__, gfp_mask);
405 405
406 XFS_STATS_INC(xb_page_retries); 406 XFS_STATS_INC(xb_page_retries);
407 xfsbufd_wakeup(0, gfp_mask); 407 xfsbufd_wakeup(0, gfp_mask);
408 congestion_wait(WRITE, HZ/50); 408 congestion_wait(WRITE, HZ/50);
409 goto retry; 409 goto retry;
410 } 410 }
411 411
412 XFS_STATS_INC(xb_page_found); 412 XFS_STATS_INC(xb_page_found);
413 413
414 nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); 414 nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
415 size -= nbytes; 415 size -= nbytes;
416 416
417 ASSERT(!PagePrivate(page)); 417 ASSERT(!PagePrivate(page));
418 if (!PageUptodate(page)) { 418 if (!PageUptodate(page)) {
419 page_count--; 419 page_count--;
420 if (blocksize >= PAGE_CACHE_SIZE) { 420 if (blocksize >= PAGE_CACHE_SIZE) {
421 if (flags & XBF_READ) 421 if (flags & XBF_READ)
422 bp->b_flags |= _XBF_PAGE_LOCKED; 422 bp->b_flags |= _XBF_PAGE_LOCKED;
423 } else if (!PagePrivate(page)) { 423 } else if (!PagePrivate(page)) {
424 if (test_page_region(page, offset, nbytes)) 424 if (test_page_region(page, offset, nbytes))
425 page_count++; 425 page_count++;
426 } 426 }
427 } 427 }
428 428
429 bp->b_pages[i] = page; 429 bp->b_pages[i] = page;
430 offset = 0; 430 offset = 0;
431 } 431 }
432 432
433 if (!(bp->b_flags & _XBF_PAGE_LOCKED)) { 433 if (!(bp->b_flags & _XBF_PAGE_LOCKED)) {
434 for (i = 0; i < bp->b_page_count; i++) 434 for (i = 0; i < bp->b_page_count; i++)
435 unlock_page(bp->b_pages[i]); 435 unlock_page(bp->b_pages[i]);
436 } 436 }
437 437
438 if (page_count == bp->b_page_count) 438 if (page_count == bp->b_page_count)
439 bp->b_flags |= XBF_DONE; 439 bp->b_flags |= XBF_DONE;
440 440
441 XB_TRACE(bp, "lookup_pages", (long)page_count); 441 XB_TRACE(bp, "lookup_pages", (long)page_count);
442 return error; 442 return error;
443 } 443 }
444 444
445 /* 445 /*
446 * Map buffer into kernel address-space if nessecary. 446 * Map buffer into kernel address-space if nessecary.
447 */ 447 */
448 STATIC int 448 STATIC int
449 _xfs_buf_map_pages( 449 _xfs_buf_map_pages(
450 xfs_buf_t *bp, 450 xfs_buf_t *bp,
451 uint flags) 451 uint flags)
452 { 452 {
453 /* A single page buffer is always mappable */ 453 /* A single page buffer is always mappable */
454 if (bp->b_page_count == 1) { 454 if (bp->b_page_count == 1) {
455 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 455 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
456 bp->b_flags |= XBF_MAPPED; 456 bp->b_flags |= XBF_MAPPED;
457 } else if (flags & XBF_MAPPED) { 457 } else if (flags & XBF_MAPPED) {
458 if (as_list_len > 64) 458 if (as_list_len > 64)
459 purge_addresses(); 459 purge_addresses();
460 bp->b_addr = vmap(bp->b_pages, bp->b_page_count, 460 bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
461 VM_MAP, PAGE_KERNEL); 461 VM_MAP, PAGE_KERNEL);
462 if (unlikely(bp->b_addr == NULL)) 462 if (unlikely(bp->b_addr == NULL))
463 return -ENOMEM; 463 return -ENOMEM;
464 bp->b_addr += bp->b_offset; 464 bp->b_addr += bp->b_offset;
465 bp->b_flags |= XBF_MAPPED; 465 bp->b_flags |= XBF_MAPPED;
466 } 466 }
467 467
468 return 0; 468 return 0;
469 } 469 }
470 470
471 /* 471 /*
472 * Finding and Reading Buffers 472 * Finding and Reading Buffers
473 */ 473 */
474 474
475 /* 475 /*
476 * Look up, and creates if absent, a lockable buffer for 476 * Look up, and creates if absent, a lockable buffer for
477 * a given range of an inode. The buffer is returned 477 * a given range of an inode. The buffer is returned
478 * locked. If other overlapping buffers exist, they are 478 * locked. If other overlapping buffers exist, they are
479 * released before the new buffer is created and locked, 479 * released before the new buffer is created and locked,
480 * which may imply that this call will block until those buffers 480 * which may imply that this call will block until those buffers
481 * are unlocked. No I/O is implied by this call. 481 * are unlocked. No I/O is implied by this call.
482 */ 482 */
483 xfs_buf_t * 483 xfs_buf_t *
484 _xfs_buf_find( 484 _xfs_buf_find(
485 xfs_buftarg_t *btp, /* block device target */ 485 xfs_buftarg_t *btp, /* block device target */
486 xfs_off_t ioff, /* starting offset of range */ 486 xfs_off_t ioff, /* starting offset of range */
487 size_t isize, /* length of range */ 487 size_t isize, /* length of range */
488 xfs_buf_flags_t flags, 488 xfs_buf_flags_t flags,
489 xfs_buf_t *new_bp) 489 xfs_buf_t *new_bp)
490 { 490 {
491 xfs_off_t range_base; 491 xfs_off_t range_base;
492 size_t range_length; 492 size_t range_length;
493 xfs_bufhash_t *hash; 493 xfs_bufhash_t *hash;
494 xfs_buf_t *bp, *n; 494 xfs_buf_t *bp, *n;
495 495
496 range_base = (ioff << BBSHIFT); 496 range_base = (ioff << BBSHIFT);
497 range_length = (isize << BBSHIFT); 497 range_length = (isize << BBSHIFT);
498 498
499 /* Check for IOs smaller than the sector size / not sector aligned */ 499 /* Check for IOs smaller than the sector size / not sector aligned */
500 ASSERT(!(range_length < (1 << btp->bt_sshift))); 500 ASSERT(!(range_length < (1 << btp->bt_sshift)));
501 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); 501 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
502 502
503 hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; 503 hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
504 504
505 spin_lock(&hash->bh_lock); 505 spin_lock(&hash->bh_lock);
506 506
507 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { 507 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
508 ASSERT(btp == bp->b_target); 508 ASSERT(btp == bp->b_target);
509 if (bp->b_file_offset == range_base && 509 if (bp->b_file_offset == range_base &&
510 bp->b_buffer_length == range_length) { 510 bp->b_buffer_length == range_length) {
511 /* 511 /*
512 * If we look at something, bring it to the 512 * If we look at something, bring it to the
513 * front of the list for next time. 513 * front of the list for next time.
514 */ 514 */
515 atomic_inc(&bp->b_hold); 515 atomic_inc(&bp->b_hold);
516 list_move(&bp->b_hash_list, &hash->bh_list); 516 list_move(&bp->b_hash_list, &hash->bh_list);
517 goto found; 517 goto found;
518 } 518 }
519 } 519 }
520 520
521 /* No match found */ 521 /* No match found */
522 if (new_bp) { 522 if (new_bp) {
523 _xfs_buf_initialize(new_bp, btp, range_base, 523 _xfs_buf_initialize(new_bp, btp, range_base,
524 range_length, flags); 524 range_length, flags);
525 new_bp->b_hash = hash; 525 new_bp->b_hash = hash;
526 list_add(&new_bp->b_hash_list, &hash->bh_list); 526 list_add(&new_bp->b_hash_list, &hash->bh_list);
527 } else { 527 } else {
528 XFS_STATS_INC(xb_miss_locked); 528 XFS_STATS_INC(xb_miss_locked);
529 } 529 }
530 530
531 spin_unlock(&hash->bh_lock); 531 spin_unlock(&hash->bh_lock);
532 return new_bp; 532 return new_bp;
533 533
534 found: 534 found:
535 spin_unlock(&hash->bh_lock); 535 spin_unlock(&hash->bh_lock);
536 536
537 /* Attempt to get the semaphore without sleeping, 537 /* Attempt to get the semaphore without sleeping,
538 * if this does not work then we need to drop the 538 * if this does not work then we need to drop the
539 * spinlock and do a hard attempt on the semaphore. 539 * spinlock and do a hard attempt on the semaphore.
540 */ 540 */
541 if (down_trylock(&bp->b_sema)) { 541 if (down_trylock(&bp->b_sema)) {
542 if (!(flags & XBF_TRYLOCK)) { 542 if (!(flags & XBF_TRYLOCK)) {
543 /* wait for buffer ownership */ 543 /* wait for buffer ownership */
544 XB_TRACE(bp, "get_lock", 0); 544 XB_TRACE(bp, "get_lock", 0);
545 xfs_buf_lock(bp); 545 xfs_buf_lock(bp);
546 XFS_STATS_INC(xb_get_locked_waited); 546 XFS_STATS_INC(xb_get_locked_waited);
547 } else { 547 } else {
548 /* We asked for a trylock and failed, no need 548 /* We asked for a trylock and failed, no need
549 * to look at file offset and length here, we 549 * to look at file offset and length here, we
550 * know that this buffer at least overlaps our 550 * know that this buffer at least overlaps our
551 * buffer and is locked, therefore our buffer 551 * buffer and is locked, therefore our buffer
552 * either does not exist, or is this buffer. 552 * either does not exist, or is this buffer.
553 */ 553 */
554 xfs_buf_rele(bp); 554 xfs_buf_rele(bp);
555 XFS_STATS_INC(xb_busy_locked); 555 XFS_STATS_INC(xb_busy_locked);
556 return NULL; 556 return NULL;
557 } 557 }
558 } else { 558 } else {
559 /* trylock worked */ 559 /* trylock worked */
560 XB_SET_OWNER(bp); 560 XB_SET_OWNER(bp);
561 } 561 }
562 562
563 if (bp->b_flags & XBF_STALE) { 563 if (bp->b_flags & XBF_STALE) {
564 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 564 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
565 bp->b_flags &= XBF_MAPPED; 565 bp->b_flags &= XBF_MAPPED;
566 } 566 }
567 XB_TRACE(bp, "got_lock", 0); 567 XB_TRACE(bp, "got_lock", 0);
568 XFS_STATS_INC(xb_get_locked); 568 XFS_STATS_INC(xb_get_locked);
569 return bp; 569 return bp;
570 } 570 }
571 571
572 /* 572 /*
573 * Assembles a buffer covering the specified range. 573 * Assembles a buffer covering the specified range.
574 * Storage in memory for all portions of the buffer will be allocated, 574 * Storage in memory for all portions of the buffer will be allocated,
575 * although backing storage may not be. 575 * although backing storage may not be.
576 */ 576 */
577 xfs_buf_t * 577 xfs_buf_t *
578 xfs_buf_get_flags( 578 xfs_buf_get_flags(
579 xfs_buftarg_t *target,/* target for buffer */ 579 xfs_buftarg_t *target,/* target for buffer */
580 xfs_off_t ioff, /* starting offset of range */ 580 xfs_off_t ioff, /* starting offset of range */
581 size_t isize, /* length of range */ 581 size_t isize, /* length of range */
582 xfs_buf_flags_t flags) 582 xfs_buf_flags_t flags)
583 { 583 {
584 xfs_buf_t *bp, *new_bp; 584 xfs_buf_t *bp, *new_bp;
585 int error = 0, i; 585 int error = 0, i;
586 586
587 new_bp = xfs_buf_allocate(flags); 587 new_bp = xfs_buf_allocate(flags);
588 if (unlikely(!new_bp)) 588 if (unlikely(!new_bp))
589 return NULL; 589 return NULL;
590 590
591 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); 591 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
592 if (bp == new_bp) { 592 if (bp == new_bp) {
593 error = _xfs_buf_lookup_pages(bp, flags); 593 error = _xfs_buf_lookup_pages(bp, flags);
594 if (error) 594 if (error)
595 goto no_buffer; 595 goto no_buffer;
596 } else { 596 } else {
597 xfs_buf_deallocate(new_bp); 597 xfs_buf_deallocate(new_bp);
598 if (unlikely(bp == NULL)) 598 if (unlikely(bp == NULL))
599 return NULL; 599 return NULL;
600 } 600 }
601 601
602 for (i = 0; i < bp->b_page_count; i++) 602 for (i = 0; i < bp->b_page_count; i++)
603 mark_page_accessed(bp->b_pages[i]); 603 mark_page_accessed(bp->b_pages[i]);
604 604
605 if (!(bp->b_flags & XBF_MAPPED)) { 605 if (!(bp->b_flags & XBF_MAPPED)) {
606 error = _xfs_buf_map_pages(bp, flags); 606 error = _xfs_buf_map_pages(bp, flags);
607 if (unlikely(error)) { 607 if (unlikely(error)) {
608 printk(KERN_WARNING "%s: failed to map pages\n", 608 printk(KERN_WARNING "%s: failed to map pages\n",
609 __func__); 609 __func__);
610 goto no_buffer; 610 goto no_buffer;
611 } 611 }
612 } 612 }
613 613
614 XFS_STATS_INC(xb_get); 614 XFS_STATS_INC(xb_get);
615 615
616 /* 616 /*
617 * Always fill in the block number now, the mapped cases can do 617 * Always fill in the block number now, the mapped cases can do
618 * their own overlay of this later. 618 * their own overlay of this later.
619 */ 619 */
620 bp->b_bn = ioff; 620 bp->b_bn = ioff;
621 bp->b_count_desired = bp->b_buffer_length; 621 bp->b_count_desired = bp->b_buffer_length;
622 622
623 XB_TRACE(bp, "get", (unsigned long)flags); 623 XB_TRACE(bp, "get", (unsigned long)flags);
624 return bp; 624 return bp;
625 625
626 no_buffer: 626 no_buffer:
627 if (flags & (XBF_LOCK | XBF_TRYLOCK)) 627 if (flags & (XBF_LOCK | XBF_TRYLOCK))
628 xfs_buf_unlock(bp); 628 xfs_buf_unlock(bp);
629 xfs_buf_rele(bp); 629 xfs_buf_rele(bp);
630 return NULL; 630 return NULL;
631 } 631 }
632 632
633 STATIC int 633 STATIC int
634 _xfs_buf_read( 634 _xfs_buf_read(
635 xfs_buf_t *bp, 635 xfs_buf_t *bp,
636 xfs_buf_flags_t flags) 636 xfs_buf_flags_t flags)
637 { 637 {
638 int status; 638 int status;
639 639
640 XB_TRACE(bp, "_xfs_buf_read", (unsigned long)flags); 640 XB_TRACE(bp, "_xfs_buf_read", (unsigned long)flags);
641 641
642 ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE))); 642 ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
643 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); 643 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
644 644
645 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \ 645 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
646 XBF_READ_AHEAD | _XBF_RUN_QUEUES); 646 XBF_READ_AHEAD | _XBF_RUN_QUEUES);
647 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | \ 647 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | \
648 XBF_READ_AHEAD | _XBF_RUN_QUEUES); 648 XBF_READ_AHEAD | _XBF_RUN_QUEUES);
649 649
650 status = xfs_buf_iorequest(bp); 650 status = xfs_buf_iorequest(bp);
651 if (!status && !(flags & XBF_ASYNC)) 651 if (!status && !(flags & XBF_ASYNC))
652 status = xfs_buf_iowait(bp); 652 status = xfs_buf_iowait(bp);
653 return status; 653 return status;
654 } 654 }
655 655
656 xfs_buf_t * 656 xfs_buf_t *
657 xfs_buf_read_flags( 657 xfs_buf_read_flags(
658 xfs_buftarg_t *target, 658 xfs_buftarg_t *target,
659 xfs_off_t ioff, 659 xfs_off_t ioff,
660 size_t isize, 660 size_t isize,
661 xfs_buf_flags_t flags) 661 xfs_buf_flags_t flags)
662 { 662 {
663 xfs_buf_t *bp; 663 xfs_buf_t *bp;
664 664
665 flags |= XBF_READ; 665 flags |= XBF_READ;
666 666
667 bp = xfs_buf_get_flags(target, ioff, isize, flags); 667 bp = xfs_buf_get_flags(target, ioff, isize, flags);
668 if (bp) { 668 if (bp) {
669 if (!XFS_BUF_ISDONE(bp)) { 669 if (!XFS_BUF_ISDONE(bp)) {
670 XB_TRACE(bp, "read", (unsigned long)flags); 670 XB_TRACE(bp, "read", (unsigned long)flags);
671 XFS_STATS_INC(xb_get_read); 671 XFS_STATS_INC(xb_get_read);
672 _xfs_buf_read(bp, flags); 672 _xfs_buf_read(bp, flags);
673 } else if (flags & XBF_ASYNC) { 673 } else if (flags & XBF_ASYNC) {
674 XB_TRACE(bp, "read_async", (unsigned long)flags); 674 XB_TRACE(bp, "read_async", (unsigned long)flags);
675 /* 675 /*
676 * Read ahead call which is already satisfied, 676 * Read ahead call which is already satisfied,
677 * drop the buffer 677 * drop the buffer
678 */ 678 */
679 goto no_buffer; 679 goto no_buffer;
680 } else { 680 } else {
681 XB_TRACE(bp, "read_done", (unsigned long)flags); 681 XB_TRACE(bp, "read_done", (unsigned long)flags);
682 /* We do not want read in the flags */ 682 /* We do not want read in the flags */
683 bp->b_flags &= ~XBF_READ; 683 bp->b_flags &= ~XBF_READ;
684 } 684 }
685 } 685 }
686 686
687 return bp; 687 return bp;
688 688
689 no_buffer: 689 no_buffer:
690 if (flags & (XBF_LOCK | XBF_TRYLOCK)) 690 if (flags & (XBF_LOCK | XBF_TRYLOCK))
691 xfs_buf_unlock(bp); 691 xfs_buf_unlock(bp);
692 xfs_buf_rele(bp); 692 xfs_buf_rele(bp);
693 return NULL; 693 return NULL;
694 } 694 }
695 695
696 /* 696 /*
697 * If we are not low on memory then do the readahead in a deadlock 697 * If we are not low on memory then do the readahead in a deadlock
698 * safe manner. 698 * safe manner.
699 */ 699 */
700 void 700 void
701 xfs_buf_readahead( 701 xfs_buf_readahead(
702 xfs_buftarg_t *target, 702 xfs_buftarg_t *target,
703 xfs_off_t ioff, 703 xfs_off_t ioff,
704 size_t isize, 704 size_t isize,
705 xfs_buf_flags_t flags) 705 xfs_buf_flags_t flags)
706 { 706 {
707 struct backing_dev_info *bdi; 707 struct backing_dev_info *bdi;
708 708
709 bdi = target->bt_mapping->backing_dev_info; 709 bdi = target->bt_mapping->backing_dev_info;
710 if (bdi_read_congested(bdi)) 710 if (bdi_read_congested(bdi))
711 return; 711 return;
712 712
713 flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); 713 flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
714 xfs_buf_read_flags(target, ioff, isize, flags); 714 xfs_buf_read_flags(target, ioff, isize, flags);
715 } 715 }
716 716
717 xfs_buf_t * 717 xfs_buf_t *
718 xfs_buf_get_empty( 718 xfs_buf_get_empty(
719 size_t len, 719 size_t len,
720 xfs_buftarg_t *target) 720 xfs_buftarg_t *target)
721 { 721 {
722 xfs_buf_t *bp; 722 xfs_buf_t *bp;
723 723
724 bp = xfs_buf_allocate(0); 724 bp = xfs_buf_allocate(0);
725 if (bp) 725 if (bp)
726 _xfs_buf_initialize(bp, target, 0, len, 0); 726 _xfs_buf_initialize(bp, target, 0, len, 0);
727 return bp; 727 return bp;
728 } 728 }
729 729
730 static inline struct page * 730 static inline struct page *
731 mem_to_page( 731 mem_to_page(
732 void *addr) 732 void *addr)
733 { 733 {
734 if ((!is_vmalloc_addr(addr))) { 734 if ((!is_vmalloc_addr(addr))) {
735 return virt_to_page(addr); 735 return virt_to_page(addr);
736 } else { 736 } else {
737 return vmalloc_to_page(addr); 737 return vmalloc_to_page(addr);
738 } 738 }
739 } 739 }
740 740
741 int 741 int
742 xfs_buf_associate_memory( 742 xfs_buf_associate_memory(
743 xfs_buf_t *bp, 743 xfs_buf_t *bp,
744 void *mem, 744 void *mem,
745 size_t len) 745 size_t len)
746 { 746 {
747 int rval; 747 int rval;
748 int i = 0; 748 int i = 0;
749 unsigned long pageaddr; 749 unsigned long pageaddr;
750 unsigned long offset; 750 unsigned long offset;
751 size_t buflen; 751 size_t buflen;
752 int page_count; 752 int page_count;
753 753
754 pageaddr = (unsigned long)mem & PAGE_CACHE_MASK; 754 pageaddr = (unsigned long)mem & PAGE_CACHE_MASK;
755 offset = (unsigned long)mem - pageaddr; 755 offset = (unsigned long)mem - pageaddr;
756 buflen = PAGE_CACHE_ALIGN(len + offset); 756 buflen = PAGE_CACHE_ALIGN(len + offset);
757 page_count = buflen >> PAGE_CACHE_SHIFT; 757 page_count = buflen >> PAGE_CACHE_SHIFT;
758 758
759 /* Free any previous set of page pointers */ 759 /* Free any previous set of page pointers */
760 if (bp->b_pages) 760 if (bp->b_pages)
761 _xfs_buf_free_pages(bp); 761 _xfs_buf_free_pages(bp);
762 762
763 bp->b_pages = NULL; 763 bp->b_pages = NULL;
764 bp->b_addr = mem; 764 bp->b_addr = mem;
765 765
766 rval = _xfs_buf_get_pages(bp, page_count, 0); 766 rval = _xfs_buf_get_pages(bp, page_count, 0);
767 if (rval) 767 if (rval)
768 return rval; 768 return rval;
769 769
770 bp->b_offset = offset; 770 bp->b_offset = offset;
771 771
772 for (i = 0; i < bp->b_page_count; i++) { 772 for (i = 0; i < bp->b_page_count; i++) {
773 bp->b_pages[i] = mem_to_page((void *)pageaddr); 773 bp->b_pages[i] = mem_to_page((void *)pageaddr);
774 pageaddr += PAGE_CACHE_SIZE; 774 pageaddr += PAGE_CACHE_SIZE;
775 } 775 }
776 776
777 bp->b_count_desired = len; 777 bp->b_count_desired = len;
778 bp->b_buffer_length = buflen; 778 bp->b_buffer_length = buflen;
779 bp->b_flags |= XBF_MAPPED; 779 bp->b_flags |= XBF_MAPPED;
780 bp->b_flags &= ~_XBF_PAGE_LOCKED; 780 bp->b_flags &= ~_XBF_PAGE_LOCKED;
781 781
782 return 0; 782 return 0;
783 } 783 }
784 784
785 xfs_buf_t * 785 xfs_buf_t *
786 xfs_buf_get_noaddr( 786 xfs_buf_get_noaddr(
787 size_t len, 787 size_t len,
788 xfs_buftarg_t *target) 788 xfs_buftarg_t *target)
789 { 789 {
790 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; 790 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
791 int error, i; 791 int error, i;
792 xfs_buf_t *bp; 792 xfs_buf_t *bp;
793 793
794 bp = xfs_buf_allocate(0); 794 bp = xfs_buf_allocate(0);
795 if (unlikely(bp == NULL)) 795 if (unlikely(bp == NULL))
796 goto fail; 796 goto fail;
797 _xfs_buf_initialize(bp, target, 0, len, 0); 797 _xfs_buf_initialize(bp, target, 0, len, 0);
798 798
799 error = _xfs_buf_get_pages(bp, page_count, 0); 799 error = _xfs_buf_get_pages(bp, page_count, 0);
800 if (error) 800 if (error)
801 goto fail_free_buf; 801 goto fail_free_buf;
802 802
803 for (i = 0; i < page_count; i++) { 803 for (i = 0; i < page_count; i++) {
804 bp->b_pages[i] = alloc_page(GFP_KERNEL); 804 bp->b_pages[i] = alloc_page(GFP_KERNEL);
805 if (!bp->b_pages[i]) 805 if (!bp->b_pages[i])
806 goto fail_free_mem; 806 goto fail_free_mem;
807 } 807 }
808 bp->b_flags |= _XBF_PAGES; 808 bp->b_flags |= _XBF_PAGES;
809 809
810 error = _xfs_buf_map_pages(bp, XBF_MAPPED); 810 error = _xfs_buf_map_pages(bp, XBF_MAPPED);
811 if (unlikely(error)) { 811 if (unlikely(error)) {
812 printk(KERN_WARNING "%s: failed to map pages\n", 812 printk(KERN_WARNING "%s: failed to map pages\n",
813 __func__); 813 __func__);
814 goto fail_free_mem; 814 goto fail_free_mem;
815 } 815 }
816 816
817 xfs_buf_unlock(bp); 817 xfs_buf_unlock(bp);
818 818
819 XB_TRACE(bp, "no_daddr", len); 819 XB_TRACE(bp, "no_daddr", len);
820 return bp; 820 return bp;
821 821
822 fail_free_mem: 822 fail_free_mem:
823 while (--i >= 0) 823 while (--i >= 0)
824 __free_page(bp->b_pages[i]); 824 __free_page(bp->b_pages[i]);
825 _xfs_buf_free_pages(bp); 825 _xfs_buf_free_pages(bp);
826 fail_free_buf: 826 fail_free_buf:
827 xfs_buf_deallocate(bp); 827 xfs_buf_deallocate(bp);
828 fail: 828 fail:
829 return NULL; 829 return NULL;
830 } 830 }
831 831
832 /* 832 /*
833 * Increment reference count on buffer, to hold the buffer concurrently 833 * Increment reference count on buffer, to hold the buffer concurrently
834 * with another thread which may release (free) the buffer asynchronously. 834 * with another thread which may release (free) the buffer asynchronously.
835 * Must hold the buffer already to call this function. 835 * Must hold the buffer already to call this function.
836 */ 836 */
837 void 837 void
838 xfs_buf_hold( 838 xfs_buf_hold(
839 xfs_buf_t *bp) 839 xfs_buf_t *bp)
840 { 840 {
841 atomic_inc(&bp->b_hold); 841 atomic_inc(&bp->b_hold);
842 XB_TRACE(bp, "hold", 0); 842 XB_TRACE(bp, "hold", 0);
843 } 843 }
844 844
845 /* 845 /*
846 * Releases a hold on the specified buffer. If the 846 * Releases a hold on the specified buffer. If the
847 * the hold count is 1, calls xfs_buf_free. 847 * the hold count is 1, calls xfs_buf_free.
848 */ 848 */
849 void 849 void
850 xfs_buf_rele( 850 xfs_buf_rele(
851 xfs_buf_t *bp) 851 xfs_buf_t *bp)
852 { 852 {
853 xfs_bufhash_t *hash = bp->b_hash; 853 xfs_bufhash_t *hash = bp->b_hash;
854 854
855 XB_TRACE(bp, "rele", bp->b_relse); 855 XB_TRACE(bp, "rele", bp->b_relse);
856 856
857 if (unlikely(!hash)) { 857 if (unlikely(!hash)) {
858 ASSERT(!bp->b_relse); 858 ASSERT(!bp->b_relse);
859 if (atomic_dec_and_test(&bp->b_hold)) 859 if (atomic_dec_and_test(&bp->b_hold))
860 xfs_buf_free(bp); 860 xfs_buf_free(bp);
861 return; 861 return;
862 } 862 }
863 863
864 ASSERT(atomic_read(&bp->b_hold) > 0); 864 ASSERT(atomic_read(&bp->b_hold) > 0);
865 if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { 865 if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
866 if (bp->b_relse) { 866 if (bp->b_relse) {
867 atomic_inc(&bp->b_hold); 867 atomic_inc(&bp->b_hold);
868 spin_unlock(&hash->bh_lock); 868 spin_unlock(&hash->bh_lock);
869 (*(bp->b_relse)) (bp); 869 (*(bp->b_relse)) (bp);
870 } else if (bp->b_flags & XBF_FS_MANAGED) { 870 } else if (bp->b_flags & XBF_FS_MANAGED) {
871 spin_unlock(&hash->bh_lock); 871 spin_unlock(&hash->bh_lock);
872 } else { 872 } else {
873 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); 873 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
874 list_del_init(&bp->b_hash_list); 874 list_del_init(&bp->b_hash_list);
875 spin_unlock(&hash->bh_lock); 875 spin_unlock(&hash->bh_lock);
876 xfs_buf_free(bp); 876 xfs_buf_free(bp);
877 } 877 }
878 } 878 }
879 } 879 }
880 880
881 881
882 /* 882 /*
883 * Mutual exclusion on buffers. Locking model: 883 * Mutual exclusion on buffers. Locking model:
884 * 884 *
885 * Buffers associated with inodes for which buffer locking 885 * Buffers associated with inodes for which buffer locking
886 * is not enabled are not protected by semaphores, and are 886 * is not enabled are not protected by semaphores, and are
887 * assumed to be exclusively owned by the caller. There is a 887 * assumed to be exclusively owned by the caller. There is a
888 * spinlock in the buffer, used by the caller when concurrent 888 * spinlock in the buffer, used by the caller when concurrent
889 * access is possible. 889 * access is possible.
890 */ 890 */
891 891
892 /* 892 /*
893 * Locks a buffer object, if it is not already locked. 893 * Locks a buffer object, if it is not already locked.
894 * Note that this in no way locks the underlying pages, so it is only 894 * Note that this in no way locks the underlying pages, so it is only
895 * useful for synchronizing concurrent use of buffer objects, not for 895 * useful for synchronizing concurrent use of buffer objects, not for
896 * synchronizing independent access to the underlying pages. 896 * synchronizing independent access to the underlying pages.
897 */ 897 */
898 int 898 int
899 xfs_buf_cond_lock( 899 xfs_buf_cond_lock(
900 xfs_buf_t *bp) 900 xfs_buf_t *bp)
901 { 901 {
902 int locked; 902 int locked;
903 903
904 locked = down_trylock(&bp->b_sema) == 0; 904 locked = down_trylock(&bp->b_sema) == 0;
905 if (locked) { 905 if (locked) {
906 XB_SET_OWNER(bp); 906 XB_SET_OWNER(bp);
907 } 907 }
908 XB_TRACE(bp, "cond_lock", (long)locked); 908 XB_TRACE(bp, "cond_lock", (long)locked);
909 return locked ? 0 : -EBUSY; 909 return locked ? 0 : -EBUSY;
910 } 910 }
911 911
912 #if defined(DEBUG) || defined(XFS_BLI_TRACE) 912 #if defined(DEBUG) || defined(XFS_BLI_TRACE)
913 int 913 int
914 xfs_buf_lock_value( 914 xfs_buf_lock_value(
915 xfs_buf_t *bp) 915 xfs_buf_t *bp)
916 { 916 {
917 return bp->b_sema.count; 917 return bp->b_sema.count;
918 } 918 }
919 #endif 919 #endif
920 920
921 /* 921 /*
922 * Locks a buffer object. 922 * Locks a buffer object.
923 * Note that this in no way locks the underlying pages, so it is only 923 * Note that this in no way locks the underlying pages, so it is only
924 * useful for synchronizing concurrent use of buffer objects, not for 924 * useful for synchronizing concurrent use of buffer objects, not for
925 * synchronizing independent access to the underlying pages. 925 * synchronizing independent access to the underlying pages.
926 */ 926 */
927 void 927 void
928 xfs_buf_lock( 928 xfs_buf_lock(
929 xfs_buf_t *bp) 929 xfs_buf_t *bp)
930 { 930 {
931 XB_TRACE(bp, "lock", 0); 931 XB_TRACE(bp, "lock", 0);
932 if (atomic_read(&bp->b_io_remaining)) 932 if (atomic_read(&bp->b_io_remaining))
933 blk_run_address_space(bp->b_target->bt_mapping); 933 blk_run_address_space(bp->b_target->bt_mapping);
934 down(&bp->b_sema); 934 down(&bp->b_sema);
935 XB_SET_OWNER(bp); 935 XB_SET_OWNER(bp);
936 XB_TRACE(bp, "locked", 0); 936 XB_TRACE(bp, "locked", 0);
937 } 937 }
938 938
939 /* 939 /*
940 * Releases the lock on the buffer object. 940 * Releases the lock on the buffer object.
941 * If the buffer is marked delwri but is not queued, do so before we 941 * If the buffer is marked delwri but is not queued, do so before we
942 * unlock the buffer as we need to set flags correctly. We also need to 942 * unlock the buffer as we need to set flags correctly. We also need to
943 * take a reference for the delwri queue because the unlocker is going to 943 * take a reference for the delwri queue because the unlocker is going to
944 * drop their's and they don't know we just queued it. 944 * drop their's and they don't know we just queued it.
945 */ 945 */
946 void 946 void
947 xfs_buf_unlock( 947 xfs_buf_unlock(
948 xfs_buf_t *bp) 948 xfs_buf_t *bp)
949 { 949 {
950 if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) { 950 if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
951 atomic_inc(&bp->b_hold); 951 atomic_inc(&bp->b_hold);
952 bp->b_flags |= XBF_ASYNC; 952 bp->b_flags |= XBF_ASYNC;
953 xfs_buf_delwri_queue(bp, 0); 953 xfs_buf_delwri_queue(bp, 0);
954 } 954 }
955 955
956 XB_CLEAR_OWNER(bp); 956 XB_CLEAR_OWNER(bp);
957 up(&bp->b_sema); 957 up(&bp->b_sema);
958 XB_TRACE(bp, "unlock", 0); 958 XB_TRACE(bp, "unlock", 0);
959 } 959 }
960 960
961 961
962 /* 962 /*
963 * Pinning Buffer Storage in Memory 963 * Pinning Buffer Storage in Memory
964 * Ensure that no attempt to force a buffer to disk will succeed. 964 * Ensure that no attempt to force a buffer to disk will succeed.
965 */ 965 */
966 void 966 void
967 xfs_buf_pin( 967 xfs_buf_pin(
968 xfs_buf_t *bp) 968 xfs_buf_t *bp)
969 { 969 {
970 atomic_inc(&bp->b_pin_count); 970 atomic_inc(&bp->b_pin_count);
971 XB_TRACE(bp, "pin", (long)bp->b_pin_count.counter); 971 XB_TRACE(bp, "pin", (long)bp->b_pin_count.counter);
972 } 972 }
973 973
974 void 974 void
975 xfs_buf_unpin( 975 xfs_buf_unpin(
976 xfs_buf_t *bp) 976 xfs_buf_t *bp)
977 { 977 {
978 if (atomic_dec_and_test(&bp->b_pin_count)) 978 if (atomic_dec_and_test(&bp->b_pin_count))
979 wake_up_all(&bp->b_waiters); 979 wake_up_all(&bp->b_waiters);
980 XB_TRACE(bp, "unpin", (long)bp->b_pin_count.counter); 980 XB_TRACE(bp, "unpin", (long)bp->b_pin_count.counter);
981 } 981 }
982 982
983 int 983 int
984 xfs_buf_ispin( 984 xfs_buf_ispin(
985 xfs_buf_t *bp) 985 xfs_buf_t *bp)
986 { 986 {
987 return atomic_read(&bp->b_pin_count); 987 return atomic_read(&bp->b_pin_count);
988 } 988 }
989 989
990 STATIC void 990 STATIC void
991 xfs_buf_wait_unpin( 991 xfs_buf_wait_unpin(
992 xfs_buf_t *bp) 992 xfs_buf_t *bp)
993 { 993 {
994 DECLARE_WAITQUEUE (wait, current); 994 DECLARE_WAITQUEUE (wait, current);
995 995
996 if (atomic_read(&bp->b_pin_count) == 0) 996 if (atomic_read(&bp->b_pin_count) == 0)
997 return; 997 return;
998 998
999 add_wait_queue(&bp->b_waiters, &wait); 999 add_wait_queue(&bp->b_waiters, &wait);
1000 for (;;) { 1000 for (;;) {
1001 set_current_state(TASK_UNINTERRUPTIBLE); 1001 set_current_state(TASK_UNINTERRUPTIBLE);
1002 if (atomic_read(&bp->b_pin_count) == 0) 1002 if (atomic_read(&bp->b_pin_count) == 0)
1003 break; 1003 break;
1004 if (atomic_read(&bp->b_io_remaining)) 1004 if (atomic_read(&bp->b_io_remaining))
1005 blk_run_address_space(bp->b_target->bt_mapping); 1005 blk_run_address_space(bp->b_target->bt_mapping);
1006 schedule(); 1006 schedule();
1007 } 1007 }
1008 remove_wait_queue(&bp->b_waiters, &wait); 1008 remove_wait_queue(&bp->b_waiters, &wait);
1009 set_current_state(TASK_RUNNING); 1009 set_current_state(TASK_RUNNING);
1010 } 1010 }
1011 1011
1012 /* 1012 /*
1013 * Buffer Utility Routines 1013 * Buffer Utility Routines
1014 */ 1014 */
1015 1015
1016 STATIC void 1016 STATIC void
1017 xfs_buf_iodone_work( 1017 xfs_buf_iodone_work(
1018 struct work_struct *work) 1018 struct work_struct *work)
1019 { 1019 {
1020 xfs_buf_t *bp = 1020 xfs_buf_t *bp =
1021 container_of(work, xfs_buf_t, b_iodone_work); 1021 container_of(work, xfs_buf_t, b_iodone_work);
1022 1022
1023 /* 1023 /*
1024 * We can get an EOPNOTSUPP to ordered writes. Here we clear the 1024 * We can get an EOPNOTSUPP to ordered writes. Here we clear the
1025 * ordered flag and reissue them. Because we can't tell the higher 1025 * ordered flag and reissue them. Because we can't tell the higher
1026 * layers directly that they should not issue ordered I/O anymore, they 1026 * layers directly that they should not issue ordered I/O anymore, they
1027 * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion. 1027 * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
1028 */ 1028 */
1029 if ((bp->b_error == EOPNOTSUPP) && 1029 if ((bp->b_error == EOPNOTSUPP) &&
1030 (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) { 1030 (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
1031 XB_TRACE(bp, "ordered_retry", bp->b_iodone); 1031 XB_TRACE(bp, "ordered_retry", bp->b_iodone);
1032 bp->b_flags &= ~XBF_ORDERED; 1032 bp->b_flags &= ~XBF_ORDERED;
1033 bp->b_flags |= _XFS_BARRIER_FAILED; 1033 bp->b_flags |= _XFS_BARRIER_FAILED;
1034 xfs_buf_iorequest(bp); 1034 xfs_buf_iorequest(bp);
1035 } else if (bp->b_iodone) 1035 } else if (bp->b_iodone)
1036 (*(bp->b_iodone))(bp); 1036 (*(bp->b_iodone))(bp);
1037 else if (bp->b_flags & XBF_ASYNC) 1037 else if (bp->b_flags & XBF_ASYNC)
1038 xfs_buf_relse(bp); 1038 xfs_buf_relse(bp);
1039 } 1039 }
1040 1040
1041 void 1041 void
1042 xfs_buf_ioend( 1042 xfs_buf_ioend(
1043 xfs_buf_t *bp, 1043 xfs_buf_t *bp,
1044 int schedule) 1044 int schedule)
1045 { 1045 {
1046 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); 1046 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1047 if (bp->b_error == 0) 1047 if (bp->b_error == 0)
1048 bp->b_flags |= XBF_DONE; 1048 bp->b_flags |= XBF_DONE;
1049 1049
1050 XB_TRACE(bp, "iodone", bp->b_iodone); 1050 XB_TRACE(bp, "iodone", bp->b_iodone);
1051 1051
1052 if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { 1052 if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
1053 if (schedule) { 1053 if (schedule) {
1054 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); 1054 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
1055 queue_work(xfslogd_workqueue, &bp->b_iodone_work); 1055 queue_work(xfslogd_workqueue, &bp->b_iodone_work);
1056 } else { 1056 } else {
1057 xfs_buf_iodone_work(&bp->b_iodone_work); 1057 xfs_buf_iodone_work(&bp->b_iodone_work);
1058 } 1058 }
1059 } else { 1059 } else {
1060 complete(&bp->b_iowait); 1060 complete(&bp->b_iowait);
1061 } 1061 }
1062 } 1062 }
1063 1063
1064 void 1064 void
1065 xfs_buf_ioerror( 1065 xfs_buf_ioerror(
1066 xfs_buf_t *bp, 1066 xfs_buf_t *bp,
1067 int error) 1067 int error)
1068 { 1068 {
1069 ASSERT(error >= 0 && error <= 0xffff); 1069 ASSERT(error >= 0 && error <= 0xffff);
1070 bp->b_error = (unsigned short)error; 1070 bp->b_error = (unsigned short)error;
1071 XB_TRACE(bp, "ioerror", (unsigned long)error); 1071 XB_TRACE(bp, "ioerror", (unsigned long)error);
1072 } 1072 }
1073 1073
1074 int 1074 int
1075 xfs_bawrite( 1075 xfs_bawrite(
1076 void *mp, 1076 void *mp,
1077 struct xfs_buf *bp) 1077 struct xfs_buf *bp)
1078 { 1078 {
1079 XB_TRACE(bp, "bawrite", 0); 1079 XB_TRACE(bp, "bawrite", 0);
1080 1080
1081 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); 1081 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
1082 1082
1083 xfs_buf_delwri_dequeue(bp); 1083 xfs_buf_delwri_dequeue(bp);
1084 1084
1085 bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD); 1085 bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD);
1086 bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES); 1086 bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
1087 1087
1088 bp->b_fspriv3 = mp; 1088 bp->b_mount = mp;
1089 bp->b_strat = xfs_bdstrat_cb; 1089 bp->b_strat = xfs_bdstrat_cb;
1090 return xfs_bdstrat_cb(bp); 1090 return xfs_bdstrat_cb(bp);
1091 } 1091 }
1092 1092
1093 void 1093 void
1094 xfs_bdwrite( 1094 xfs_bdwrite(
1095 void *mp, 1095 void *mp,
1096 struct xfs_buf *bp) 1096 struct xfs_buf *bp)
1097 { 1097 {
1098 XB_TRACE(bp, "bdwrite", 0); 1098 XB_TRACE(bp, "bdwrite", 0);
1099 1099
1100 bp->b_strat = xfs_bdstrat_cb; 1100 bp->b_strat = xfs_bdstrat_cb;
1101 bp->b_fspriv3 = mp; 1101 bp->b_mount = mp;
1102 1102
1103 bp->b_flags &= ~XBF_READ; 1103 bp->b_flags &= ~XBF_READ;
1104 bp->b_flags |= (XBF_DELWRI | XBF_ASYNC); 1104 bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
1105 1105
1106 xfs_buf_delwri_queue(bp, 1); 1106 xfs_buf_delwri_queue(bp, 1);
1107 } 1107 }
1108 1108
1109 STATIC_INLINE void 1109 STATIC_INLINE void
1110 _xfs_buf_ioend( 1110 _xfs_buf_ioend(
1111 xfs_buf_t *bp, 1111 xfs_buf_t *bp,
1112 int schedule) 1112 int schedule)
1113 { 1113 {
1114 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { 1114 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
1115 bp->b_flags &= ~_XBF_PAGE_LOCKED; 1115 bp->b_flags &= ~_XBF_PAGE_LOCKED;
1116 xfs_buf_ioend(bp, schedule); 1116 xfs_buf_ioend(bp, schedule);
1117 } 1117 }
1118 } 1118 }
1119 1119
1120 STATIC void 1120 STATIC void
1121 xfs_buf_bio_end_io( 1121 xfs_buf_bio_end_io(
1122 struct bio *bio, 1122 struct bio *bio,
1123 int error) 1123 int error)
1124 { 1124 {
1125 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; 1125 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
1126 unsigned int blocksize = bp->b_target->bt_bsize; 1126 unsigned int blocksize = bp->b_target->bt_bsize;
1127 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1127 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1128 1128
1129 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 1129 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1130 bp->b_error = EIO; 1130 bp->b_error = EIO;
1131 1131
1132 do { 1132 do {
1133 struct page *page = bvec->bv_page; 1133 struct page *page = bvec->bv_page;
1134 1134
1135 ASSERT(!PagePrivate(page)); 1135 ASSERT(!PagePrivate(page));
1136 if (unlikely(bp->b_error)) { 1136 if (unlikely(bp->b_error)) {
1137 if (bp->b_flags & XBF_READ) 1137 if (bp->b_flags & XBF_READ)
1138 ClearPageUptodate(page); 1138 ClearPageUptodate(page);
1139 } else if (blocksize >= PAGE_CACHE_SIZE) { 1139 } else if (blocksize >= PAGE_CACHE_SIZE) {
1140 SetPageUptodate(page); 1140 SetPageUptodate(page);
1141 } else if (!PagePrivate(page) && 1141 } else if (!PagePrivate(page) &&
1142 (bp->b_flags & _XBF_PAGE_CACHE)) { 1142 (bp->b_flags & _XBF_PAGE_CACHE)) {
1143 set_page_region(page, bvec->bv_offset, bvec->bv_len); 1143 set_page_region(page, bvec->bv_offset, bvec->bv_len);
1144 } 1144 }
1145 1145
1146 if (--bvec >= bio->bi_io_vec) 1146 if (--bvec >= bio->bi_io_vec)
1147 prefetchw(&bvec->bv_page->flags); 1147 prefetchw(&bvec->bv_page->flags);
1148 1148
1149 if (bp->b_flags & _XBF_PAGE_LOCKED) 1149 if (bp->b_flags & _XBF_PAGE_LOCKED)
1150 unlock_page(page); 1150 unlock_page(page);
1151 } while (bvec >= bio->bi_io_vec); 1151 } while (bvec >= bio->bi_io_vec);
1152 1152
1153 _xfs_buf_ioend(bp, 1); 1153 _xfs_buf_ioend(bp, 1);
1154 bio_put(bio); 1154 bio_put(bio);
1155 } 1155 }
1156 1156
1157 STATIC void 1157 STATIC void
1158 _xfs_buf_ioapply( 1158 _xfs_buf_ioapply(
1159 xfs_buf_t *bp) 1159 xfs_buf_t *bp)
1160 { 1160 {
1161 int rw, map_i, total_nr_pages, nr_pages; 1161 int rw, map_i, total_nr_pages, nr_pages;
1162 struct bio *bio; 1162 struct bio *bio;
1163 int offset = bp->b_offset; 1163 int offset = bp->b_offset;
1164 int size = bp->b_count_desired; 1164 int size = bp->b_count_desired;
1165 sector_t sector = bp->b_bn; 1165 sector_t sector = bp->b_bn;
1166 unsigned int blocksize = bp->b_target->bt_bsize; 1166 unsigned int blocksize = bp->b_target->bt_bsize;
1167 1167
1168 total_nr_pages = bp->b_page_count; 1168 total_nr_pages = bp->b_page_count;
1169 map_i = 0; 1169 map_i = 0;
1170 1170
1171 if (bp->b_flags & XBF_ORDERED) { 1171 if (bp->b_flags & XBF_ORDERED) {
1172 ASSERT(!(bp->b_flags & XBF_READ)); 1172 ASSERT(!(bp->b_flags & XBF_READ));
1173 rw = WRITE_BARRIER; 1173 rw = WRITE_BARRIER;
1174 } else if (bp->b_flags & _XBF_RUN_QUEUES) { 1174 } else if (bp->b_flags & _XBF_RUN_QUEUES) {
1175 ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); 1175 ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
1176 bp->b_flags &= ~_XBF_RUN_QUEUES; 1176 bp->b_flags &= ~_XBF_RUN_QUEUES;
1177 rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC; 1177 rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC;
1178 } else { 1178 } else {
1179 rw = (bp->b_flags & XBF_WRITE) ? WRITE : 1179 rw = (bp->b_flags & XBF_WRITE) ? WRITE :
1180 (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; 1180 (bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
1181 } 1181 }
1182 1182
1183 /* Special code path for reading a sub page size buffer in -- 1183 /* Special code path for reading a sub page size buffer in --
1184 * we populate up the whole page, and hence the other metadata 1184 * we populate up the whole page, and hence the other metadata
1185 * in the same page. This optimization is only valid when the 1185 * in the same page. This optimization is only valid when the
1186 * filesystem block size is not smaller than the page size. 1186 * filesystem block size is not smaller than the page size.
1187 */ 1187 */
1188 if ((bp->b_buffer_length < PAGE_CACHE_SIZE) && 1188 if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
1189 ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) == 1189 ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) ==
1190 (XBF_READ|_XBF_PAGE_LOCKED)) && 1190 (XBF_READ|_XBF_PAGE_LOCKED)) &&
1191 (blocksize >= PAGE_CACHE_SIZE)) { 1191 (blocksize >= PAGE_CACHE_SIZE)) {
1192 bio = bio_alloc(GFP_NOIO, 1); 1192 bio = bio_alloc(GFP_NOIO, 1);
1193 1193
1194 bio->bi_bdev = bp->b_target->bt_bdev; 1194 bio->bi_bdev = bp->b_target->bt_bdev;
1195 bio->bi_sector = sector - (offset >> BBSHIFT); 1195 bio->bi_sector = sector - (offset >> BBSHIFT);
1196 bio->bi_end_io = xfs_buf_bio_end_io; 1196 bio->bi_end_io = xfs_buf_bio_end_io;
1197 bio->bi_private = bp; 1197 bio->bi_private = bp;
1198 1198
1199 bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0); 1199 bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0);
1200 size = 0; 1200 size = 0;
1201 1201
1202 atomic_inc(&bp->b_io_remaining); 1202 atomic_inc(&bp->b_io_remaining);
1203 1203
1204 goto submit_io; 1204 goto submit_io;
1205 } 1205 }
1206 1206
1207 next_chunk: 1207 next_chunk:
1208 atomic_inc(&bp->b_io_remaining); 1208 atomic_inc(&bp->b_io_remaining);
1209 nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); 1209 nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
1210 if (nr_pages > total_nr_pages) 1210 if (nr_pages > total_nr_pages)
1211 nr_pages = total_nr_pages; 1211 nr_pages = total_nr_pages;
1212 1212
1213 bio = bio_alloc(GFP_NOIO, nr_pages); 1213 bio = bio_alloc(GFP_NOIO, nr_pages);
1214 bio->bi_bdev = bp->b_target->bt_bdev; 1214 bio->bi_bdev = bp->b_target->bt_bdev;
1215 bio->bi_sector = sector; 1215 bio->bi_sector = sector;
1216 bio->bi_end_io = xfs_buf_bio_end_io; 1216 bio->bi_end_io = xfs_buf_bio_end_io;
1217 bio->bi_private = bp; 1217 bio->bi_private = bp;
1218 1218
1219 for (; size && nr_pages; nr_pages--, map_i++) { 1219 for (; size && nr_pages; nr_pages--, map_i++) {
1220 int rbytes, nbytes = PAGE_CACHE_SIZE - offset; 1220 int rbytes, nbytes = PAGE_CACHE_SIZE - offset;
1221 1221
1222 if (nbytes > size) 1222 if (nbytes > size)
1223 nbytes = size; 1223 nbytes = size;
1224 1224
1225 rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset); 1225 rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);
1226 if (rbytes < nbytes) 1226 if (rbytes < nbytes)
1227 break; 1227 break;
1228 1228
1229 offset = 0; 1229 offset = 0;
1230 sector += nbytes >> BBSHIFT; 1230 sector += nbytes >> BBSHIFT;
1231 size -= nbytes; 1231 size -= nbytes;
1232 total_nr_pages--; 1232 total_nr_pages--;
1233 } 1233 }
1234 1234
1235 submit_io: 1235 submit_io:
1236 if (likely(bio->bi_size)) { 1236 if (likely(bio->bi_size)) {
1237 submit_bio(rw, bio); 1237 submit_bio(rw, bio);
1238 if (size) 1238 if (size)
1239 goto next_chunk; 1239 goto next_chunk;
1240 } else { 1240 } else {
1241 bio_put(bio); 1241 bio_put(bio);
1242 xfs_buf_ioerror(bp, EIO); 1242 xfs_buf_ioerror(bp, EIO);
1243 } 1243 }
1244 } 1244 }
1245 1245
1246 int 1246 int
1247 xfs_buf_iorequest( 1247 xfs_buf_iorequest(
1248 xfs_buf_t *bp) 1248 xfs_buf_t *bp)
1249 { 1249 {
1250 XB_TRACE(bp, "iorequest", 0); 1250 XB_TRACE(bp, "iorequest", 0);
1251 1251
1252 if (bp->b_flags & XBF_DELWRI) { 1252 if (bp->b_flags & XBF_DELWRI) {
1253 xfs_buf_delwri_queue(bp, 1); 1253 xfs_buf_delwri_queue(bp, 1);
1254 return 0; 1254 return 0;
1255 } 1255 }
1256 1256
1257 if (bp->b_flags & XBF_WRITE) { 1257 if (bp->b_flags & XBF_WRITE) {
1258 xfs_buf_wait_unpin(bp); 1258 xfs_buf_wait_unpin(bp);
1259 } 1259 }
1260 1260
1261 xfs_buf_hold(bp); 1261 xfs_buf_hold(bp);
1262 1262
1263 /* Set the count to 1 initially, this will stop an I/O 1263 /* Set the count to 1 initially, this will stop an I/O
1264 * completion callout which happens before we have started 1264 * completion callout which happens before we have started
1265 * all the I/O from calling xfs_buf_ioend too early. 1265 * all the I/O from calling xfs_buf_ioend too early.
1266 */ 1266 */
1267 atomic_set(&bp->b_io_remaining, 1); 1267 atomic_set(&bp->b_io_remaining, 1);
1268 _xfs_buf_ioapply(bp); 1268 _xfs_buf_ioapply(bp);
1269 _xfs_buf_ioend(bp, 0); 1269 _xfs_buf_ioend(bp, 0);
1270 1270
1271 xfs_buf_rele(bp); 1271 xfs_buf_rele(bp);
1272 return 0; 1272 return 0;
1273 } 1273 }
1274 1274
1275 /* 1275 /*
1276 * Waits for I/O to complete on the buffer supplied. 1276 * Waits for I/O to complete on the buffer supplied.
1277 * It returns immediately if no I/O is pending. 1277 * It returns immediately if no I/O is pending.
1278 * It returns the I/O error code, if any, or 0 if there was no error. 1278 * It returns the I/O error code, if any, or 0 if there was no error.
1279 */ 1279 */
1280 int 1280 int
1281 xfs_buf_iowait( 1281 xfs_buf_iowait(
1282 xfs_buf_t *bp) 1282 xfs_buf_t *bp)
1283 { 1283 {
1284 XB_TRACE(bp, "iowait", 0); 1284 XB_TRACE(bp, "iowait", 0);
1285 if (atomic_read(&bp->b_io_remaining)) 1285 if (atomic_read(&bp->b_io_remaining))
1286 blk_run_address_space(bp->b_target->bt_mapping); 1286 blk_run_address_space(bp->b_target->bt_mapping);
1287 wait_for_completion(&bp->b_iowait); 1287 wait_for_completion(&bp->b_iowait);
1288 XB_TRACE(bp, "iowaited", (long)bp->b_error); 1288 XB_TRACE(bp, "iowaited", (long)bp->b_error);
1289 return bp->b_error; 1289 return bp->b_error;
1290 } 1290 }
1291 1291
1292 xfs_caddr_t 1292 xfs_caddr_t
1293 xfs_buf_offset( 1293 xfs_buf_offset(
1294 xfs_buf_t *bp, 1294 xfs_buf_t *bp,
1295 size_t offset) 1295 size_t offset)
1296 { 1296 {
1297 struct page *page; 1297 struct page *page;
1298 1298
1299 if (bp->b_flags & XBF_MAPPED) 1299 if (bp->b_flags & XBF_MAPPED)
1300 return XFS_BUF_PTR(bp) + offset; 1300 return XFS_BUF_PTR(bp) + offset;
1301 1301
1302 offset += bp->b_offset; 1302 offset += bp->b_offset;
1303 page = bp->b_pages[offset >> PAGE_CACHE_SHIFT]; 1303 page = bp->b_pages[offset >> PAGE_CACHE_SHIFT];
1304 return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1)); 1304 return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1));
1305 } 1305 }
1306 1306
1307 /* 1307 /*
1308 * Move data into or out of a buffer. 1308 * Move data into or out of a buffer.
1309 */ 1309 */
1310 void 1310 void
1311 xfs_buf_iomove( 1311 xfs_buf_iomove(
1312 xfs_buf_t *bp, /* buffer to process */ 1312 xfs_buf_t *bp, /* buffer to process */
1313 size_t boff, /* starting buffer offset */ 1313 size_t boff, /* starting buffer offset */
1314 size_t bsize, /* length to copy */ 1314 size_t bsize, /* length to copy */
1315 caddr_t data, /* data address */ 1315 caddr_t data, /* data address */
1316 xfs_buf_rw_t mode) /* read/write/zero flag */ 1316 xfs_buf_rw_t mode) /* read/write/zero flag */
1317 { 1317 {
1318 size_t bend, cpoff, csize; 1318 size_t bend, cpoff, csize;
1319 struct page *page; 1319 struct page *page;
1320 1320
1321 bend = boff + bsize; 1321 bend = boff + bsize;
1322 while (boff < bend) { 1322 while (boff < bend) {
1323 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; 1323 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
1324 cpoff = xfs_buf_poff(boff + bp->b_offset); 1324 cpoff = xfs_buf_poff(boff + bp->b_offset);
1325 csize = min_t(size_t, 1325 csize = min_t(size_t,
1326 PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff); 1326 PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff);
1327 1327
1328 ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); 1328 ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
1329 1329
1330 switch (mode) { 1330 switch (mode) {
1331 case XBRW_ZERO: 1331 case XBRW_ZERO:
1332 memset(page_address(page) + cpoff, 0, csize); 1332 memset(page_address(page) + cpoff, 0, csize);
1333 break; 1333 break;
1334 case XBRW_READ: 1334 case XBRW_READ:
1335 memcpy(data, page_address(page) + cpoff, csize); 1335 memcpy(data, page_address(page) + cpoff, csize);
1336 break; 1336 break;
1337 case XBRW_WRITE: 1337 case XBRW_WRITE:
1338 memcpy(page_address(page) + cpoff, data, csize); 1338 memcpy(page_address(page) + cpoff, data, csize);
1339 } 1339 }
1340 1340
1341 boff += csize; 1341 boff += csize;
1342 data += csize; 1342 data += csize;
1343 } 1343 }
1344 } 1344 }
1345 1345
1346 /* 1346 /*
1347 * Handling of buffer targets (buftargs). 1347 * Handling of buffer targets (buftargs).
1348 */ 1348 */
1349 1349
1350 /* 1350 /*
1351 * Wait for any bufs with callbacks that have been submitted but 1351 * Wait for any bufs with callbacks that have been submitted but
1352 * have not yet returned... walk the hash list for the target. 1352 * have not yet returned... walk the hash list for the target.
1353 */ 1353 */
1354 void 1354 void
1355 xfs_wait_buftarg( 1355 xfs_wait_buftarg(
1356 xfs_buftarg_t *btp) 1356 xfs_buftarg_t *btp)
1357 { 1357 {
1358 xfs_buf_t *bp, *n; 1358 xfs_buf_t *bp, *n;
1359 xfs_bufhash_t *hash; 1359 xfs_bufhash_t *hash;
1360 uint i; 1360 uint i;
1361 1361
1362 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1362 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1363 hash = &btp->bt_hash[i]; 1363 hash = &btp->bt_hash[i];
1364 again: 1364 again:
1365 spin_lock(&hash->bh_lock); 1365 spin_lock(&hash->bh_lock);
1366 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { 1366 list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
1367 ASSERT(btp == bp->b_target); 1367 ASSERT(btp == bp->b_target);
1368 if (!(bp->b_flags & XBF_FS_MANAGED)) { 1368 if (!(bp->b_flags & XBF_FS_MANAGED)) {
1369 spin_unlock(&hash->bh_lock); 1369 spin_unlock(&hash->bh_lock);
1370 /* 1370 /*
1371 * Catch superblock reference count leaks 1371 * Catch superblock reference count leaks
1372 * immediately 1372 * immediately
1373 */ 1373 */
1374 BUG_ON(bp->b_bn == 0); 1374 BUG_ON(bp->b_bn == 0);
1375 delay(100); 1375 delay(100);
1376 goto again; 1376 goto again;
1377 } 1377 }
1378 } 1378 }
1379 spin_unlock(&hash->bh_lock); 1379 spin_unlock(&hash->bh_lock);
1380 } 1380 }
1381 } 1381 }
1382 1382
1383 /* 1383 /*
1384 * Allocate buffer hash table for a given target. 1384 * Allocate buffer hash table for a given target.
1385 * For devices containing metadata (i.e. not the log/realtime devices) 1385 * For devices containing metadata (i.e. not the log/realtime devices)
1386 * we need to allocate a much larger hash table. 1386 * we need to allocate a much larger hash table.
1387 */ 1387 */
1388 STATIC void 1388 STATIC void
1389 xfs_alloc_bufhash( 1389 xfs_alloc_bufhash(
1390 xfs_buftarg_t *btp, 1390 xfs_buftarg_t *btp,
1391 int external) 1391 int external)
1392 { 1392 {
1393 unsigned int i; 1393 unsigned int i;
1394 1394
1395 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ 1395 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */
1396 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; 1396 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
1397 btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) * 1397 btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *
1398 sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE); 1398 sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE);
1399 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1399 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1400 spin_lock_init(&btp->bt_hash[i].bh_lock); 1400 spin_lock_init(&btp->bt_hash[i].bh_lock);
1401 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); 1401 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
1402 } 1402 }
1403 } 1403 }
1404 1404
1405 STATIC void 1405 STATIC void
1406 xfs_free_bufhash( 1406 xfs_free_bufhash(
1407 xfs_buftarg_t *btp) 1407 xfs_buftarg_t *btp)
1408 { 1408 {
1409 kmem_free(btp->bt_hash); 1409 kmem_free(btp->bt_hash);
1410 btp->bt_hash = NULL; 1410 btp->bt_hash = NULL;
1411 } 1411 }
1412 1412
1413 /* 1413 /*
1414 * buftarg list for delwrite queue processing 1414 * buftarg list for delwrite queue processing
1415 */ 1415 */
1416 static LIST_HEAD(xfs_buftarg_list); 1416 static LIST_HEAD(xfs_buftarg_list);
1417 static DEFINE_SPINLOCK(xfs_buftarg_lock); 1417 static DEFINE_SPINLOCK(xfs_buftarg_lock);
1418 1418
1419 STATIC void 1419 STATIC void
1420 xfs_register_buftarg( 1420 xfs_register_buftarg(
1421 xfs_buftarg_t *btp) 1421 xfs_buftarg_t *btp)
1422 { 1422 {
1423 spin_lock(&xfs_buftarg_lock); 1423 spin_lock(&xfs_buftarg_lock);
1424 list_add(&btp->bt_list, &xfs_buftarg_list); 1424 list_add(&btp->bt_list, &xfs_buftarg_list);
1425 spin_unlock(&xfs_buftarg_lock); 1425 spin_unlock(&xfs_buftarg_lock);
1426 } 1426 }
1427 1427
1428 STATIC void 1428 STATIC void
1429 xfs_unregister_buftarg( 1429 xfs_unregister_buftarg(
1430 xfs_buftarg_t *btp) 1430 xfs_buftarg_t *btp)
1431 { 1431 {
1432 spin_lock(&xfs_buftarg_lock); 1432 spin_lock(&xfs_buftarg_lock);
1433 list_del(&btp->bt_list); 1433 list_del(&btp->bt_list);
1434 spin_unlock(&xfs_buftarg_lock); 1434 spin_unlock(&xfs_buftarg_lock);
1435 } 1435 }
1436 1436
1437 void 1437 void
1438 xfs_free_buftarg( 1438 xfs_free_buftarg(
1439 xfs_buftarg_t *btp) 1439 xfs_buftarg_t *btp)
1440 { 1440 {
1441 xfs_flush_buftarg(btp, 1); 1441 xfs_flush_buftarg(btp, 1);
1442 xfs_blkdev_issue_flush(btp); 1442 xfs_blkdev_issue_flush(btp);
1443 xfs_free_bufhash(btp); 1443 xfs_free_bufhash(btp);
1444 iput(btp->bt_mapping->host); 1444 iput(btp->bt_mapping->host);
1445 1445
1446 /* Unregister the buftarg first so that we don't get a 1446 /* Unregister the buftarg first so that we don't get a
1447 * wakeup finding a non-existent task 1447 * wakeup finding a non-existent task
1448 */ 1448 */
1449 xfs_unregister_buftarg(btp); 1449 xfs_unregister_buftarg(btp);
1450 kthread_stop(btp->bt_task); 1450 kthread_stop(btp->bt_task);
1451 1451
1452 kmem_free(btp); 1452 kmem_free(btp);
1453 } 1453 }
1454 1454
1455 STATIC int 1455 STATIC int
1456 xfs_setsize_buftarg_flags( 1456 xfs_setsize_buftarg_flags(
1457 xfs_buftarg_t *btp, 1457 xfs_buftarg_t *btp,
1458 unsigned int blocksize, 1458 unsigned int blocksize,
1459 unsigned int sectorsize, 1459 unsigned int sectorsize,
1460 int verbose) 1460 int verbose)
1461 { 1461 {
1462 btp->bt_bsize = blocksize; 1462 btp->bt_bsize = blocksize;
1463 btp->bt_sshift = ffs(sectorsize) - 1; 1463 btp->bt_sshift = ffs(sectorsize) - 1;
1464 btp->bt_smask = sectorsize - 1; 1464 btp->bt_smask = sectorsize - 1;
1465 1465
1466 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1466 if (set_blocksize(btp->bt_bdev, sectorsize)) {
1467 printk(KERN_WARNING 1467 printk(KERN_WARNING
1468 "XFS: Cannot set_blocksize to %u on device %s\n", 1468 "XFS: Cannot set_blocksize to %u on device %s\n",
1469 sectorsize, XFS_BUFTARG_NAME(btp)); 1469 sectorsize, XFS_BUFTARG_NAME(btp));
1470 return EINVAL; 1470 return EINVAL;
1471 } 1471 }
1472 1472
1473 if (verbose && 1473 if (verbose &&
1474 (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) { 1474 (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
1475 printk(KERN_WARNING 1475 printk(KERN_WARNING
1476 "XFS: %u byte sectors in use on device %s. " 1476 "XFS: %u byte sectors in use on device %s. "
1477 "This is suboptimal; %u or greater is ideal.\n", 1477 "This is suboptimal; %u or greater is ideal.\n",
1478 sectorsize, XFS_BUFTARG_NAME(btp), 1478 sectorsize, XFS_BUFTARG_NAME(btp),
1479 (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG); 1479 (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
1480 } 1480 }
1481 1481
1482 return 0; 1482 return 0;
1483 } 1483 }
1484 1484
1485 /* 1485 /*
1486 * When allocating the initial buffer target we have not yet 1486 * When allocating the initial buffer target we have not yet
1487 * read in the superblock, so don't know what sized sectors 1487 * read in the superblock, so don't know what sized sectors
1488 * are being used is at this early stage. Play safe. 1488 * are being used is at this early stage. Play safe.
1489 */ 1489 */
1490 STATIC int 1490 STATIC int
1491 xfs_setsize_buftarg_early( 1491 xfs_setsize_buftarg_early(
1492 xfs_buftarg_t *btp, 1492 xfs_buftarg_t *btp,
1493 struct block_device *bdev) 1493 struct block_device *bdev)
1494 { 1494 {
1495 return xfs_setsize_buftarg_flags(btp, 1495 return xfs_setsize_buftarg_flags(btp,
1496 PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0); 1496 PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
1497 } 1497 }
1498 1498
1499 int 1499 int
1500 xfs_setsize_buftarg( 1500 xfs_setsize_buftarg(
1501 xfs_buftarg_t *btp, 1501 xfs_buftarg_t *btp,
1502 unsigned int blocksize, 1502 unsigned int blocksize,
1503 unsigned int sectorsize) 1503 unsigned int sectorsize)
1504 { 1504 {
1505 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); 1505 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
1506 } 1506 }
1507 1507
1508 STATIC int 1508 STATIC int
1509 xfs_mapping_buftarg( 1509 xfs_mapping_buftarg(
1510 xfs_buftarg_t *btp, 1510 xfs_buftarg_t *btp,
1511 struct block_device *bdev) 1511 struct block_device *bdev)
1512 { 1512 {
1513 struct backing_dev_info *bdi; 1513 struct backing_dev_info *bdi;
1514 struct inode *inode; 1514 struct inode *inode;
1515 struct address_space *mapping; 1515 struct address_space *mapping;
1516 static const struct address_space_operations mapping_aops = { 1516 static const struct address_space_operations mapping_aops = {
1517 .sync_page = block_sync_page, 1517 .sync_page = block_sync_page,
1518 .migratepage = fail_migrate_page, 1518 .migratepage = fail_migrate_page,
1519 }; 1519 };
1520 1520
1521 inode = new_inode(bdev->bd_inode->i_sb); 1521 inode = new_inode(bdev->bd_inode->i_sb);
1522 if (!inode) { 1522 if (!inode) {
1523 printk(KERN_WARNING 1523 printk(KERN_WARNING
1524 "XFS: Cannot allocate mapping inode for device %s\n", 1524 "XFS: Cannot allocate mapping inode for device %s\n",
1525 XFS_BUFTARG_NAME(btp)); 1525 XFS_BUFTARG_NAME(btp));
1526 return ENOMEM; 1526 return ENOMEM;
1527 } 1527 }
1528 inode->i_mode = S_IFBLK; 1528 inode->i_mode = S_IFBLK;
1529 inode->i_bdev = bdev; 1529 inode->i_bdev = bdev;
1530 inode->i_rdev = bdev->bd_dev; 1530 inode->i_rdev = bdev->bd_dev;
1531 bdi = blk_get_backing_dev_info(bdev); 1531 bdi = blk_get_backing_dev_info(bdev);
1532 if (!bdi) 1532 if (!bdi)
1533 bdi = &default_backing_dev_info; 1533 bdi = &default_backing_dev_info;
1534 mapping = &inode->i_data; 1534 mapping = &inode->i_data;
1535 mapping->a_ops = &mapping_aops; 1535 mapping->a_ops = &mapping_aops;
1536 mapping->backing_dev_info = bdi; 1536 mapping->backing_dev_info = bdi;
1537 mapping_set_gfp_mask(mapping, GFP_NOFS); 1537 mapping_set_gfp_mask(mapping, GFP_NOFS);
1538 btp->bt_mapping = mapping; 1538 btp->bt_mapping = mapping;
1539 return 0; 1539 return 0;
1540 } 1540 }
1541 1541
1542 STATIC int 1542 STATIC int
1543 xfs_alloc_delwrite_queue( 1543 xfs_alloc_delwrite_queue(
1544 xfs_buftarg_t *btp) 1544 xfs_buftarg_t *btp)
1545 { 1545 {
1546 int error = 0; 1546 int error = 0;
1547 1547
1548 INIT_LIST_HEAD(&btp->bt_list); 1548 INIT_LIST_HEAD(&btp->bt_list);
1549 INIT_LIST_HEAD(&btp->bt_delwrite_queue); 1549 INIT_LIST_HEAD(&btp->bt_delwrite_queue);
1550 spin_lock_init(&btp->bt_delwrite_lock); 1550 spin_lock_init(&btp->bt_delwrite_lock);
1551 btp->bt_flags = 0; 1551 btp->bt_flags = 0;
1552 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd"); 1552 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd");
1553 if (IS_ERR(btp->bt_task)) { 1553 if (IS_ERR(btp->bt_task)) {
1554 error = PTR_ERR(btp->bt_task); 1554 error = PTR_ERR(btp->bt_task);
1555 goto out_error; 1555 goto out_error;
1556 } 1556 }
1557 xfs_register_buftarg(btp); 1557 xfs_register_buftarg(btp);
1558 out_error: 1558 out_error:
1559 return error; 1559 return error;
1560 } 1560 }
1561 1561
1562 xfs_buftarg_t * 1562 xfs_buftarg_t *
1563 xfs_alloc_buftarg( 1563 xfs_alloc_buftarg(
1564 struct block_device *bdev, 1564 struct block_device *bdev,
1565 int external) 1565 int external)
1566 { 1566 {
1567 xfs_buftarg_t *btp; 1567 xfs_buftarg_t *btp;
1568 1568
1569 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); 1569 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
1570 1570
1571 btp->bt_dev = bdev->bd_dev; 1571 btp->bt_dev = bdev->bd_dev;
1572 btp->bt_bdev = bdev; 1572 btp->bt_bdev = bdev;
1573 if (xfs_setsize_buftarg_early(btp, bdev)) 1573 if (xfs_setsize_buftarg_early(btp, bdev))
1574 goto error; 1574 goto error;
1575 if (xfs_mapping_buftarg(btp, bdev)) 1575 if (xfs_mapping_buftarg(btp, bdev))
1576 goto error; 1576 goto error;
1577 if (xfs_alloc_delwrite_queue(btp)) 1577 if (xfs_alloc_delwrite_queue(btp))
1578 goto error; 1578 goto error;
1579 xfs_alloc_bufhash(btp, external); 1579 xfs_alloc_bufhash(btp, external);
1580 return btp; 1580 return btp;
1581 1581
1582 error: 1582 error:
1583 kmem_free(btp); 1583 kmem_free(btp);
1584 return NULL; 1584 return NULL;
1585 } 1585 }
1586 1586
1587 1587
1588 /* 1588 /*
1589 * Delayed write buffer handling 1589 * Delayed write buffer handling
1590 */ 1590 */
1591 STATIC void 1591 STATIC void
1592 xfs_buf_delwri_queue( 1592 xfs_buf_delwri_queue(
1593 xfs_buf_t *bp, 1593 xfs_buf_t *bp,
1594 int unlock) 1594 int unlock)
1595 { 1595 {
1596 struct list_head *dwq = &bp->b_target->bt_delwrite_queue; 1596 struct list_head *dwq = &bp->b_target->bt_delwrite_queue;
1597 spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; 1597 spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;
1598 1598
1599 XB_TRACE(bp, "delwri_q", (long)unlock); 1599 XB_TRACE(bp, "delwri_q", (long)unlock);
1600 ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC)); 1600 ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
1601 1601
1602 spin_lock(dwlk); 1602 spin_lock(dwlk);
1603 /* If already in the queue, dequeue and place at tail */ 1603 /* If already in the queue, dequeue and place at tail */
1604 if (!list_empty(&bp->b_list)) { 1604 if (!list_empty(&bp->b_list)) {
1605 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 1605 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1606 if (unlock) 1606 if (unlock)
1607 atomic_dec(&bp->b_hold); 1607 atomic_dec(&bp->b_hold);
1608 list_del(&bp->b_list); 1608 list_del(&bp->b_list);
1609 } 1609 }
1610 1610
1611 bp->b_flags |= _XBF_DELWRI_Q; 1611 bp->b_flags |= _XBF_DELWRI_Q;
1612 list_add_tail(&bp->b_list, dwq); 1612 list_add_tail(&bp->b_list, dwq);
1613 bp->b_queuetime = jiffies; 1613 bp->b_queuetime = jiffies;
1614 spin_unlock(dwlk); 1614 spin_unlock(dwlk);
1615 1615
1616 if (unlock) 1616 if (unlock)
1617 xfs_buf_unlock(bp); 1617 xfs_buf_unlock(bp);
1618 } 1618 }
1619 1619
1620 void 1620 void
1621 xfs_buf_delwri_dequeue( 1621 xfs_buf_delwri_dequeue(
1622 xfs_buf_t *bp) 1622 xfs_buf_t *bp)
1623 { 1623 {
1624 spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; 1624 spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock;
1625 int dequeued = 0; 1625 int dequeued = 0;
1626 1626
1627 spin_lock(dwlk); 1627 spin_lock(dwlk);
1628 if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) { 1628 if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
1629 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 1629 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1630 list_del_init(&bp->b_list); 1630 list_del_init(&bp->b_list);
1631 dequeued = 1; 1631 dequeued = 1;
1632 } 1632 }
1633 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); 1633 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
1634 spin_unlock(dwlk); 1634 spin_unlock(dwlk);
1635 1635
1636 if (dequeued) 1636 if (dequeued)
1637 xfs_buf_rele(bp); 1637 xfs_buf_rele(bp);
1638 1638
1639 XB_TRACE(bp, "delwri_dq", (long)dequeued); 1639 XB_TRACE(bp, "delwri_dq", (long)dequeued);
1640 } 1640 }
1641 1641
1642 STATIC void 1642 STATIC void
1643 xfs_buf_runall_queues( 1643 xfs_buf_runall_queues(
1644 struct workqueue_struct *queue) 1644 struct workqueue_struct *queue)
1645 { 1645 {
1646 flush_workqueue(queue); 1646 flush_workqueue(queue);
1647 } 1647 }
1648 1648
1649 STATIC int 1649 STATIC int
1650 xfsbufd_wakeup( 1650 xfsbufd_wakeup(
1651 int priority, 1651 int priority,
1652 gfp_t mask) 1652 gfp_t mask)
1653 { 1653 {
1654 xfs_buftarg_t *btp; 1654 xfs_buftarg_t *btp;
1655 1655
1656 spin_lock(&xfs_buftarg_lock); 1656 spin_lock(&xfs_buftarg_lock);
1657 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) { 1657 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
1658 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags)) 1658 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
1659 continue; 1659 continue;
1660 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags); 1660 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
1661 wake_up_process(btp->bt_task); 1661 wake_up_process(btp->bt_task);
1662 } 1662 }
1663 spin_unlock(&xfs_buftarg_lock); 1663 spin_unlock(&xfs_buftarg_lock);
1664 return 0; 1664 return 0;
1665 } 1665 }
1666 1666
1667 /* 1667 /*
1668 * Move as many buffers as specified to the supplied list 1668 * Move as many buffers as specified to the supplied list
1669 * idicating if we skipped any buffers to prevent deadlocks. 1669 * idicating if we skipped any buffers to prevent deadlocks.
1670 */ 1670 */
1671 STATIC int 1671 STATIC int
1672 xfs_buf_delwri_split( 1672 xfs_buf_delwri_split(
1673 xfs_buftarg_t *target, 1673 xfs_buftarg_t *target,
1674 struct list_head *list, 1674 struct list_head *list,
1675 unsigned long age) 1675 unsigned long age)
1676 { 1676 {
1677 xfs_buf_t *bp, *n; 1677 xfs_buf_t *bp, *n;
1678 struct list_head *dwq = &target->bt_delwrite_queue; 1678 struct list_head *dwq = &target->bt_delwrite_queue;
1679 spinlock_t *dwlk = &target->bt_delwrite_lock; 1679 spinlock_t *dwlk = &target->bt_delwrite_lock;
1680 int skipped = 0; 1680 int skipped = 0;
1681 int force; 1681 int force;
1682 1682
1683 force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1683 force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1684 INIT_LIST_HEAD(list); 1684 INIT_LIST_HEAD(list);
1685 spin_lock(dwlk); 1685 spin_lock(dwlk);
1686 list_for_each_entry_safe(bp, n, dwq, b_list) { 1686 list_for_each_entry_safe(bp, n, dwq, b_list) {
1687 XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp)); 1687 XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp));
1688 ASSERT(bp->b_flags & XBF_DELWRI); 1688 ASSERT(bp->b_flags & XBF_DELWRI);
1689 1689
1690 if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) { 1690 if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) {
1691 if (!force && 1691 if (!force &&
1692 time_before(jiffies, bp->b_queuetime + age)) { 1692 time_before(jiffies, bp->b_queuetime + age)) {
1693 xfs_buf_unlock(bp); 1693 xfs_buf_unlock(bp);
1694 break; 1694 break;
1695 } 1695 }
1696 1696
1697 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q| 1697 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q|
1698 _XBF_RUN_QUEUES); 1698 _XBF_RUN_QUEUES);
1699 bp->b_flags |= XBF_WRITE; 1699 bp->b_flags |= XBF_WRITE;
1700 list_move_tail(&bp->b_list, list); 1700 list_move_tail(&bp->b_list, list);
1701 } else 1701 } else
1702 skipped++; 1702 skipped++;
1703 } 1703 }
1704 spin_unlock(dwlk); 1704 spin_unlock(dwlk);
1705 1705
1706 return skipped; 1706 return skipped;
1707 1707
1708 } 1708 }
1709 1709
1710 STATIC int 1710 STATIC int
1711 xfsbufd( 1711 xfsbufd(
1712 void *data) 1712 void *data)
1713 { 1713 {
1714 struct list_head tmp; 1714 struct list_head tmp;
1715 xfs_buftarg_t *target = (xfs_buftarg_t *)data; 1715 xfs_buftarg_t *target = (xfs_buftarg_t *)data;
1716 int count; 1716 int count;
1717 xfs_buf_t *bp; 1717 xfs_buf_t *bp;
1718 1718
1719 current->flags |= PF_MEMALLOC; 1719 current->flags |= PF_MEMALLOC;
1720 1720
1721 set_freezable(); 1721 set_freezable();
1722 1722
1723 do { 1723 do {
1724 if (unlikely(freezing(current))) { 1724 if (unlikely(freezing(current))) {
1725 set_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1725 set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1726 refrigerator(); 1726 refrigerator();
1727 } else { 1727 } else {
1728 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1728 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1729 } 1729 }
1730 1730
1731 schedule_timeout_interruptible( 1731 schedule_timeout_interruptible(
1732 xfs_buf_timer_centisecs * msecs_to_jiffies(10)); 1732 xfs_buf_timer_centisecs * msecs_to_jiffies(10));
1733 1733
1734 xfs_buf_delwri_split(target, &tmp, 1734 xfs_buf_delwri_split(target, &tmp,
1735 xfs_buf_age_centisecs * msecs_to_jiffies(10)); 1735 xfs_buf_age_centisecs * msecs_to_jiffies(10));
1736 1736
1737 count = 0; 1737 count = 0;
1738 while (!list_empty(&tmp)) { 1738 while (!list_empty(&tmp)) {
1739 bp = list_entry(tmp.next, xfs_buf_t, b_list); 1739 bp = list_entry(tmp.next, xfs_buf_t, b_list);
1740 ASSERT(target == bp->b_target); 1740 ASSERT(target == bp->b_target);
1741 1741
1742 list_del_init(&bp->b_list); 1742 list_del_init(&bp->b_list);
1743 xfs_buf_iostrategy(bp); 1743 xfs_buf_iostrategy(bp);
1744 count++; 1744 count++;
1745 } 1745 }
1746 1746
1747 if (as_list_len > 0) 1747 if (as_list_len > 0)
1748 purge_addresses(); 1748 purge_addresses();
1749 if (count) 1749 if (count)
1750 blk_run_address_space(target->bt_mapping); 1750 blk_run_address_space(target->bt_mapping);
1751 1751
1752 } while (!kthread_should_stop()); 1752 } while (!kthread_should_stop());
1753 1753
1754 return 0; 1754 return 0;
1755 } 1755 }
1756 1756
1757 /* 1757 /*
1758 * Go through all incore buffers, and release buffers if they belong to 1758 * Go through all incore buffers, and release buffers if they belong to
1759 * the given device. This is used in filesystem error handling to 1759 * the given device. This is used in filesystem error handling to
1760 * preserve the consistency of its metadata. 1760 * preserve the consistency of its metadata.
1761 */ 1761 */
1762 int 1762 int
1763 xfs_flush_buftarg( 1763 xfs_flush_buftarg(
1764 xfs_buftarg_t *target, 1764 xfs_buftarg_t *target,
1765 int wait) 1765 int wait)
1766 { 1766 {
1767 struct list_head tmp; 1767 struct list_head tmp;
1768 xfs_buf_t *bp, *n; 1768 xfs_buf_t *bp, *n;
1769 int pincount = 0; 1769 int pincount = 0;
1770 1770
1771 xfs_buf_runall_queues(xfsdatad_workqueue); 1771 xfs_buf_runall_queues(xfsdatad_workqueue);
1772 xfs_buf_runall_queues(xfslogd_workqueue); 1772 xfs_buf_runall_queues(xfslogd_workqueue);
1773 1773
1774 set_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1774 set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1775 pincount = xfs_buf_delwri_split(target, &tmp, 0); 1775 pincount = xfs_buf_delwri_split(target, &tmp, 0);
1776 1776
1777 /* 1777 /*
1778 * Dropped the delayed write list lock, now walk the temporary list 1778 * Dropped the delayed write list lock, now walk the temporary list
1779 */ 1779 */
1780 list_for_each_entry_safe(bp, n, &tmp, b_list) { 1780 list_for_each_entry_safe(bp, n, &tmp, b_list) {
1781 ASSERT(target == bp->b_target); 1781 ASSERT(target == bp->b_target);
1782 if (wait) 1782 if (wait)
1783 bp->b_flags &= ~XBF_ASYNC; 1783 bp->b_flags &= ~XBF_ASYNC;
1784 else 1784 else
1785 list_del_init(&bp->b_list); 1785 list_del_init(&bp->b_list);
1786 1786
1787 xfs_buf_iostrategy(bp); 1787 xfs_buf_iostrategy(bp);
1788 } 1788 }
1789 1789
1790 if (wait) 1790 if (wait)
1791 blk_run_address_space(target->bt_mapping); 1791 blk_run_address_space(target->bt_mapping);
1792 1792
1793 /* 1793 /*
1794 * Remaining list items must be flushed before returning 1794 * Remaining list items must be flushed before returning
1795 */ 1795 */
1796 while (!list_empty(&tmp)) { 1796 while (!list_empty(&tmp)) {
1797 bp = list_entry(tmp.next, xfs_buf_t, b_list); 1797 bp = list_entry(tmp.next, xfs_buf_t, b_list);
1798 1798
1799 list_del_init(&bp->b_list); 1799 list_del_init(&bp->b_list);
1800 xfs_iowait(bp); 1800 xfs_iowait(bp);
1801 xfs_buf_relse(bp); 1801 xfs_buf_relse(bp);
1802 } 1802 }
1803 1803
1804 return pincount; 1804 return pincount;
1805 } 1805 }
1806 1806
1807 int __init 1807 int __init
1808 xfs_buf_init(void) 1808 xfs_buf_init(void)
1809 { 1809 {
1810 #ifdef XFS_BUF_TRACE 1810 #ifdef XFS_BUF_TRACE
1811 xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_NOFS); 1811 xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_NOFS);
1812 #endif 1812 #endif
1813 1813
1814 xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", 1814 xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
1815 KM_ZONE_HWALIGN, NULL); 1815 KM_ZONE_HWALIGN, NULL);
1816 if (!xfs_buf_zone) 1816 if (!xfs_buf_zone)
1817 goto out_free_trace_buf; 1817 goto out_free_trace_buf;
1818 1818
1819 xfslogd_workqueue = create_workqueue("xfslogd"); 1819 xfslogd_workqueue = create_workqueue("xfslogd");
1820 if (!xfslogd_workqueue) 1820 if (!xfslogd_workqueue)
1821 goto out_free_buf_zone; 1821 goto out_free_buf_zone;
1822 1822
1823 xfsdatad_workqueue = create_workqueue("xfsdatad"); 1823 xfsdatad_workqueue = create_workqueue("xfsdatad");
1824 if (!xfsdatad_workqueue) 1824 if (!xfsdatad_workqueue)
1825 goto out_destroy_xfslogd_workqueue; 1825 goto out_destroy_xfslogd_workqueue;
1826 1826
1827 register_shrinker(&xfs_buf_shake); 1827 register_shrinker(&xfs_buf_shake);
1828 return 0; 1828 return 0;
1829 1829
1830 out_destroy_xfslogd_workqueue: 1830 out_destroy_xfslogd_workqueue:
1831 destroy_workqueue(xfslogd_workqueue); 1831 destroy_workqueue(xfslogd_workqueue);
1832 out_free_buf_zone: 1832 out_free_buf_zone:
1833 kmem_zone_destroy(xfs_buf_zone); 1833 kmem_zone_destroy(xfs_buf_zone);
1834 out_free_trace_buf: 1834 out_free_trace_buf:
1835 #ifdef XFS_BUF_TRACE 1835 #ifdef XFS_BUF_TRACE
1836 ktrace_free(xfs_buf_trace_buf); 1836 ktrace_free(xfs_buf_trace_buf);
1837 #endif 1837 #endif
1838 return -ENOMEM; 1838 return -ENOMEM;
1839 } 1839 }
1840 1840
1841 void 1841 void
1842 xfs_buf_terminate(void) 1842 xfs_buf_terminate(void)
1843 { 1843 {
1844 unregister_shrinker(&xfs_buf_shake); 1844 unregister_shrinker(&xfs_buf_shake);
1845 destroy_workqueue(xfsdatad_workqueue); 1845 destroy_workqueue(xfsdatad_workqueue);
1846 destroy_workqueue(xfslogd_workqueue); 1846 destroy_workqueue(xfslogd_workqueue);
1847 kmem_zone_destroy(xfs_buf_zone); 1847 kmem_zone_destroy(xfs_buf_zone);
1848 #ifdef XFS_BUF_TRACE 1848 #ifdef XFS_BUF_TRACE
1849 ktrace_free(xfs_buf_trace_buf); 1849 ktrace_free(xfs_buf_trace_buf);
1850 #endif 1850 #endif
1851 } 1851 }
1852 1852
1853 #ifdef CONFIG_KDB_MODULES 1853 #ifdef CONFIG_KDB_MODULES
1854 struct list_head * 1854 struct list_head *
1855 xfs_get_buftarg_list(void) 1855 xfs_get_buftarg_list(void)
1856 { 1856 {
1857 return &xfs_buftarg_list; 1857 return &xfs_buftarg_list;
1858 } 1858 }
1859 #endif 1859 #endif
1860 1860
fs/xfs/linux-2.6/xfs_buf.h
1 /* 1 /*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved. 3 * All Rights Reserved.
4 * 4 *
5 * This program is free software; you can redistribute it and/or 5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as 6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 * 8 *
9 * This program is distributed in the hope that it would be useful, 9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation, 15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18 #ifndef __XFS_BUF_H__ 18 #ifndef __XFS_BUF_H__
19 #define __XFS_BUF_H__ 19 #define __XFS_BUF_H__
20 20
21 #include <linux/list.h> 21 #include <linux/list.h>
22 #include <linux/types.h> 22 #include <linux/types.h>
23 #include <linux/spinlock.h> 23 #include <linux/spinlock.h>
24 #include <asm/system.h> 24 #include <asm/system.h>
25 #include <linux/mm.h> 25 #include <linux/mm.h>
26 #include <linux/fs.h> 26 #include <linux/fs.h>
27 #include <linux/buffer_head.h> 27 #include <linux/buffer_head.h>
28 #include <linux/uio.h> 28 #include <linux/uio.h>
29 29
30 /* 30 /*
31 * Base types 31 * Base types
32 */ 32 */
33 33
34 #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL)) 34 #define XFS_BUF_DADDR_NULL ((xfs_daddr_t) (-1LL))
35 35
36 #define xfs_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE) 36 #define xfs_buf_ctob(pp) ((pp) * PAGE_CACHE_SIZE)
37 #define xfs_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) 37 #define xfs_buf_btoc(dd) (((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT)
38 #define xfs_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT) 38 #define xfs_buf_btoct(dd) ((dd) >> PAGE_CACHE_SHIFT)
39 #define xfs_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK) 39 #define xfs_buf_poff(aa) ((aa) & ~PAGE_CACHE_MASK)
40 40
41 typedef enum { 41 typedef enum {
42 XBRW_READ = 1, /* transfer into target memory */ 42 XBRW_READ = 1, /* transfer into target memory */
43 XBRW_WRITE = 2, /* transfer from target memory */ 43 XBRW_WRITE = 2, /* transfer from target memory */
44 XBRW_ZERO = 3, /* Zero target memory */ 44 XBRW_ZERO = 3, /* Zero target memory */
45 } xfs_buf_rw_t; 45 } xfs_buf_rw_t;
46 46
47 typedef enum { 47 typedef enum {
48 XBF_READ = (1 << 0), /* buffer intended for reading from device */ 48 XBF_READ = (1 << 0), /* buffer intended for reading from device */
49 XBF_WRITE = (1 << 1), /* buffer intended for writing to device */ 49 XBF_WRITE = (1 << 1), /* buffer intended for writing to device */
50 XBF_MAPPED = (1 << 2), /* buffer mapped (b_addr valid) */ 50 XBF_MAPPED = (1 << 2), /* buffer mapped (b_addr valid) */
51 XBF_ASYNC = (1 << 4), /* initiator will not wait for completion */ 51 XBF_ASYNC = (1 << 4), /* initiator will not wait for completion */
52 XBF_DONE = (1 << 5), /* all pages in the buffer uptodate */ 52 XBF_DONE = (1 << 5), /* all pages in the buffer uptodate */
53 XBF_DELWRI = (1 << 6), /* buffer has dirty pages */ 53 XBF_DELWRI = (1 << 6), /* buffer has dirty pages */
54 XBF_STALE = (1 << 7), /* buffer has been staled, do not find it */ 54 XBF_STALE = (1 << 7), /* buffer has been staled, do not find it */
55 XBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */ 55 XBF_FS_MANAGED = (1 << 8), /* filesystem controls freeing memory */
56 XBF_ORDERED = (1 << 11), /* use ordered writes */ 56 XBF_ORDERED = (1 << 11), /* use ordered writes */
57 XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */ 57 XBF_READ_AHEAD = (1 << 12), /* asynchronous read-ahead */
58 58
59 /* flags used only as arguments to access routines */ 59 /* flags used only as arguments to access routines */
60 XBF_LOCK = (1 << 14), /* lock requested */ 60 XBF_LOCK = (1 << 14), /* lock requested */
61 XBF_TRYLOCK = (1 << 15), /* lock requested, but do not wait */ 61 XBF_TRYLOCK = (1 << 15), /* lock requested, but do not wait */
62 XBF_DONT_BLOCK = (1 << 16), /* do not block in current thread */ 62 XBF_DONT_BLOCK = (1 << 16), /* do not block in current thread */
63 63
64 /* flags used only internally */ 64 /* flags used only internally */
65 _XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */ 65 _XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */
66 _XBF_PAGES = (1 << 18), /* backed by refcounted pages */ 66 _XBF_PAGES = (1 << 18), /* backed by refcounted pages */
67 _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */ 67 _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */
68 _XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */ 68 _XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */
69 69
70 /* 70 /*
71 * Special flag for supporting metadata blocks smaller than a FSB. 71 * Special flag for supporting metadata blocks smaller than a FSB.
72 * 72 *
73 * In this case we can have multiple xfs_buf_t on a single page and 73 * In this case we can have multiple xfs_buf_t on a single page and
74 * need to lock out concurrent xfs_buf_t readers as they only 74 * need to lock out concurrent xfs_buf_t readers as they only
75 * serialise access to the buffer. 75 * serialise access to the buffer.
76 * 76 *
77 * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation 77 * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
78 * between reads of the page. Hence we can have one thread read the 78 * between reads of the page. Hence we can have one thread read the
79 * page and modify it, but then race with another thread that thinks 79 * page and modify it, but then race with another thread that thinks
80 * the page is not up-to-date and hence reads it again. 80 * the page is not up-to-date and hence reads it again.
81 * 81 *
82 * The result is that the first modifcation to the page is lost. 82 * The result is that the first modifcation to the page is lost.
83 * This sort of AGF/AGI reading race can happen when unlinking inodes 83 * This sort of AGF/AGI reading race can happen when unlinking inodes
84 * that require truncation and results in the AGI unlinked list 84 * that require truncation and results in the AGI unlinked list
85 * modifications being lost. 85 * modifications being lost.
86 */ 86 */
87 _XBF_PAGE_LOCKED = (1 << 22), 87 _XBF_PAGE_LOCKED = (1 << 22),
88 88
89 /* 89 /*
90 * If we try a barrier write, but it fails we have to communicate 90 * If we try a barrier write, but it fails we have to communicate
91 * this to the upper layers. Unfortunately b_error gets overwritten 91 * this to the upper layers. Unfortunately b_error gets overwritten
92 * when the buffer is re-issued so we have to add another flag to 92 * when the buffer is re-issued so we have to add another flag to
93 * keep this information. 93 * keep this information.
94 */ 94 */
95 _XFS_BARRIER_FAILED = (1 << 23), 95 _XFS_BARRIER_FAILED = (1 << 23),
96 } xfs_buf_flags_t; 96 } xfs_buf_flags_t;
97 97
98 typedef enum { 98 typedef enum {
99 XBT_FORCE_SLEEP = 0, 99 XBT_FORCE_SLEEP = 0,
100 XBT_FORCE_FLUSH = 1, 100 XBT_FORCE_FLUSH = 1,
101 } xfs_buftarg_flags_t; 101 } xfs_buftarg_flags_t;
102 102
103 typedef struct xfs_bufhash { 103 typedef struct xfs_bufhash {
104 struct list_head bh_list; 104 struct list_head bh_list;
105 spinlock_t bh_lock; 105 spinlock_t bh_lock;
106 } xfs_bufhash_t; 106 } xfs_bufhash_t;
107 107
108 typedef struct xfs_buftarg { 108 typedef struct xfs_buftarg {
109 dev_t bt_dev; 109 dev_t bt_dev;
110 struct block_device *bt_bdev; 110 struct block_device *bt_bdev;
111 struct address_space *bt_mapping; 111 struct address_space *bt_mapping;
112 unsigned int bt_bsize; 112 unsigned int bt_bsize;
113 unsigned int bt_sshift; 113 unsigned int bt_sshift;
114 size_t bt_smask; 114 size_t bt_smask;
115 115
116 /* per device buffer hash table */ 116 /* per device buffer hash table */
117 uint bt_hashmask; 117 uint bt_hashmask;
118 uint bt_hashshift; 118 uint bt_hashshift;
119 xfs_bufhash_t *bt_hash; 119 xfs_bufhash_t *bt_hash;
120 120
121 /* per device delwri queue */ 121 /* per device delwri queue */
122 struct task_struct *bt_task; 122 struct task_struct *bt_task;
123 struct list_head bt_list; 123 struct list_head bt_list;
124 struct list_head bt_delwrite_queue; 124 struct list_head bt_delwrite_queue;
125 spinlock_t bt_delwrite_lock; 125 spinlock_t bt_delwrite_lock;
126 unsigned long bt_flags; 126 unsigned long bt_flags;
127 } xfs_buftarg_t; 127 } xfs_buftarg_t;
128 128
129 /* 129 /*
130 * xfs_buf_t: Buffer structure for pagecache-based buffers 130 * xfs_buf_t: Buffer structure for pagecache-based buffers
131 * 131 *
132 * This buffer structure is used by the pagecache buffer management routines 132 * This buffer structure is used by the pagecache buffer management routines
133 * to refer to an assembly of pages forming a logical buffer. 133 * to refer to an assembly of pages forming a logical buffer.
134 * 134 *
135 * The buffer structure is used on a temporary basis only, and discarded when 135 * The buffer structure is used on a temporary basis only, and discarded when
136 * released. The real data storage is recorded in the pagecache. Buffers are 136 * released. The real data storage is recorded in the pagecache. Buffers are
137 * hashed to the block device on which the file system resides. 137 * hashed to the block device on which the file system resides.
138 */ 138 */
139 139
140 struct xfs_buf; 140 struct xfs_buf;
141 typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); 141 typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
142 typedef void (*xfs_buf_relse_t)(struct xfs_buf *); 142 typedef void (*xfs_buf_relse_t)(struct xfs_buf *);
143 typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *); 143 typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *);
144 144
145 #define XB_PAGES 2 145 #define XB_PAGES 2
146 146
147 typedef struct xfs_buf { 147 typedef struct xfs_buf {
148 struct semaphore b_sema; /* semaphore for lockables */ 148 struct semaphore b_sema; /* semaphore for lockables */
149 unsigned long b_queuetime; /* time buffer was queued */ 149 unsigned long b_queuetime; /* time buffer was queued */
150 atomic_t b_pin_count; /* pin count */ 150 atomic_t b_pin_count; /* pin count */
151 wait_queue_head_t b_waiters; /* unpin waiters */ 151 wait_queue_head_t b_waiters; /* unpin waiters */
152 struct list_head b_list; 152 struct list_head b_list;
153 xfs_buf_flags_t b_flags; /* status flags */ 153 xfs_buf_flags_t b_flags; /* status flags */
154 struct list_head b_hash_list; /* hash table list */ 154 struct list_head b_hash_list; /* hash table list */
155 xfs_bufhash_t *b_hash; /* hash table list start */ 155 xfs_bufhash_t *b_hash; /* hash table list start */
156 xfs_buftarg_t *b_target; /* buffer target (device) */ 156 xfs_buftarg_t *b_target; /* buffer target (device) */
157 atomic_t b_hold; /* reference count */ 157 atomic_t b_hold; /* reference count */
158 xfs_daddr_t b_bn; /* block number for I/O */ 158 xfs_daddr_t b_bn; /* block number for I/O */
159 xfs_off_t b_file_offset; /* offset in file */ 159 xfs_off_t b_file_offset; /* offset in file */
160 size_t b_buffer_length;/* size of buffer in bytes */ 160 size_t b_buffer_length;/* size of buffer in bytes */
161 size_t b_count_desired;/* desired transfer size */ 161 size_t b_count_desired;/* desired transfer size */
162 void *b_addr; /* virtual address of buffer */ 162 void *b_addr; /* virtual address of buffer */
163 struct work_struct b_iodone_work; 163 struct work_struct b_iodone_work;
164 atomic_t b_io_remaining; /* #outstanding I/O requests */ 164 atomic_t b_io_remaining; /* #outstanding I/O requests */
165 xfs_buf_iodone_t b_iodone; /* I/O completion function */ 165 xfs_buf_iodone_t b_iodone; /* I/O completion function */
166 xfs_buf_relse_t b_relse; /* releasing function */ 166 xfs_buf_relse_t b_relse; /* releasing function */
167 xfs_buf_bdstrat_t b_strat; /* pre-write function */ 167 xfs_buf_bdstrat_t b_strat; /* pre-write function */
168 struct completion b_iowait; /* queue for I/O waiters */ 168 struct completion b_iowait; /* queue for I/O waiters */
169 void *b_fspriv; 169 void *b_fspriv;
170 void *b_fspriv2; 170 void *b_fspriv2;
171 void *b_fspriv3; 171 struct xfs_mount *b_mount;
172 unsigned short b_error; /* error code on I/O */ 172 unsigned short b_error; /* error code on I/O */
173 unsigned int b_page_count; /* size of page array */ 173 unsigned int b_page_count; /* size of page array */
174 unsigned int b_offset; /* page offset in first page */ 174 unsigned int b_offset; /* page offset in first page */
175 struct page **b_pages; /* array of page pointers */ 175 struct page **b_pages; /* array of page pointers */
176 struct page *b_page_array[XB_PAGES]; /* inline pages */ 176 struct page *b_page_array[XB_PAGES]; /* inline pages */
177 #ifdef XFS_BUF_LOCK_TRACKING 177 #ifdef XFS_BUF_LOCK_TRACKING
178 int b_last_holder; 178 int b_last_holder;
179 #endif 179 #endif
180 } xfs_buf_t; 180 } xfs_buf_t;
181 181
182 182
183 /* Finding and Reading Buffers */ 183 /* Finding and Reading Buffers */
184 extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t, 184 extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t,
185 xfs_buf_flags_t, xfs_buf_t *); 185 xfs_buf_flags_t, xfs_buf_t *);
186 #define xfs_incore(buftarg,blkno,len,lockit) \ 186 #define xfs_incore(buftarg,blkno,len,lockit) \
187 _xfs_buf_find(buftarg, blkno ,len, lockit, NULL) 187 _xfs_buf_find(buftarg, blkno ,len, lockit, NULL)
188 188
189 extern xfs_buf_t *xfs_buf_get_flags(xfs_buftarg_t *, xfs_off_t, size_t, 189 extern xfs_buf_t *xfs_buf_get_flags(xfs_buftarg_t *, xfs_off_t, size_t,
190 xfs_buf_flags_t); 190 xfs_buf_flags_t);
191 #define xfs_buf_get(target, blkno, len, flags) \ 191 #define xfs_buf_get(target, blkno, len, flags) \
192 xfs_buf_get_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED) 192 xfs_buf_get_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED)
193 193
194 extern xfs_buf_t *xfs_buf_read_flags(xfs_buftarg_t *, xfs_off_t, size_t, 194 extern xfs_buf_t *xfs_buf_read_flags(xfs_buftarg_t *, xfs_off_t, size_t,
195 xfs_buf_flags_t); 195 xfs_buf_flags_t);
196 #define xfs_buf_read(target, blkno, len, flags) \ 196 #define xfs_buf_read(target, blkno, len, flags) \
197 xfs_buf_read_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED) 197 xfs_buf_read_flags((target), (blkno), (len), XBF_LOCK | XBF_MAPPED)
198 198
199 extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); 199 extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
200 extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *); 200 extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *);
201 extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); 201 extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
202 extern void xfs_buf_hold(xfs_buf_t *); 202 extern void xfs_buf_hold(xfs_buf_t *);
203 extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t, 203 extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t,
204 xfs_buf_flags_t); 204 xfs_buf_flags_t);
205 205
206 /* Releasing Buffers */ 206 /* Releasing Buffers */
207 extern void xfs_buf_free(xfs_buf_t *); 207 extern void xfs_buf_free(xfs_buf_t *);
208 extern void xfs_buf_rele(xfs_buf_t *); 208 extern void xfs_buf_rele(xfs_buf_t *);
209 209
210 /* Locking and Unlocking Buffers */ 210 /* Locking and Unlocking Buffers */
211 extern int xfs_buf_cond_lock(xfs_buf_t *); 211 extern int xfs_buf_cond_lock(xfs_buf_t *);
212 extern int xfs_buf_lock_value(xfs_buf_t *); 212 extern int xfs_buf_lock_value(xfs_buf_t *);
213 extern void xfs_buf_lock(xfs_buf_t *); 213 extern void xfs_buf_lock(xfs_buf_t *);
214 extern void xfs_buf_unlock(xfs_buf_t *); 214 extern void xfs_buf_unlock(xfs_buf_t *);
215 215
216 /* Buffer Read and Write Routines */ 216 /* Buffer Read and Write Routines */
217 extern int xfs_bawrite(void *mp, xfs_buf_t *bp); 217 extern int xfs_bawrite(void *mp, xfs_buf_t *bp);
218 extern void xfs_bdwrite(void *mp, xfs_buf_t *bp); 218 extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
219 extern void xfs_buf_ioend(xfs_buf_t *, int); 219 extern void xfs_buf_ioend(xfs_buf_t *, int);
220 extern void xfs_buf_ioerror(xfs_buf_t *, int); 220 extern void xfs_buf_ioerror(xfs_buf_t *, int);
221 extern int xfs_buf_iorequest(xfs_buf_t *); 221 extern int xfs_buf_iorequest(xfs_buf_t *);
222 extern int xfs_buf_iowait(xfs_buf_t *); 222 extern int xfs_buf_iowait(xfs_buf_t *);
223 extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t, 223 extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t,
224 xfs_buf_rw_t); 224 xfs_buf_rw_t);
225 225
226 static inline int xfs_buf_iostrategy(xfs_buf_t *bp) 226 static inline int xfs_buf_iostrategy(xfs_buf_t *bp)
227 { 227 {
228 return bp->b_strat ? bp->b_strat(bp) : xfs_buf_iorequest(bp); 228 return bp->b_strat ? bp->b_strat(bp) : xfs_buf_iorequest(bp);
229 } 229 }
230 230
231 static inline int xfs_buf_geterror(xfs_buf_t *bp) 231 static inline int xfs_buf_geterror(xfs_buf_t *bp)
232 { 232 {
233 return bp ? bp->b_error : ENOMEM; 233 return bp ? bp->b_error : ENOMEM;
234 } 234 }
235 235
236 /* Buffer Utility Routines */ 236 /* Buffer Utility Routines */
237 extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); 237 extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
238 238
239 /* Pinning Buffer Storage in Memory */ 239 /* Pinning Buffer Storage in Memory */
240 extern void xfs_buf_pin(xfs_buf_t *); 240 extern void xfs_buf_pin(xfs_buf_t *);
241 extern void xfs_buf_unpin(xfs_buf_t *); 241 extern void xfs_buf_unpin(xfs_buf_t *);
242 extern int xfs_buf_ispin(xfs_buf_t *); 242 extern int xfs_buf_ispin(xfs_buf_t *);
243 243
244 /* Delayed Write Buffer Routines */ 244 /* Delayed Write Buffer Routines */
245 extern void xfs_buf_delwri_dequeue(xfs_buf_t *); 245 extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
246 246
247 /* Buffer Daemon Setup Routines */ 247 /* Buffer Daemon Setup Routines */
248 extern int xfs_buf_init(void); 248 extern int xfs_buf_init(void);
249 extern void xfs_buf_terminate(void); 249 extern void xfs_buf_terminate(void);
250 250
251 #ifdef XFS_BUF_TRACE 251 #ifdef XFS_BUF_TRACE
252 extern ktrace_t *xfs_buf_trace_buf; 252 extern ktrace_t *xfs_buf_trace_buf;
253 extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *); 253 extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
254 #else 254 #else
255 #define xfs_buf_trace(bp,id,ptr,ra) do { } while (0) 255 #define xfs_buf_trace(bp,id,ptr,ra) do { } while (0)
256 #endif 256 #endif
257 257
258 #define xfs_buf_target_name(target) \ 258 #define xfs_buf_target_name(target) \
259 ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; }) 259 ({ char __b[BDEVNAME_SIZE]; bdevname((target)->bt_bdev, __b); __b; })
260 260
261 261
262 #define XFS_B_ASYNC XBF_ASYNC 262 #define XFS_B_ASYNC XBF_ASYNC
263 #define XFS_B_DELWRI XBF_DELWRI 263 #define XFS_B_DELWRI XBF_DELWRI
264 #define XFS_B_READ XBF_READ 264 #define XFS_B_READ XBF_READ
265 #define XFS_B_WRITE XBF_WRITE 265 #define XFS_B_WRITE XBF_WRITE
266 #define XFS_B_STALE XBF_STALE 266 #define XFS_B_STALE XBF_STALE
267 267
268 #define XFS_BUF_TRYLOCK XBF_TRYLOCK 268 #define XFS_BUF_TRYLOCK XBF_TRYLOCK
269 #define XFS_INCORE_TRYLOCK XBF_TRYLOCK 269 #define XFS_INCORE_TRYLOCK XBF_TRYLOCK
270 #define XFS_BUF_LOCK XBF_LOCK 270 #define XFS_BUF_LOCK XBF_LOCK
271 #define XFS_BUF_MAPPED XBF_MAPPED 271 #define XFS_BUF_MAPPED XBF_MAPPED
272 272
273 #define BUF_BUSY XBF_DONT_BLOCK 273 #define BUF_BUSY XBF_DONT_BLOCK
274 274
275 #define XFS_BUF_BFLAGS(bp) ((bp)->b_flags) 275 #define XFS_BUF_BFLAGS(bp) ((bp)->b_flags)
276 #define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \ 276 #define XFS_BUF_ZEROFLAGS(bp) ((bp)->b_flags &= \
277 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED)) 277 ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI|XBF_ORDERED))
278 278
279 #define XFS_BUF_STALE(bp) ((bp)->b_flags |= XFS_B_STALE) 279 #define XFS_BUF_STALE(bp) ((bp)->b_flags |= XFS_B_STALE)
280 #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XFS_B_STALE) 280 #define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XFS_B_STALE)
281 #define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XFS_B_STALE) 281 #define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XFS_B_STALE)
282 #define XFS_BUF_SUPER_STALE(bp) do { \ 282 #define XFS_BUF_SUPER_STALE(bp) do { \
283 XFS_BUF_STALE(bp); \ 283 XFS_BUF_STALE(bp); \
284 xfs_buf_delwri_dequeue(bp); \ 284 xfs_buf_delwri_dequeue(bp); \
285 XFS_BUF_DONE(bp); \ 285 XFS_BUF_DONE(bp); \
286 } while (0) 286 } while (0)
287 287
288 #define XFS_BUF_MANAGE XBF_FS_MANAGED 288 #define XFS_BUF_MANAGE XBF_FS_MANAGED
289 #define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED) 289 #define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED)
290 290
291 #define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) 291 #define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI)
292 #define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp) 292 #define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp)
293 #define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) 293 #define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
294 294
295 #define XFS_BUF_ERROR(bp,no) xfs_buf_ioerror(bp,no) 295 #define XFS_BUF_ERROR(bp,no) xfs_buf_ioerror(bp,no)
296 #define XFS_BUF_GETERROR(bp) xfs_buf_geterror(bp) 296 #define XFS_BUF_GETERROR(bp) xfs_buf_geterror(bp)
297 #define XFS_BUF_ISERROR(bp) (xfs_buf_geterror(bp) ? 1 : 0) 297 #define XFS_BUF_ISERROR(bp) (xfs_buf_geterror(bp) ? 1 : 0)
298 298
299 #define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) 299 #define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE)
300 #define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) 300 #define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE)
301 #define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) 301 #define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE)
302 302
303 #define XFS_BUF_BUSY(bp) do { } while (0) 303 #define XFS_BUF_BUSY(bp) do { } while (0)
304 #define XFS_BUF_UNBUSY(bp) do { } while (0) 304 #define XFS_BUF_UNBUSY(bp) do { } while (0)
305 #define XFS_BUF_ISBUSY(bp) (1) 305 #define XFS_BUF_ISBUSY(bp) (1)
306 306
307 #define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC) 307 #define XFS_BUF_ASYNC(bp) ((bp)->b_flags |= XBF_ASYNC)
308 #define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC) 308 #define XFS_BUF_UNASYNC(bp) ((bp)->b_flags &= ~XBF_ASYNC)
309 #define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC) 309 #define XFS_BUF_ISASYNC(bp) ((bp)->b_flags & XBF_ASYNC)
310 310
311 #define XFS_BUF_ORDERED(bp) ((bp)->b_flags |= XBF_ORDERED) 311 #define XFS_BUF_ORDERED(bp) ((bp)->b_flags |= XBF_ORDERED)
312 #define XFS_BUF_UNORDERED(bp) ((bp)->b_flags &= ~XBF_ORDERED) 312 #define XFS_BUF_UNORDERED(bp) ((bp)->b_flags &= ~XBF_ORDERED)
313 #define XFS_BUF_ISORDERED(bp) ((bp)->b_flags & XBF_ORDERED) 313 #define XFS_BUF_ISORDERED(bp) ((bp)->b_flags & XBF_ORDERED)
314 314
315 #define XFS_BUF_SHUT(bp) do { } while (0) 315 #define XFS_BUF_SHUT(bp) do { } while (0)
316 #define XFS_BUF_UNSHUT(bp) do { } while (0) 316 #define XFS_BUF_UNSHUT(bp) do { } while (0)
317 #define XFS_BUF_ISSHUT(bp) (0) 317 #define XFS_BUF_ISSHUT(bp) (0)
318 318
319 #define XFS_BUF_HOLD(bp) xfs_buf_hold(bp) 319 #define XFS_BUF_HOLD(bp) xfs_buf_hold(bp)
320 #define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ) 320 #define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ)
321 #define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ) 321 #define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ)
322 #define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ) 322 #define XFS_BUF_ISREAD(bp) ((bp)->b_flags & XBF_READ)
323 323
324 #define XFS_BUF_WRITE(bp) ((bp)->b_flags |= XBF_WRITE) 324 #define XFS_BUF_WRITE(bp) ((bp)->b_flags |= XBF_WRITE)
325 #define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE) 325 #define XFS_BUF_UNWRITE(bp) ((bp)->b_flags &= ~XBF_WRITE)
326 #define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE) 326 #define XFS_BUF_ISWRITE(bp) ((bp)->b_flags & XBF_WRITE)
327 327
328 #define XFS_BUF_IODONE_FUNC(bp) ((bp)->b_iodone) 328 #define XFS_BUF_IODONE_FUNC(bp) ((bp)->b_iodone)
329 #define XFS_BUF_SET_IODONE_FUNC(bp, func) ((bp)->b_iodone = (func)) 329 #define XFS_BUF_SET_IODONE_FUNC(bp, func) ((bp)->b_iodone = (func))
330 #define XFS_BUF_CLR_IODONE_FUNC(bp) ((bp)->b_iodone = NULL) 330 #define XFS_BUF_CLR_IODONE_FUNC(bp) ((bp)->b_iodone = NULL)
331 #define XFS_BUF_SET_BDSTRAT_FUNC(bp, func) ((bp)->b_strat = (func)) 331 #define XFS_BUF_SET_BDSTRAT_FUNC(bp, func) ((bp)->b_strat = (func))
332 #define XFS_BUF_CLR_BDSTRAT_FUNC(bp) ((bp)->b_strat = NULL) 332 #define XFS_BUF_CLR_BDSTRAT_FUNC(bp) ((bp)->b_strat = NULL)
333 333
334 #define XFS_BUF_FSPRIVATE(bp, type) ((type)(bp)->b_fspriv) 334 #define XFS_BUF_FSPRIVATE(bp, type) ((type)(bp)->b_fspriv)
335 #define XFS_BUF_SET_FSPRIVATE(bp, val) ((bp)->b_fspriv = (void*)(val)) 335 #define XFS_BUF_SET_FSPRIVATE(bp, val) ((bp)->b_fspriv = (void*)(val))
336 #define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2) 336 #define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2)
337 #define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val)) 337 #define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val))
338 #define XFS_BUF_FSPRIVATE3(bp, type) ((type)(bp)->b_fspriv3)
339 #define XFS_BUF_SET_FSPRIVATE3(bp, val) ((bp)->b_fspriv3 = (void*)(val))
340 #define XFS_BUF_SET_START(bp) do { } while (0) 338 #define XFS_BUF_SET_START(bp) do { } while (0)
341 #define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func)) 339 #define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func))
342 340
343 #define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr) 341 #define XFS_BUF_PTR(bp) (xfs_caddr_t)((bp)->b_addr)
344 #define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt) 342 #define XFS_BUF_SET_PTR(bp, val, cnt) xfs_buf_associate_memory(bp, val, cnt)
345 #define XFS_BUF_ADDR(bp) ((bp)->b_bn) 343 #define XFS_BUF_ADDR(bp) ((bp)->b_bn)
346 #define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno)) 344 #define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_bn = (xfs_daddr_t)(bno))
347 #define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset) 345 #define XFS_BUF_OFFSET(bp) ((bp)->b_file_offset)
348 #define XFS_BUF_SET_OFFSET(bp, off) ((bp)->b_file_offset = (off)) 346 #define XFS_BUF_SET_OFFSET(bp, off) ((bp)->b_file_offset = (off))
349 #define XFS_BUF_COUNT(bp) ((bp)->b_count_desired) 347 #define XFS_BUF_COUNT(bp) ((bp)->b_count_desired)
350 #define XFS_BUF_SET_COUNT(bp, cnt) ((bp)->b_count_desired = (cnt)) 348 #define XFS_BUF_SET_COUNT(bp, cnt) ((bp)->b_count_desired = (cnt))
351 #define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length) 349 #define XFS_BUF_SIZE(bp) ((bp)->b_buffer_length)
352 #define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt)) 350 #define XFS_BUF_SET_SIZE(bp, cnt) ((bp)->b_buffer_length = (cnt))
353 351
354 #define XFS_BUF_SET_VTYPE_REF(bp, type, ref) do { } while (0) 352 #define XFS_BUF_SET_VTYPE_REF(bp, type, ref) do { } while (0)
355 #define XFS_BUF_SET_VTYPE(bp, type) do { } while (0) 353 #define XFS_BUF_SET_VTYPE(bp, type) do { } while (0)
356 #define XFS_BUF_SET_REF(bp, ref) do { } while (0) 354 #define XFS_BUF_SET_REF(bp, ref) do { } while (0)
357 355
358 #define XFS_BUF_ISPINNED(bp) xfs_buf_ispin(bp) 356 #define XFS_BUF_ISPINNED(bp) xfs_buf_ispin(bp)
359 357
360 #define XFS_BUF_VALUSEMA(bp) xfs_buf_lock_value(bp) 358 #define XFS_BUF_VALUSEMA(bp) xfs_buf_lock_value(bp)
361 #define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0) 359 #define XFS_BUF_CPSEMA(bp) (xfs_buf_cond_lock(bp) == 0)
362 #define XFS_BUF_VSEMA(bp) xfs_buf_unlock(bp) 360 #define XFS_BUF_VSEMA(bp) xfs_buf_unlock(bp)
363 #define XFS_BUF_PSEMA(bp,x) xfs_buf_lock(bp) 361 #define XFS_BUF_PSEMA(bp,x) xfs_buf_lock(bp)
364 #define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait); 362 #define XFS_BUF_FINISH_IOWAIT(bp) complete(&bp->b_iowait);
365 363
366 #define XFS_BUF_SET_TARGET(bp, target) ((bp)->b_target = (target)) 364 #define XFS_BUF_SET_TARGET(bp, target) ((bp)->b_target = (target))
367 #define XFS_BUF_TARGET(bp) ((bp)->b_target) 365 #define XFS_BUF_TARGET(bp) ((bp)->b_target)
368 #define XFS_BUFTARG_NAME(target) xfs_buf_target_name(target) 366 #define XFS_BUFTARG_NAME(target) xfs_buf_target_name(target)
369 367
370 static inline void xfs_buf_relse(xfs_buf_t *bp) 368 static inline void xfs_buf_relse(xfs_buf_t *bp)
371 { 369 {
372 if (!bp->b_relse) 370 if (!bp->b_relse)
373 xfs_buf_unlock(bp); 371 xfs_buf_unlock(bp);
374 xfs_buf_rele(bp); 372 xfs_buf_rele(bp);
375 } 373 }
376 374
377 #define xfs_bpin(bp) xfs_buf_pin(bp) 375 #define xfs_bpin(bp) xfs_buf_pin(bp)
378 #define xfs_bunpin(bp) xfs_buf_unpin(bp) 376 #define xfs_bunpin(bp) xfs_buf_unpin(bp)
379 377
380 #define xfs_buftrace(id, bp) \ 378 #define xfs_buftrace(id, bp) \
381 xfs_buf_trace(bp, id, NULL, (void *)__builtin_return_address(0)) 379 xfs_buf_trace(bp, id, NULL, (void *)__builtin_return_address(0))
382 380
383 #define xfs_biodone(bp) xfs_buf_ioend(bp, 0) 381 #define xfs_biodone(bp) xfs_buf_ioend(bp, 0)
384 382
385 #define xfs_biomove(bp, off, len, data, rw) \ 383 #define xfs_biomove(bp, off, len, data, rw) \
386 xfs_buf_iomove((bp), (off), (len), (data), \ 384 xfs_buf_iomove((bp), (off), (len), (data), \
387 ((rw) == XFS_B_WRITE) ? XBRW_WRITE : XBRW_READ) 385 ((rw) == XFS_B_WRITE) ? XBRW_WRITE : XBRW_READ)
388 386
389 #define xfs_biozero(bp, off, len) \ 387 #define xfs_biozero(bp, off, len) \
390 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) 388 xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
391 389
392 390
393 static inline int XFS_bwrite(xfs_buf_t *bp) 391 static inline int XFS_bwrite(xfs_buf_t *bp)
394 { 392 {
395 int iowait = (bp->b_flags & XBF_ASYNC) == 0; 393 int iowait = (bp->b_flags & XBF_ASYNC) == 0;
396 int error = 0; 394 int error = 0;
397 395
398 if (!iowait) 396 if (!iowait)
399 bp->b_flags |= _XBF_RUN_QUEUES; 397 bp->b_flags |= _XBF_RUN_QUEUES;
400 398
401 xfs_buf_delwri_dequeue(bp); 399 xfs_buf_delwri_dequeue(bp);
402 xfs_buf_iostrategy(bp); 400 xfs_buf_iostrategy(bp);
403 if (iowait) { 401 if (iowait) {
404 error = xfs_buf_iowait(bp); 402 error = xfs_buf_iowait(bp);
405 xfs_buf_relse(bp); 403 xfs_buf_relse(bp);
406 } 404 }
407 return error; 405 return error;
408 } 406 }
409 407
410 #define XFS_bdstrat(bp) xfs_buf_iorequest(bp) 408 #define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
411 409
412 #define xfs_iowait(bp) xfs_buf_iowait(bp) 410 #define xfs_iowait(bp) xfs_buf_iowait(bp)
413 411
414 #define xfs_baread(target, rablkno, ralen) \ 412 #define xfs_baread(target, rablkno, ralen) \
415 xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK) 413 xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK)
416 414
417 415
418 /* 416 /*
419 * Handling of buftargs. 417 * Handling of buftargs.
420 */ 418 */
421 extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); 419 extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
422 extern void xfs_free_buftarg(xfs_buftarg_t *); 420 extern void xfs_free_buftarg(xfs_buftarg_t *);
423 extern void xfs_wait_buftarg(xfs_buftarg_t *); 421 extern void xfs_wait_buftarg(xfs_buftarg_t *);
424 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 422 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
425 extern int xfs_flush_buftarg(xfs_buftarg_t *, int); 423 extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
426 #ifdef CONFIG_KDB_MODULES 424 #ifdef CONFIG_KDB_MODULES
427 extern struct list_head *xfs_get_buftarg_list(void); 425 extern struct list_head *xfs_get_buftarg_list(void);
428 #endif 426 #endif
429 427
430 #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) 428 #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
431 #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) 429 #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
432 430
433 #define xfs_binval(buftarg) xfs_flush_buftarg(buftarg, 1) 431 #define xfs_binval(buftarg) xfs_flush_buftarg(buftarg, 1)
434 #define XFS_bflush(buftarg) xfs_flush_buftarg(buftarg, 1) 432 #define XFS_bflush(buftarg) xfs_flush_buftarg(buftarg, 1)
435 433
436 #endif /* __XFS_BUF_H__ */ 434 #endif /* __XFS_BUF_H__ */
437 435
fs/xfs/linux-2.6/xfs_lrw.c
1 /* 1 /*
2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. 2 * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3 * All Rights Reserved. 3 * All Rights Reserved.
4 * 4 *
5 * This program is free software; you can redistribute it and/or 5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as 6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 * 8 *
9 * This program is distributed in the hope that it would be useful, 9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation, 15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18 #include "xfs.h" 18 #include "xfs.h"
19 #include "xfs_fs.h" 19 #include "xfs_fs.h"
20 #include "xfs_bit.h" 20 #include "xfs_bit.h"
21 #include "xfs_log.h" 21 #include "xfs_log.h"
22 #include "xfs_inum.h" 22 #include "xfs_inum.h"
23 #include "xfs_trans.h" 23 #include "xfs_trans.h"
24 #include "xfs_sb.h" 24 #include "xfs_sb.h"
25 #include "xfs_ag.h" 25 #include "xfs_ag.h"
26 #include "xfs_dir2.h" 26 #include "xfs_dir2.h"
27 #include "xfs_alloc.h" 27 #include "xfs_alloc.h"
28 #include "xfs_dmapi.h" 28 #include "xfs_dmapi.h"
29 #include "xfs_quota.h" 29 #include "xfs_quota.h"
30 #include "xfs_mount.h" 30 #include "xfs_mount.h"
31 #include "xfs_bmap_btree.h" 31 #include "xfs_bmap_btree.h"
32 #include "xfs_alloc_btree.h" 32 #include "xfs_alloc_btree.h"
33 #include "xfs_ialloc_btree.h" 33 #include "xfs_ialloc_btree.h"
34 #include "xfs_dir2_sf.h" 34 #include "xfs_dir2_sf.h"
35 #include "xfs_attr_sf.h" 35 #include "xfs_attr_sf.h"
36 #include "xfs_dinode.h" 36 #include "xfs_dinode.h"
37 #include "xfs_inode.h" 37 #include "xfs_inode.h"
38 #include "xfs_bmap.h" 38 #include "xfs_bmap.h"
39 #include "xfs_btree.h" 39 #include "xfs_btree.h"
40 #include "xfs_ialloc.h" 40 #include "xfs_ialloc.h"
41 #include "xfs_rtalloc.h" 41 #include "xfs_rtalloc.h"
42 #include "xfs_error.h" 42 #include "xfs_error.h"
43 #include "xfs_itable.h" 43 #include "xfs_itable.h"
44 #include "xfs_rw.h" 44 #include "xfs_rw.h"
45 #include "xfs_acl.h" 45 #include "xfs_acl.h"
46 #include "xfs_attr.h" 46 #include "xfs_attr.h"
47 #include "xfs_inode_item.h" 47 #include "xfs_inode_item.h"
48 #include "xfs_buf_item.h" 48 #include "xfs_buf_item.h"
49 #include "xfs_utils.h" 49 #include "xfs_utils.h"
50 #include "xfs_iomap.h" 50 #include "xfs_iomap.h"
51 #include "xfs_vnodeops.h" 51 #include "xfs_vnodeops.h"
52 52
53 #include <linux/capability.h> 53 #include <linux/capability.h>
54 #include <linux/writeback.h> 54 #include <linux/writeback.h>
55 55
56 56
57 #if defined(XFS_RW_TRACE) 57 #if defined(XFS_RW_TRACE)
58 void 58 void
59 xfs_rw_enter_trace( 59 xfs_rw_enter_trace(
60 int tag, 60 int tag,
61 xfs_inode_t *ip, 61 xfs_inode_t *ip,
62 void *data, 62 void *data,
63 size_t segs, 63 size_t segs,
64 loff_t offset, 64 loff_t offset,
65 int ioflags) 65 int ioflags)
66 { 66 {
67 if (ip->i_rwtrace == NULL) 67 if (ip->i_rwtrace == NULL)
68 return; 68 return;
69 ktrace_enter(ip->i_rwtrace, 69 ktrace_enter(ip->i_rwtrace,
70 (void *)(unsigned long)tag, 70 (void *)(unsigned long)tag,
71 (void *)ip, 71 (void *)ip,
72 (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)), 72 (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
73 (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)), 73 (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
74 (void *)data, 74 (void *)data,
75 (void *)((unsigned long)segs), 75 (void *)((unsigned long)segs),
76 (void *)((unsigned long)((offset >> 32) & 0xffffffff)), 76 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
77 (void *)((unsigned long)(offset & 0xffffffff)), 77 (void *)((unsigned long)(offset & 0xffffffff)),
78 (void *)((unsigned long)ioflags), 78 (void *)((unsigned long)ioflags),
79 (void *)((unsigned long)((ip->i_new_size >> 32) & 0xffffffff)), 79 (void *)((unsigned long)((ip->i_new_size >> 32) & 0xffffffff)),
80 (void *)((unsigned long)(ip->i_new_size & 0xffffffff)), 80 (void *)((unsigned long)(ip->i_new_size & 0xffffffff)),
81 (void *)((unsigned long)current_pid()), 81 (void *)((unsigned long)current_pid()),
82 (void *)NULL, 82 (void *)NULL,
83 (void *)NULL, 83 (void *)NULL,
84 (void *)NULL, 84 (void *)NULL,
85 (void *)NULL); 85 (void *)NULL);
86 } 86 }
87 87
88 void 88 void
89 xfs_inval_cached_trace( 89 xfs_inval_cached_trace(
90 xfs_inode_t *ip, 90 xfs_inode_t *ip,
91 xfs_off_t offset, 91 xfs_off_t offset,
92 xfs_off_t len, 92 xfs_off_t len,
93 xfs_off_t first, 93 xfs_off_t first,
94 xfs_off_t last) 94 xfs_off_t last)
95 { 95 {
96 96
97 if (ip->i_rwtrace == NULL) 97 if (ip->i_rwtrace == NULL)
98 return; 98 return;
99 ktrace_enter(ip->i_rwtrace, 99 ktrace_enter(ip->i_rwtrace,
100 (void *)(__psint_t)XFS_INVAL_CACHED, 100 (void *)(__psint_t)XFS_INVAL_CACHED,
101 (void *)ip, 101 (void *)ip,
102 (void *)((unsigned long)((offset >> 32) & 0xffffffff)), 102 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
103 (void *)((unsigned long)(offset & 0xffffffff)), 103 (void *)((unsigned long)(offset & 0xffffffff)),
104 (void *)((unsigned long)((len >> 32) & 0xffffffff)), 104 (void *)((unsigned long)((len >> 32) & 0xffffffff)),
105 (void *)((unsigned long)(len & 0xffffffff)), 105 (void *)((unsigned long)(len & 0xffffffff)),
106 (void *)((unsigned long)((first >> 32) & 0xffffffff)), 106 (void *)((unsigned long)((first >> 32) & 0xffffffff)),
107 (void *)((unsigned long)(first & 0xffffffff)), 107 (void *)((unsigned long)(first & 0xffffffff)),
108 (void *)((unsigned long)((last >> 32) & 0xffffffff)), 108 (void *)((unsigned long)((last >> 32) & 0xffffffff)),
109 (void *)((unsigned long)(last & 0xffffffff)), 109 (void *)((unsigned long)(last & 0xffffffff)),
110 (void *)((unsigned long)current_pid()), 110 (void *)((unsigned long)current_pid()),
111 (void *)NULL, 111 (void *)NULL,
112 (void *)NULL, 112 (void *)NULL,
113 (void *)NULL, 113 (void *)NULL,
114 (void *)NULL, 114 (void *)NULL,
115 (void *)NULL); 115 (void *)NULL);
116 } 116 }
117 #endif 117 #endif
118 118
119 /* 119 /*
120 * xfs_iozero 120 * xfs_iozero
121 * 121 *
122 * xfs_iozero clears the specified range of buffer supplied, 122 * xfs_iozero clears the specified range of buffer supplied,
123 * and marks all the affected blocks as valid and modified. If 123 * and marks all the affected blocks as valid and modified. If
124 * an affected block is not allocated, it will be allocated. If 124 * an affected block is not allocated, it will be allocated. If
125 * an affected block is not completely overwritten, and is not 125 * an affected block is not completely overwritten, and is not
126 * valid before the operation, it will be read from disk before 126 * valid before the operation, it will be read from disk before
127 * being partially zeroed. 127 * being partially zeroed.
128 */ 128 */
129 STATIC int 129 STATIC int
130 xfs_iozero( 130 xfs_iozero(
131 struct xfs_inode *ip, /* inode */ 131 struct xfs_inode *ip, /* inode */
132 loff_t pos, /* offset in file */ 132 loff_t pos, /* offset in file */
133 size_t count) /* size of data to zero */ 133 size_t count) /* size of data to zero */
134 { 134 {
135 struct page *page; 135 struct page *page;
136 struct address_space *mapping; 136 struct address_space *mapping;
137 int status; 137 int status;
138 138
139 mapping = VFS_I(ip)->i_mapping; 139 mapping = VFS_I(ip)->i_mapping;
140 do { 140 do {
141 unsigned offset, bytes; 141 unsigned offset, bytes;
142 void *fsdata; 142 void *fsdata;
143 143
144 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ 144 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
145 bytes = PAGE_CACHE_SIZE - offset; 145 bytes = PAGE_CACHE_SIZE - offset;
146 if (bytes > count) 146 if (bytes > count)
147 bytes = count; 147 bytes = count;
148 148
149 status = pagecache_write_begin(NULL, mapping, pos, bytes, 149 status = pagecache_write_begin(NULL, mapping, pos, bytes,
150 AOP_FLAG_UNINTERRUPTIBLE, 150 AOP_FLAG_UNINTERRUPTIBLE,
151 &page, &fsdata); 151 &page, &fsdata);
152 if (status) 152 if (status)
153 break; 153 break;
154 154
155 zero_user(page, offset, bytes); 155 zero_user(page, offset, bytes);
156 156
157 status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, 157 status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
158 page, fsdata); 158 page, fsdata);
159 WARN_ON(status <= 0); /* can't return less than zero! */ 159 WARN_ON(status <= 0); /* can't return less than zero! */
160 pos += bytes; 160 pos += bytes;
161 count -= bytes; 161 count -= bytes;
162 status = 0; 162 status = 0;
163 } while (count); 163 } while (count);
164 164
165 return (-status); 165 return (-status);
166 } 166 }
167 167
168 ssize_t /* bytes read, or (-) error */ 168 ssize_t /* bytes read, or (-) error */
169 xfs_read( 169 xfs_read(
170 xfs_inode_t *ip, 170 xfs_inode_t *ip,
171 struct kiocb *iocb, 171 struct kiocb *iocb,
172 const struct iovec *iovp, 172 const struct iovec *iovp,
173 unsigned int segs, 173 unsigned int segs,
174 loff_t *offset, 174 loff_t *offset,
175 int ioflags) 175 int ioflags)
176 { 176 {
177 struct file *file = iocb->ki_filp; 177 struct file *file = iocb->ki_filp;
178 struct inode *inode = file->f_mapping->host; 178 struct inode *inode = file->f_mapping->host;
179 xfs_mount_t *mp = ip->i_mount; 179 xfs_mount_t *mp = ip->i_mount;
180 size_t size = 0; 180 size_t size = 0;
181 ssize_t ret = 0; 181 ssize_t ret = 0;
182 xfs_fsize_t n; 182 xfs_fsize_t n;
183 unsigned long seg; 183 unsigned long seg;
184 184
185 185
186 XFS_STATS_INC(xs_read_calls); 186 XFS_STATS_INC(xs_read_calls);
187 187
188 /* START copy & waste from filemap.c */ 188 /* START copy & waste from filemap.c */
189 for (seg = 0; seg < segs; seg++) { 189 for (seg = 0; seg < segs; seg++) {
190 const struct iovec *iv = &iovp[seg]; 190 const struct iovec *iv = &iovp[seg];
191 191
192 /* 192 /*
193 * If any segment has a negative length, or the cumulative 193 * If any segment has a negative length, or the cumulative
194 * length ever wraps negative then return -EINVAL. 194 * length ever wraps negative then return -EINVAL.
195 */ 195 */
196 size += iv->iov_len; 196 size += iv->iov_len;
197 if (unlikely((ssize_t)(size|iv->iov_len) < 0)) 197 if (unlikely((ssize_t)(size|iv->iov_len) < 0))
198 return XFS_ERROR(-EINVAL); 198 return XFS_ERROR(-EINVAL);
199 } 199 }
200 /* END copy & waste from filemap.c */ 200 /* END copy & waste from filemap.c */
201 201
202 if (unlikely(ioflags & IO_ISDIRECT)) { 202 if (unlikely(ioflags & IO_ISDIRECT)) {
203 xfs_buftarg_t *target = 203 xfs_buftarg_t *target =
204 XFS_IS_REALTIME_INODE(ip) ? 204 XFS_IS_REALTIME_INODE(ip) ?
205 mp->m_rtdev_targp : mp->m_ddev_targp; 205 mp->m_rtdev_targp : mp->m_ddev_targp;
206 if ((*offset & target->bt_smask) || 206 if ((*offset & target->bt_smask) ||
207 (size & target->bt_smask)) { 207 (size & target->bt_smask)) {
208 if (*offset == ip->i_size) { 208 if (*offset == ip->i_size) {
209 return (0); 209 return (0);
210 } 210 }
211 return -XFS_ERROR(EINVAL); 211 return -XFS_ERROR(EINVAL);
212 } 212 }
213 } 213 }
214 214
215 n = XFS_MAXIOFFSET(mp) - *offset; 215 n = XFS_MAXIOFFSET(mp) - *offset;
216 if ((n <= 0) || (size == 0)) 216 if ((n <= 0) || (size == 0))
217 return 0; 217 return 0;
218 218
219 if (n < size) 219 if (n < size)
220 size = n; 220 size = n;
221 221
222 if (XFS_FORCED_SHUTDOWN(mp)) 222 if (XFS_FORCED_SHUTDOWN(mp))
223 return -EIO; 223 return -EIO;
224 224
225 if (unlikely(ioflags & IO_ISDIRECT)) 225 if (unlikely(ioflags & IO_ISDIRECT))
226 mutex_lock(&inode->i_mutex); 226 mutex_lock(&inode->i_mutex);
227 xfs_ilock(ip, XFS_IOLOCK_SHARED); 227 xfs_ilock(ip, XFS_IOLOCK_SHARED);
228 228
229 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { 229 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
230 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); 230 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
231 int iolock = XFS_IOLOCK_SHARED; 231 int iolock = XFS_IOLOCK_SHARED;
232 232
233 ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size, 233 ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size,
234 dmflags, &iolock); 234 dmflags, &iolock);
235 if (ret) { 235 if (ret) {
236 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 236 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
237 if (unlikely(ioflags & IO_ISDIRECT)) 237 if (unlikely(ioflags & IO_ISDIRECT))
238 mutex_unlock(&inode->i_mutex); 238 mutex_unlock(&inode->i_mutex);
239 return ret; 239 return ret;
240 } 240 }
241 } 241 }
242 242
243 if (unlikely(ioflags & IO_ISDIRECT)) { 243 if (unlikely(ioflags & IO_ISDIRECT)) {
244 if (inode->i_mapping->nrpages) 244 if (inode->i_mapping->nrpages)
245 ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK), 245 ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
246 -1, FI_REMAPF_LOCKED); 246 -1, FI_REMAPF_LOCKED);
247 mutex_unlock(&inode->i_mutex); 247 mutex_unlock(&inode->i_mutex);
248 if (ret) { 248 if (ret) {
249 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 249 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
250 return ret; 250 return ret;
251 } 251 }
252 } 252 }
253 253
254 xfs_rw_enter_trace(XFS_READ_ENTER, ip, 254 xfs_rw_enter_trace(XFS_READ_ENTER, ip,
255 (void *)iovp, segs, *offset, ioflags); 255 (void *)iovp, segs, *offset, ioflags);
256 256
257 iocb->ki_pos = *offset; 257 iocb->ki_pos = *offset;
258 ret = generic_file_aio_read(iocb, iovp, segs, *offset); 258 ret = generic_file_aio_read(iocb, iovp, segs, *offset);
259 if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) 259 if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
260 ret = wait_on_sync_kiocb(iocb); 260 ret = wait_on_sync_kiocb(iocb);
261 if (ret > 0) 261 if (ret > 0)
262 XFS_STATS_ADD(xs_read_bytes, ret); 262 XFS_STATS_ADD(xs_read_bytes, ret);
263 263
264 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 264 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
265 return ret; 265 return ret;
266 } 266 }
267 267
268 ssize_t 268 ssize_t
269 xfs_splice_read( 269 xfs_splice_read(
270 xfs_inode_t *ip, 270 xfs_inode_t *ip,
271 struct file *infilp, 271 struct file *infilp,
272 loff_t *ppos, 272 loff_t *ppos,
273 struct pipe_inode_info *pipe, 273 struct pipe_inode_info *pipe,
274 size_t count, 274 size_t count,
275 int flags, 275 int flags,
276 int ioflags) 276 int ioflags)
277 { 277 {
278 xfs_mount_t *mp = ip->i_mount; 278 xfs_mount_t *mp = ip->i_mount;
279 ssize_t ret; 279 ssize_t ret;
280 280
281 XFS_STATS_INC(xs_read_calls); 281 XFS_STATS_INC(xs_read_calls);
282 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 282 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
283 return -EIO; 283 return -EIO;
284 284
285 xfs_ilock(ip, XFS_IOLOCK_SHARED); 285 xfs_ilock(ip, XFS_IOLOCK_SHARED);
286 286
287 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { 287 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
288 int iolock = XFS_IOLOCK_SHARED; 288 int iolock = XFS_IOLOCK_SHARED;
289 int error; 289 int error;
290 290
291 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count, 291 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
292 FILP_DELAY_FLAG(infilp), &iolock); 292 FILP_DELAY_FLAG(infilp), &iolock);
293 if (error) { 293 if (error) {
294 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 294 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
295 return -error; 295 return -error;
296 } 296 }
297 } 297 }
298 xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, ip, 298 xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, ip,
299 pipe, count, *ppos, ioflags); 299 pipe, count, *ppos, ioflags);
300 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); 300 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
301 if (ret > 0) 301 if (ret > 0)
302 XFS_STATS_ADD(xs_read_bytes, ret); 302 XFS_STATS_ADD(xs_read_bytes, ret);
303 303
304 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 304 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
305 return ret; 305 return ret;
306 } 306 }
307 307
308 ssize_t 308 ssize_t
309 xfs_splice_write( 309 xfs_splice_write(
310 xfs_inode_t *ip, 310 xfs_inode_t *ip,
311 struct pipe_inode_info *pipe, 311 struct pipe_inode_info *pipe,
312 struct file *outfilp, 312 struct file *outfilp,
313 loff_t *ppos, 313 loff_t *ppos,
314 size_t count, 314 size_t count,
315 int flags, 315 int flags,
316 int ioflags) 316 int ioflags)
317 { 317 {
318 xfs_mount_t *mp = ip->i_mount; 318 xfs_mount_t *mp = ip->i_mount;
319 ssize_t ret; 319 ssize_t ret;
320 struct inode *inode = outfilp->f_mapping->host; 320 struct inode *inode = outfilp->f_mapping->host;
321 xfs_fsize_t isize, new_size; 321 xfs_fsize_t isize, new_size;
322 322
323 XFS_STATS_INC(xs_write_calls); 323 XFS_STATS_INC(xs_write_calls);
324 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 324 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
325 return -EIO; 325 return -EIO;
326 326
327 xfs_ilock(ip, XFS_IOLOCK_EXCL); 327 xfs_ilock(ip, XFS_IOLOCK_EXCL);
328 328
329 if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) { 329 if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
330 int iolock = XFS_IOLOCK_EXCL; 330 int iolock = XFS_IOLOCK_EXCL;
331 int error; 331 int error;
332 332
333 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count, 333 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
334 FILP_DELAY_FLAG(outfilp), &iolock); 334 FILP_DELAY_FLAG(outfilp), &iolock);
335 if (error) { 335 if (error) {
336 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 336 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
337 return -error; 337 return -error;
338 } 338 }
339 } 339 }
340 340
341 new_size = *ppos + count; 341 new_size = *ppos + count;
342 342
343 xfs_ilock(ip, XFS_ILOCK_EXCL); 343 xfs_ilock(ip, XFS_ILOCK_EXCL);
344 if (new_size > ip->i_size) 344 if (new_size > ip->i_size)
345 ip->i_new_size = new_size; 345 ip->i_new_size = new_size;
346 xfs_iunlock(ip, XFS_ILOCK_EXCL); 346 xfs_iunlock(ip, XFS_ILOCK_EXCL);
347 347
348 xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, ip, 348 xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, ip,
349 pipe, count, *ppos, ioflags); 349 pipe, count, *ppos, ioflags);
350 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); 350 ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
351 if (ret > 0) 351 if (ret > 0)
352 XFS_STATS_ADD(xs_write_bytes, ret); 352 XFS_STATS_ADD(xs_write_bytes, ret);
353 353
354 isize = i_size_read(inode); 354 isize = i_size_read(inode);
355 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize)) 355 if (unlikely(ret < 0 && ret != -EFAULT && *ppos > isize))
356 *ppos = isize; 356 *ppos = isize;
357 357
358 if (*ppos > ip->i_size) { 358 if (*ppos > ip->i_size) {
359 xfs_ilock(ip, XFS_ILOCK_EXCL); 359 xfs_ilock(ip, XFS_ILOCK_EXCL);
360 if (*ppos > ip->i_size) 360 if (*ppos > ip->i_size)
361 ip->i_size = *ppos; 361 ip->i_size = *ppos;
362 xfs_iunlock(ip, XFS_ILOCK_EXCL); 362 xfs_iunlock(ip, XFS_ILOCK_EXCL);
363 } 363 }
364 364
365 if (ip->i_new_size) { 365 if (ip->i_new_size) {
366 xfs_ilock(ip, XFS_ILOCK_EXCL); 366 xfs_ilock(ip, XFS_ILOCK_EXCL);
367 ip->i_new_size = 0; 367 ip->i_new_size = 0;
368 if (ip->i_d.di_size > ip->i_size) 368 if (ip->i_d.di_size > ip->i_size)
369 ip->i_d.di_size = ip->i_size; 369 ip->i_d.di_size = ip->i_size;
370 xfs_iunlock(ip, XFS_ILOCK_EXCL); 370 xfs_iunlock(ip, XFS_ILOCK_EXCL);
371 } 371 }
372 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 372 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
373 return ret; 373 return ret;
374 } 374 }
375 375
376 /* 376 /*
377 * This routine is called to handle zeroing any space in the last 377 * This routine is called to handle zeroing any space in the last
378 * block of the file that is beyond the EOF. We do this since the 378 * block of the file that is beyond the EOF. We do this since the
379 * size is being increased without writing anything to that block 379 * size is being increased without writing anything to that block
380 * and we don't want anyone to read the garbage on the disk. 380 * and we don't want anyone to read the garbage on the disk.
381 */ 381 */
382 STATIC int /* error (positive) */ 382 STATIC int /* error (positive) */
383 xfs_zero_last_block( 383 xfs_zero_last_block(
384 xfs_inode_t *ip, 384 xfs_inode_t *ip,
385 xfs_fsize_t offset, 385 xfs_fsize_t offset,
386 xfs_fsize_t isize) 386 xfs_fsize_t isize)
387 { 387 {
388 xfs_fileoff_t last_fsb; 388 xfs_fileoff_t last_fsb;
389 xfs_mount_t *mp = ip->i_mount; 389 xfs_mount_t *mp = ip->i_mount;
390 int nimaps; 390 int nimaps;
391 int zero_offset; 391 int zero_offset;
392 int zero_len; 392 int zero_len;
393 int error = 0; 393 int error = 0;
394 xfs_bmbt_irec_t imap; 394 xfs_bmbt_irec_t imap;
395 395
396 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 396 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
397 397
398 zero_offset = XFS_B_FSB_OFFSET(mp, isize); 398 zero_offset = XFS_B_FSB_OFFSET(mp, isize);
399 if (zero_offset == 0) { 399 if (zero_offset == 0) {
400 /* 400 /*
401 * There are no extra bytes in the last block on disk to 401 * There are no extra bytes in the last block on disk to
402 * zero, so return. 402 * zero, so return.
403 */ 403 */
404 return 0; 404 return 0;
405 } 405 }
406 406
407 last_fsb = XFS_B_TO_FSBT(mp, isize); 407 last_fsb = XFS_B_TO_FSBT(mp, isize);
408 nimaps = 1; 408 nimaps = 1;
409 error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap, 409 error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
410 &nimaps, NULL, NULL); 410 &nimaps, NULL, NULL);
411 if (error) { 411 if (error) {
412 return error; 412 return error;
413 } 413 }
414 ASSERT(nimaps > 0); 414 ASSERT(nimaps > 0);
415 /* 415 /*
416 * If the block underlying isize is just a hole, then there 416 * If the block underlying isize is just a hole, then there
417 * is nothing to zero. 417 * is nothing to zero.
418 */ 418 */
419 if (imap.br_startblock == HOLESTARTBLOCK) { 419 if (imap.br_startblock == HOLESTARTBLOCK) {
420 return 0; 420 return 0;
421 } 421 }
422 /* 422 /*
423 * Zero the part of the last block beyond the EOF, and write it 423 * Zero the part of the last block beyond the EOF, and write it
424 * out sync. We need to drop the ilock while we do this so we 424 * out sync. We need to drop the ilock while we do this so we
425 * don't deadlock when the buffer cache calls back to us. 425 * don't deadlock when the buffer cache calls back to us.
426 */ 426 */
427 xfs_iunlock(ip, XFS_ILOCK_EXCL); 427 xfs_iunlock(ip, XFS_ILOCK_EXCL);
428 428
429 zero_len = mp->m_sb.sb_blocksize - zero_offset; 429 zero_len = mp->m_sb.sb_blocksize - zero_offset;
430 if (isize + zero_len > offset) 430 if (isize + zero_len > offset)
431 zero_len = offset - isize; 431 zero_len = offset - isize;
432 error = xfs_iozero(ip, isize, zero_len); 432 error = xfs_iozero(ip, isize, zero_len);
433 433
434 xfs_ilock(ip, XFS_ILOCK_EXCL); 434 xfs_ilock(ip, XFS_ILOCK_EXCL);
435 ASSERT(error >= 0); 435 ASSERT(error >= 0);
436 return error; 436 return error;
437 } 437 }
438 438
439 /* 439 /*
440 * Zero any on disk space between the current EOF and the new, 440 * Zero any on disk space between the current EOF and the new,
441 * larger EOF. This handles the normal case of zeroing the remainder 441 * larger EOF. This handles the normal case of zeroing the remainder
442 * of the last block in the file and the unusual case of zeroing blocks 442 * of the last block in the file and the unusual case of zeroing blocks
443 * out beyond the size of the file. This second case only happens 443 * out beyond the size of the file. This second case only happens
444 * with fixed size extents and when the system crashes before the inode 444 * with fixed size extents and when the system crashes before the inode
445 * size was updated but after blocks were allocated. If fill is set, 445 * size was updated but after blocks were allocated. If fill is set,
446 * then any holes in the range are filled and zeroed. If not, the holes 446 * then any holes in the range are filled and zeroed. If not, the holes
447 * are left alone as holes. 447 * are left alone as holes.
448 */ 448 */
449 449
450 int /* error (positive) */ 450 int /* error (positive) */
451 xfs_zero_eof( 451 xfs_zero_eof(
452 xfs_inode_t *ip, 452 xfs_inode_t *ip,
453 xfs_off_t offset, /* starting I/O offset */ 453 xfs_off_t offset, /* starting I/O offset */
454 xfs_fsize_t isize) /* current inode size */ 454 xfs_fsize_t isize) /* current inode size */
455 { 455 {
456 xfs_mount_t *mp = ip->i_mount; 456 xfs_mount_t *mp = ip->i_mount;
457 xfs_fileoff_t start_zero_fsb; 457 xfs_fileoff_t start_zero_fsb;
458 xfs_fileoff_t end_zero_fsb; 458 xfs_fileoff_t end_zero_fsb;
459 xfs_fileoff_t zero_count_fsb; 459 xfs_fileoff_t zero_count_fsb;
460 xfs_fileoff_t last_fsb; 460 xfs_fileoff_t last_fsb;
461 xfs_fileoff_t zero_off; 461 xfs_fileoff_t zero_off;
462 xfs_fsize_t zero_len; 462 xfs_fsize_t zero_len;
463 int nimaps; 463 int nimaps;
464 int error = 0; 464 int error = 0;
465 xfs_bmbt_irec_t imap; 465 xfs_bmbt_irec_t imap;
466 466
467 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 467 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
468 ASSERT(offset > isize); 468 ASSERT(offset > isize);
469 469
470 /* 470 /*
471 * First handle zeroing the block on which isize resides. 471 * First handle zeroing the block on which isize resides.
472 * We only zero a part of that block so it is handled specially. 472 * We only zero a part of that block so it is handled specially.
473 */ 473 */
474 error = xfs_zero_last_block(ip, offset, isize); 474 error = xfs_zero_last_block(ip, offset, isize);
475 if (error) { 475 if (error) {
476 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 476 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
477 return error; 477 return error;
478 } 478 }
479 479
480 /* 480 /*
481 * Calculate the range between the new size and the old 481 * Calculate the range between the new size and the old
482 * where blocks needing to be zeroed may exist. To get the 482 * where blocks needing to be zeroed may exist. To get the
483 * block where the last byte in the file currently resides, 483 * block where the last byte in the file currently resides,
484 * we need to subtract one from the size and truncate back 484 * we need to subtract one from the size and truncate back
485 * to a block boundary. We subtract 1 in case the size is 485 * to a block boundary. We subtract 1 in case the size is
486 * exactly on a block boundary. 486 * exactly on a block boundary.
487 */ 487 */
488 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1; 488 last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
489 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize); 489 start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
490 end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1); 490 end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
491 ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb); 491 ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
492 if (last_fsb == end_zero_fsb) { 492 if (last_fsb == end_zero_fsb) {
493 /* 493 /*
494 * The size was only incremented on its last block. 494 * The size was only incremented on its last block.
495 * We took care of that above, so just return. 495 * We took care of that above, so just return.
496 */ 496 */
497 return 0; 497 return 0;
498 } 498 }
499 499
500 ASSERT(start_zero_fsb <= end_zero_fsb); 500 ASSERT(start_zero_fsb <= end_zero_fsb);
501 while (start_zero_fsb <= end_zero_fsb) { 501 while (start_zero_fsb <= end_zero_fsb) {
502 nimaps = 1; 502 nimaps = 1;
503 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1; 503 zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
504 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, 504 error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
505 0, NULL, 0, &imap, &nimaps, NULL, NULL); 505 0, NULL, 0, &imap, &nimaps, NULL, NULL);
506 if (error) { 506 if (error) {
507 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); 507 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
508 return error; 508 return error;
509 } 509 }
510 ASSERT(nimaps > 0); 510 ASSERT(nimaps > 0);
511 511
512 if (imap.br_state == XFS_EXT_UNWRITTEN || 512 if (imap.br_state == XFS_EXT_UNWRITTEN ||
513 imap.br_startblock == HOLESTARTBLOCK) { 513 imap.br_startblock == HOLESTARTBLOCK) {
514 /* 514 /*
515 * This loop handles initializing pages that were 515 * This loop handles initializing pages that were
516 * partially initialized by the code below this 516 * partially initialized by the code below this
517 * loop. It basically zeroes the part of the page 517 * loop. It basically zeroes the part of the page
518 * that sits on a hole and sets the page as P_HOLE 518 * that sits on a hole and sets the page as P_HOLE
519 * and calls remapf if it is a mapped file. 519 * and calls remapf if it is a mapped file.
520 */ 520 */
521 start_zero_fsb = imap.br_startoff + imap.br_blockcount; 521 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
522 ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); 522 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
523 continue; 523 continue;
524 } 524 }
525 525
526 /* 526 /*
527 * There are blocks we need to zero. 527 * There are blocks we need to zero.
528 * Drop the inode lock while we're doing the I/O. 528 * Drop the inode lock while we're doing the I/O.
529 * We'll still have the iolock to protect us. 529 * We'll still have the iolock to protect us.
530 */ 530 */
531 xfs_iunlock(ip, XFS_ILOCK_EXCL); 531 xfs_iunlock(ip, XFS_ILOCK_EXCL);
532 532
533 zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); 533 zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
534 zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); 534 zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
535 535
536 if ((zero_off + zero_len) > offset) 536 if ((zero_off + zero_len) > offset)
537 zero_len = offset - zero_off; 537 zero_len = offset - zero_off;
538 538
539 error = xfs_iozero(ip, zero_off, zero_len); 539 error = xfs_iozero(ip, zero_off, zero_len);
540 if (error) { 540 if (error) {
541 goto out_lock; 541 goto out_lock;
542 } 542 }
543 543
544 start_zero_fsb = imap.br_startoff + imap.br_blockcount; 544 start_zero_fsb = imap.br_startoff + imap.br_blockcount;
545 ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); 545 ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
546 546
547 xfs_ilock(ip, XFS_ILOCK_EXCL); 547 xfs_ilock(ip, XFS_ILOCK_EXCL);
548 } 548 }
549 549
550 return 0; 550 return 0;
551 551
552 out_lock: 552 out_lock:
553 xfs_ilock(ip, XFS_ILOCK_EXCL); 553 xfs_ilock(ip, XFS_ILOCK_EXCL);
554 ASSERT(error >= 0); 554 ASSERT(error >= 0);
555 return error; 555 return error;
556 } 556 }
557 557
558 ssize_t /* bytes written, or (-) error */ 558 ssize_t /* bytes written, or (-) error */
559 xfs_write( 559 xfs_write(
560 struct xfs_inode *xip, 560 struct xfs_inode *xip,
561 struct kiocb *iocb, 561 struct kiocb *iocb,
562 const struct iovec *iovp, 562 const struct iovec *iovp,
563 unsigned int nsegs, 563 unsigned int nsegs,
564 loff_t *offset, 564 loff_t *offset,
565 int ioflags) 565 int ioflags)
566 { 566 {
567 struct file *file = iocb->ki_filp; 567 struct file *file = iocb->ki_filp;
568 struct address_space *mapping = file->f_mapping; 568 struct address_space *mapping = file->f_mapping;
569 struct inode *inode = mapping->host; 569 struct inode *inode = mapping->host;
570 unsigned long segs = nsegs; 570 unsigned long segs = nsegs;
571 xfs_mount_t *mp; 571 xfs_mount_t *mp;
572 ssize_t ret = 0, error = 0; 572 ssize_t ret = 0, error = 0;
573 xfs_fsize_t isize, new_size; 573 xfs_fsize_t isize, new_size;
574 int iolock; 574 int iolock;
575 int eventsent = 0; 575 int eventsent = 0;
576 size_t ocount = 0, count; 576 size_t ocount = 0, count;
577 loff_t pos; 577 loff_t pos;
578 int need_i_mutex; 578 int need_i_mutex;
579 579
580 XFS_STATS_INC(xs_write_calls); 580 XFS_STATS_INC(xs_write_calls);
581 581
582 error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ); 582 error = generic_segment_checks(iovp, &segs, &ocount, VERIFY_READ);
583 if (error) 583 if (error)
584 return error; 584 return error;
585 585
586 count = ocount; 586 count = ocount;
587 pos = *offset; 587 pos = *offset;
588 588
589 if (count == 0) 589 if (count == 0)
590 return 0; 590 return 0;
591 591
592 mp = xip->i_mount; 592 mp = xip->i_mount;
593 593
594 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE); 594 xfs_wait_for_freeze(mp, SB_FREEZE_WRITE);
595 595
596 if (XFS_FORCED_SHUTDOWN(mp)) 596 if (XFS_FORCED_SHUTDOWN(mp))
597 return -EIO; 597 return -EIO;
598 598
599 relock: 599 relock:
600 if (ioflags & IO_ISDIRECT) { 600 if (ioflags & IO_ISDIRECT) {
601 iolock = XFS_IOLOCK_SHARED; 601 iolock = XFS_IOLOCK_SHARED;
602 need_i_mutex = 0; 602 need_i_mutex = 0;
603 } else { 603 } else {
604 iolock = XFS_IOLOCK_EXCL; 604 iolock = XFS_IOLOCK_EXCL;
605 need_i_mutex = 1; 605 need_i_mutex = 1;
606 mutex_lock(&inode->i_mutex); 606 mutex_lock(&inode->i_mutex);
607 } 607 }
608 608
609 xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); 609 xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
610 610
611 start: 611 start:
612 error = -generic_write_checks(file, &pos, &count, 612 error = -generic_write_checks(file, &pos, &count,
613 S_ISBLK(inode->i_mode)); 613 S_ISBLK(inode->i_mode));
614 if (error) { 614 if (error) {
615 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); 615 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
616 goto out_unlock_mutex; 616 goto out_unlock_mutex;
617 } 617 }
618 618
619 if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) && 619 if ((DM_EVENT_ENABLED(xip, DM_EVENT_WRITE) &&
620 !(ioflags & IO_INVIS) && !eventsent)) { 620 !(ioflags & IO_INVIS) && !eventsent)) {
621 int dmflags = FILP_DELAY_FLAG(file); 621 int dmflags = FILP_DELAY_FLAG(file);
622 622
623 if (need_i_mutex) 623 if (need_i_mutex)
624 dmflags |= DM_FLAGS_IMUX; 624 dmflags |= DM_FLAGS_IMUX;
625 625
626 xfs_iunlock(xip, XFS_ILOCK_EXCL); 626 xfs_iunlock(xip, XFS_ILOCK_EXCL);
627 error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip, 627 error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip,
628 pos, count, dmflags, &iolock); 628 pos, count, dmflags, &iolock);
629 if (error) { 629 if (error) {
630 goto out_unlock_internal; 630 goto out_unlock_internal;
631 } 631 }
632 xfs_ilock(xip, XFS_ILOCK_EXCL); 632 xfs_ilock(xip, XFS_ILOCK_EXCL);
633 eventsent = 1; 633 eventsent = 1;
634 634
635 /* 635 /*
636 * The iolock was dropped and reacquired in XFS_SEND_DATA 636 * The iolock was dropped and reacquired in XFS_SEND_DATA
637 * so we have to recheck the size when appending. 637 * so we have to recheck the size when appending.
638 * We will only "goto start;" once, since having sent the 638 * We will only "goto start;" once, since having sent the
639 * event prevents another call to XFS_SEND_DATA, which is 639 * event prevents another call to XFS_SEND_DATA, which is
640 * what allows the size to change in the first place. 640 * what allows the size to change in the first place.
641 */ 641 */
642 if ((file->f_flags & O_APPEND) && pos != xip->i_size) 642 if ((file->f_flags & O_APPEND) && pos != xip->i_size)
643 goto start; 643 goto start;
644 } 644 }
645 645
646 if (ioflags & IO_ISDIRECT) { 646 if (ioflags & IO_ISDIRECT) {
647 xfs_buftarg_t *target = 647 xfs_buftarg_t *target =
648 XFS_IS_REALTIME_INODE(xip) ? 648 XFS_IS_REALTIME_INODE(xip) ?
649 mp->m_rtdev_targp : mp->m_ddev_targp; 649 mp->m_rtdev_targp : mp->m_ddev_targp;
650 650
651 if ((pos & target->bt_smask) || (count & target->bt_smask)) { 651 if ((pos & target->bt_smask) || (count & target->bt_smask)) {
652 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); 652 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
653 return XFS_ERROR(-EINVAL); 653 return XFS_ERROR(-EINVAL);
654 } 654 }
655 655
656 if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) { 656 if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) {
657 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); 657 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
658 iolock = XFS_IOLOCK_EXCL; 658 iolock = XFS_IOLOCK_EXCL;
659 need_i_mutex = 1; 659 need_i_mutex = 1;
660 mutex_lock(&inode->i_mutex); 660 mutex_lock(&inode->i_mutex);
661 xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); 661 xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
662 goto start; 662 goto start;
663 } 663 }
664 } 664 }
665 665
666 new_size = pos + count; 666 new_size = pos + count;
667 if (new_size > xip->i_size) 667 if (new_size > xip->i_size)
668 xip->i_new_size = new_size; 668 xip->i_new_size = new_size;
669 669
670 if (likely(!(ioflags & IO_INVIS))) 670 if (likely(!(ioflags & IO_INVIS)))
671 xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 671 xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
672 672
673 /* 673 /*
674 * If the offset is beyond the size of the file, we have a couple 674 * If the offset is beyond the size of the file, we have a couple
675 * of things to do. First, if there is already space allocated 675 * of things to do. First, if there is already space allocated
676 * we need to either create holes or zero the disk or ... 676 * we need to either create holes or zero the disk or ...
677 * 677 *
678 * If there is a page where the previous size lands, we need 678 * If there is a page where the previous size lands, we need
679 * to zero it out up to the new size. 679 * to zero it out up to the new size.
680 */ 680 */
681 681
682 if (pos > xip->i_size) { 682 if (pos > xip->i_size) {
683 error = xfs_zero_eof(xip, pos, xip->i_size); 683 error = xfs_zero_eof(xip, pos, xip->i_size);
684 if (error) { 684 if (error) {
685 xfs_iunlock(xip, XFS_ILOCK_EXCL); 685 xfs_iunlock(xip, XFS_ILOCK_EXCL);
686 goto out_unlock_internal; 686 goto out_unlock_internal;
687 } 687 }
688 } 688 }
689 xfs_iunlock(xip, XFS_ILOCK_EXCL); 689 xfs_iunlock(xip, XFS_ILOCK_EXCL);
690 690
691 /* 691 /*
692 * If we're writing the file then make sure to clear the 692 * If we're writing the file then make sure to clear the
693 * setuid and setgid bits if the process is not being run 693 * setuid and setgid bits if the process is not being run
694 * by root. This keeps people from modifying setuid and 694 * by root. This keeps people from modifying setuid and
695 * setgid binaries. 695 * setgid binaries.
696 */ 696 */
697 697
698 if (((xip->i_d.di_mode & S_ISUID) || 698 if (((xip->i_d.di_mode & S_ISUID) ||
699 ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == 699 ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) ==
700 (S_ISGID | S_IXGRP))) && 700 (S_ISGID | S_IXGRP))) &&
701 !capable(CAP_FSETID)) { 701 !capable(CAP_FSETID)) {
702 error = xfs_write_clear_setuid(xip); 702 error = xfs_write_clear_setuid(xip);
703 if (likely(!error)) 703 if (likely(!error))
704 error = -file_remove_suid(file); 704 error = -file_remove_suid(file);
705 if (unlikely(error)) { 705 if (unlikely(error)) {
706 goto out_unlock_internal; 706 goto out_unlock_internal;
707 } 707 }
708 } 708 }
709 709
710 retry: 710 retry:
711 /* We can write back this queue in page reclaim */ 711 /* We can write back this queue in page reclaim */
712 current->backing_dev_info = mapping->backing_dev_info; 712 current->backing_dev_info = mapping->backing_dev_info;
713 713
714 if ((ioflags & IO_ISDIRECT)) { 714 if ((ioflags & IO_ISDIRECT)) {
715 if (mapping->nrpages) { 715 if (mapping->nrpages) {
716 WARN_ON(need_i_mutex == 0); 716 WARN_ON(need_i_mutex == 0);
717 xfs_inval_cached_trace(xip, pos, -1, 717 xfs_inval_cached_trace(xip, pos, -1,
718 (pos & PAGE_CACHE_MASK), -1); 718 (pos & PAGE_CACHE_MASK), -1);
719 error = xfs_flushinval_pages(xip, 719 error = xfs_flushinval_pages(xip,
720 (pos & PAGE_CACHE_MASK), 720 (pos & PAGE_CACHE_MASK),
721 -1, FI_REMAPF_LOCKED); 721 -1, FI_REMAPF_LOCKED);
722 if (error) 722 if (error)
723 goto out_unlock_internal; 723 goto out_unlock_internal;
724 } 724 }
725 725
726 if (need_i_mutex) { 726 if (need_i_mutex) {
727 /* demote the lock now the cached pages are gone */ 727 /* demote the lock now the cached pages are gone */
728 xfs_ilock_demote(xip, XFS_IOLOCK_EXCL); 728 xfs_ilock_demote(xip, XFS_IOLOCK_EXCL);
729 mutex_unlock(&inode->i_mutex); 729 mutex_unlock(&inode->i_mutex);
730 730
731 iolock = XFS_IOLOCK_SHARED; 731 iolock = XFS_IOLOCK_SHARED;
732 need_i_mutex = 0; 732 need_i_mutex = 0;
733 } 733 }
734 734
735 xfs_rw_enter_trace(XFS_DIOWR_ENTER, xip, (void *)iovp, segs, 735 xfs_rw_enter_trace(XFS_DIOWR_ENTER, xip, (void *)iovp, segs,
736 *offset, ioflags); 736 *offset, ioflags);
737 ret = generic_file_direct_write(iocb, iovp, 737 ret = generic_file_direct_write(iocb, iovp,
738 &segs, pos, offset, count, ocount); 738 &segs, pos, offset, count, ocount);
739 739
740 /* 740 /*
741 * direct-io write to a hole: fall through to buffered I/O 741 * direct-io write to a hole: fall through to buffered I/O
742 * for completing the rest of the request. 742 * for completing the rest of the request.
743 */ 743 */
744 if (ret >= 0 && ret != count) { 744 if (ret >= 0 && ret != count) {
745 XFS_STATS_ADD(xs_write_bytes, ret); 745 XFS_STATS_ADD(xs_write_bytes, ret);
746 746
747 pos += ret; 747 pos += ret;
748 count -= ret; 748 count -= ret;
749 749
750 ioflags &= ~IO_ISDIRECT; 750 ioflags &= ~IO_ISDIRECT;
751 xfs_iunlock(xip, iolock); 751 xfs_iunlock(xip, iolock);
752 goto relock; 752 goto relock;
753 } 753 }
754 } else { 754 } else {
755 xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs, 755 xfs_rw_enter_trace(XFS_WRITE_ENTER, xip, (void *)iovp, segs,
756 *offset, ioflags); 756 *offset, ioflags);
757 ret = generic_file_buffered_write(iocb, iovp, segs, 757 ret = generic_file_buffered_write(iocb, iovp, segs,
758 pos, offset, count, ret); 758 pos, offset, count, ret);
759 } 759 }
760 760
761 current->backing_dev_info = NULL; 761 current->backing_dev_info = NULL;
762 762
763 if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO)) 763 if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
764 ret = wait_on_sync_kiocb(iocb); 764 ret = wait_on_sync_kiocb(iocb);
765 765
766 if (ret == -ENOSPC && 766 if (ret == -ENOSPC &&
767 DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { 767 DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
768 xfs_iunlock(xip, iolock); 768 xfs_iunlock(xip, iolock);
769 if (need_i_mutex) 769 if (need_i_mutex)
770 mutex_unlock(&inode->i_mutex); 770 mutex_unlock(&inode->i_mutex);
771 error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip, 771 error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip,
772 DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL, 772 DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL,
773 0, 0, 0); /* Delay flag intentionally unused */ 773 0, 0, 0); /* Delay flag intentionally unused */
774 if (need_i_mutex) 774 if (need_i_mutex)
775 mutex_lock(&inode->i_mutex); 775 mutex_lock(&inode->i_mutex);
776 xfs_ilock(xip, iolock); 776 xfs_ilock(xip, iolock);
777 if (error) 777 if (error)
778 goto out_unlock_internal; 778 goto out_unlock_internal;
779 pos = xip->i_size; 779 pos = xip->i_size;
780 ret = 0; 780 ret = 0;
781 goto retry; 781 goto retry;
782 } 782 }
783 783
784 isize = i_size_read(inode); 784 isize = i_size_read(inode);
785 if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize)) 785 if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
786 *offset = isize; 786 *offset = isize;
787 787
788 if (*offset > xip->i_size) { 788 if (*offset > xip->i_size) {
789 xfs_ilock(xip, XFS_ILOCK_EXCL); 789 xfs_ilock(xip, XFS_ILOCK_EXCL);
790 if (*offset > xip->i_size) 790 if (*offset > xip->i_size)
791 xip->i_size = *offset; 791 xip->i_size = *offset;
792 xfs_iunlock(xip, XFS_ILOCK_EXCL); 792 xfs_iunlock(xip, XFS_ILOCK_EXCL);
793 } 793 }
794 794
795 error = -ret; 795 error = -ret;
796 if (ret <= 0) 796 if (ret <= 0)
797 goto out_unlock_internal; 797 goto out_unlock_internal;
798 798
799 XFS_STATS_ADD(xs_write_bytes, ret); 799 XFS_STATS_ADD(xs_write_bytes, ret);
800 800
801 /* Handle various SYNC-type writes */ 801 /* Handle various SYNC-type writes */
802 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { 802 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
803 int error2; 803 int error2;
804 804
805 xfs_iunlock(xip, iolock); 805 xfs_iunlock(xip, iolock);
806 if (need_i_mutex) 806 if (need_i_mutex)
807 mutex_unlock(&inode->i_mutex); 807 mutex_unlock(&inode->i_mutex);
808 error2 = sync_page_range(inode, mapping, pos, ret); 808 error2 = sync_page_range(inode, mapping, pos, ret);
809 if (!error) 809 if (!error)
810 error = error2; 810 error = error2;
811 if (need_i_mutex) 811 if (need_i_mutex)
812 mutex_lock(&inode->i_mutex); 812 mutex_lock(&inode->i_mutex);
813 xfs_ilock(xip, iolock); 813 xfs_ilock(xip, iolock);
814 error2 = xfs_write_sync_logforce(mp, xip); 814 error2 = xfs_write_sync_logforce(mp, xip);
815 if (!error) 815 if (!error)
816 error = error2; 816 error = error2;
817 } 817 }
818 818
819 out_unlock_internal: 819 out_unlock_internal:
820 if (xip->i_new_size) { 820 if (xip->i_new_size) {
821 xfs_ilock(xip, XFS_ILOCK_EXCL); 821 xfs_ilock(xip, XFS_ILOCK_EXCL);
822 xip->i_new_size = 0; 822 xip->i_new_size = 0;
823 /* 823 /*
824 * If this was a direct or synchronous I/O that failed (such 824 * If this was a direct or synchronous I/O that failed (such
825 * as ENOSPC) then part of the I/O may have been written to 825 * as ENOSPC) then part of the I/O may have been written to
826 * disk before the error occured. In this case the on-disk 826 * disk before the error occured. In this case the on-disk
827 * file size may have been adjusted beyond the in-memory file 827 * file size may have been adjusted beyond the in-memory file
828 * size and now needs to be truncated back. 828 * size and now needs to be truncated back.
829 */ 829 */
830 if (xip->i_d.di_size > xip->i_size) 830 if (xip->i_d.di_size > xip->i_size)
831 xip->i_d.di_size = xip->i_size; 831 xip->i_d.di_size = xip->i_size;
832 xfs_iunlock(xip, XFS_ILOCK_EXCL); 832 xfs_iunlock(xip, XFS_ILOCK_EXCL);
833 } 833 }
834 xfs_iunlock(xip, iolock); 834 xfs_iunlock(xip, iolock);
835 out_unlock_mutex: 835 out_unlock_mutex:
836 if (need_i_mutex) 836 if (need_i_mutex)
837 mutex_unlock(&inode->i_mutex); 837 mutex_unlock(&inode->i_mutex);
838 return -error; 838 return -error;
839 } 839 }
840 840
841 /* 841 /*
842 * All xfs metadata buffers except log state machine buffers 842 * All xfs metadata buffers except log state machine buffers
843 * get this attached as their b_bdstrat callback function. 843 * get this attached as their b_bdstrat callback function.
844 * This is so that we can catch a buffer 844 * This is so that we can catch a buffer
845 * after prematurely unpinning it to forcibly shutdown the filesystem. 845 * after prematurely unpinning it to forcibly shutdown the filesystem.
846 */ 846 */
847 int 847 int
848 xfs_bdstrat_cb(struct xfs_buf *bp) 848 xfs_bdstrat_cb(struct xfs_buf *bp)
849 { 849 {
850 xfs_mount_t *mp; 850 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
851
852 mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
853 if (!XFS_FORCED_SHUTDOWN(mp)) {
854 xfs_buf_iorequest(bp);
855 return 0;
856 } else {
857 xfs_buftrace("XFS__BDSTRAT IOERROR", bp); 851 xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
858 /* 852 /*
859 * Metadata write that didn't get logged but 853 * Metadata write that didn't get logged but
860 * written delayed anyway. These aren't associated 854 * written delayed anyway. These aren't associated
861 * with a transaction, and can be ignored. 855 * with a transaction, and can be ignored.
862 */ 856 */
863 if (XFS_BUF_IODONE_FUNC(bp) == NULL && 857 if (XFS_BUF_IODONE_FUNC(bp) == NULL &&
864 (XFS_BUF_ISREAD(bp)) == 0) 858 (XFS_BUF_ISREAD(bp)) == 0)
865 return (xfs_bioerror_relse(bp)); 859 return (xfs_bioerror_relse(bp));
866 else 860 else
867 return (xfs_bioerror(bp)); 861 return (xfs_bioerror(bp));
868 } 862 }
863
864 xfs_buf_iorequest(bp);
865 return 0;
869 } 866 }
870 867
871 /* 868 /*
872 * Wrapper around bdstrat so that we can stop data from going to disk in case 869 * Wrapper around bdstrat so that we can stop data from going to disk in case
873 * we are shutting down the filesystem. Typically user data goes thru this 870 * we are shutting down the filesystem. Typically user data goes thru this
874 * path; one of the exceptions is the superblock. 871 * path; one of the exceptions is the superblock.
875 */ 872 */
876 void 873 void
877 xfsbdstrat( 874 xfsbdstrat(
878 struct xfs_mount *mp, 875 struct xfs_mount *mp,
879 struct xfs_buf *bp) 876 struct xfs_buf *bp)
880 { 877 {
881 ASSERT(mp); 878 ASSERT(mp);
882 if (!XFS_FORCED_SHUTDOWN(mp)) { 879 if (!XFS_FORCED_SHUTDOWN(mp)) {
883 xfs_buf_iorequest(bp); 880 xfs_buf_iorequest(bp);
884 return; 881 return;
885 } 882 }
886 883
887 xfs_buftrace("XFSBDSTRAT IOERROR", bp); 884 xfs_buftrace("XFSBDSTRAT IOERROR", bp);
888 xfs_bioerror_relse(bp); 885 xfs_bioerror_relse(bp);
889 } 886 }
890 887
891 /* 888 /*
892 * If the underlying (data/log/rt) device is readonly, there are some 889 * If the underlying (data/log/rt) device is readonly, there are some
893 * operations that cannot proceed. 890 * operations that cannot proceed.
894 */ 891 */
895 int 892 int
896 xfs_dev_is_read_only( 893 xfs_dev_is_read_only(
897 xfs_mount_t *mp, 894 xfs_mount_t *mp,
898 char *message) 895 char *message)
899 { 896 {
900 if (xfs_readonly_buftarg(mp->m_ddev_targp) || 897 if (xfs_readonly_buftarg(mp->m_ddev_targp) ||
901 xfs_readonly_buftarg(mp->m_logdev_targp) || 898 xfs_readonly_buftarg(mp->m_logdev_targp) ||
902 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) { 899 (mp->m_rtdev_targp && xfs_readonly_buftarg(mp->m_rtdev_targp))) {
903 cmn_err(CE_NOTE, 900 cmn_err(CE_NOTE,
904 "XFS: %s required on read-only device.", message); 901 "XFS: %s required on read-only device.", message);
905 cmn_err(CE_NOTE, 902 cmn_err(CE_NOTE,
906 "XFS: write access unavailable, cannot proceed."); 903 "XFS: write access unavailable, cannot proceed.");
907 return EROFS; 904 return EROFS;
908 } 905 }
fs/xfs/xfs_buf_item.c
1 /* 1 /*
2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
3 * All Rights Reserved. 3 * All Rights Reserved.
4 * 4 *
5 * This program is free software; you can redistribute it and/or 5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as 6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 * 8 *
9 * This program is distributed in the hope that it would be useful, 9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation, 15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18 #include "xfs.h" 18 #include "xfs.h"
19 #include "xfs_fs.h" 19 #include "xfs_fs.h"
20 #include "xfs_types.h" 20 #include "xfs_types.h"
21 #include "xfs_bit.h" 21 #include "xfs_bit.h"
22 #include "xfs_log.h" 22 #include "xfs_log.h"
23 #include "xfs_inum.h" 23 #include "xfs_inum.h"
24 #include "xfs_trans.h" 24 #include "xfs_trans.h"
25 #include "xfs_sb.h" 25 #include "xfs_sb.h"
26 #include "xfs_ag.h" 26 #include "xfs_ag.h"
27 #include "xfs_dmapi.h" 27 #include "xfs_dmapi.h"
28 #include "xfs_mount.h" 28 #include "xfs_mount.h"
29 #include "xfs_buf_item.h" 29 #include "xfs_buf_item.h"
30 #include "xfs_trans_priv.h" 30 #include "xfs_trans_priv.h"
31 #include "xfs_error.h" 31 #include "xfs_error.h"
32 32
33 33
34 kmem_zone_t *xfs_buf_item_zone; 34 kmem_zone_t *xfs_buf_item_zone;
35 35
36 #ifdef XFS_TRANS_DEBUG 36 #ifdef XFS_TRANS_DEBUG
37 /* 37 /*
38 * This function uses an alternate strategy for tracking the bytes 38 * This function uses an alternate strategy for tracking the bytes
39 * that the user requests to be logged. This can then be used 39 * that the user requests to be logged. This can then be used
40 * in conjunction with the bli_orig array in the buf log item to 40 * in conjunction with the bli_orig array in the buf log item to
41 * catch bugs in our callers' code. 41 * catch bugs in our callers' code.
42 * 42 *
43 * We also double check the bits set in xfs_buf_item_log using a 43 * We also double check the bits set in xfs_buf_item_log using a
44 * simple algorithm to check that every byte is accounted for. 44 * simple algorithm to check that every byte is accounted for.
45 */ 45 */
46 STATIC void 46 STATIC void
47 xfs_buf_item_log_debug( 47 xfs_buf_item_log_debug(
48 xfs_buf_log_item_t *bip, 48 xfs_buf_log_item_t *bip,
49 uint first, 49 uint first,
50 uint last) 50 uint last)
51 { 51 {
52 uint x; 52 uint x;
53 uint byte; 53 uint byte;
54 uint nbytes; 54 uint nbytes;
55 uint chunk_num; 55 uint chunk_num;
56 uint word_num; 56 uint word_num;
57 uint bit_num; 57 uint bit_num;
58 uint bit_set; 58 uint bit_set;
59 uint *wordp; 59 uint *wordp;
60 60
61 ASSERT(bip->bli_logged != NULL); 61 ASSERT(bip->bli_logged != NULL);
62 byte = first; 62 byte = first;
63 nbytes = last - first + 1; 63 nbytes = last - first + 1;
64 bfset(bip->bli_logged, first, nbytes); 64 bfset(bip->bli_logged, first, nbytes);
65 for (x = 0; x < nbytes; x++) { 65 for (x = 0; x < nbytes; x++) {
66 chunk_num = byte >> XFS_BLI_SHIFT; 66 chunk_num = byte >> XFS_BLI_SHIFT;
67 word_num = chunk_num >> BIT_TO_WORD_SHIFT; 67 word_num = chunk_num >> BIT_TO_WORD_SHIFT;
68 bit_num = chunk_num & (NBWORD - 1); 68 bit_num = chunk_num & (NBWORD - 1);
69 wordp = &(bip->bli_format.blf_data_map[word_num]); 69 wordp = &(bip->bli_format.blf_data_map[word_num]);
70 bit_set = *wordp & (1 << bit_num); 70 bit_set = *wordp & (1 << bit_num);
71 ASSERT(bit_set); 71 ASSERT(bit_set);
72 byte++; 72 byte++;
73 } 73 }
74 } 74 }
75 75
76 /* 76 /*
77 * This function is called when we flush something into a buffer without 77 * This function is called when we flush something into a buffer without
78 * logging it. This happens for things like inodes which are logged 78 * logging it. This happens for things like inodes which are logged
79 * separately from the buffer. 79 * separately from the buffer.
80 */ 80 */
81 void 81 void
82 xfs_buf_item_flush_log_debug( 82 xfs_buf_item_flush_log_debug(
83 xfs_buf_t *bp, 83 xfs_buf_t *bp,
84 uint first, 84 uint first,
85 uint last) 85 uint last)
86 { 86 {
87 xfs_buf_log_item_t *bip; 87 xfs_buf_log_item_t *bip;
88 uint nbytes; 88 uint nbytes;
89 89
90 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); 90 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
91 if ((bip == NULL) || (bip->bli_item.li_type != XFS_LI_BUF)) { 91 if ((bip == NULL) || (bip->bli_item.li_type != XFS_LI_BUF)) {
92 return; 92 return;
93 } 93 }
94 94
95 ASSERT(bip->bli_logged != NULL); 95 ASSERT(bip->bli_logged != NULL);
96 nbytes = last - first + 1; 96 nbytes = last - first + 1;
97 bfset(bip->bli_logged, first, nbytes); 97 bfset(bip->bli_logged, first, nbytes);
98 } 98 }
99 99
100 /* 100 /*
101 * This function is called to verify that our callers have logged 101 * This function is called to verify that our callers have logged
102 * all the bytes that they changed. 102 * all the bytes that they changed.
103 * 103 *
104 * It does this by comparing the original copy of the buffer stored in 104 * It does this by comparing the original copy of the buffer stored in
105 * the buf log item's bli_orig array to the current copy of the buffer 105 * the buf log item's bli_orig array to the current copy of the buffer
106 * and ensuring that all bytes which mismatch are set in the bli_logged 106 * and ensuring that all bytes which mismatch are set in the bli_logged
107 * array of the buf log item. 107 * array of the buf log item.
108 */ 108 */
109 STATIC void 109 STATIC void
110 xfs_buf_item_log_check( 110 xfs_buf_item_log_check(
111 xfs_buf_log_item_t *bip) 111 xfs_buf_log_item_t *bip)
112 { 112 {
113 char *orig; 113 char *orig;
114 char *buffer; 114 char *buffer;
115 int x; 115 int x;
116 xfs_buf_t *bp; 116 xfs_buf_t *bp;
117 117
118 ASSERT(bip->bli_orig != NULL); 118 ASSERT(bip->bli_orig != NULL);
119 ASSERT(bip->bli_logged != NULL); 119 ASSERT(bip->bli_logged != NULL);
120 120
121 bp = bip->bli_buf; 121 bp = bip->bli_buf;
122 ASSERT(XFS_BUF_COUNT(bp) > 0); 122 ASSERT(XFS_BUF_COUNT(bp) > 0);
123 ASSERT(XFS_BUF_PTR(bp) != NULL); 123 ASSERT(XFS_BUF_PTR(bp) != NULL);
124 orig = bip->bli_orig; 124 orig = bip->bli_orig;
125 buffer = XFS_BUF_PTR(bp); 125 buffer = XFS_BUF_PTR(bp);
126 for (x = 0; x < XFS_BUF_COUNT(bp); x++) { 126 for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
127 if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) 127 if (orig[x] != buffer[x] && !btst(bip->bli_logged, x))
128 cmn_err(CE_PANIC, 128 cmn_err(CE_PANIC,
129 "xfs_buf_item_log_check bip %x buffer %x orig %x index %d", 129 "xfs_buf_item_log_check bip %x buffer %x orig %x index %d",
130 bip, bp, orig, x); 130 bip, bp, orig, x);
131 } 131 }
132 } 132 }
133 #else 133 #else
134 #define xfs_buf_item_log_debug(x,y,z) 134 #define xfs_buf_item_log_debug(x,y,z)
135 #define xfs_buf_item_log_check(x) 135 #define xfs_buf_item_log_check(x)
136 #endif 136 #endif
137 137
138 STATIC void xfs_buf_error_relse(xfs_buf_t *bp); 138 STATIC void xfs_buf_error_relse(xfs_buf_t *bp);
139 STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip); 139 STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
140 140
141 /* 141 /*
142 * This returns the number of log iovecs needed to log the 142 * This returns the number of log iovecs needed to log the
143 * given buf log item. 143 * given buf log item.
144 * 144 *
145 * It calculates this as 1 iovec for the buf log format structure 145 * It calculates this as 1 iovec for the buf log format structure
146 * and 1 for each stretch of non-contiguous chunks to be logged. 146 * and 1 for each stretch of non-contiguous chunks to be logged.
147 * Contiguous chunks are logged in a single iovec. 147 * Contiguous chunks are logged in a single iovec.
148 * 148 *
149 * If the XFS_BLI_STALE flag has been set, then log nothing. 149 * If the XFS_BLI_STALE flag has been set, then log nothing.
150 */ 150 */
151 STATIC uint 151 STATIC uint
152 xfs_buf_item_size( 152 xfs_buf_item_size(
153 xfs_buf_log_item_t *bip) 153 xfs_buf_log_item_t *bip)
154 { 154 {
155 uint nvecs; 155 uint nvecs;
156 int next_bit; 156 int next_bit;
157 int last_bit; 157 int last_bit;
158 xfs_buf_t *bp; 158 xfs_buf_t *bp;
159 159
160 ASSERT(atomic_read(&bip->bli_refcount) > 0); 160 ASSERT(atomic_read(&bip->bli_refcount) > 0);
161 if (bip->bli_flags & XFS_BLI_STALE) { 161 if (bip->bli_flags & XFS_BLI_STALE) {
162 /* 162 /*
163 * The buffer is stale, so all we need to log 163 * The buffer is stale, so all we need to log
164 * is the buf log format structure with the 164 * is the buf log format structure with the
165 * cancel flag in it. 165 * cancel flag in it.
166 */ 166 */
167 xfs_buf_item_trace("SIZE STALE", bip); 167 xfs_buf_item_trace("SIZE STALE", bip);
168 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 168 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
169 return 1; 169 return 1;
170 } 170 }
171 171
172 bp = bip->bli_buf; 172 bp = bip->bli_buf;
173 ASSERT(bip->bli_flags & XFS_BLI_LOGGED); 173 ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
174 nvecs = 1; 174 nvecs = 1;
175 last_bit = xfs_next_bit(bip->bli_format.blf_data_map, 175 last_bit = xfs_next_bit(bip->bli_format.blf_data_map,
176 bip->bli_format.blf_map_size, 0); 176 bip->bli_format.blf_map_size, 0);
177 ASSERT(last_bit != -1); 177 ASSERT(last_bit != -1);
178 nvecs++; 178 nvecs++;
179 while (last_bit != -1) { 179 while (last_bit != -1) {
180 /* 180 /*
181 * This takes the bit number to start looking from and 181 * This takes the bit number to start looking from and
182 * returns the next set bit from there. It returns -1 182 * returns the next set bit from there. It returns -1
183 * if there are no more bits set or the start bit is 183 * if there are no more bits set or the start bit is
184 * beyond the end of the bitmap. 184 * beyond the end of the bitmap.
185 */ 185 */
186 next_bit = xfs_next_bit(bip->bli_format.blf_data_map, 186 next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
187 bip->bli_format.blf_map_size, 187 bip->bli_format.blf_map_size,
188 last_bit + 1); 188 last_bit + 1);
189 /* 189 /*
190 * If we run out of bits, leave the loop, 190 * If we run out of bits, leave the loop,
191 * else if we find a new set of bits bump the number of vecs, 191 * else if we find a new set of bits bump the number of vecs,
192 * else keep scanning the current set of bits. 192 * else keep scanning the current set of bits.
193 */ 193 */
194 if (next_bit == -1) { 194 if (next_bit == -1) {
195 last_bit = -1; 195 last_bit = -1;
196 } else if (next_bit != last_bit + 1) { 196 } else if (next_bit != last_bit + 1) {
197 last_bit = next_bit; 197 last_bit = next_bit;
198 nvecs++; 198 nvecs++;
199 } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) != 199 } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) !=
200 (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) + 200 (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) +
201 XFS_BLI_CHUNK)) { 201 XFS_BLI_CHUNK)) {
202 last_bit = next_bit; 202 last_bit = next_bit;
203 nvecs++; 203 nvecs++;
204 } else { 204 } else {
205 last_bit++; 205 last_bit++;
206 } 206 }
207 } 207 }
208 208
209 xfs_buf_item_trace("SIZE NORM", bip); 209 xfs_buf_item_trace("SIZE NORM", bip);
210 return nvecs; 210 return nvecs;
211 } 211 }
212 212
213 /* 213 /*
214 * This is called to fill in the vector of log iovecs for the 214 * This is called to fill in the vector of log iovecs for the
215 * given log buf item. It fills the first entry with a buf log 215 * given log buf item. It fills the first entry with a buf log
216 * format structure, and the rest point to contiguous chunks 216 * format structure, and the rest point to contiguous chunks
217 * within the buffer. 217 * within the buffer.
218 */ 218 */
219 STATIC void 219 STATIC void
220 xfs_buf_item_format( 220 xfs_buf_item_format(
221 xfs_buf_log_item_t *bip, 221 xfs_buf_log_item_t *bip,
222 xfs_log_iovec_t *log_vector) 222 xfs_log_iovec_t *log_vector)
223 { 223 {
224 uint base_size; 224 uint base_size;
225 uint nvecs; 225 uint nvecs;
226 xfs_log_iovec_t *vecp; 226 xfs_log_iovec_t *vecp;
227 xfs_buf_t *bp; 227 xfs_buf_t *bp;
228 int first_bit; 228 int first_bit;
229 int last_bit; 229 int last_bit;
230 int next_bit; 230 int next_bit;
231 uint nbits; 231 uint nbits;
232 uint buffer_offset; 232 uint buffer_offset;
233 233
234 ASSERT(atomic_read(&bip->bli_refcount) > 0); 234 ASSERT(atomic_read(&bip->bli_refcount) > 0);
235 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 235 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
236 (bip->bli_flags & XFS_BLI_STALE)); 236 (bip->bli_flags & XFS_BLI_STALE));
237 bp = bip->bli_buf; 237 bp = bip->bli_buf;
238 vecp = log_vector; 238 vecp = log_vector;
239 239
240 /* 240 /*
241 * The size of the base structure is the size of the 241 * The size of the base structure is the size of the
242 * declared structure plus the space for the extra words 242 * declared structure plus the space for the extra words
243 * of the bitmap. We subtract one from the map size, because 243 * of the bitmap. We subtract one from the map size, because
244 * the first element of the bitmap is accounted for in the 244 * the first element of the bitmap is accounted for in the
245 * size of the base structure. 245 * size of the base structure.
246 */ 246 */
247 base_size = 247 base_size =
248 (uint)(sizeof(xfs_buf_log_format_t) + 248 (uint)(sizeof(xfs_buf_log_format_t) +
249 ((bip->bli_format.blf_map_size - 1) * sizeof(uint))); 249 ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
250 vecp->i_addr = (xfs_caddr_t)&bip->bli_format; 250 vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
251 vecp->i_len = base_size; 251 vecp->i_len = base_size;
252 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BFORMAT); 252 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BFORMAT);
253 vecp++; 253 vecp++;
254 nvecs = 1; 254 nvecs = 1;
255 255
256 if (bip->bli_flags & XFS_BLI_STALE) { 256 if (bip->bli_flags & XFS_BLI_STALE) {
257 /* 257 /*
258 * The buffer is stale, so all we need to log 258 * The buffer is stale, so all we need to log
259 * is the buf log format structure with the 259 * is the buf log format structure with the
260 * cancel flag in it. 260 * cancel flag in it.
261 */ 261 */
262 xfs_buf_item_trace("FORMAT STALE", bip); 262 xfs_buf_item_trace("FORMAT STALE", bip);
263 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 263 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
264 bip->bli_format.blf_size = nvecs; 264 bip->bli_format.blf_size = nvecs;
265 return; 265 return;
266 } 266 }
267 267
268 /* 268 /*
269 * Fill in an iovec for each set of contiguous chunks. 269 * Fill in an iovec for each set of contiguous chunks.
270 */ 270 */
271 first_bit = xfs_next_bit(bip->bli_format.blf_data_map, 271 first_bit = xfs_next_bit(bip->bli_format.blf_data_map,
272 bip->bli_format.blf_map_size, 0); 272 bip->bli_format.blf_map_size, 0);
273 ASSERT(first_bit != -1); 273 ASSERT(first_bit != -1);
274 last_bit = first_bit; 274 last_bit = first_bit;
275 nbits = 1; 275 nbits = 1;
276 for (;;) { 276 for (;;) {
277 /* 277 /*
278 * This takes the bit number to start looking from and 278 * This takes the bit number to start looking from and
279 * returns the next set bit from there. It returns -1 279 * returns the next set bit from there. It returns -1
280 * if there are no more bits set or the start bit is 280 * if there are no more bits set or the start bit is
281 * beyond the end of the bitmap. 281 * beyond the end of the bitmap.
282 */ 282 */
283 next_bit = xfs_next_bit(bip->bli_format.blf_data_map, 283 next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
284 bip->bli_format.blf_map_size, 284 bip->bli_format.blf_map_size,
285 (uint)last_bit + 1); 285 (uint)last_bit + 1);
286 /* 286 /*
287 * If we run out of bits fill in the last iovec and get 287 * If we run out of bits fill in the last iovec and get
288 * out of the loop. 288 * out of the loop.
289 * Else if we start a new set of bits then fill in the 289 * Else if we start a new set of bits then fill in the
290 * iovec for the series we were looking at and start 290 * iovec for the series we were looking at and start
291 * counting the bits in the new one. 291 * counting the bits in the new one.
292 * Else we're still in the same set of bits so just 292 * Else we're still in the same set of bits so just
293 * keep counting and scanning. 293 * keep counting and scanning.
294 */ 294 */
295 if (next_bit == -1) { 295 if (next_bit == -1) {
296 buffer_offset = first_bit * XFS_BLI_CHUNK; 296 buffer_offset = first_bit * XFS_BLI_CHUNK;
297 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 297 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
298 vecp->i_len = nbits * XFS_BLI_CHUNK; 298 vecp->i_len = nbits * XFS_BLI_CHUNK;
299 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); 299 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
300 nvecs++; 300 nvecs++;
301 break; 301 break;
302 } else if (next_bit != last_bit + 1) { 302 } else if (next_bit != last_bit + 1) {
303 buffer_offset = first_bit * XFS_BLI_CHUNK; 303 buffer_offset = first_bit * XFS_BLI_CHUNK;
304 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 304 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
305 vecp->i_len = nbits * XFS_BLI_CHUNK; 305 vecp->i_len = nbits * XFS_BLI_CHUNK;
306 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); 306 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
307 nvecs++; 307 nvecs++;
308 vecp++; 308 vecp++;
309 first_bit = next_bit; 309 first_bit = next_bit;
310 last_bit = next_bit; 310 last_bit = next_bit;
311 nbits = 1; 311 nbits = 1;
312 } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) != 312 } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) !=
313 (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) + 313 (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) +
314 XFS_BLI_CHUNK)) { 314 XFS_BLI_CHUNK)) {
315 buffer_offset = first_bit * XFS_BLI_CHUNK; 315 buffer_offset = first_bit * XFS_BLI_CHUNK;
316 vecp->i_addr = xfs_buf_offset(bp, buffer_offset); 316 vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
317 vecp->i_len = nbits * XFS_BLI_CHUNK; 317 vecp->i_len = nbits * XFS_BLI_CHUNK;
318 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK); 318 XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
319 /* You would think we need to bump the nvecs here too, but we do not 319 /* You would think we need to bump the nvecs here too, but we do not
320 * this number is used by recovery, and it gets confused by the boundary 320 * this number is used by recovery, and it gets confused by the boundary
321 * split here 321 * split here
322 * nvecs++; 322 * nvecs++;
323 */ 323 */
324 vecp++; 324 vecp++;
325 first_bit = next_bit; 325 first_bit = next_bit;
326 last_bit = next_bit; 326 last_bit = next_bit;
327 nbits = 1; 327 nbits = 1;
328 } else { 328 } else {
329 last_bit++; 329 last_bit++;
330 nbits++; 330 nbits++;
331 } 331 }
332 } 332 }
333 bip->bli_format.blf_size = nvecs; 333 bip->bli_format.blf_size = nvecs;
334 334
335 /* 335 /*
336 * Check to make sure everything is consistent. 336 * Check to make sure everything is consistent.
337 */ 337 */
338 xfs_buf_item_trace("FORMAT NORM", bip); 338 xfs_buf_item_trace("FORMAT NORM", bip);
339 xfs_buf_item_log_check(bip); 339 xfs_buf_item_log_check(bip);
340 } 340 }
341 341
342 /* 342 /*
343 * This is called to pin the buffer associated with the buf log 343 * This is called to pin the buffer associated with the buf log
344 * item in memory so it cannot be written out. Simply call bpin() 344 * item in memory so it cannot be written out. Simply call bpin()
345 * on the buffer to do this. 345 * on the buffer to do this.
346 */ 346 */
347 STATIC void 347 STATIC void
348 xfs_buf_item_pin( 348 xfs_buf_item_pin(
349 xfs_buf_log_item_t *bip) 349 xfs_buf_log_item_t *bip)
350 { 350 {
351 xfs_buf_t *bp; 351 xfs_buf_t *bp;
352 352
353 bp = bip->bli_buf; 353 bp = bip->bli_buf;
354 ASSERT(XFS_BUF_ISBUSY(bp)); 354 ASSERT(XFS_BUF_ISBUSY(bp));
355 ASSERT(atomic_read(&bip->bli_refcount) > 0); 355 ASSERT(atomic_read(&bip->bli_refcount) > 0);
356 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) || 356 ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
357 (bip->bli_flags & XFS_BLI_STALE)); 357 (bip->bli_flags & XFS_BLI_STALE));
358 xfs_buf_item_trace("PIN", bip); 358 xfs_buf_item_trace("PIN", bip);
359 xfs_buftrace("XFS_PIN", bp); 359 xfs_buftrace("XFS_PIN", bp);
360 xfs_bpin(bp); 360 xfs_bpin(bp);
361 } 361 }
362 362
363 363
364 /* 364 /*
365 * This is called to unpin the buffer associated with the buf log 365 * This is called to unpin the buffer associated with the buf log
366 * item which was previously pinned with a call to xfs_buf_item_pin(). 366 * item which was previously pinned with a call to xfs_buf_item_pin().
367 * Just call bunpin() on the buffer to do this. 367 * Just call bunpin() on the buffer to do this.
368 * 368 *
369 * Also drop the reference to the buf item for the current transaction. 369 * Also drop the reference to the buf item for the current transaction.
370 * If the XFS_BLI_STALE flag is set and we are the last reference, 370 * If the XFS_BLI_STALE flag is set and we are the last reference,
371 * then free up the buf log item and unlock the buffer. 371 * then free up the buf log item and unlock the buffer.
372 */ 372 */
373 STATIC void 373 STATIC void
374 xfs_buf_item_unpin( 374 xfs_buf_item_unpin(
375 xfs_buf_log_item_t *bip, 375 xfs_buf_log_item_t *bip,
376 int stale) 376 int stale)
377 { 377 {
378 struct xfs_ail *ailp; 378 struct xfs_ail *ailp;
379 xfs_buf_t *bp; 379 xfs_buf_t *bp;
380 int freed; 380 int freed;
381 381
382 bp = bip->bli_buf; 382 bp = bip->bli_buf;
383 ASSERT(bp != NULL); 383 ASSERT(bp != NULL);
384 ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip); 384 ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);
385 ASSERT(atomic_read(&bip->bli_refcount) > 0); 385 ASSERT(atomic_read(&bip->bli_refcount) > 0);
386 xfs_buf_item_trace("UNPIN", bip); 386 xfs_buf_item_trace("UNPIN", bip);
387 xfs_buftrace("XFS_UNPIN", bp); 387 xfs_buftrace("XFS_UNPIN", bp);
388 388
389 freed = atomic_dec_and_test(&bip->bli_refcount); 389 freed = atomic_dec_and_test(&bip->bli_refcount);
390 ailp = bip->bli_item.li_ailp; 390 ailp = bip->bli_item.li_ailp;
391 xfs_bunpin(bp); 391 xfs_bunpin(bp);
392 if (freed && stale) { 392 if (freed && stale) {
393 ASSERT(bip->bli_flags & XFS_BLI_STALE); 393 ASSERT(bip->bli_flags & XFS_BLI_STALE);
394 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 394 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
395 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); 395 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
396 ASSERT(XFS_BUF_ISSTALE(bp)); 396 ASSERT(XFS_BUF_ISSTALE(bp));
397 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 397 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
398 xfs_buf_item_trace("UNPIN STALE", bip); 398 xfs_buf_item_trace("UNPIN STALE", bip);
399 xfs_buftrace("XFS_UNPIN STALE", bp); 399 xfs_buftrace("XFS_UNPIN STALE", bp);
400 /* 400 /*
401 * If we get called here because of an IO error, we may 401 * If we get called here because of an IO error, we may
402 * or may not have the item on the AIL. xfs_trans_ail_delete() 402 * or may not have the item on the AIL. xfs_trans_ail_delete()
403 * will take care of that situation. 403 * will take care of that situation.
404 * xfs_trans_ail_delete() drops the AIL lock. 404 * xfs_trans_ail_delete() drops the AIL lock.
405 */ 405 */
406 if (bip->bli_flags & XFS_BLI_STALE_INODE) { 406 if (bip->bli_flags & XFS_BLI_STALE_INODE) {
407 xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip); 407 xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
408 XFS_BUF_SET_FSPRIVATE(bp, NULL); 408 XFS_BUF_SET_FSPRIVATE(bp, NULL);
409 XFS_BUF_CLR_IODONE_FUNC(bp); 409 XFS_BUF_CLR_IODONE_FUNC(bp);
410 } else { 410 } else {
411 spin_lock(&ailp->xa_lock); 411 spin_lock(&ailp->xa_lock);
412 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip); 412 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
413 xfs_buf_item_relse(bp); 413 xfs_buf_item_relse(bp);
414 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL); 414 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL);
415 } 415 }
416 xfs_buf_relse(bp); 416 xfs_buf_relse(bp);
417 } 417 }
418 } 418 }
419 419
420 /* 420 /*
421 * this is called from uncommit in the forced-shutdown path. 421 * this is called from uncommit in the forced-shutdown path.
422 * we need to check to see if the reference count on the log item 422 * we need to check to see if the reference count on the log item
423 * is going to drop to zero. If so, unpin will free the log item 423 * is going to drop to zero. If so, unpin will free the log item
424 * so we need to free the item's descriptor (that points to the item) 424 * so we need to free the item's descriptor (that points to the item)
425 * in the transaction. 425 * in the transaction.
426 */ 426 */
427 STATIC void 427 STATIC void
428 xfs_buf_item_unpin_remove( 428 xfs_buf_item_unpin_remove(
429 xfs_buf_log_item_t *bip, 429 xfs_buf_log_item_t *bip,
430 xfs_trans_t *tp) 430 xfs_trans_t *tp)
431 { 431 {
432 xfs_buf_t *bp; 432 xfs_buf_t *bp;
433 xfs_log_item_desc_t *lidp; 433 xfs_log_item_desc_t *lidp;
434 int stale = 0; 434 int stale = 0;
435 435
436 bp = bip->bli_buf; 436 bp = bip->bli_buf;
437 /* 437 /*
438 * will xfs_buf_item_unpin() call xfs_buf_item_relse()? 438 * will xfs_buf_item_unpin() call xfs_buf_item_relse()?
439 */ 439 */
440 if ((atomic_read(&bip->bli_refcount) == 1) && 440 if ((atomic_read(&bip->bli_refcount) == 1) &&
441 (bip->bli_flags & XFS_BLI_STALE)) { 441 (bip->bli_flags & XFS_BLI_STALE)) {
442 ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0); 442 ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
443 xfs_buf_item_trace("UNPIN REMOVE", bip); 443 xfs_buf_item_trace("UNPIN REMOVE", bip);
444 xfs_buftrace("XFS_UNPIN_REMOVE", bp); 444 xfs_buftrace("XFS_UNPIN_REMOVE", bp);
445 /* 445 /*
446 * yes -- clear the xaction descriptor in-use flag 446 * yes -- clear the xaction descriptor in-use flag
447 * and free the chunk if required. We can safely 447 * and free the chunk if required. We can safely
448 * do some work here and then call buf_item_unpin 448 * do some work here and then call buf_item_unpin
449 * to do the rest because if the if is true, then 449 * to do the rest because if the if is true, then
450 * we are holding the buffer locked so no one else 450 * we are holding the buffer locked so no one else
451 * will be able to bump up the refcount. 451 * will be able to bump up the refcount.
452 */ 452 */
453 lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip); 453 lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip);
454 stale = lidp->lid_flags & XFS_LID_BUF_STALE; 454 stale = lidp->lid_flags & XFS_LID_BUF_STALE;
455 xfs_trans_free_item(tp, lidp); 455 xfs_trans_free_item(tp, lidp);
456 /* 456 /*
457 * Since the transaction no longer refers to the buffer, 457 * Since the transaction no longer refers to the buffer,
458 * the buffer should no longer refer to the transaction. 458 * the buffer should no longer refer to the transaction.
459 */ 459 */
460 XFS_BUF_SET_FSPRIVATE2(bp, NULL); 460 XFS_BUF_SET_FSPRIVATE2(bp, NULL);
461 } 461 }
462 462
463 xfs_buf_item_unpin(bip, stale); 463 xfs_buf_item_unpin(bip, stale);
464 464
465 return; 465 return;
466 } 466 }
467 467
468 /* 468 /*
469 * This is called to attempt to lock the buffer associated with this 469 * This is called to attempt to lock the buffer associated with this
470 * buf log item. Don't sleep on the buffer lock. If we can't get 470 * buf log item. Don't sleep on the buffer lock. If we can't get
471 * the lock right away, return 0. If we can get the lock, pull the 471 * the lock right away, return 0. If we can get the lock, pull the
472 * buffer from the free list, mark it busy, and return 1. 472 * buffer from the free list, mark it busy, and return 1.
473 */ 473 */
474 STATIC uint 474 STATIC uint
475 xfs_buf_item_trylock( 475 xfs_buf_item_trylock(
476 xfs_buf_log_item_t *bip) 476 xfs_buf_log_item_t *bip)
477 { 477 {
478 xfs_buf_t *bp; 478 xfs_buf_t *bp;
479 479
480 bp = bip->bli_buf; 480 bp = bip->bli_buf;
481 481
482 if (XFS_BUF_ISPINNED(bp)) { 482 if (XFS_BUF_ISPINNED(bp)) {
483 return XFS_ITEM_PINNED; 483 return XFS_ITEM_PINNED;
484 } 484 }
485 485
486 if (!XFS_BUF_CPSEMA(bp)) { 486 if (!XFS_BUF_CPSEMA(bp)) {
487 return XFS_ITEM_LOCKED; 487 return XFS_ITEM_LOCKED;
488 } 488 }
489 489
490 /* 490 /*
491 * Remove the buffer from the free list. Only do this 491 * Remove the buffer from the free list. Only do this
492 * if it's on the free list. Private buffers like the 492 * if it's on the free list. Private buffers like the
493 * superblock buffer are not. 493 * superblock buffer are not.
494 */ 494 */
495 XFS_BUF_HOLD(bp); 495 XFS_BUF_HOLD(bp);
496 496
497 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 497 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
498 xfs_buf_item_trace("TRYLOCK SUCCESS", bip); 498 xfs_buf_item_trace("TRYLOCK SUCCESS", bip);
499 return XFS_ITEM_SUCCESS; 499 return XFS_ITEM_SUCCESS;
500 } 500 }
501 501
502 /* 502 /*
503 * Release the buffer associated with the buf log item. 503 * Release the buffer associated with the buf log item.
504 * If there is no dirty logged data associated with the 504 * If there is no dirty logged data associated with the
505 * buffer recorded in the buf log item, then free the 505 * buffer recorded in the buf log item, then free the
506 * buf log item and remove the reference to it in the 506 * buf log item and remove the reference to it in the
507 * buffer. 507 * buffer.
508 * 508 *
509 * This call ignores the recursion count. It is only called 509 * This call ignores the recursion count. It is only called
510 * when the buffer should REALLY be unlocked, regardless 510 * when the buffer should REALLY be unlocked, regardless
511 * of the recursion count. 511 * of the recursion count.
512 * 512 *
513 * If the XFS_BLI_HOLD flag is set in the buf log item, then 513 * If the XFS_BLI_HOLD flag is set in the buf log item, then
514 * free the log item if necessary but do not unlock the buffer. 514 * free the log item if necessary but do not unlock the buffer.
515 * This is for support of xfs_trans_bhold(). Make sure the 515 * This is for support of xfs_trans_bhold(). Make sure the
516 * XFS_BLI_HOLD field is cleared if we don't free the item. 516 * XFS_BLI_HOLD field is cleared if we don't free the item.
517 */ 517 */
518 STATIC void 518 STATIC void
519 xfs_buf_item_unlock( 519 xfs_buf_item_unlock(
520 xfs_buf_log_item_t *bip) 520 xfs_buf_log_item_t *bip)
521 { 521 {
522 int aborted; 522 int aborted;
523 xfs_buf_t *bp; 523 xfs_buf_t *bp;
524 uint hold; 524 uint hold;
525 525
526 bp = bip->bli_buf; 526 bp = bip->bli_buf;
527 xfs_buftrace("XFS_UNLOCK", bp); 527 xfs_buftrace("XFS_UNLOCK", bp);
528 528
529 /* 529 /*
530 * Clear the buffer's association with this transaction. 530 * Clear the buffer's association with this transaction.
531 */ 531 */
532 XFS_BUF_SET_FSPRIVATE2(bp, NULL); 532 XFS_BUF_SET_FSPRIVATE2(bp, NULL);
533 533
534 /* 534 /*
535 * If this is a transaction abort, don't return early. 535 * If this is a transaction abort, don't return early.
536 * Instead, allow the brelse to happen. 536 * Instead, allow the brelse to happen.
537 * Normally it would be done for stale (cancelled) buffers 537 * Normally it would be done for stale (cancelled) buffers
538 * at unpin time, but we'll never go through the pin/unpin 538 * at unpin time, but we'll never go through the pin/unpin
539 * cycle if we abort inside commit. 539 * cycle if we abort inside commit.
540 */ 540 */
541 aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0; 541 aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
542 542
543 /* 543 /*
544 * If the buf item is marked stale, then don't do anything. 544 * If the buf item is marked stale, then don't do anything.
545 * We'll unlock the buffer and free the buf item when the 545 * We'll unlock the buffer and free the buf item when the
546 * buffer is unpinned for the last time. 546 * buffer is unpinned for the last time.
547 */ 547 */
548 if (bip->bli_flags & XFS_BLI_STALE) { 548 if (bip->bli_flags & XFS_BLI_STALE) {
549 bip->bli_flags &= ~XFS_BLI_LOGGED; 549 bip->bli_flags &= ~XFS_BLI_LOGGED;
550 xfs_buf_item_trace("UNLOCK STALE", bip); 550 xfs_buf_item_trace("UNLOCK STALE", bip);
551 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); 551 ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
552 if (!aborted) 552 if (!aborted)
553 return; 553 return;
554 } 554 }
555 555
556 /* 556 /*
557 * Drop the transaction's reference to the log item if 557 * Drop the transaction's reference to the log item if
558 * it was not logged as part of the transaction. Otherwise 558 * it was not logged as part of the transaction. Otherwise
559 * we'll drop the reference in xfs_buf_item_unpin() when 559 * we'll drop the reference in xfs_buf_item_unpin() when
560 * the transaction is really through with the buffer. 560 * the transaction is really through with the buffer.
561 */ 561 */
562 if (!(bip->bli_flags & XFS_BLI_LOGGED)) { 562 if (!(bip->bli_flags & XFS_BLI_LOGGED)) {
563 atomic_dec(&bip->bli_refcount); 563 atomic_dec(&bip->bli_refcount);
564 } else { 564 } else {
565 /* 565 /*
566 * Clear the logged flag since this is per 566 * Clear the logged flag since this is per
567 * transaction state. 567 * transaction state.
568 */ 568 */
569 bip->bli_flags &= ~XFS_BLI_LOGGED; 569 bip->bli_flags &= ~XFS_BLI_LOGGED;
570 } 570 }
571 571
572 /* 572 /*
573 * Before possibly freeing the buf item, determine if we should 573 * Before possibly freeing the buf item, determine if we should
574 * release the buffer at the end of this routine. 574 * release the buffer at the end of this routine.
575 */ 575 */
576 hold = bip->bli_flags & XFS_BLI_HOLD; 576 hold = bip->bli_flags & XFS_BLI_HOLD;
577 xfs_buf_item_trace("UNLOCK", bip); 577 xfs_buf_item_trace("UNLOCK", bip);
578 578
579 /* 579 /*
580 * If the buf item isn't tracking any data, free it. 580 * If the buf item isn't tracking any data, free it.
581 * Otherwise, if XFS_BLI_HOLD is set clear it. 581 * Otherwise, if XFS_BLI_HOLD is set clear it.
582 */ 582 */
583 if (xfs_bitmap_empty(bip->bli_format.blf_data_map, 583 if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
584 bip->bli_format.blf_map_size)) { 584 bip->bli_format.blf_map_size)) {
585 xfs_buf_item_relse(bp); 585 xfs_buf_item_relse(bp);
586 } else if (hold) { 586 } else if (hold) {
587 bip->bli_flags &= ~XFS_BLI_HOLD; 587 bip->bli_flags &= ~XFS_BLI_HOLD;
588 } 588 }
589 589
590 /* 590 /*
591 * Release the buffer if XFS_BLI_HOLD was not set. 591 * Release the buffer if XFS_BLI_HOLD was not set.
592 */ 592 */
593 if (!hold) { 593 if (!hold) {
594 xfs_buf_relse(bp); 594 xfs_buf_relse(bp);
595 } 595 }
596 } 596 }
597 597
598 /* 598 /*
599 * This is called to find out where the oldest active copy of the 599 * This is called to find out where the oldest active copy of the
600 * buf log item in the on disk log resides now that the last log 600 * buf log item in the on disk log resides now that the last log
601 * write of it completed at the given lsn. 601 * write of it completed at the given lsn.
602 * We always re-log all the dirty data in a buffer, so usually the 602 * We always re-log all the dirty data in a buffer, so usually the
603 * latest copy in the on disk log is the only one that matters. For 603 * latest copy in the on disk log is the only one that matters. For
604 * those cases we simply return the given lsn. 604 * those cases we simply return the given lsn.
605 * 605 *
606 * The one exception to this is for buffers full of newly allocated 606 * The one exception to this is for buffers full of newly allocated
607 * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF 607 * inodes. These buffers are only relogged with the XFS_BLI_INODE_BUF
608 * flag set, indicating that only the di_next_unlinked fields from the 608 * flag set, indicating that only the di_next_unlinked fields from the
609 * inodes in the buffers will be replayed during recovery. If the 609 * inodes in the buffers will be replayed during recovery. If the
610 * original newly allocated inode images have not yet been flushed 610 * original newly allocated inode images have not yet been flushed
611 * when the buffer is so relogged, then we need to make sure that we 611 * when the buffer is so relogged, then we need to make sure that we
612 * keep the old images in the 'active' portion of the log. We do this 612 * keep the old images in the 'active' portion of the log. We do this
613 * by returning the original lsn of that transaction here rather than 613 * by returning the original lsn of that transaction here rather than
614 * the current one. 614 * the current one.
615 */ 615 */
616 STATIC xfs_lsn_t 616 STATIC xfs_lsn_t
617 xfs_buf_item_committed( 617 xfs_buf_item_committed(
618 xfs_buf_log_item_t *bip, 618 xfs_buf_log_item_t *bip,
619 xfs_lsn_t lsn) 619 xfs_lsn_t lsn)
620 { 620 {
621 xfs_buf_item_trace("COMMITTED", bip); 621 xfs_buf_item_trace("COMMITTED", bip);
622 if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && 622 if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
623 (bip->bli_item.li_lsn != 0)) { 623 (bip->bli_item.li_lsn != 0)) {
624 return bip->bli_item.li_lsn; 624 return bip->bli_item.li_lsn;
625 } 625 }
626 return (lsn); 626 return (lsn);
627 } 627 }
628 628
629 /* 629 /*
630 * This is called to asynchronously write the buffer associated with this 630 * This is called to asynchronously write the buffer associated with this
631 * buf log item out to disk. The buffer will already have been locked by 631 * buf log item out to disk. The buffer will already have been locked by
632 * a successful call to xfs_buf_item_trylock(). If the buffer still has 632 * a successful call to xfs_buf_item_trylock(). If the buffer still has
633 * B_DELWRI set, then get it going out to disk with a call to bawrite(). 633 * B_DELWRI set, then get it going out to disk with a call to bawrite().
634 * If not, then just release the buffer. 634 * If not, then just release the buffer.
635 */ 635 */
636 STATIC void 636 STATIC void
637 xfs_buf_item_push( 637 xfs_buf_item_push(
638 xfs_buf_log_item_t *bip) 638 xfs_buf_log_item_t *bip)
639 { 639 {
640 xfs_buf_t *bp; 640 xfs_buf_t *bp;
641 641
642 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 642 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
643 xfs_buf_item_trace("PUSH", bip); 643 xfs_buf_item_trace("PUSH", bip);
644 644
645 bp = bip->bli_buf; 645 bp = bip->bli_buf;
646 646
647 if (XFS_BUF_ISDELAYWRITE(bp)) { 647 if (XFS_BUF_ISDELAYWRITE(bp)) {
648 int error; 648 int error;
649 error = xfs_bawrite(bip->bli_item.li_mountp, bp); 649 error = xfs_bawrite(bip->bli_item.li_mountp, bp);
650 if (error) 650 if (error)
651 xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp, 651 xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp,
652 "xfs_buf_item_push: pushbuf error %d on bip %p, bp %p", 652 "xfs_buf_item_push: pushbuf error %d on bip %p, bp %p",
653 error, bip, bp); 653 error, bip, bp);
654 } else { 654 } else {
655 xfs_buf_relse(bp); 655 xfs_buf_relse(bp);
656 } 656 }
657 } 657 }
658 658
659 /* ARGSUSED */ 659 /* ARGSUSED */
660 STATIC void 660 STATIC void
661 xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn) 661 xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn)
662 { 662 {
663 } 663 }
664 664
665 /* 665 /*
666 * This is the ops vector shared by all buf log items. 666 * This is the ops vector shared by all buf log items.
667 */ 667 */
668 static struct xfs_item_ops xfs_buf_item_ops = { 668 static struct xfs_item_ops xfs_buf_item_ops = {
669 .iop_size = (uint(*)(xfs_log_item_t*))xfs_buf_item_size, 669 .iop_size = (uint(*)(xfs_log_item_t*))xfs_buf_item_size,
670 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*)) 670 .iop_format = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
671 xfs_buf_item_format, 671 xfs_buf_item_format,
672 .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin, 672 .iop_pin = (void(*)(xfs_log_item_t*))xfs_buf_item_pin,
673 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin, 673 .iop_unpin = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin,
674 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *)) 674 .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
675 xfs_buf_item_unpin_remove, 675 xfs_buf_item_unpin_remove,
676 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock, 676 .iop_trylock = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock,
677 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_buf_item_unlock, 677 .iop_unlock = (void(*)(xfs_log_item_t*))xfs_buf_item_unlock,
678 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) 678 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
679 xfs_buf_item_committed, 679 xfs_buf_item_committed,
680 .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push, 680 .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push,
681 .iop_pushbuf = NULL, 681 .iop_pushbuf = NULL,
682 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) 682 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
683 xfs_buf_item_committing 683 xfs_buf_item_committing
684 }; 684 };
685 685
686 686
687 /* 687 /*
688 * Allocate a new buf log item to go with the given buffer. 688 * Allocate a new buf log item to go with the given buffer.
689 * Set the buffer's b_fsprivate field to point to the new 689 * Set the buffer's b_fsprivate field to point to the new
690 * buf log item. If there are other item's attached to the 690 * buf log item. If there are other item's attached to the
691 * buffer (see xfs_buf_attach_iodone() below), then put the 691 * buffer (see xfs_buf_attach_iodone() below), then put the
692 * buf log item at the front. 692 * buf log item at the front.
693 */ 693 */
694 void 694 void
695 xfs_buf_item_init( 695 xfs_buf_item_init(
696 xfs_buf_t *bp, 696 xfs_buf_t *bp,
697 xfs_mount_t *mp) 697 xfs_mount_t *mp)
698 { 698 {
699 xfs_log_item_t *lip; 699 xfs_log_item_t *lip;
700 xfs_buf_log_item_t *bip; 700 xfs_buf_log_item_t *bip;
701 int chunks; 701 int chunks;
702 int map_size; 702 int map_size;
703 703
704 /* 704 /*
705 * Check to see if there is already a buf log item for 705 * Check to see if there is already a buf log item for
706 * this buffer. If there is, it is guaranteed to be 706 * this buffer. If there is, it is guaranteed to be
707 * the first. If we do already have one, there is 707 * the first. If we do already have one, there is
708 * nothing to do here so return. 708 * nothing to do here so return.
709 */ 709 */
710 if (XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *) != mp) 710 if (bp->b_mount != mp)
711 XFS_BUF_SET_FSPRIVATE3(bp, mp); 711 bp->b_mount = mp;
712 XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb); 712 XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
713 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { 713 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
714 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 714 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
715 if (lip->li_type == XFS_LI_BUF) { 715 if (lip->li_type == XFS_LI_BUF) {
716 return; 716 return;
717 } 717 }
718 } 718 }
719 719
720 /* 720 /*
721 * chunks is the number of XFS_BLI_CHUNK size pieces 721 * chunks is the number of XFS_BLI_CHUNK size pieces
722 * the buffer can be divided into. Make sure not to 722 * the buffer can be divided into. Make sure not to
723 * truncate any pieces. map_size is the size of the 723 * truncate any pieces. map_size is the size of the
724 * bitmap needed to describe the chunks of the buffer. 724 * bitmap needed to describe the chunks of the buffer.
725 */ 725 */
726 chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT); 726 chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT);
727 map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT); 727 map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
728 728
729 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone, 729 bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
730 KM_SLEEP); 730 KM_SLEEP);
731 bip->bli_item.li_type = XFS_LI_BUF; 731 bip->bli_item.li_type = XFS_LI_BUF;
732 bip->bli_item.li_ops = &xfs_buf_item_ops; 732 bip->bli_item.li_ops = &xfs_buf_item_ops;
733 bip->bli_item.li_mountp = mp; 733 bip->bli_item.li_mountp = mp;
734 bip->bli_item.li_ailp = mp->m_ail; 734 bip->bli_item.li_ailp = mp->m_ail;
735 bip->bli_buf = bp; 735 bip->bli_buf = bp;
736 xfs_buf_hold(bp); 736 xfs_buf_hold(bp);
737 bip->bli_format.blf_type = XFS_LI_BUF; 737 bip->bli_format.blf_type = XFS_LI_BUF;
738 bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); 738 bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
739 bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); 739 bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
740 bip->bli_format.blf_map_size = map_size; 740 bip->bli_format.blf_map_size = map_size;
741 #ifdef XFS_BLI_TRACE 741 #ifdef XFS_BLI_TRACE
742 bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_NOFS); 742 bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_NOFS);
743 #endif 743 #endif
744 744
745 #ifdef XFS_TRANS_DEBUG 745 #ifdef XFS_TRANS_DEBUG
746 /* 746 /*
747 * Allocate the arrays for tracking what needs to be logged 747 * Allocate the arrays for tracking what needs to be logged
748 * and what our callers request to be logged. bli_orig 748 * and what our callers request to be logged. bli_orig
749 * holds a copy of the original, clean buffer for comparison 749 * holds a copy of the original, clean buffer for comparison
750 * against, and bli_logged keeps a 1 bit flag per byte in 750 * against, and bli_logged keeps a 1 bit flag per byte in
751 * the buffer to indicate which bytes the callers have asked 751 * the buffer to indicate which bytes the callers have asked
752 * to have logged. 752 * to have logged.
753 */ 753 */
754 bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP); 754 bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP);
755 memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp)); 755 memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp));
756 bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP); 756 bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP);
757 #endif 757 #endif
758 758
759 /* 759 /*
760 * Put the buf item into the list of items attached to the 760 * Put the buf item into the list of items attached to the
761 * buffer at the front. 761 * buffer at the front.
762 */ 762 */
763 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { 763 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
764 bip->bli_item.li_bio_list = 764 bip->bli_item.li_bio_list =
765 XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 765 XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
766 } 766 }
767 XFS_BUF_SET_FSPRIVATE(bp, bip); 767 XFS_BUF_SET_FSPRIVATE(bp, bip);
768 } 768 }
769 769
770 770
771 /* 771 /*
772 * Mark bytes first through last inclusive as dirty in the buf 772 * Mark bytes first through last inclusive as dirty in the buf
773 * item's bitmap. 773 * item's bitmap.
774 */ 774 */
775 void 775 void
776 xfs_buf_item_log( 776 xfs_buf_item_log(
777 xfs_buf_log_item_t *bip, 777 xfs_buf_log_item_t *bip,
778 uint first, 778 uint first,
779 uint last) 779 uint last)
780 { 780 {
781 uint first_bit; 781 uint first_bit;
782 uint last_bit; 782 uint last_bit;
783 uint bits_to_set; 783 uint bits_to_set;
784 uint bits_set; 784 uint bits_set;
785 uint word_num; 785 uint word_num;
786 uint *wordp; 786 uint *wordp;
787 uint bit; 787 uint bit;
788 uint end_bit; 788 uint end_bit;
789 uint mask; 789 uint mask;
790 790
791 /* 791 /*
792 * Mark the item as having some dirty data for 792 * Mark the item as having some dirty data for
793 * quick reference in xfs_buf_item_dirty. 793 * quick reference in xfs_buf_item_dirty.
794 */ 794 */
795 bip->bli_flags |= XFS_BLI_DIRTY; 795 bip->bli_flags |= XFS_BLI_DIRTY;
796 796
797 /* 797 /*
798 * Convert byte offsets to bit numbers. 798 * Convert byte offsets to bit numbers.
799 */ 799 */
800 first_bit = first >> XFS_BLI_SHIFT; 800 first_bit = first >> XFS_BLI_SHIFT;
801 last_bit = last >> XFS_BLI_SHIFT; 801 last_bit = last >> XFS_BLI_SHIFT;
802 802
803 /* 803 /*
804 * Calculate the total number of bits to be set. 804 * Calculate the total number of bits to be set.
805 */ 805 */
806 bits_to_set = last_bit - first_bit + 1; 806 bits_to_set = last_bit - first_bit + 1;
807 807
808 /* 808 /*
809 * Get a pointer to the first word in the bitmap 809 * Get a pointer to the first word in the bitmap
810 * to set a bit in. 810 * to set a bit in.
811 */ 811 */
812 word_num = first_bit >> BIT_TO_WORD_SHIFT; 812 word_num = first_bit >> BIT_TO_WORD_SHIFT;
813 wordp = &(bip->bli_format.blf_data_map[word_num]); 813 wordp = &(bip->bli_format.blf_data_map[word_num]);
814 814
815 /* 815 /*
816 * Calculate the starting bit in the first word. 816 * Calculate the starting bit in the first word.
817 */ 817 */
818 bit = first_bit & (uint)(NBWORD - 1); 818 bit = first_bit & (uint)(NBWORD - 1);
819 819
820 /* 820 /*
821 * First set any bits in the first word of our range. 821 * First set any bits in the first word of our range.
822 * If it starts at bit 0 of the word, it will be 822 * If it starts at bit 0 of the word, it will be
823 * set below rather than here. That is what the variable 823 * set below rather than here. That is what the variable
824 * bit tells us. The variable bits_set tracks the number 824 * bit tells us. The variable bits_set tracks the number
825 * of bits that have been set so far. End_bit is the number 825 * of bits that have been set so far. End_bit is the number
826 * of the last bit to be set in this word plus one. 826 * of the last bit to be set in this word plus one.
827 */ 827 */
828 if (bit) { 828 if (bit) {
829 end_bit = MIN(bit + bits_to_set, (uint)NBWORD); 829 end_bit = MIN(bit + bits_to_set, (uint)NBWORD);
830 mask = ((1 << (end_bit - bit)) - 1) << bit; 830 mask = ((1 << (end_bit - bit)) - 1) << bit;
831 *wordp |= mask; 831 *wordp |= mask;
832 wordp++; 832 wordp++;
833 bits_set = end_bit - bit; 833 bits_set = end_bit - bit;
834 } else { 834 } else {
835 bits_set = 0; 835 bits_set = 0;
836 } 836 }
837 837
838 /* 838 /*
839 * Now set bits a whole word at a time that are between 839 * Now set bits a whole word at a time that are between
840 * first_bit and last_bit. 840 * first_bit and last_bit.
841 */ 841 */
842 while ((bits_to_set - bits_set) >= NBWORD) { 842 while ((bits_to_set - bits_set) >= NBWORD) {
843 *wordp |= 0xffffffff; 843 *wordp |= 0xffffffff;
844 bits_set += NBWORD; 844 bits_set += NBWORD;
845 wordp++; 845 wordp++;
846 } 846 }
847 847
848 /* 848 /*
849 * Finally, set any bits left to be set in one last partial word. 849 * Finally, set any bits left to be set in one last partial word.
850 */ 850 */
851 end_bit = bits_to_set - bits_set; 851 end_bit = bits_to_set - bits_set;
852 if (end_bit) { 852 if (end_bit) {
853 mask = (1 << end_bit) - 1; 853 mask = (1 << end_bit) - 1;
854 *wordp |= mask; 854 *wordp |= mask;
855 } 855 }
856 856
857 xfs_buf_item_log_debug(bip, first, last); 857 xfs_buf_item_log_debug(bip, first, last);
858 } 858 }
859 859
860 860
861 /* 861 /*
862 * Return 1 if the buffer has some data that has been logged (at any 862 * Return 1 if the buffer has some data that has been logged (at any
863 * point, not just the current transaction) and 0 if not. 863 * point, not just the current transaction) and 0 if not.
864 */ 864 */
865 uint 865 uint
866 xfs_buf_item_dirty( 866 xfs_buf_item_dirty(
867 xfs_buf_log_item_t *bip) 867 xfs_buf_log_item_t *bip)
868 { 868 {
869 return (bip->bli_flags & XFS_BLI_DIRTY); 869 return (bip->bli_flags & XFS_BLI_DIRTY);
870 } 870 }
871 871
872 STATIC void 872 STATIC void
873 xfs_buf_item_free( 873 xfs_buf_item_free(
874 xfs_buf_log_item_t *bip) 874 xfs_buf_log_item_t *bip)
875 { 875 {
876 #ifdef XFS_TRANS_DEBUG 876 #ifdef XFS_TRANS_DEBUG
877 kmem_free(bip->bli_orig); 877 kmem_free(bip->bli_orig);
878 kmem_free(bip->bli_logged); 878 kmem_free(bip->bli_logged);
879 #endif /* XFS_TRANS_DEBUG */ 879 #endif /* XFS_TRANS_DEBUG */
880 880
881 #ifdef XFS_BLI_TRACE 881 #ifdef XFS_BLI_TRACE
882 ktrace_free(bip->bli_trace); 882 ktrace_free(bip->bli_trace);
883 #endif 883 #endif
884 kmem_zone_free(xfs_buf_item_zone, bip); 884 kmem_zone_free(xfs_buf_item_zone, bip);
885 } 885 }
886 886
887 /* 887 /*
888 * This is called when the buf log item is no longer needed. It should 888 * This is called when the buf log item is no longer needed. It should
889 * free the buf log item associated with the given buffer and clear 889 * free the buf log item associated with the given buffer and clear
890 * the buffer's pointer to the buf log item. If there are no more 890 * the buffer's pointer to the buf log item. If there are no more
891 * items in the list, clear the b_iodone field of the buffer (see 891 * items in the list, clear the b_iodone field of the buffer (see
892 * xfs_buf_attach_iodone() below). 892 * xfs_buf_attach_iodone() below).
893 */ 893 */
894 void 894 void
895 xfs_buf_item_relse( 895 xfs_buf_item_relse(
896 xfs_buf_t *bp) 896 xfs_buf_t *bp)
897 { 897 {
898 xfs_buf_log_item_t *bip; 898 xfs_buf_log_item_t *bip;
899 899
900 xfs_buftrace("XFS_RELSE", bp); 900 xfs_buftrace("XFS_RELSE", bp);
901 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*); 901 bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
902 XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list); 902 XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list);
903 if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) && 903 if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) &&
904 (XFS_BUF_IODONE_FUNC(bp) != NULL)) { 904 (XFS_BUF_IODONE_FUNC(bp) != NULL)) {
905 XFS_BUF_CLR_IODONE_FUNC(bp); 905 XFS_BUF_CLR_IODONE_FUNC(bp);
906 } 906 }
907 xfs_buf_rele(bp); 907 xfs_buf_rele(bp);
908 xfs_buf_item_free(bip); 908 xfs_buf_item_free(bip);
909 } 909 }
910 910
911 911
912 /* 912 /*
913 * Add the given log item with its callback to the list of callbacks 913 * Add the given log item with its callback to the list of callbacks
914 * to be called when the buffer's I/O completes. If it is not set 914 * to be called when the buffer's I/O completes. If it is not set
915 * already, set the buffer's b_iodone() routine to be 915 * already, set the buffer's b_iodone() routine to be
916 * xfs_buf_iodone_callbacks() and link the log item into the list of 916 * xfs_buf_iodone_callbacks() and link the log item into the list of
917 * items rooted at b_fsprivate. Items are always added as the second 917 * items rooted at b_fsprivate. Items are always added as the second
918 * entry in the list if there is a first, because the buf item code 918 * entry in the list if there is a first, because the buf item code
919 * assumes that the buf log item is first. 919 * assumes that the buf log item is first.
920 */ 920 */
921 void 921 void
922 xfs_buf_attach_iodone( 922 xfs_buf_attach_iodone(
923 xfs_buf_t *bp, 923 xfs_buf_t *bp,
924 void (*cb)(xfs_buf_t *, xfs_log_item_t *), 924 void (*cb)(xfs_buf_t *, xfs_log_item_t *),
925 xfs_log_item_t *lip) 925 xfs_log_item_t *lip)
926 { 926 {
927 xfs_log_item_t *head_lip; 927 xfs_log_item_t *head_lip;
928 928
929 ASSERT(XFS_BUF_ISBUSY(bp)); 929 ASSERT(XFS_BUF_ISBUSY(bp));
930 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); 930 ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
931 931
932 lip->li_cb = cb; 932 lip->li_cb = cb;
933 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) { 933 if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
934 head_lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 934 head_lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
935 lip->li_bio_list = head_lip->li_bio_list; 935 lip->li_bio_list = head_lip->li_bio_list;
936 head_lip->li_bio_list = lip; 936 head_lip->li_bio_list = lip;
937 } else { 937 } else {
938 XFS_BUF_SET_FSPRIVATE(bp, lip); 938 XFS_BUF_SET_FSPRIVATE(bp, lip);
939 } 939 }
940 940
941 ASSERT((XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks) || 941 ASSERT((XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks) ||
942 (XFS_BUF_IODONE_FUNC(bp) == NULL)); 942 (XFS_BUF_IODONE_FUNC(bp) == NULL));
943 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks); 943 XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
944 } 944 }
945 945
946 STATIC void 946 STATIC void
947 xfs_buf_do_callbacks( 947 xfs_buf_do_callbacks(
948 xfs_buf_t *bp, 948 xfs_buf_t *bp,
949 xfs_log_item_t *lip) 949 xfs_log_item_t *lip)
950 { 950 {
951 xfs_log_item_t *nlip; 951 xfs_log_item_t *nlip;
952 952
953 while (lip != NULL) { 953 while (lip != NULL) {
954 nlip = lip->li_bio_list; 954 nlip = lip->li_bio_list;
955 ASSERT(lip->li_cb != NULL); 955 ASSERT(lip->li_cb != NULL);
956 /* 956 /*
957 * Clear the next pointer so we don't have any 957 * Clear the next pointer so we don't have any
958 * confusion if the item is added to another buf. 958 * confusion if the item is added to another buf.
959 * Don't touch the log item after calling its 959 * Don't touch the log item after calling its
960 * callback, because it could have freed itself. 960 * callback, because it could have freed itself.
961 */ 961 */
962 lip->li_bio_list = NULL; 962 lip->li_bio_list = NULL;
963 lip->li_cb(bp, lip); 963 lip->li_cb(bp, lip);
964 lip = nlip; 964 lip = nlip;
965 } 965 }
966 } 966 }
967 967
968 /* 968 /*
969 * This is the iodone() function for buffers which have had callbacks 969 * This is the iodone() function for buffers which have had callbacks
970 * attached to them by xfs_buf_attach_iodone(). It should remove each 970 * attached to them by xfs_buf_attach_iodone(). It should remove each
971 * log item from the buffer's list and call the callback of each in turn. 971 * log item from the buffer's list and call the callback of each in turn.
972 * When done, the buffer's fsprivate field is set to NULL and the buffer 972 * When done, the buffer's fsprivate field is set to NULL and the buffer
973 * is unlocked with a call to iodone(). 973 * is unlocked with a call to iodone().
974 */ 974 */
975 void 975 void
976 xfs_buf_iodone_callbacks( 976 xfs_buf_iodone_callbacks(
977 xfs_buf_t *bp) 977 xfs_buf_t *bp)
978 { 978 {
979 xfs_log_item_t *lip; 979 xfs_log_item_t *lip;
980 static ulong lasttime; 980 static ulong lasttime;
981 static xfs_buftarg_t *lasttarg; 981 static xfs_buftarg_t *lasttarg;
982 xfs_mount_t *mp; 982 xfs_mount_t *mp;
983 983
984 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); 984 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
985 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 985 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
986 986
987 if (XFS_BUF_GETERROR(bp) != 0) { 987 if (XFS_BUF_GETERROR(bp) != 0) {
988 /* 988 /*
989 * If we've already decided to shutdown the filesystem 989 * If we've already decided to shutdown the filesystem
990 * because of IO errors, there's no point in giving this 990 * because of IO errors, there's no point in giving this
991 * a retry. 991 * a retry.
992 */ 992 */
993 mp = lip->li_mountp; 993 mp = lip->li_mountp;
994 if (XFS_FORCED_SHUTDOWN(mp)) { 994 if (XFS_FORCED_SHUTDOWN(mp)) {
995 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); 995 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
996 XFS_BUF_SUPER_STALE(bp); 996 XFS_BUF_SUPER_STALE(bp);
997 xfs_buftrace("BUF_IODONE_CB", bp); 997 xfs_buftrace("BUF_IODONE_CB", bp);
998 xfs_buf_do_callbacks(bp, lip); 998 xfs_buf_do_callbacks(bp, lip);
999 XFS_BUF_SET_FSPRIVATE(bp, NULL); 999 XFS_BUF_SET_FSPRIVATE(bp, NULL);
1000 XFS_BUF_CLR_IODONE_FUNC(bp); 1000 XFS_BUF_CLR_IODONE_FUNC(bp);
1001 1001
1002 /* 1002 /*
1003 * XFS_SHUT flag gets set when we go thru the 1003 * XFS_SHUT flag gets set when we go thru the
1004 * entire buffer cache and deliberately start 1004 * entire buffer cache and deliberately start
1005 * throwing away delayed write buffers. 1005 * throwing away delayed write buffers.
1006 * Since there's no biowait done on those, 1006 * Since there's no biowait done on those,
1007 * we should just brelse them. 1007 * we should just brelse them.
1008 */ 1008 */
1009 if (XFS_BUF_ISSHUT(bp)) { 1009 if (XFS_BUF_ISSHUT(bp)) {
1010 XFS_BUF_UNSHUT(bp); 1010 XFS_BUF_UNSHUT(bp);
1011 xfs_buf_relse(bp); 1011 xfs_buf_relse(bp);
1012 } else { 1012 } else {
1013 xfs_biodone(bp); 1013 xfs_biodone(bp);
1014 } 1014 }
1015 1015
1016 return; 1016 return;
1017 } 1017 }
1018 1018
1019 if ((XFS_BUF_TARGET(bp) != lasttarg) || 1019 if ((XFS_BUF_TARGET(bp) != lasttarg) ||
1020 (time_after(jiffies, (lasttime + 5*HZ)))) { 1020 (time_after(jiffies, (lasttime + 5*HZ)))) {
1021 lasttime = jiffies; 1021 lasttime = jiffies;
1022 cmn_err(CE_ALERT, "Device %s, XFS metadata write error" 1022 cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
1023 " block 0x%llx in %s", 1023 " block 0x%llx in %s",
1024 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), 1024 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
1025 (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname); 1025 (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
1026 } 1026 }
1027 lasttarg = XFS_BUF_TARGET(bp); 1027 lasttarg = XFS_BUF_TARGET(bp);
1028 1028
1029 if (XFS_BUF_ISASYNC(bp)) { 1029 if (XFS_BUF_ISASYNC(bp)) {
1030 /* 1030 /*
1031 * If the write was asynchronous then noone will be 1031 * If the write was asynchronous then noone will be
1032 * looking for the error. Clear the error state 1032 * looking for the error. Clear the error state
1033 * and write the buffer out again delayed write. 1033 * and write the buffer out again delayed write.
1034 * 1034 *
1035 * XXXsup This is OK, so long as we catch these 1035 * XXXsup This is OK, so long as we catch these
1036 * before we start the umount; we don't want these 1036 * before we start the umount; we don't want these
1037 * DELWRI metadata bufs to be hanging around. 1037 * DELWRI metadata bufs to be hanging around.
1038 */ 1038 */
1039 XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */ 1039 XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */
1040 1040
1041 if (!(XFS_BUF_ISSTALE(bp))) { 1041 if (!(XFS_BUF_ISSTALE(bp))) {
1042 XFS_BUF_DELAYWRITE(bp); 1042 XFS_BUF_DELAYWRITE(bp);
1043 XFS_BUF_DONE(bp); 1043 XFS_BUF_DONE(bp);
1044 XFS_BUF_SET_START(bp); 1044 XFS_BUF_SET_START(bp);
1045 } 1045 }
1046 ASSERT(XFS_BUF_IODONE_FUNC(bp)); 1046 ASSERT(XFS_BUF_IODONE_FUNC(bp));
1047 xfs_buftrace("BUF_IODONE ASYNC", bp); 1047 xfs_buftrace("BUF_IODONE ASYNC", bp);
1048 xfs_buf_relse(bp); 1048 xfs_buf_relse(bp);
1049 } else { 1049 } else {
1050 /* 1050 /*
1051 * If the write of the buffer was not asynchronous, 1051 * If the write of the buffer was not asynchronous,
1052 * then we want to make sure to return the error 1052 * then we want to make sure to return the error
1053 * to the caller of bwrite(). Because of this we 1053 * to the caller of bwrite(). Because of this we
1054 * cannot clear the B_ERROR state at this point. 1054 * cannot clear the B_ERROR state at this point.
1055 * Instead we install a callback function that 1055 * Instead we install a callback function that
1056 * will be called when the buffer is released, and 1056 * will be called when the buffer is released, and
1057 * that routine will clear the error state and 1057 * that routine will clear the error state and
1058 * set the buffer to be written out again after 1058 * set the buffer to be written out again after
1059 * some delay. 1059 * some delay.
1060 */ 1060 */
1061 /* We actually overwrite the existing b-relse 1061 /* We actually overwrite the existing b-relse
1062 function at times, but we're gonna be shutting down 1062 function at times, but we're gonna be shutting down
1063 anyway. */ 1063 anyway. */
1064 XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse); 1064 XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
1065 XFS_BUF_DONE(bp); 1065 XFS_BUF_DONE(bp);
1066 XFS_BUF_FINISH_IOWAIT(bp); 1066 XFS_BUF_FINISH_IOWAIT(bp);
1067 } 1067 }
1068 return; 1068 return;
1069 } 1069 }
1070 #ifdef XFSERRORDEBUG 1070 #ifdef XFSERRORDEBUG
1071 xfs_buftrace("XFS BUFCB NOERR", bp); 1071 xfs_buftrace("XFS BUFCB NOERR", bp);
1072 #endif 1072 #endif
1073 xfs_buf_do_callbacks(bp, lip); 1073 xfs_buf_do_callbacks(bp, lip);
1074 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1074 XFS_BUF_SET_FSPRIVATE(bp, NULL);
1075 XFS_BUF_CLR_IODONE_FUNC(bp); 1075 XFS_BUF_CLR_IODONE_FUNC(bp);
1076 xfs_biodone(bp); 1076 xfs_biodone(bp);
1077 } 1077 }
1078 1078
1079 /* 1079 /*
1080 * This is a callback routine attached to a buffer which gets an error 1080 * This is a callback routine attached to a buffer which gets an error
1081 * when being written out synchronously. 1081 * when being written out synchronously.
1082 */ 1082 */
1083 STATIC void 1083 STATIC void
1084 xfs_buf_error_relse( 1084 xfs_buf_error_relse(
1085 xfs_buf_t *bp) 1085 xfs_buf_t *bp)
1086 { 1086 {
1087 xfs_log_item_t *lip; 1087 xfs_log_item_t *lip;
1088 xfs_mount_t *mp; 1088 xfs_mount_t *mp;
1089 1089
1090 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); 1090 lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1091 mp = (xfs_mount_t *)lip->li_mountp; 1091 mp = (xfs_mount_t *)lip->li_mountp;
1092 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp); 1092 ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
1093 1093
1094 XFS_BUF_STALE(bp); 1094 XFS_BUF_STALE(bp);
1095 XFS_BUF_DONE(bp); 1095 XFS_BUF_DONE(bp);
1096 XFS_BUF_UNDELAYWRITE(bp); 1096 XFS_BUF_UNDELAYWRITE(bp);
1097 XFS_BUF_ERROR(bp,0); 1097 XFS_BUF_ERROR(bp,0);
1098 xfs_buftrace("BUF_ERROR_RELSE", bp); 1098 xfs_buftrace("BUF_ERROR_RELSE", bp);
1099 if (! XFS_FORCED_SHUTDOWN(mp)) 1099 if (! XFS_FORCED_SHUTDOWN(mp))
1100 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1100 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1101 /* 1101 /*
1102 * We have to unpin the pinned buffers so do the 1102 * We have to unpin the pinned buffers so do the
1103 * callbacks. 1103 * callbacks.
1104 */ 1104 */
1105 xfs_buf_do_callbacks(bp, lip); 1105 xfs_buf_do_callbacks(bp, lip);
1106 XFS_BUF_SET_FSPRIVATE(bp, NULL); 1106 XFS_BUF_SET_FSPRIVATE(bp, NULL);
1107 XFS_BUF_CLR_IODONE_FUNC(bp); 1107 XFS_BUF_CLR_IODONE_FUNC(bp);
1108 XFS_BUF_SET_BRELSE_FUNC(bp,NULL); 1108 XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
1109 xfs_buf_relse(bp); 1109 xfs_buf_relse(bp);
1110 } 1110 }
1111 1111
1112 1112
1113 /* 1113 /*
1114 * This is the iodone() function for buffers which have been 1114 * This is the iodone() function for buffers which have been
1115 * logged. It is called when they are eventually flushed out. 1115 * logged. It is called when they are eventually flushed out.
1116 * It should remove the buf item from the AIL, and free the buf item. 1116 * It should remove the buf item from the AIL, and free the buf item.
1117 * It is called by xfs_buf_iodone_callbacks() above which will take 1117 * It is called by xfs_buf_iodone_callbacks() above which will take
1118 * care of cleaning up the buffer itself. 1118 * care of cleaning up the buffer itself.
1119 */ 1119 */
1120 /* ARGSUSED */ 1120 /* ARGSUSED */
1121 void 1121 void
1122 xfs_buf_iodone( 1122 xfs_buf_iodone(
1123 xfs_buf_t *bp, 1123 xfs_buf_t *bp,
1124 xfs_buf_log_item_t *bip) 1124 xfs_buf_log_item_t *bip)
1125 { 1125 {
1126 struct xfs_ail *ailp = bip->bli_item.li_ailp; 1126 struct xfs_ail *ailp = bip->bli_item.li_ailp;
1127 1127
1128 ASSERT(bip->bli_buf == bp); 1128 ASSERT(bip->bli_buf == bp);
1129 1129
1130 xfs_buf_rele(bp); 1130 xfs_buf_rele(bp);
1131 1131
1132 /* 1132 /*
1133 * If we are forcibly shutting down, this may well be 1133 * If we are forcibly shutting down, this may well be
1134 * off the AIL already. That's because we simulate the 1134 * off the AIL already. That's because we simulate the
1135 * log-committed callbacks to unpin these buffers. Or we may never 1135 * log-committed callbacks to unpin these buffers. Or we may never
1136 * have put this item on AIL because of the transaction was 1136 * have put this item on AIL because of the transaction was
1137 * aborted forcibly. xfs_trans_ail_delete() takes care of these. 1137 * aborted forcibly. xfs_trans_ail_delete() takes care of these.
1138 * 1138 *
1139 * Either way, AIL is useless if we're forcing a shutdown. 1139 * Either way, AIL is useless if we're forcing a shutdown.
1140 */ 1140 */
1141 spin_lock(&ailp->xa_lock); 1141 spin_lock(&ailp->xa_lock);
1142 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip); 1142 xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
1143 xfs_buf_item_free(bip); 1143 xfs_buf_item_free(bip);
1144 } 1144 }
1145 1145
1146 #if defined(XFS_BLI_TRACE) 1146 #if defined(XFS_BLI_TRACE)
1147 void 1147 void
1148 xfs_buf_item_trace( 1148 xfs_buf_item_trace(
1149 char *id, 1149 char *id,
1150 xfs_buf_log_item_t *bip) 1150 xfs_buf_log_item_t *bip)
1151 { 1151 {
1152 xfs_buf_t *bp; 1152 xfs_buf_t *bp;
1153 ASSERT(bip->bli_trace != NULL); 1153 ASSERT(bip->bli_trace != NULL);
1154 1154
1155 bp = bip->bli_buf; 1155 bp = bip->bli_buf;
1156 ktrace_enter(bip->bli_trace, 1156 ktrace_enter(bip->bli_trace,
1157 (void *)id, 1157 (void *)id,
1158 (void *)bip->bli_buf, 1158 (void *)bip->bli_buf,
1159 (void *)((unsigned long)bip->bli_flags), 1159 (void *)((unsigned long)bip->bli_flags),
1160 (void *)((unsigned long)bip->bli_recur), 1160 (void *)((unsigned long)bip->bli_recur),
1161 (void *)((unsigned long)atomic_read(&bip->bli_refcount)), 1161 (void *)((unsigned long)atomic_read(&bip->bli_refcount)),
1162 (void *)((unsigned long) 1162 (void *)((unsigned long)
1163 (0xFFFFFFFF & XFS_BUF_ADDR(bp) >> 32)), 1163 (0xFFFFFFFF & XFS_BUF_ADDR(bp) >> 32)),
1164 (void *)((unsigned long)(0xFFFFFFFF & XFS_BUF_ADDR(bp))), 1164 (void *)((unsigned long)(0xFFFFFFFF & XFS_BUF_ADDR(bp))),
1165 (void *)((unsigned long)XFS_BUF_COUNT(bp)), 1165 (void *)((unsigned long)XFS_BUF_COUNT(bp)),
1166 (void *)((unsigned long)XFS_BUF_BFLAGS(bp)), 1166 (void *)((unsigned long)XFS_BUF_BFLAGS(bp)),
1167 XFS_BUF_FSPRIVATE(bp, void *), 1167 XFS_BUF_FSPRIVATE(bp, void *),
1168 XFS_BUF_FSPRIVATE2(bp, void *), 1168 XFS_BUF_FSPRIVATE2(bp, void *),
1169 (void *)(unsigned long)XFS_BUF_ISPINNED(bp), 1169 (void *)(unsigned long)XFS_BUF_ISPINNED(bp),
1170 (void *)XFS_BUF_IODONE_FUNC(bp), 1170 (void *)XFS_BUF_IODONE_FUNC(bp),
1171 (void *)((unsigned long)(XFS_BUF_VALUSEMA(bp))), 1171 (void *)((unsigned long)(XFS_BUF_VALUSEMA(bp))),
1172 (void *)bip->bli_item.li_desc, 1172 (void *)bip->bli_item.li_desc,
1173 (void *)((unsigned long)bip->bli_item.li_flags)); 1173 (void *)((unsigned long)bip->bli_item.li_flags));
1174 } 1174 }
1175 #endif /* XFS_BLI_TRACE */ 1175 #endif /* XFS_BLI_TRACE */
1176 1176
fs/xfs/xfs_log_recover.c
1 /* 1 /*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved. 3 * All Rights Reserved.
4 * 4 *
5 * This program is free software; you can redistribute it and/or 5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as 6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 * 8 *
9 * This program is distributed in the hope that it would be useful, 9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation, 15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18 #include "xfs.h" 18 #include "xfs.h"
19 #include "xfs_fs.h" 19 #include "xfs_fs.h"
20 #include "xfs_types.h" 20 #include "xfs_types.h"
21 #include "xfs_bit.h" 21 #include "xfs_bit.h"
22 #include "xfs_log.h" 22 #include "xfs_log.h"
23 #include "xfs_inum.h" 23 #include "xfs_inum.h"
24 #include "xfs_trans.h" 24 #include "xfs_trans.h"
25 #include "xfs_sb.h" 25 #include "xfs_sb.h"
26 #include "xfs_ag.h" 26 #include "xfs_ag.h"
27 #include "xfs_dir2.h" 27 #include "xfs_dir2.h"
28 #include "xfs_dmapi.h" 28 #include "xfs_dmapi.h"
29 #include "xfs_mount.h" 29 #include "xfs_mount.h"
30 #include "xfs_error.h" 30 #include "xfs_error.h"
31 #include "xfs_bmap_btree.h" 31 #include "xfs_bmap_btree.h"
32 #include "xfs_alloc_btree.h" 32 #include "xfs_alloc_btree.h"
33 #include "xfs_ialloc_btree.h" 33 #include "xfs_ialloc_btree.h"
34 #include "xfs_dir2_sf.h" 34 #include "xfs_dir2_sf.h"
35 #include "xfs_attr_sf.h" 35 #include "xfs_attr_sf.h"
36 #include "xfs_dinode.h" 36 #include "xfs_dinode.h"
37 #include "xfs_inode.h" 37 #include "xfs_inode.h"
38 #include "xfs_inode_item.h" 38 #include "xfs_inode_item.h"
39 #include "xfs_alloc.h" 39 #include "xfs_alloc.h"
40 #include "xfs_ialloc.h" 40 #include "xfs_ialloc.h"
41 #include "xfs_log_priv.h" 41 #include "xfs_log_priv.h"
42 #include "xfs_buf_item.h" 42 #include "xfs_buf_item.h"
43 #include "xfs_log_recover.h" 43 #include "xfs_log_recover.h"
44 #include "xfs_extfree_item.h" 44 #include "xfs_extfree_item.h"
45 #include "xfs_trans_priv.h" 45 #include "xfs_trans_priv.h"
46 #include "xfs_quota.h" 46 #include "xfs_quota.h"
47 #include "xfs_rw.h" 47 #include "xfs_rw.h"
48 #include "xfs_utils.h" 48 #include "xfs_utils.h"
49 49
50 STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *); 50 STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
51 STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t); 51 STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
52 STATIC void xlog_recover_insert_item_backq(xlog_recover_item_t **q, 52 STATIC void xlog_recover_insert_item_backq(xlog_recover_item_t **q,
53 xlog_recover_item_t *item); 53 xlog_recover_item_t *item);
54 #if defined(DEBUG) 54 #if defined(DEBUG)
55 STATIC void xlog_recover_check_summary(xlog_t *); 55 STATIC void xlog_recover_check_summary(xlog_t *);
56 #else 56 #else
57 #define xlog_recover_check_summary(log) 57 #define xlog_recover_check_summary(log)
58 #endif 58 #endif
59 59
60 60
61 /* 61 /*
62 * Sector aligned buffer routines for buffer create/read/write/access 62 * Sector aligned buffer routines for buffer create/read/write/access
63 */ 63 */
64 64
65 #define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs) \ 65 #define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs) \
66 ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \ 66 ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \
67 ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) ) 67 ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) )
68 #define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask) 68 #define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask)
69 69
70 xfs_buf_t * 70 xfs_buf_t *
71 xlog_get_bp( 71 xlog_get_bp(
72 xlog_t *log, 72 xlog_t *log,
73 int num_bblks) 73 int num_bblks)
74 { 74 {
75 ASSERT(num_bblks > 0); 75 ASSERT(num_bblks > 0);
76 76
77 if (log->l_sectbb_log) { 77 if (log->l_sectbb_log) {
78 if (num_bblks > 1) 78 if (num_bblks > 1)
79 num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); 79 num_bblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
80 num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks); 80 num_bblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, num_bblks);
81 } 81 }
82 return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp); 82 return xfs_buf_get_noaddr(BBTOB(num_bblks), log->l_mp->m_logdev_targp);
83 } 83 }
84 84
85 void 85 void
86 xlog_put_bp( 86 xlog_put_bp(
87 xfs_buf_t *bp) 87 xfs_buf_t *bp)
88 { 88 {
89 xfs_buf_free(bp); 89 xfs_buf_free(bp);
90 } 90 }
91 91
92 92
93 /* 93 /*
94 * nbblks should be uint, but oh well. Just want to catch that 32-bit length. 94 * nbblks should be uint, but oh well. Just want to catch that 32-bit length.
95 */ 95 */
96 int 96 int
97 xlog_bread( 97 xlog_bread(
98 xlog_t *log, 98 xlog_t *log,
99 xfs_daddr_t blk_no, 99 xfs_daddr_t blk_no,
100 int nbblks, 100 int nbblks,
101 xfs_buf_t *bp) 101 xfs_buf_t *bp)
102 { 102 {
103 int error; 103 int error;
104 104
105 if (log->l_sectbb_log) { 105 if (log->l_sectbb_log) {
106 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); 106 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
107 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); 107 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
108 } 108 }
109 109
110 ASSERT(nbblks > 0); 110 ASSERT(nbblks > 0);
111 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); 111 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
112 ASSERT(bp); 112 ASSERT(bp);
113 113
114 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); 114 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
115 XFS_BUF_READ(bp); 115 XFS_BUF_READ(bp);
116 XFS_BUF_BUSY(bp); 116 XFS_BUF_BUSY(bp);
117 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); 117 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
118 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); 118 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
119 119
120 xfsbdstrat(log->l_mp, bp); 120 xfsbdstrat(log->l_mp, bp);
121 error = xfs_iowait(bp); 121 error = xfs_iowait(bp);
122 if (error) 122 if (error)
123 xfs_ioerror_alert("xlog_bread", log->l_mp, 123 xfs_ioerror_alert("xlog_bread", log->l_mp,
124 bp, XFS_BUF_ADDR(bp)); 124 bp, XFS_BUF_ADDR(bp));
125 return error; 125 return error;
126 } 126 }
127 127
128 /* 128 /*
129 * Write out the buffer at the given block for the given number of blocks. 129 * Write out the buffer at the given block for the given number of blocks.
130 * The buffer is kept locked across the write and is returned locked. 130 * The buffer is kept locked across the write and is returned locked.
131 * This can only be used for synchronous log writes. 131 * This can only be used for synchronous log writes.
132 */ 132 */
133 STATIC int 133 STATIC int
134 xlog_bwrite( 134 xlog_bwrite(
135 xlog_t *log, 135 xlog_t *log,
136 xfs_daddr_t blk_no, 136 xfs_daddr_t blk_no,
137 int nbblks, 137 int nbblks,
138 xfs_buf_t *bp) 138 xfs_buf_t *bp)
139 { 139 {
140 int error; 140 int error;
141 141
142 if (log->l_sectbb_log) { 142 if (log->l_sectbb_log) {
143 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); 143 blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no);
144 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); 144 nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks);
145 } 145 }
146 146
147 ASSERT(nbblks > 0); 147 ASSERT(nbblks > 0);
148 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); 148 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
149 149
150 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); 150 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
151 XFS_BUF_ZEROFLAGS(bp); 151 XFS_BUF_ZEROFLAGS(bp);
152 XFS_BUF_BUSY(bp); 152 XFS_BUF_BUSY(bp);
153 XFS_BUF_HOLD(bp); 153 XFS_BUF_HOLD(bp);
154 XFS_BUF_PSEMA(bp, PRIBIO); 154 XFS_BUF_PSEMA(bp, PRIBIO);
155 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); 155 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
156 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); 156 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
157 157
158 if ((error = xfs_bwrite(log->l_mp, bp))) 158 if ((error = xfs_bwrite(log->l_mp, bp)))
159 xfs_ioerror_alert("xlog_bwrite", log->l_mp, 159 xfs_ioerror_alert("xlog_bwrite", log->l_mp,
160 bp, XFS_BUF_ADDR(bp)); 160 bp, XFS_BUF_ADDR(bp));
161 return error; 161 return error;
162 } 162 }
163 163
164 STATIC xfs_caddr_t 164 STATIC xfs_caddr_t
165 xlog_align( 165 xlog_align(
166 xlog_t *log, 166 xlog_t *log,
167 xfs_daddr_t blk_no, 167 xfs_daddr_t blk_no,
168 int nbblks, 168 int nbblks,
169 xfs_buf_t *bp) 169 xfs_buf_t *bp)
170 { 170 {
171 xfs_caddr_t ptr; 171 xfs_caddr_t ptr;
172 172
173 if (!log->l_sectbb_log) 173 if (!log->l_sectbb_log)
174 return XFS_BUF_PTR(bp); 174 return XFS_BUF_PTR(bp);
175 175
176 ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask); 176 ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
177 ASSERT(XFS_BUF_SIZE(bp) >= 177 ASSERT(XFS_BUF_SIZE(bp) >=
178 BBTOB(nbblks + (blk_no & log->l_sectbb_mask))); 178 BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
179 return ptr; 179 return ptr;
180 } 180 }
181 181
182 #ifdef DEBUG 182 #ifdef DEBUG
183 /* 183 /*
184 * dump debug superblock and log record information 184 * dump debug superblock and log record information
185 */ 185 */
186 STATIC void 186 STATIC void
187 xlog_header_check_dump( 187 xlog_header_check_dump(
188 xfs_mount_t *mp, 188 xfs_mount_t *mp,
189 xlog_rec_header_t *head) 189 xlog_rec_header_t *head)
190 { 190 {
191 int b; 191 int b;
192 192
193 cmn_err(CE_DEBUG, "%s: SB : uuid = ", __func__); 193 cmn_err(CE_DEBUG, "%s: SB : uuid = ", __func__);
194 for (b = 0; b < 16; b++) 194 for (b = 0; b < 16; b++)
195 cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]); 195 cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]);
196 cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT); 196 cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
197 cmn_err(CE_DEBUG, " log : uuid = "); 197 cmn_err(CE_DEBUG, " log : uuid = ");
198 for (b = 0; b < 16; b++) 198 for (b = 0; b < 16; b++)
199 cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]); 199 cmn_err(CE_DEBUG, "%02x",((uchar_t *)&head->h_fs_uuid)[b]);
200 cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt)); 200 cmn_err(CE_DEBUG, ", fmt = %d\n", be32_to_cpu(head->h_fmt));
201 } 201 }
202 #else 202 #else
203 #define xlog_header_check_dump(mp, head) 203 #define xlog_header_check_dump(mp, head)
204 #endif 204 #endif
205 205
206 /* 206 /*
207 * check log record header for recovery 207 * check log record header for recovery
208 */ 208 */
209 STATIC int 209 STATIC int
210 xlog_header_check_recover( 210 xlog_header_check_recover(
211 xfs_mount_t *mp, 211 xfs_mount_t *mp,
212 xlog_rec_header_t *head) 212 xlog_rec_header_t *head)
213 { 213 {
214 ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM); 214 ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM);
215 215
216 /* 216 /*
217 * IRIX doesn't write the h_fmt field and leaves it zeroed 217 * IRIX doesn't write the h_fmt field and leaves it zeroed
218 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover 218 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
219 * a dirty log created in IRIX. 219 * a dirty log created in IRIX.
220 */ 220 */
221 if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) { 221 if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) {
222 xlog_warn( 222 xlog_warn(
223 "XFS: dirty log written in incompatible format - can't recover"); 223 "XFS: dirty log written in incompatible format - can't recover");
224 xlog_header_check_dump(mp, head); 224 xlog_header_check_dump(mp, head);
225 XFS_ERROR_REPORT("xlog_header_check_recover(1)", 225 XFS_ERROR_REPORT("xlog_header_check_recover(1)",
226 XFS_ERRLEVEL_HIGH, mp); 226 XFS_ERRLEVEL_HIGH, mp);
227 return XFS_ERROR(EFSCORRUPTED); 227 return XFS_ERROR(EFSCORRUPTED);
228 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { 228 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
229 xlog_warn( 229 xlog_warn(
230 "XFS: dirty log entry has mismatched uuid - can't recover"); 230 "XFS: dirty log entry has mismatched uuid - can't recover");
231 xlog_header_check_dump(mp, head); 231 xlog_header_check_dump(mp, head);
232 XFS_ERROR_REPORT("xlog_header_check_recover(2)", 232 XFS_ERROR_REPORT("xlog_header_check_recover(2)",
233 XFS_ERRLEVEL_HIGH, mp); 233 XFS_ERRLEVEL_HIGH, mp);
234 return XFS_ERROR(EFSCORRUPTED); 234 return XFS_ERROR(EFSCORRUPTED);
235 } 235 }
236 return 0; 236 return 0;
237 } 237 }
238 238
239 /* 239 /*
240 * read the head block of the log and check the header 240 * read the head block of the log and check the header
241 */ 241 */
242 STATIC int 242 STATIC int
243 xlog_header_check_mount( 243 xlog_header_check_mount(
244 xfs_mount_t *mp, 244 xfs_mount_t *mp,
245 xlog_rec_header_t *head) 245 xlog_rec_header_t *head)
246 { 246 {
247 ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM); 247 ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM);
248 248
249 if (uuid_is_nil(&head->h_fs_uuid)) { 249 if (uuid_is_nil(&head->h_fs_uuid)) {
250 /* 250 /*
251 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If 251 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
252 * h_fs_uuid is nil, we assume this log was last mounted 252 * h_fs_uuid is nil, we assume this log was last mounted
253 * by IRIX and continue. 253 * by IRIX and continue.
254 */ 254 */
255 xlog_warn("XFS: nil uuid in log - IRIX style log"); 255 xlog_warn("XFS: nil uuid in log - IRIX style log");
256 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { 256 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
257 xlog_warn("XFS: log has mismatched uuid - can't recover"); 257 xlog_warn("XFS: log has mismatched uuid - can't recover");
258 xlog_header_check_dump(mp, head); 258 xlog_header_check_dump(mp, head);
259 XFS_ERROR_REPORT("xlog_header_check_mount", 259 XFS_ERROR_REPORT("xlog_header_check_mount",
260 XFS_ERRLEVEL_HIGH, mp); 260 XFS_ERRLEVEL_HIGH, mp);
261 return XFS_ERROR(EFSCORRUPTED); 261 return XFS_ERROR(EFSCORRUPTED);
262 } 262 }
263 return 0; 263 return 0;
264 } 264 }
265 265
266 STATIC void 266 STATIC void
267 xlog_recover_iodone( 267 xlog_recover_iodone(
268 struct xfs_buf *bp) 268 struct xfs_buf *bp)
269 { 269 {
270 xfs_mount_t *mp;
271
272 ASSERT(XFS_BUF_FSPRIVATE(bp, void *));
273
274 if (XFS_BUF_GETERROR(bp)) { 270 if (XFS_BUF_GETERROR(bp)) {
275 /* 271 /*
276 * We're not going to bother about retrying 272 * We're not going to bother about retrying
277 * this during recovery. One strike! 273 * this during recovery. One strike!
278 */ 274 */
279 mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *);
280 xfs_ioerror_alert("xlog_recover_iodone", 275 xfs_ioerror_alert("xlog_recover_iodone",
281 mp, bp, XFS_BUF_ADDR(bp)); 276 bp->b_mount, bp, XFS_BUF_ADDR(bp));
282 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 277 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
283 } 278 }
284 XFS_BUF_SET_FSPRIVATE(bp, NULL); 279 bp->b_mount = NULL;
285 XFS_BUF_CLR_IODONE_FUNC(bp); 280 XFS_BUF_CLR_IODONE_FUNC(bp);
286 xfs_biodone(bp); 281 xfs_biodone(bp);
287 } 282 }
288 283
289 /* 284 /*
290 * This routine finds (to an approximation) the first block in the physical 285 * This routine finds (to an approximation) the first block in the physical
291 * log which contains the given cycle. It uses a binary search algorithm. 286 * log which contains the given cycle. It uses a binary search algorithm.
292 * Note that the algorithm can not be perfect because the disk will not 287 * Note that the algorithm can not be perfect because the disk will not
293 * necessarily be perfect. 288 * necessarily be perfect.
294 */ 289 */
295 STATIC int 290 STATIC int
296 xlog_find_cycle_start( 291 xlog_find_cycle_start(
297 xlog_t *log, 292 xlog_t *log,
298 xfs_buf_t *bp, 293 xfs_buf_t *bp,
299 xfs_daddr_t first_blk, 294 xfs_daddr_t first_blk,
300 xfs_daddr_t *last_blk, 295 xfs_daddr_t *last_blk,
301 uint cycle) 296 uint cycle)
302 { 297 {
303 xfs_caddr_t offset; 298 xfs_caddr_t offset;
304 xfs_daddr_t mid_blk; 299 xfs_daddr_t mid_blk;
305 uint mid_cycle; 300 uint mid_cycle;
306 int error; 301 int error;
307 302
308 mid_blk = BLK_AVG(first_blk, *last_blk); 303 mid_blk = BLK_AVG(first_blk, *last_blk);
309 while (mid_blk != first_blk && mid_blk != *last_blk) { 304 while (mid_blk != first_blk && mid_blk != *last_blk) {
310 if ((error = xlog_bread(log, mid_blk, 1, bp))) 305 if ((error = xlog_bread(log, mid_blk, 1, bp)))
311 return error; 306 return error;
312 offset = xlog_align(log, mid_blk, 1, bp); 307 offset = xlog_align(log, mid_blk, 1, bp);
313 mid_cycle = xlog_get_cycle(offset); 308 mid_cycle = xlog_get_cycle(offset);
314 if (mid_cycle == cycle) { 309 if (mid_cycle == cycle) {
315 *last_blk = mid_blk; 310 *last_blk = mid_blk;
316 /* last_half_cycle == mid_cycle */ 311 /* last_half_cycle == mid_cycle */
317 } else { 312 } else {
318 first_blk = mid_blk; 313 first_blk = mid_blk;
319 /* first_half_cycle == mid_cycle */ 314 /* first_half_cycle == mid_cycle */
320 } 315 }
321 mid_blk = BLK_AVG(first_blk, *last_blk); 316 mid_blk = BLK_AVG(first_blk, *last_blk);
322 } 317 }
323 ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) || 318 ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) ||
324 (mid_blk == *last_blk && mid_blk-1 == first_blk)); 319 (mid_blk == *last_blk && mid_blk-1 == first_blk));
325 320
326 return 0; 321 return 0;
327 } 322 }
328 323
329 /* 324 /*
330 * Check that the range of blocks does not contain the cycle number 325 * Check that the range of blocks does not contain the cycle number
331 * given. The scan needs to occur from front to back and the ptr into the 326 * given. The scan needs to occur from front to back and the ptr into the
332 * region must be updated since a later routine will need to perform another 327 * region must be updated since a later routine will need to perform another
333 * test. If the region is completely good, we end up returning the same 328 * test. If the region is completely good, we end up returning the same
334 * last block number. 329 * last block number.
335 * 330 *
336 * Set blkno to -1 if we encounter no errors. This is an invalid block number 331 * Set blkno to -1 if we encounter no errors. This is an invalid block number
337 * since we don't ever expect logs to get this large. 332 * since we don't ever expect logs to get this large.
338 */ 333 */
339 STATIC int 334 STATIC int
340 xlog_find_verify_cycle( 335 xlog_find_verify_cycle(
341 xlog_t *log, 336 xlog_t *log,
342 xfs_daddr_t start_blk, 337 xfs_daddr_t start_blk,
343 int nbblks, 338 int nbblks,
344 uint stop_on_cycle_no, 339 uint stop_on_cycle_no,
345 xfs_daddr_t *new_blk) 340 xfs_daddr_t *new_blk)
346 { 341 {
347 xfs_daddr_t i, j; 342 xfs_daddr_t i, j;
348 uint cycle; 343 uint cycle;
349 xfs_buf_t *bp; 344 xfs_buf_t *bp;
350 xfs_daddr_t bufblks; 345 xfs_daddr_t bufblks;
351 xfs_caddr_t buf = NULL; 346 xfs_caddr_t buf = NULL;
352 int error = 0; 347 int error = 0;
353 348
354 bufblks = 1 << ffs(nbblks); 349 bufblks = 1 << ffs(nbblks);
355 350
356 while (!(bp = xlog_get_bp(log, bufblks))) { 351 while (!(bp = xlog_get_bp(log, bufblks))) {
357 /* can't get enough memory to do everything in one big buffer */ 352 /* can't get enough memory to do everything in one big buffer */
358 bufblks >>= 1; 353 bufblks >>= 1;
359 if (bufblks <= log->l_sectbb_log) 354 if (bufblks <= log->l_sectbb_log)
360 return ENOMEM; 355 return ENOMEM;
361 } 356 }
362 357
363 for (i = start_blk; i < start_blk + nbblks; i += bufblks) { 358 for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
364 int bcount; 359 int bcount;
365 360
366 bcount = min(bufblks, (start_blk + nbblks - i)); 361 bcount = min(bufblks, (start_blk + nbblks - i));
367 362
368 if ((error = xlog_bread(log, i, bcount, bp))) 363 if ((error = xlog_bread(log, i, bcount, bp)))
369 goto out; 364 goto out;
370 365
371 buf = xlog_align(log, i, bcount, bp); 366 buf = xlog_align(log, i, bcount, bp);
372 for (j = 0; j < bcount; j++) { 367 for (j = 0; j < bcount; j++) {
373 cycle = xlog_get_cycle(buf); 368 cycle = xlog_get_cycle(buf);
374 if (cycle == stop_on_cycle_no) { 369 if (cycle == stop_on_cycle_no) {
375 *new_blk = i+j; 370 *new_blk = i+j;
376 goto out; 371 goto out;
377 } 372 }
378 373
379 buf += BBSIZE; 374 buf += BBSIZE;
380 } 375 }
381 } 376 }
382 377
383 *new_blk = -1; 378 *new_blk = -1;
384 379
385 out: 380 out:
386 xlog_put_bp(bp); 381 xlog_put_bp(bp);
387 return error; 382 return error;
388 } 383 }
389 384
390 /* 385 /*
391 * Potentially backup over partial log record write. 386 * Potentially backup over partial log record write.
392 * 387 *
393 * In the typical case, last_blk is the number of the block directly after 388 * In the typical case, last_blk is the number of the block directly after
394 * a good log record. Therefore, we subtract one to get the block number 389 * a good log record. Therefore, we subtract one to get the block number
395 * of the last block in the given buffer. extra_bblks contains the number 390 * of the last block in the given buffer. extra_bblks contains the number
396 * of blocks we would have read on a previous read. This happens when the 391 * of blocks we would have read on a previous read. This happens when the
397 * last log record is split over the end of the physical log. 392 * last log record is split over the end of the physical log.
398 * 393 *
399 * extra_bblks is the number of blocks potentially verified on a previous 394 * extra_bblks is the number of blocks potentially verified on a previous
400 * call to this routine. 395 * call to this routine.
401 */ 396 */
402 STATIC int 397 STATIC int
403 xlog_find_verify_log_record( 398 xlog_find_verify_log_record(
404 xlog_t *log, 399 xlog_t *log,
405 xfs_daddr_t start_blk, 400 xfs_daddr_t start_blk,
406 xfs_daddr_t *last_blk, 401 xfs_daddr_t *last_blk,
407 int extra_bblks) 402 int extra_bblks)
408 { 403 {
409 xfs_daddr_t i; 404 xfs_daddr_t i;
410 xfs_buf_t *bp; 405 xfs_buf_t *bp;
411 xfs_caddr_t offset = NULL; 406 xfs_caddr_t offset = NULL;
412 xlog_rec_header_t *head = NULL; 407 xlog_rec_header_t *head = NULL;
413 int error = 0; 408 int error = 0;
414 int smallmem = 0; 409 int smallmem = 0;
415 int num_blks = *last_blk - start_blk; 410 int num_blks = *last_blk - start_blk;
416 int xhdrs; 411 int xhdrs;
417 412
418 ASSERT(start_blk != 0 || *last_blk != start_blk); 413 ASSERT(start_blk != 0 || *last_blk != start_blk);
419 414
420 if (!(bp = xlog_get_bp(log, num_blks))) { 415 if (!(bp = xlog_get_bp(log, num_blks))) {
421 if (!(bp = xlog_get_bp(log, 1))) 416 if (!(bp = xlog_get_bp(log, 1)))
422 return ENOMEM; 417 return ENOMEM;
423 smallmem = 1; 418 smallmem = 1;
424 } else { 419 } else {
425 if ((error = xlog_bread(log, start_blk, num_blks, bp))) 420 if ((error = xlog_bread(log, start_blk, num_blks, bp)))
426 goto out; 421 goto out;
427 offset = xlog_align(log, start_blk, num_blks, bp); 422 offset = xlog_align(log, start_blk, num_blks, bp);
428 offset += ((num_blks - 1) << BBSHIFT); 423 offset += ((num_blks - 1) << BBSHIFT);
429 } 424 }
430 425
431 for (i = (*last_blk) - 1; i >= 0; i--) { 426 for (i = (*last_blk) - 1; i >= 0; i--) {
432 if (i < start_blk) { 427 if (i < start_blk) {
433 /* valid log record not found */ 428 /* valid log record not found */
434 xlog_warn( 429 xlog_warn(
435 "XFS: Log inconsistent (didn't find previous header)"); 430 "XFS: Log inconsistent (didn't find previous header)");
436 ASSERT(0); 431 ASSERT(0);
437 error = XFS_ERROR(EIO); 432 error = XFS_ERROR(EIO);
438 goto out; 433 goto out;
439 } 434 }
440 435
441 if (smallmem) { 436 if (smallmem) {
442 if ((error = xlog_bread(log, i, 1, bp))) 437 if ((error = xlog_bread(log, i, 1, bp)))
443 goto out; 438 goto out;
444 offset = xlog_align(log, i, 1, bp); 439 offset = xlog_align(log, i, 1, bp);
445 } 440 }
446 441
447 head = (xlog_rec_header_t *)offset; 442 head = (xlog_rec_header_t *)offset;
448 443
449 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(head->h_magicno)) 444 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(head->h_magicno))
450 break; 445 break;
451 446
452 if (!smallmem) 447 if (!smallmem)
453 offset -= BBSIZE; 448 offset -= BBSIZE;
454 } 449 }
455 450
456 /* 451 /*
457 * We hit the beginning of the physical log & still no header. Return 452 * We hit the beginning of the physical log & still no header. Return
458 * to caller. If caller can handle a return of -1, then this routine 453 * to caller. If caller can handle a return of -1, then this routine
459 * will be called again for the end of the physical log. 454 * will be called again for the end of the physical log.
460 */ 455 */
461 if (i == -1) { 456 if (i == -1) {
462 error = -1; 457 error = -1;
463 goto out; 458 goto out;
464 } 459 }
465 460
466 /* 461 /*
467 * We have the final block of the good log (the first block 462 * We have the final block of the good log (the first block
468 * of the log record _before_ the head. So we check the uuid. 463 * of the log record _before_ the head. So we check the uuid.
469 */ 464 */
470 if ((error = xlog_header_check_mount(log->l_mp, head))) 465 if ((error = xlog_header_check_mount(log->l_mp, head)))
471 goto out; 466 goto out;
472 467
473 /* 468 /*
474 * We may have found a log record header before we expected one. 469 * We may have found a log record header before we expected one.
475 * last_blk will be the 1st block # with a given cycle #. We may end 470 * last_blk will be the 1st block # with a given cycle #. We may end
476 * up reading an entire log record. In this case, we don't want to 471 * up reading an entire log record. In this case, we don't want to
477 * reset last_blk. Only when last_blk points in the middle of a log 472 * reset last_blk. Only when last_blk points in the middle of a log
478 * record do we update last_blk. 473 * record do we update last_blk.
479 */ 474 */
480 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 475 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
481 uint h_size = be32_to_cpu(head->h_size); 476 uint h_size = be32_to_cpu(head->h_size);
482 477
483 xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE; 478 xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
484 if (h_size % XLOG_HEADER_CYCLE_SIZE) 479 if (h_size % XLOG_HEADER_CYCLE_SIZE)
485 xhdrs++; 480 xhdrs++;
486 } else { 481 } else {
487 xhdrs = 1; 482 xhdrs = 1;
488 } 483 }
489 484
490 if (*last_blk - i + extra_bblks != 485 if (*last_blk - i + extra_bblks !=
491 BTOBB(be32_to_cpu(head->h_len)) + xhdrs) 486 BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
492 *last_blk = i; 487 *last_blk = i;
493 488
494 out: 489 out:
495 xlog_put_bp(bp); 490 xlog_put_bp(bp);
496 return error; 491 return error;
497 } 492 }
498 493
499 /* 494 /*
500 * Head is defined to be the point of the log where the next log write 495 * Head is defined to be the point of the log where the next log write
501 * write could go. This means that incomplete LR writes at the end are 496 * write could go. This means that incomplete LR writes at the end are
502 * eliminated when calculating the head. We aren't guaranteed that previous 497 * eliminated when calculating the head. We aren't guaranteed that previous
503 * LR have complete transactions. We only know that a cycle number of 498 * LR have complete transactions. We only know that a cycle number of
504 * current cycle number -1 won't be present in the log if we start writing 499 * current cycle number -1 won't be present in the log if we start writing
505 * from our current block number. 500 * from our current block number.
506 * 501 *
507 * last_blk contains the block number of the first block with a given 502 * last_blk contains the block number of the first block with a given
508 * cycle number. 503 * cycle number.
509 * 504 *
510 * Return: zero if normal, non-zero if error. 505 * Return: zero if normal, non-zero if error.
511 */ 506 */
512 STATIC int 507 STATIC int
513 xlog_find_head( 508 xlog_find_head(
514 xlog_t *log, 509 xlog_t *log,
515 xfs_daddr_t *return_head_blk) 510 xfs_daddr_t *return_head_blk)
516 { 511 {
517 xfs_buf_t *bp; 512 xfs_buf_t *bp;
518 xfs_caddr_t offset; 513 xfs_caddr_t offset;
519 xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk; 514 xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
520 int num_scan_bblks; 515 int num_scan_bblks;
521 uint first_half_cycle, last_half_cycle; 516 uint first_half_cycle, last_half_cycle;
522 uint stop_on_cycle; 517 uint stop_on_cycle;
523 int error, log_bbnum = log->l_logBBsize; 518 int error, log_bbnum = log->l_logBBsize;
524 519
525 /* Is the end of the log device zeroed? */ 520 /* Is the end of the log device zeroed? */
526 if ((error = xlog_find_zeroed(log, &first_blk)) == -1) { 521 if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
527 *return_head_blk = first_blk; 522 *return_head_blk = first_blk;
528 523
529 /* Is the whole lot zeroed? */ 524 /* Is the whole lot zeroed? */
530 if (!first_blk) { 525 if (!first_blk) {
531 /* Linux XFS shouldn't generate totally zeroed logs - 526 /* Linux XFS shouldn't generate totally zeroed logs -
532 * mkfs etc write a dummy unmount record to a fresh 527 * mkfs etc write a dummy unmount record to a fresh
533 * log so we can store the uuid in there 528 * log so we can store the uuid in there
534 */ 529 */
535 xlog_warn("XFS: totally zeroed log"); 530 xlog_warn("XFS: totally zeroed log");
536 } 531 }
537 532
538 return 0; 533 return 0;
539 } else if (error) { 534 } else if (error) {
540 xlog_warn("XFS: empty log check failed"); 535 xlog_warn("XFS: empty log check failed");
541 return error; 536 return error;
542 } 537 }
543 538
544 first_blk = 0; /* get cycle # of 1st block */ 539 first_blk = 0; /* get cycle # of 1st block */
545 bp = xlog_get_bp(log, 1); 540 bp = xlog_get_bp(log, 1);
546 if (!bp) 541 if (!bp)
547 return ENOMEM; 542 return ENOMEM;
548 if ((error = xlog_bread(log, 0, 1, bp))) 543 if ((error = xlog_bread(log, 0, 1, bp)))
549 goto bp_err; 544 goto bp_err;
550 offset = xlog_align(log, 0, 1, bp); 545 offset = xlog_align(log, 0, 1, bp);
551 first_half_cycle = xlog_get_cycle(offset); 546 first_half_cycle = xlog_get_cycle(offset);
552 547
553 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */ 548 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
554 if ((error = xlog_bread(log, last_blk, 1, bp))) 549 if ((error = xlog_bread(log, last_blk, 1, bp)))
555 goto bp_err; 550 goto bp_err;
556 offset = xlog_align(log, last_blk, 1, bp); 551 offset = xlog_align(log, last_blk, 1, bp);
557 last_half_cycle = xlog_get_cycle(offset); 552 last_half_cycle = xlog_get_cycle(offset);
558 ASSERT(last_half_cycle != 0); 553 ASSERT(last_half_cycle != 0);
559 554
560 /* 555 /*
561 * If the 1st half cycle number is equal to the last half cycle number, 556 * If the 1st half cycle number is equal to the last half cycle number,
562 * then the entire log is stamped with the same cycle number. In this 557 * then the entire log is stamped with the same cycle number. In this
563 * case, head_blk can't be set to zero (which makes sense). The below 558 * case, head_blk can't be set to zero (which makes sense). The below
564 * math doesn't work out properly with head_blk equal to zero. Instead, 559 * math doesn't work out properly with head_blk equal to zero. Instead,
565 * we set it to log_bbnum which is an invalid block number, but this 560 * we set it to log_bbnum which is an invalid block number, but this
566 * value makes the math correct. If head_blk doesn't changed through 561 * value makes the math correct. If head_blk doesn't changed through
567 * all the tests below, *head_blk is set to zero at the very end rather 562 * all the tests below, *head_blk is set to zero at the very end rather
568 * than log_bbnum. In a sense, log_bbnum and zero are the same block 563 * than log_bbnum. In a sense, log_bbnum and zero are the same block
569 * in a circular file. 564 * in a circular file.
570 */ 565 */
571 if (first_half_cycle == last_half_cycle) { 566 if (first_half_cycle == last_half_cycle) {
572 /* 567 /*
573 * In this case we believe that the entire log should have 568 * In this case we believe that the entire log should have
574 * cycle number last_half_cycle. We need to scan backwards 569 * cycle number last_half_cycle. We need to scan backwards
575 * from the end verifying that there are no holes still 570 * from the end verifying that there are no holes still
576 * containing last_half_cycle - 1. If we find such a hole, 571 * containing last_half_cycle - 1. If we find such a hole,
577 * then the start of that hole will be the new head. The 572 * then the start of that hole will be the new head. The
578 * simple case looks like 573 * simple case looks like
579 * x | x ... | x - 1 | x 574 * x | x ... | x - 1 | x
580 * Another case that fits this picture would be 575 * Another case that fits this picture would be
581 * x | x + 1 | x ... | x 576 * x | x + 1 | x ... | x
582 * In this case the head really is somewhere at the end of the 577 * In this case the head really is somewhere at the end of the
583 * log, as one of the latest writes at the beginning was 578 * log, as one of the latest writes at the beginning was
584 * incomplete. 579 * incomplete.
585 * One more case is 580 * One more case is
586 * x | x + 1 | x ... | x - 1 | x 581 * x | x + 1 | x ... | x - 1 | x
587 * This is really the combination of the above two cases, and 582 * This is really the combination of the above two cases, and
588 * the head has to end up at the start of the x-1 hole at the 583 * the head has to end up at the start of the x-1 hole at the
589 * end of the log. 584 * end of the log.
590 * 585 *
591 * In the 256k log case, we will read from the beginning to the 586 * In the 256k log case, we will read from the beginning to the
592 * end of the log and search for cycle numbers equal to x-1. 587 * end of the log and search for cycle numbers equal to x-1.
593 * We don't worry about the x+1 blocks that we encounter, 588 * We don't worry about the x+1 blocks that we encounter,
594 * because we know that they cannot be the head since the log 589 * because we know that they cannot be the head since the log
595 * started with x. 590 * started with x.
596 */ 591 */
597 head_blk = log_bbnum; 592 head_blk = log_bbnum;
598 stop_on_cycle = last_half_cycle - 1; 593 stop_on_cycle = last_half_cycle - 1;
599 } else { 594 } else {
600 /* 595 /*
601 * In this case we want to find the first block with cycle 596 * In this case we want to find the first block with cycle
602 * number matching last_half_cycle. We expect the log to be 597 * number matching last_half_cycle. We expect the log to be
603 * some variation on 598 * some variation on
604 * x + 1 ... | x ... 599 * x + 1 ... | x ...
605 * The first block with cycle number x (last_half_cycle) will 600 * The first block with cycle number x (last_half_cycle) will
606 * be where the new head belongs. First we do a binary search 601 * be where the new head belongs. First we do a binary search
607 * for the first occurrence of last_half_cycle. The binary 602 * for the first occurrence of last_half_cycle. The binary
608 * search may not be totally accurate, so then we scan back 603 * search may not be totally accurate, so then we scan back
609 * from there looking for occurrences of last_half_cycle before 604 * from there looking for occurrences of last_half_cycle before
610 * us. If that backwards scan wraps around the beginning of 605 * us. If that backwards scan wraps around the beginning of
611 * the log, then we look for occurrences of last_half_cycle - 1 606 * the log, then we look for occurrences of last_half_cycle - 1
612 * at the end of the log. The cases we're looking for look 607 * at the end of the log. The cases we're looking for look
613 * like 608 * like
614 * x + 1 ... | x | x + 1 | x ... 609 * x + 1 ... | x | x + 1 | x ...
615 * ^ binary search stopped here 610 * ^ binary search stopped here
616 * or 611 * or
617 * x + 1 ... | x ... | x - 1 | x 612 * x + 1 ... | x ... | x - 1 | x
618 * <---------> less than scan distance 613 * <---------> less than scan distance
619 */ 614 */
620 stop_on_cycle = last_half_cycle; 615 stop_on_cycle = last_half_cycle;
621 if ((error = xlog_find_cycle_start(log, bp, first_blk, 616 if ((error = xlog_find_cycle_start(log, bp, first_blk,
622 &head_blk, last_half_cycle))) 617 &head_blk, last_half_cycle)))
623 goto bp_err; 618 goto bp_err;
624 } 619 }
625 620
626 /* 621 /*
627 * Now validate the answer. Scan back some number of maximum possible 622 * Now validate the answer. Scan back some number of maximum possible
628 * blocks and make sure each one has the expected cycle number. The 623 * blocks and make sure each one has the expected cycle number. The
629 * maximum is determined by the total possible amount of buffering 624 * maximum is determined by the total possible amount of buffering
630 * in the in-core log. The following number can be made tighter if 625 * in the in-core log. The following number can be made tighter if
631 * we actually look at the block size of the filesystem. 626 * we actually look at the block size of the filesystem.
632 */ 627 */
633 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log); 628 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
634 if (head_blk >= num_scan_bblks) { 629 if (head_blk >= num_scan_bblks) {
635 /* 630 /*
636 * We are guaranteed that the entire check can be performed 631 * We are guaranteed that the entire check can be performed
637 * in one buffer. 632 * in one buffer.
638 */ 633 */
639 start_blk = head_blk - num_scan_bblks; 634 start_blk = head_blk - num_scan_bblks;
640 if ((error = xlog_find_verify_cycle(log, 635 if ((error = xlog_find_verify_cycle(log,
641 start_blk, num_scan_bblks, 636 start_blk, num_scan_bblks,
642 stop_on_cycle, &new_blk))) 637 stop_on_cycle, &new_blk)))
643 goto bp_err; 638 goto bp_err;
644 if (new_blk != -1) 639 if (new_blk != -1)
645 head_blk = new_blk; 640 head_blk = new_blk;
646 } else { /* need to read 2 parts of log */ 641 } else { /* need to read 2 parts of log */
647 /* 642 /*
648 * We are going to scan backwards in the log in two parts. 643 * We are going to scan backwards in the log in two parts.
649 * First we scan the physical end of the log. In this part 644 * First we scan the physical end of the log. In this part
650 * of the log, we are looking for blocks with cycle number 645 * of the log, we are looking for blocks with cycle number
651 * last_half_cycle - 1. 646 * last_half_cycle - 1.
652 * If we find one, then we know that the log starts there, as 647 * If we find one, then we know that the log starts there, as
653 * we've found a hole that didn't get written in going around 648 * we've found a hole that didn't get written in going around
654 * the end of the physical log. The simple case for this is 649 * the end of the physical log. The simple case for this is
655 * x + 1 ... | x ... | x - 1 | x 650 * x + 1 ... | x ... | x - 1 | x
656 * <---------> less than scan distance 651 * <---------> less than scan distance
657 * If all of the blocks at the end of the log have cycle number 652 * If all of the blocks at the end of the log have cycle number
658 * last_half_cycle, then we check the blocks at the start of 653 * last_half_cycle, then we check the blocks at the start of
659 * the log looking for occurrences of last_half_cycle. If we 654 * the log looking for occurrences of last_half_cycle. If we
660 * find one, then our current estimate for the location of the 655 * find one, then our current estimate for the location of the
661 * first occurrence of last_half_cycle is wrong and we move 656 * first occurrence of last_half_cycle is wrong and we move
662 * back to the hole we've found. This case looks like 657 * back to the hole we've found. This case looks like
663 * x + 1 ... | x | x + 1 | x ... 658 * x + 1 ... | x | x + 1 | x ...
664 * ^ binary search stopped here 659 * ^ binary search stopped here
665 * Another case we need to handle that only occurs in 256k 660 * Another case we need to handle that only occurs in 256k
666 * logs is 661 * logs is
667 * x + 1 ... | x ... | x+1 | x ... 662 * x + 1 ... | x ... | x+1 | x ...
668 * ^ binary search stops here 663 * ^ binary search stops here
669 * In a 256k log, the scan at the end of the log will see the 664 * In a 256k log, the scan at the end of the log will see the
670 * x + 1 blocks. We need to skip past those since that is 665 * x + 1 blocks. We need to skip past those since that is
671 * certainly not the head of the log. By searching for 666 * certainly not the head of the log. By searching for
672 * last_half_cycle-1 we accomplish that. 667 * last_half_cycle-1 we accomplish that.
673 */ 668 */
674 start_blk = log_bbnum - num_scan_bblks + head_blk; 669 start_blk = log_bbnum - num_scan_bblks + head_blk;
675 ASSERT(head_blk <= INT_MAX && 670 ASSERT(head_blk <= INT_MAX &&
676 (xfs_daddr_t) num_scan_bblks - head_blk >= 0); 671 (xfs_daddr_t) num_scan_bblks - head_blk >= 0);
677 if ((error = xlog_find_verify_cycle(log, start_blk, 672 if ((error = xlog_find_verify_cycle(log, start_blk,
678 num_scan_bblks - (int)head_blk, 673 num_scan_bblks - (int)head_blk,
679 (stop_on_cycle - 1), &new_blk))) 674 (stop_on_cycle - 1), &new_blk)))
680 goto bp_err; 675 goto bp_err;
681 if (new_blk != -1) { 676 if (new_blk != -1) {
682 head_blk = new_blk; 677 head_blk = new_blk;
683 goto bad_blk; 678 goto bad_blk;
684 } 679 }
685 680
686 /* 681 /*
687 * Scan beginning of log now. The last part of the physical 682 * Scan beginning of log now. The last part of the physical
688 * log is good. This scan needs to verify that it doesn't find 683 * log is good. This scan needs to verify that it doesn't find
689 * the last_half_cycle. 684 * the last_half_cycle.
690 */ 685 */
691 start_blk = 0; 686 start_blk = 0;
692 ASSERT(head_blk <= INT_MAX); 687 ASSERT(head_blk <= INT_MAX);
693 if ((error = xlog_find_verify_cycle(log, 688 if ((error = xlog_find_verify_cycle(log,
694 start_blk, (int)head_blk, 689 start_blk, (int)head_blk,
695 stop_on_cycle, &new_blk))) 690 stop_on_cycle, &new_blk)))
696 goto bp_err; 691 goto bp_err;
697 if (new_blk != -1) 692 if (new_blk != -1)
698 head_blk = new_blk; 693 head_blk = new_blk;
699 } 694 }
700 695
701 bad_blk: 696 bad_blk:
702 /* 697 /*
703 * Now we need to make sure head_blk is not pointing to a block in 698 * Now we need to make sure head_blk is not pointing to a block in
704 * the middle of a log record. 699 * the middle of a log record.
705 */ 700 */
706 num_scan_bblks = XLOG_REC_SHIFT(log); 701 num_scan_bblks = XLOG_REC_SHIFT(log);
707 if (head_blk >= num_scan_bblks) { 702 if (head_blk >= num_scan_bblks) {
708 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */ 703 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
709 704
710 /* start ptr at last block ptr before head_blk */ 705 /* start ptr at last block ptr before head_blk */
711 if ((error = xlog_find_verify_log_record(log, start_blk, 706 if ((error = xlog_find_verify_log_record(log, start_blk,
712 &head_blk, 0)) == -1) { 707 &head_blk, 0)) == -1) {
713 error = XFS_ERROR(EIO); 708 error = XFS_ERROR(EIO);
714 goto bp_err; 709 goto bp_err;
715 } else if (error) 710 } else if (error)
716 goto bp_err; 711 goto bp_err;
717 } else { 712 } else {
718 start_blk = 0; 713 start_blk = 0;
719 ASSERT(head_blk <= INT_MAX); 714 ASSERT(head_blk <= INT_MAX);
720 if ((error = xlog_find_verify_log_record(log, start_blk, 715 if ((error = xlog_find_verify_log_record(log, start_blk,
721 &head_blk, 0)) == -1) { 716 &head_blk, 0)) == -1) {
722 /* We hit the beginning of the log during our search */ 717 /* We hit the beginning of the log during our search */
723 start_blk = log_bbnum - num_scan_bblks + head_blk; 718 start_blk = log_bbnum - num_scan_bblks + head_blk;
724 new_blk = log_bbnum; 719 new_blk = log_bbnum;
725 ASSERT(start_blk <= INT_MAX && 720 ASSERT(start_blk <= INT_MAX &&
726 (xfs_daddr_t) log_bbnum-start_blk >= 0); 721 (xfs_daddr_t) log_bbnum-start_blk >= 0);
727 ASSERT(head_blk <= INT_MAX); 722 ASSERT(head_blk <= INT_MAX);
728 if ((error = xlog_find_verify_log_record(log, 723 if ((error = xlog_find_verify_log_record(log,
729 start_blk, &new_blk, 724 start_blk, &new_blk,
730 (int)head_blk)) == -1) { 725 (int)head_blk)) == -1) {
731 error = XFS_ERROR(EIO); 726 error = XFS_ERROR(EIO);
732 goto bp_err; 727 goto bp_err;
733 } else if (error) 728 } else if (error)
734 goto bp_err; 729 goto bp_err;
735 if (new_blk != log_bbnum) 730 if (new_blk != log_bbnum)
736 head_blk = new_blk; 731 head_blk = new_blk;
737 } else if (error) 732 } else if (error)
738 goto bp_err; 733 goto bp_err;
739 } 734 }
740 735
741 xlog_put_bp(bp); 736 xlog_put_bp(bp);
742 if (head_blk == log_bbnum) 737 if (head_blk == log_bbnum)
743 *return_head_blk = 0; 738 *return_head_blk = 0;
744 else 739 else
745 *return_head_blk = head_blk; 740 *return_head_blk = head_blk;
746 /* 741 /*
747 * When returning here, we have a good block number. Bad block 742 * When returning here, we have a good block number. Bad block
748 * means that during a previous crash, we didn't have a clean break 743 * means that during a previous crash, we didn't have a clean break
749 * from cycle number N to cycle number N-1. In this case, we need 744 * from cycle number N to cycle number N-1. In this case, we need
750 * to find the first block with cycle number N-1. 745 * to find the first block with cycle number N-1.
751 */ 746 */
752 return 0; 747 return 0;
753 748
754 bp_err: 749 bp_err:
755 xlog_put_bp(bp); 750 xlog_put_bp(bp);
756 751
757 if (error) 752 if (error)
758 xlog_warn("XFS: failed to find log head"); 753 xlog_warn("XFS: failed to find log head");
759 return error; 754 return error;
760 } 755 }
761 756
762 /* 757 /*
763 * Find the sync block number or the tail of the log. 758 * Find the sync block number or the tail of the log.
764 * 759 *
765 * This will be the block number of the last record to have its 760 * This will be the block number of the last record to have its
766 * associated buffers synced to disk. Every log record header has 761 * associated buffers synced to disk. Every log record header has
767 * a sync lsn embedded in it. LSNs hold block numbers, so it is easy 762 * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
768 * to get a sync block number. The only concern is to figure out which 763 * to get a sync block number. The only concern is to figure out which
769 * log record header to believe. 764 * log record header to believe.
770 * 765 *
771 * The following algorithm uses the log record header with the largest 766 * The following algorithm uses the log record header with the largest
772 * lsn. The entire log record does not need to be valid. We only care 767 * lsn. The entire log record does not need to be valid. We only care
773 * that the header is valid. 768 * that the header is valid.
774 * 769 *
775 * We could speed up search by using current head_blk buffer, but it is not 770 * We could speed up search by using current head_blk buffer, but it is not
776 * available. 771 * available.
777 */ 772 */
778 int 773 int
779 xlog_find_tail( 774 xlog_find_tail(
780 xlog_t *log, 775 xlog_t *log,
781 xfs_daddr_t *head_blk, 776 xfs_daddr_t *head_blk,
782 xfs_daddr_t *tail_blk) 777 xfs_daddr_t *tail_blk)
783 { 778 {
784 xlog_rec_header_t *rhead; 779 xlog_rec_header_t *rhead;
785 xlog_op_header_t *op_head; 780 xlog_op_header_t *op_head;
786 xfs_caddr_t offset = NULL; 781 xfs_caddr_t offset = NULL;
787 xfs_buf_t *bp; 782 xfs_buf_t *bp;
788 int error, i, found; 783 int error, i, found;
789 xfs_daddr_t umount_data_blk; 784 xfs_daddr_t umount_data_blk;
790 xfs_daddr_t after_umount_blk; 785 xfs_daddr_t after_umount_blk;
791 xfs_lsn_t tail_lsn; 786 xfs_lsn_t tail_lsn;
792 int hblks; 787 int hblks;
793 788
794 found = 0; 789 found = 0;
795 790
796 /* 791 /*
797 * Find previous log record 792 * Find previous log record
798 */ 793 */
799 if ((error = xlog_find_head(log, head_blk))) 794 if ((error = xlog_find_head(log, head_blk)))
800 return error; 795 return error;
801 796
802 bp = xlog_get_bp(log, 1); 797 bp = xlog_get_bp(log, 1);
803 if (!bp) 798 if (!bp)
804 return ENOMEM; 799 return ENOMEM;
805 if (*head_blk == 0) { /* special case */ 800 if (*head_blk == 0) { /* special case */
806 if ((error = xlog_bread(log, 0, 1, bp))) 801 if ((error = xlog_bread(log, 0, 1, bp)))
807 goto bread_err; 802 goto bread_err;
808 offset = xlog_align(log, 0, 1, bp); 803 offset = xlog_align(log, 0, 1, bp);
809 if (xlog_get_cycle(offset) == 0) { 804 if (xlog_get_cycle(offset) == 0) {
810 *tail_blk = 0; 805 *tail_blk = 0;
811 /* leave all other log inited values alone */ 806 /* leave all other log inited values alone */
812 goto exit; 807 goto exit;
813 } 808 }
814 } 809 }
815 810
816 /* 811 /*
817 * Search backwards looking for log record header block 812 * Search backwards looking for log record header block
818 */ 813 */
819 ASSERT(*head_blk < INT_MAX); 814 ASSERT(*head_blk < INT_MAX);
820 for (i = (int)(*head_blk) - 1; i >= 0; i--) { 815 for (i = (int)(*head_blk) - 1; i >= 0; i--) {
821 if ((error = xlog_bread(log, i, 1, bp))) 816 if ((error = xlog_bread(log, i, 1, bp)))
822 goto bread_err; 817 goto bread_err;
823 offset = xlog_align(log, i, 1, bp); 818 offset = xlog_align(log, i, 1, bp);
824 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { 819 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
825 found = 1; 820 found = 1;
826 break; 821 break;
827 } 822 }
828 } 823 }
829 /* 824 /*
830 * If we haven't found the log record header block, start looking 825 * If we haven't found the log record header block, start looking
831 * again from the end of the physical log. XXXmiken: There should be 826 * again from the end of the physical log. XXXmiken: There should be
832 * a check here to make sure we didn't search more than N blocks in 827 * a check here to make sure we didn't search more than N blocks in
833 * the previous code. 828 * the previous code.
834 */ 829 */
835 if (!found) { 830 if (!found) {
836 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { 831 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
837 if ((error = xlog_bread(log, i, 1, bp))) 832 if ((error = xlog_bread(log, i, 1, bp)))
838 goto bread_err; 833 goto bread_err;
839 offset = xlog_align(log, i, 1, bp); 834 offset = xlog_align(log, i, 1, bp);
840 if (XLOG_HEADER_MAGIC_NUM == 835 if (XLOG_HEADER_MAGIC_NUM ==
841 be32_to_cpu(*(__be32 *)offset)) { 836 be32_to_cpu(*(__be32 *)offset)) {
842 found = 2; 837 found = 2;
843 break; 838 break;
844 } 839 }
845 } 840 }
846 } 841 }
847 if (!found) { 842 if (!found) {
848 xlog_warn("XFS: xlog_find_tail: couldn't find sync record"); 843 xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
849 ASSERT(0); 844 ASSERT(0);
850 return XFS_ERROR(EIO); 845 return XFS_ERROR(EIO);
851 } 846 }
852 847
853 /* find blk_no of tail of log */ 848 /* find blk_no of tail of log */
854 rhead = (xlog_rec_header_t *)offset; 849 rhead = (xlog_rec_header_t *)offset;
855 *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn)); 850 *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
856 851
857 /* 852 /*
858 * Reset log values according to the state of the log when we 853 * Reset log values according to the state of the log when we
859 * crashed. In the case where head_blk == 0, we bump curr_cycle 854 * crashed. In the case where head_blk == 0, we bump curr_cycle
860 * one because the next write starts a new cycle rather than 855 * one because the next write starts a new cycle rather than
861 * continuing the cycle of the last good log record. At this 856 * continuing the cycle of the last good log record. At this
862 * point we have guaranteed that all partial log records have been 857 * point we have guaranteed that all partial log records have been
863 * accounted for. Therefore, we know that the last good log record 858 * accounted for. Therefore, we know that the last good log record
864 * written was complete and ended exactly on the end boundary 859 * written was complete and ended exactly on the end boundary
865 * of the physical log. 860 * of the physical log.
866 */ 861 */
867 log->l_prev_block = i; 862 log->l_prev_block = i;
868 log->l_curr_block = (int)*head_blk; 863 log->l_curr_block = (int)*head_blk;
869 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); 864 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
870 if (found == 2) 865 if (found == 2)
871 log->l_curr_cycle++; 866 log->l_curr_cycle++;
872 log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn); 867 log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn);
873 log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn); 868 log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn);
874 log->l_grant_reserve_cycle = log->l_curr_cycle; 869 log->l_grant_reserve_cycle = log->l_curr_cycle;
875 log->l_grant_reserve_bytes = BBTOB(log->l_curr_block); 870 log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
876 log->l_grant_write_cycle = log->l_curr_cycle; 871 log->l_grant_write_cycle = log->l_curr_cycle;
877 log->l_grant_write_bytes = BBTOB(log->l_curr_block); 872 log->l_grant_write_bytes = BBTOB(log->l_curr_block);
878 873
879 /* 874 /*
880 * Look for unmount record. If we find it, then we know there 875 * Look for unmount record. If we find it, then we know there
881 * was a clean unmount. Since 'i' could be the last block in 876 * was a clean unmount. Since 'i' could be the last block in
882 * the physical log, we convert to a log block before comparing 877 * the physical log, we convert to a log block before comparing
883 * to the head_blk. 878 * to the head_blk.
884 * 879 *
885 * Save the current tail lsn to use to pass to 880 * Save the current tail lsn to use to pass to
886 * xlog_clear_stale_blocks() below. We won't want to clear the 881 * xlog_clear_stale_blocks() below. We won't want to clear the
887 * unmount record if there is one, so we pass the lsn of the 882 * unmount record if there is one, so we pass the lsn of the
888 * unmount record rather than the block after it. 883 * unmount record rather than the block after it.
889 */ 884 */
890 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 885 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
891 int h_size = be32_to_cpu(rhead->h_size); 886 int h_size = be32_to_cpu(rhead->h_size);
892 int h_version = be32_to_cpu(rhead->h_version); 887 int h_version = be32_to_cpu(rhead->h_version);
893 888
894 if ((h_version & XLOG_VERSION_2) && 889 if ((h_version & XLOG_VERSION_2) &&
895 (h_size > XLOG_HEADER_CYCLE_SIZE)) { 890 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
896 hblks = h_size / XLOG_HEADER_CYCLE_SIZE; 891 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
897 if (h_size % XLOG_HEADER_CYCLE_SIZE) 892 if (h_size % XLOG_HEADER_CYCLE_SIZE)
898 hblks++; 893 hblks++;
899 } else { 894 } else {
900 hblks = 1; 895 hblks = 1;
901 } 896 }
902 } else { 897 } else {
903 hblks = 1; 898 hblks = 1;
904 } 899 }
905 after_umount_blk = (i + hblks + (int) 900 after_umount_blk = (i + hblks + (int)
906 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; 901 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
907 tail_lsn = log->l_tail_lsn; 902 tail_lsn = log->l_tail_lsn;
908 if (*head_blk == after_umount_blk && 903 if (*head_blk == after_umount_blk &&
909 be32_to_cpu(rhead->h_num_logops) == 1) { 904 be32_to_cpu(rhead->h_num_logops) == 1) {
910 umount_data_blk = (i + hblks) % log->l_logBBsize; 905 umount_data_blk = (i + hblks) % log->l_logBBsize;
911 if ((error = xlog_bread(log, umount_data_blk, 1, bp))) { 906 if ((error = xlog_bread(log, umount_data_blk, 1, bp))) {
912 goto bread_err; 907 goto bread_err;
913 } 908 }
914 offset = xlog_align(log, umount_data_blk, 1, bp); 909 offset = xlog_align(log, umount_data_blk, 1, bp);
915 op_head = (xlog_op_header_t *)offset; 910 op_head = (xlog_op_header_t *)offset;
916 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { 911 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
917 /* 912 /*
918 * Set tail and last sync so that newly written 913 * Set tail and last sync so that newly written
919 * log records will point recovery to after the 914 * log records will point recovery to after the
920 * current unmount record. 915 * current unmount record.
921 */ 916 */
922 log->l_tail_lsn = 917 log->l_tail_lsn =
923 xlog_assign_lsn(log->l_curr_cycle, 918 xlog_assign_lsn(log->l_curr_cycle,
924 after_umount_blk); 919 after_umount_blk);
925 log->l_last_sync_lsn = 920 log->l_last_sync_lsn =
926 xlog_assign_lsn(log->l_curr_cycle, 921 xlog_assign_lsn(log->l_curr_cycle,
927 after_umount_blk); 922 after_umount_blk);
928 *tail_blk = after_umount_blk; 923 *tail_blk = after_umount_blk;
929 924
930 /* 925 /*
931 * Note that the unmount was clean. If the unmount 926 * Note that the unmount was clean. If the unmount
932 * was not clean, we need to know this to rebuild the 927 * was not clean, we need to know this to rebuild the
933 * superblock counters from the perag headers if we 928 * superblock counters from the perag headers if we
934 * have a filesystem using non-persistent counters. 929 * have a filesystem using non-persistent counters.
935 */ 930 */
936 log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN; 931 log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
937 } 932 }
938 } 933 }
939 934
940 /* 935 /*
941 * Make sure that there are no blocks in front of the head 936 * Make sure that there are no blocks in front of the head
942 * with the same cycle number as the head. This can happen 937 * with the same cycle number as the head. This can happen
943 * because we allow multiple outstanding log writes concurrently, 938 * because we allow multiple outstanding log writes concurrently,
944 * and the later writes might make it out before earlier ones. 939 * and the later writes might make it out before earlier ones.
945 * 940 *
946 * We use the lsn from before modifying it so that we'll never 941 * We use the lsn from before modifying it so that we'll never
947 * overwrite the unmount record after a clean unmount. 942 * overwrite the unmount record after a clean unmount.
948 * 943 *
949 * Do this only if we are going to recover the filesystem 944 * Do this only if we are going to recover the filesystem
950 * 945 *
951 * NOTE: This used to say "if (!readonly)" 946 * NOTE: This used to say "if (!readonly)"
952 * However on Linux, we can & do recover a read-only filesystem. 947 * However on Linux, we can & do recover a read-only filesystem.
953 * We only skip recovery if NORECOVERY is specified on mount, 948 * We only skip recovery if NORECOVERY is specified on mount,
954 * in which case we would not be here. 949 * in which case we would not be here.
955 * 950 *
956 * But... if the -device- itself is readonly, just skip this. 951 * But... if the -device- itself is readonly, just skip this.
957 * We can't recover this device anyway, so it won't matter. 952 * We can't recover this device anyway, so it won't matter.
958 */ 953 */
959 if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { 954 if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
960 error = xlog_clear_stale_blocks(log, tail_lsn); 955 error = xlog_clear_stale_blocks(log, tail_lsn);
961 } 956 }
962 957
963 bread_err: 958 bread_err:
964 exit: 959 exit:
965 xlog_put_bp(bp); 960 xlog_put_bp(bp);
966 961
967 if (error) 962 if (error)
968 xlog_warn("XFS: failed to locate log tail"); 963 xlog_warn("XFS: failed to locate log tail");
969 return error; 964 return error;
970 } 965 }
971 966
972 /* 967 /*
973 * Is the log zeroed at all? 968 * Is the log zeroed at all?
974 * 969 *
975 * The last binary search should be changed to perform an X block read 970 * The last binary search should be changed to perform an X block read
976 * once X becomes small enough. You can then search linearly through 971 * once X becomes small enough. You can then search linearly through
977 * the X blocks. This will cut down on the number of reads we need to do. 972 * the X blocks. This will cut down on the number of reads we need to do.
978 * 973 *
979 * If the log is partially zeroed, this routine will pass back the blkno 974 * If the log is partially zeroed, this routine will pass back the blkno
980 * of the first block with cycle number 0. It won't have a complete LR 975 * of the first block with cycle number 0. It won't have a complete LR
981 * preceding it. 976 * preceding it.
982 * 977 *
983 * Return: 978 * Return:
984 * 0 => the log is completely written to 979 * 0 => the log is completely written to
985 * -1 => use *blk_no as the first block of the log 980 * -1 => use *blk_no as the first block of the log
986 * >0 => error has occurred 981 * >0 => error has occurred
987 */ 982 */
988 STATIC int 983 STATIC int
989 xlog_find_zeroed( 984 xlog_find_zeroed(
990 xlog_t *log, 985 xlog_t *log,
991 xfs_daddr_t *blk_no) 986 xfs_daddr_t *blk_no)
992 { 987 {
993 xfs_buf_t *bp; 988 xfs_buf_t *bp;
994 xfs_caddr_t offset; 989 xfs_caddr_t offset;
995 uint first_cycle, last_cycle; 990 uint first_cycle, last_cycle;
996 xfs_daddr_t new_blk, last_blk, start_blk; 991 xfs_daddr_t new_blk, last_blk, start_blk;
997 xfs_daddr_t num_scan_bblks; 992 xfs_daddr_t num_scan_bblks;
998 int error, log_bbnum = log->l_logBBsize; 993 int error, log_bbnum = log->l_logBBsize;
999 994
1000 *blk_no = 0; 995 *blk_no = 0;
1001 996
1002 /* check totally zeroed log */ 997 /* check totally zeroed log */
1003 bp = xlog_get_bp(log, 1); 998 bp = xlog_get_bp(log, 1);
1004 if (!bp) 999 if (!bp)
1005 return ENOMEM; 1000 return ENOMEM;
1006 if ((error = xlog_bread(log, 0, 1, bp))) 1001 if ((error = xlog_bread(log, 0, 1, bp)))
1007 goto bp_err; 1002 goto bp_err;
1008 offset = xlog_align(log, 0, 1, bp); 1003 offset = xlog_align(log, 0, 1, bp);
1009 first_cycle = xlog_get_cycle(offset); 1004 first_cycle = xlog_get_cycle(offset);
1010 if (first_cycle == 0) { /* completely zeroed log */ 1005 if (first_cycle == 0) { /* completely zeroed log */
1011 *blk_no = 0; 1006 *blk_no = 0;
1012 xlog_put_bp(bp); 1007 xlog_put_bp(bp);
1013 return -1; 1008 return -1;
1014 } 1009 }
1015 1010
1016 /* check partially zeroed log */ 1011 /* check partially zeroed log */
1017 if ((error = xlog_bread(log, log_bbnum-1, 1, bp))) 1012 if ((error = xlog_bread(log, log_bbnum-1, 1, bp)))
1018 goto bp_err; 1013 goto bp_err;
1019 offset = xlog_align(log, log_bbnum-1, 1, bp); 1014 offset = xlog_align(log, log_bbnum-1, 1, bp);
1020 last_cycle = xlog_get_cycle(offset); 1015 last_cycle = xlog_get_cycle(offset);
1021 if (last_cycle != 0) { /* log completely written to */ 1016 if (last_cycle != 0) { /* log completely written to */
1022 xlog_put_bp(bp); 1017 xlog_put_bp(bp);
1023 return 0; 1018 return 0;
1024 } else if (first_cycle != 1) { 1019 } else if (first_cycle != 1) {
1025 /* 1020 /*
1026 * If the cycle of the last block is zero, the cycle of 1021 * If the cycle of the last block is zero, the cycle of
1027 * the first block must be 1. If it's not, maybe we're 1022 * the first block must be 1. If it's not, maybe we're
1028 * not looking at a log... Bail out. 1023 * not looking at a log... Bail out.
1029 */ 1024 */
1030 xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)"); 1025 xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
1031 return XFS_ERROR(EINVAL); 1026 return XFS_ERROR(EINVAL);
1032 } 1027 }
1033 1028
1034 /* we have a partially zeroed log */ 1029 /* we have a partially zeroed log */
1035 last_blk = log_bbnum-1; 1030 last_blk = log_bbnum-1;
1036 if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0))) 1031 if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
1037 goto bp_err; 1032 goto bp_err;
1038 1033
1039 /* 1034 /*
1040 * Validate the answer. Because there is no way to guarantee that 1035 * Validate the answer. Because there is no way to guarantee that
1041 * the entire log is made up of log records which are the same size, 1036 * the entire log is made up of log records which are the same size,
1042 * we scan over the defined maximum blocks. At this point, the maximum 1037 * we scan over the defined maximum blocks. At this point, the maximum
1043 * is not chosen to mean anything special. XXXmiken 1038 * is not chosen to mean anything special. XXXmiken
1044 */ 1039 */
1045 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log); 1040 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1046 ASSERT(num_scan_bblks <= INT_MAX); 1041 ASSERT(num_scan_bblks <= INT_MAX);
1047 1042
1048 if (last_blk < num_scan_bblks) 1043 if (last_blk < num_scan_bblks)
1049 num_scan_bblks = last_blk; 1044 num_scan_bblks = last_blk;
1050 start_blk = last_blk - num_scan_bblks; 1045 start_blk = last_blk - num_scan_bblks;
1051 1046
1052 /* 1047 /*
1053 * We search for any instances of cycle number 0 that occur before 1048 * We search for any instances of cycle number 0 that occur before
1054 * our current estimate of the head. What we're trying to detect is 1049 * our current estimate of the head. What we're trying to detect is
1055 * 1 ... | 0 | 1 | 0... 1050 * 1 ... | 0 | 1 | 0...
1056 * ^ binary search ends here 1051 * ^ binary search ends here
1057 */ 1052 */
1058 if ((error = xlog_find_verify_cycle(log, start_blk, 1053 if ((error = xlog_find_verify_cycle(log, start_blk,
1059 (int)num_scan_bblks, 0, &new_blk))) 1054 (int)num_scan_bblks, 0, &new_blk)))
1060 goto bp_err; 1055 goto bp_err;
1061 if (new_blk != -1) 1056 if (new_blk != -1)
1062 last_blk = new_blk; 1057 last_blk = new_blk;
1063 1058
1064 /* 1059 /*
1065 * Potentially backup over partial log record write. We don't need 1060 * Potentially backup over partial log record write. We don't need
1066 * to search the end of the log because we know it is zero. 1061 * to search the end of the log because we know it is zero.
1067 */ 1062 */
1068 if ((error = xlog_find_verify_log_record(log, start_blk, 1063 if ((error = xlog_find_verify_log_record(log, start_blk,
1069 &last_blk, 0)) == -1) { 1064 &last_blk, 0)) == -1) {
1070 error = XFS_ERROR(EIO); 1065 error = XFS_ERROR(EIO);
1071 goto bp_err; 1066 goto bp_err;
1072 } else if (error) 1067 } else if (error)
1073 goto bp_err; 1068 goto bp_err;
1074 1069
1075 *blk_no = last_blk; 1070 *blk_no = last_blk;
1076 bp_err: 1071 bp_err:
1077 xlog_put_bp(bp); 1072 xlog_put_bp(bp);
1078 if (error) 1073 if (error)
1079 return error; 1074 return error;
1080 return -1; 1075 return -1;
1081 } 1076 }
1082 1077
1083 /* 1078 /*
1084 * These are simple subroutines used by xlog_clear_stale_blocks() below 1079 * These are simple subroutines used by xlog_clear_stale_blocks() below
1085 * to initialize a buffer full of empty log record headers and write 1080 * to initialize a buffer full of empty log record headers and write
1086 * them into the log. 1081 * them into the log.
1087 */ 1082 */
1088 STATIC void 1083 STATIC void
1089 xlog_add_record( 1084 xlog_add_record(
1090 xlog_t *log, 1085 xlog_t *log,
1091 xfs_caddr_t buf, 1086 xfs_caddr_t buf,
1092 int cycle, 1087 int cycle,
1093 int block, 1088 int block,
1094 int tail_cycle, 1089 int tail_cycle,
1095 int tail_block) 1090 int tail_block)
1096 { 1091 {
1097 xlog_rec_header_t *recp = (xlog_rec_header_t *)buf; 1092 xlog_rec_header_t *recp = (xlog_rec_header_t *)buf;
1098 1093
1099 memset(buf, 0, BBSIZE); 1094 memset(buf, 0, BBSIZE);
1100 recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); 1095 recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1101 recp->h_cycle = cpu_to_be32(cycle); 1096 recp->h_cycle = cpu_to_be32(cycle);
1102 recp->h_version = cpu_to_be32( 1097 recp->h_version = cpu_to_be32(
1103 xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); 1098 xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
1104 recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block)); 1099 recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1105 recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block)); 1100 recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1106 recp->h_fmt = cpu_to_be32(XLOG_FMT); 1101 recp->h_fmt = cpu_to_be32(XLOG_FMT);
1107 memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t)); 1102 memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1108 } 1103 }
1109 1104
1110 STATIC int 1105 STATIC int
1111 xlog_write_log_records( 1106 xlog_write_log_records(
1112 xlog_t *log, 1107 xlog_t *log,
1113 int cycle, 1108 int cycle,
1114 int start_block, 1109 int start_block,
1115 int blocks, 1110 int blocks,
1116 int tail_cycle, 1111 int tail_cycle,
1117 int tail_block) 1112 int tail_block)
1118 { 1113 {
1119 xfs_caddr_t offset; 1114 xfs_caddr_t offset;
1120 xfs_buf_t *bp; 1115 xfs_buf_t *bp;
1121 int balign, ealign; 1116 int balign, ealign;
1122 int sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); 1117 int sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1);
1123 int end_block = start_block + blocks; 1118 int end_block = start_block + blocks;
1124 int bufblks; 1119 int bufblks;
1125 int error = 0; 1120 int error = 0;
1126 int i, j = 0; 1121 int i, j = 0;
1127 1122
1128 bufblks = 1 << ffs(blocks); 1123 bufblks = 1 << ffs(blocks);
1129 while (!(bp = xlog_get_bp(log, bufblks))) { 1124 while (!(bp = xlog_get_bp(log, bufblks))) {
1130 bufblks >>= 1; 1125 bufblks >>= 1;
1131 if (bufblks <= log->l_sectbb_log) 1126 if (bufblks <= log->l_sectbb_log)
1132 return ENOMEM; 1127 return ENOMEM;
1133 } 1128 }
1134 1129
1135 /* We may need to do a read at the start to fill in part of 1130 /* We may need to do a read at the start to fill in part of
1136 * the buffer in the starting sector not covered by the first 1131 * the buffer in the starting sector not covered by the first
1137 * write below. 1132 * write below.
1138 */ 1133 */
1139 balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block); 1134 balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block);
1140 if (balign != start_block) { 1135 if (balign != start_block) {
1141 if ((error = xlog_bread(log, start_block, 1, bp))) { 1136 if ((error = xlog_bread(log, start_block, 1, bp))) {
1142 xlog_put_bp(bp); 1137 xlog_put_bp(bp);
1143 return error; 1138 return error;
1144 } 1139 }
1145 j = start_block - balign; 1140 j = start_block - balign;
1146 } 1141 }
1147 1142
1148 for (i = start_block; i < end_block; i += bufblks) { 1143 for (i = start_block; i < end_block; i += bufblks) {
1149 int bcount, endcount; 1144 int bcount, endcount;
1150 1145
1151 bcount = min(bufblks, end_block - start_block); 1146 bcount = min(bufblks, end_block - start_block);
1152 endcount = bcount - j; 1147 endcount = bcount - j;
1153 1148
1154 /* We may need to do a read at the end to fill in part of 1149 /* We may need to do a read at the end to fill in part of
1155 * the buffer in the final sector not covered by the write. 1150 * the buffer in the final sector not covered by the write.
1156 * If this is the same sector as the above read, skip it. 1151 * If this is the same sector as the above read, skip it.
1157 */ 1152 */
1158 ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block); 1153 ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block);
1159 if (j == 0 && (start_block + endcount > ealign)) { 1154 if (j == 0 && (start_block + endcount > ealign)) {
1160 offset = XFS_BUF_PTR(bp); 1155 offset = XFS_BUF_PTR(bp);
1161 balign = BBTOB(ealign - start_block); 1156 balign = BBTOB(ealign - start_block);
1162 error = XFS_BUF_SET_PTR(bp, offset + balign, 1157 error = XFS_BUF_SET_PTR(bp, offset + balign,
1163 BBTOB(sectbb)); 1158 BBTOB(sectbb));
1164 if (!error) 1159 if (!error)
1165 error = xlog_bread(log, ealign, sectbb, bp); 1160 error = xlog_bread(log, ealign, sectbb, bp);
1166 if (!error) 1161 if (!error)
1167 error = XFS_BUF_SET_PTR(bp, offset, bufblks); 1162 error = XFS_BUF_SET_PTR(bp, offset, bufblks);
1168 if (error) 1163 if (error)
1169 break; 1164 break;
1170 } 1165 }
1171 1166
1172 offset = xlog_align(log, start_block, endcount, bp); 1167 offset = xlog_align(log, start_block, endcount, bp);
1173 for (; j < endcount; j++) { 1168 for (; j < endcount; j++) {
1174 xlog_add_record(log, offset, cycle, i+j, 1169 xlog_add_record(log, offset, cycle, i+j,
1175 tail_cycle, tail_block); 1170 tail_cycle, tail_block);
1176 offset += BBSIZE; 1171 offset += BBSIZE;
1177 } 1172 }
1178 error = xlog_bwrite(log, start_block, endcount, bp); 1173 error = xlog_bwrite(log, start_block, endcount, bp);
1179 if (error) 1174 if (error)
1180 break; 1175 break;
1181 start_block += endcount; 1176 start_block += endcount;
1182 j = 0; 1177 j = 0;
1183 } 1178 }
1184 xlog_put_bp(bp); 1179 xlog_put_bp(bp);
1185 return error; 1180 return error;
1186 } 1181 }
1187 1182
1188 /* 1183 /*
1189 * This routine is called to blow away any incomplete log writes out 1184 * This routine is called to blow away any incomplete log writes out
1190 * in front of the log head. We do this so that we won't become confused 1185 * in front of the log head. We do this so that we won't become confused
1191 * if we come up, write only a little bit more, and then crash again. 1186 * if we come up, write only a little bit more, and then crash again.
1192 * If we leave the partial log records out there, this situation could 1187 * If we leave the partial log records out there, this situation could
1193 * cause us to think those partial writes are valid blocks since they 1188 * cause us to think those partial writes are valid blocks since they
1194 * have the current cycle number. We get rid of them by overwriting them 1189 * have the current cycle number. We get rid of them by overwriting them
1195 * with empty log records with the old cycle number rather than the 1190 * with empty log records with the old cycle number rather than the
1196 * current one. 1191 * current one.
1197 * 1192 *
1198 * The tail lsn is passed in rather than taken from 1193 * The tail lsn is passed in rather than taken from
1199 * the log so that we will not write over the unmount record after a 1194 * the log so that we will not write over the unmount record after a
1200 * clean unmount in a 512 block log. Doing so would leave the log without 1195 * clean unmount in a 512 block log. Doing so would leave the log without
1201 * any valid log records in it until a new one was written. If we crashed 1196 * any valid log records in it until a new one was written. If we crashed
1202 * during that time we would not be able to recover. 1197 * during that time we would not be able to recover.
1203 */ 1198 */
1204 STATIC int 1199 STATIC int
1205 xlog_clear_stale_blocks( 1200 xlog_clear_stale_blocks(
1206 xlog_t *log, 1201 xlog_t *log,
1207 xfs_lsn_t tail_lsn) 1202 xfs_lsn_t tail_lsn)
1208 { 1203 {
1209 int tail_cycle, head_cycle; 1204 int tail_cycle, head_cycle;
1210 int tail_block, head_block; 1205 int tail_block, head_block;
1211 int tail_distance, max_distance; 1206 int tail_distance, max_distance;
1212 int distance; 1207 int distance;
1213 int error; 1208 int error;
1214 1209
1215 tail_cycle = CYCLE_LSN(tail_lsn); 1210 tail_cycle = CYCLE_LSN(tail_lsn);
1216 tail_block = BLOCK_LSN(tail_lsn); 1211 tail_block = BLOCK_LSN(tail_lsn);
1217 head_cycle = log->l_curr_cycle; 1212 head_cycle = log->l_curr_cycle;
1218 head_block = log->l_curr_block; 1213 head_block = log->l_curr_block;
1219 1214
1220 /* 1215 /*
1221 * Figure out the distance between the new head of the log 1216 * Figure out the distance between the new head of the log
1222 * and the tail. We want to write over any blocks beyond the 1217 * and the tail. We want to write over any blocks beyond the
1223 * head that we may have written just before the crash, but 1218 * head that we may have written just before the crash, but
1224 * we don't want to overwrite the tail of the log. 1219 * we don't want to overwrite the tail of the log.
1225 */ 1220 */
1226 if (head_cycle == tail_cycle) { 1221 if (head_cycle == tail_cycle) {
1227 /* 1222 /*
1228 * The tail is behind the head in the physical log, 1223 * The tail is behind the head in the physical log,
1229 * so the distance from the head to the tail is the 1224 * so the distance from the head to the tail is the
1230 * distance from the head to the end of the log plus 1225 * distance from the head to the end of the log plus
1231 * the distance from the beginning of the log to the 1226 * the distance from the beginning of the log to the
1232 * tail. 1227 * tail.
1233 */ 1228 */
1234 if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) { 1229 if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
1235 XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)", 1230 XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
1236 XFS_ERRLEVEL_LOW, log->l_mp); 1231 XFS_ERRLEVEL_LOW, log->l_mp);
1237 return XFS_ERROR(EFSCORRUPTED); 1232 return XFS_ERROR(EFSCORRUPTED);
1238 } 1233 }
1239 tail_distance = tail_block + (log->l_logBBsize - head_block); 1234 tail_distance = tail_block + (log->l_logBBsize - head_block);
1240 } else { 1235 } else {
1241 /* 1236 /*
1242 * The head is behind the tail in the physical log, 1237 * The head is behind the tail in the physical log,
1243 * so the distance from the head to the tail is just 1238 * so the distance from the head to the tail is just
1244 * the tail block minus the head block. 1239 * the tail block minus the head block.
1245 */ 1240 */
1246 if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){ 1241 if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
1247 XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)", 1242 XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
1248 XFS_ERRLEVEL_LOW, log->l_mp); 1243 XFS_ERRLEVEL_LOW, log->l_mp);
1249 return XFS_ERROR(EFSCORRUPTED); 1244 return XFS_ERROR(EFSCORRUPTED);
1250 } 1245 }
1251 tail_distance = tail_block - head_block; 1246 tail_distance = tail_block - head_block;
1252 } 1247 }
1253 1248
1254 /* 1249 /*
1255 * If the head is right up against the tail, we can't clear 1250 * If the head is right up against the tail, we can't clear
1256 * anything. 1251 * anything.
1257 */ 1252 */
1258 if (tail_distance <= 0) { 1253 if (tail_distance <= 0) {
1259 ASSERT(tail_distance == 0); 1254 ASSERT(tail_distance == 0);
1260 return 0; 1255 return 0;
1261 } 1256 }
1262 1257
1263 max_distance = XLOG_TOTAL_REC_SHIFT(log); 1258 max_distance = XLOG_TOTAL_REC_SHIFT(log);
1264 /* 1259 /*
1265 * Take the smaller of the maximum amount of outstanding I/O 1260 * Take the smaller of the maximum amount of outstanding I/O
1266 * we could have and the distance to the tail to clear out. 1261 * we could have and the distance to the tail to clear out.
1267 * We take the smaller so that we don't overwrite the tail and 1262 * We take the smaller so that we don't overwrite the tail and
1268 * we don't waste all day writing from the head to the tail 1263 * we don't waste all day writing from the head to the tail
1269 * for no reason. 1264 * for no reason.
1270 */ 1265 */
1271 max_distance = MIN(max_distance, tail_distance); 1266 max_distance = MIN(max_distance, tail_distance);
1272 1267
1273 if ((head_block + max_distance) <= log->l_logBBsize) { 1268 if ((head_block + max_distance) <= log->l_logBBsize) {
1274 /* 1269 /*
1275 * We can stomp all the blocks we need to without 1270 * We can stomp all the blocks we need to without
1276 * wrapping around the end of the log. Just do it 1271 * wrapping around the end of the log. Just do it
1277 * in a single write. Use the cycle number of the 1272 * in a single write. Use the cycle number of the
1278 * current cycle minus one so that the log will look like: 1273 * current cycle minus one so that the log will look like:
1279 * n ... | n - 1 ... 1274 * n ... | n - 1 ...
1280 */ 1275 */
1281 error = xlog_write_log_records(log, (head_cycle - 1), 1276 error = xlog_write_log_records(log, (head_cycle - 1),
1282 head_block, max_distance, tail_cycle, 1277 head_block, max_distance, tail_cycle,
1283 tail_block); 1278 tail_block);
1284 if (error) 1279 if (error)
1285 return error; 1280 return error;
1286 } else { 1281 } else {
1287 /* 1282 /*
1288 * We need to wrap around the end of the physical log in 1283 * We need to wrap around the end of the physical log in
1289 * order to clear all the blocks. Do it in two separate 1284 * order to clear all the blocks. Do it in two separate
1290 * I/Os. The first write should be from the head to the 1285 * I/Os. The first write should be from the head to the
1291 * end of the physical log, and it should use the current 1286 * end of the physical log, and it should use the current
1292 * cycle number minus one just like above. 1287 * cycle number minus one just like above.
1293 */ 1288 */
1294 distance = log->l_logBBsize - head_block; 1289 distance = log->l_logBBsize - head_block;
1295 error = xlog_write_log_records(log, (head_cycle - 1), 1290 error = xlog_write_log_records(log, (head_cycle - 1),
1296 head_block, distance, tail_cycle, 1291 head_block, distance, tail_cycle,
1297 tail_block); 1292 tail_block);
1298 1293
1299 if (error) 1294 if (error)
1300 return error; 1295 return error;
1301 1296
1302 /* 1297 /*
1303 * Now write the blocks at the start of the physical log. 1298 * Now write the blocks at the start of the physical log.
1304 * This writes the remainder of the blocks we want to clear. 1299 * This writes the remainder of the blocks we want to clear.
1305 * It uses the current cycle number since we're now on the 1300 * It uses the current cycle number since we're now on the
1306 * same cycle as the head so that we get: 1301 * same cycle as the head so that we get:
1307 * n ... n ... | n - 1 ... 1302 * n ... n ... | n - 1 ...
1308 * ^^^^^ blocks we're writing 1303 * ^^^^^ blocks we're writing
1309 */ 1304 */
1310 distance = max_distance - (log->l_logBBsize - head_block); 1305 distance = max_distance - (log->l_logBBsize - head_block);
1311 error = xlog_write_log_records(log, head_cycle, 0, distance, 1306 error = xlog_write_log_records(log, head_cycle, 0, distance,
1312 tail_cycle, tail_block); 1307 tail_cycle, tail_block);
1313 if (error) 1308 if (error)
1314 return error; 1309 return error;
1315 } 1310 }
1316 1311
1317 return 0; 1312 return 0;
1318 } 1313 }
1319 1314
1320 /****************************************************************************** 1315 /******************************************************************************
1321 * 1316 *
1322 * Log recover routines 1317 * Log recover routines
1323 * 1318 *
1324 ****************************************************************************** 1319 ******************************************************************************
1325 */ 1320 */
1326 1321
1327 STATIC xlog_recover_t * 1322 STATIC xlog_recover_t *
1328 xlog_recover_find_tid( 1323 xlog_recover_find_tid(
1329 xlog_recover_t *q, 1324 xlog_recover_t *q,
1330 xlog_tid_t tid) 1325 xlog_tid_t tid)
1331 { 1326 {
1332 xlog_recover_t *p = q; 1327 xlog_recover_t *p = q;
1333 1328
1334 while (p != NULL) { 1329 while (p != NULL) {
1335 if (p->r_log_tid == tid) 1330 if (p->r_log_tid == tid)
1336 break; 1331 break;
1337 p = p->r_next; 1332 p = p->r_next;
1338 } 1333 }
1339 return p; 1334 return p;
1340 } 1335 }
1341 1336
1342 STATIC void 1337 STATIC void
1343 xlog_recover_put_hashq( 1338 xlog_recover_put_hashq(
1344 xlog_recover_t **q, 1339 xlog_recover_t **q,
1345 xlog_recover_t *trans) 1340 xlog_recover_t *trans)
1346 { 1341 {
1347 trans->r_next = *q; 1342 trans->r_next = *q;
1348 *q = trans; 1343 *q = trans;
1349 } 1344 }
1350 1345
1351 STATIC void 1346 STATIC void
1352 xlog_recover_add_item( 1347 xlog_recover_add_item(
1353 xlog_recover_item_t **itemq) 1348 xlog_recover_item_t **itemq)
1354 { 1349 {
1355 xlog_recover_item_t *item; 1350 xlog_recover_item_t *item;
1356 1351
1357 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); 1352 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
1358 xlog_recover_insert_item_backq(itemq, item); 1353 xlog_recover_insert_item_backq(itemq, item);
1359 } 1354 }
1360 1355
1361 STATIC int 1356 STATIC int
1362 xlog_recover_add_to_cont_trans( 1357 xlog_recover_add_to_cont_trans(
1363 xlog_recover_t *trans, 1358 xlog_recover_t *trans,
1364 xfs_caddr_t dp, 1359 xfs_caddr_t dp,
1365 int len) 1360 int len)
1366 { 1361 {
1367 xlog_recover_item_t *item; 1362 xlog_recover_item_t *item;
1368 xfs_caddr_t ptr, old_ptr; 1363 xfs_caddr_t ptr, old_ptr;
1369 int old_len; 1364 int old_len;
1370 1365
1371 item = trans->r_itemq; 1366 item = trans->r_itemq;
1372 if (item == NULL) { 1367 if (item == NULL) {
1373 /* finish copying rest of trans header */ 1368 /* finish copying rest of trans header */
1374 xlog_recover_add_item(&trans->r_itemq); 1369 xlog_recover_add_item(&trans->r_itemq);
1375 ptr = (xfs_caddr_t) &trans->r_theader + 1370 ptr = (xfs_caddr_t) &trans->r_theader +
1376 sizeof(xfs_trans_header_t) - len; 1371 sizeof(xfs_trans_header_t) - len;
1377 memcpy(ptr, dp, len); /* d, s, l */ 1372 memcpy(ptr, dp, len); /* d, s, l */
1378 return 0; 1373 return 0;
1379 } 1374 }
1380 item = item->ri_prev; 1375 item = item->ri_prev;
1381 1376
1382 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; 1377 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1383 old_len = item->ri_buf[item->ri_cnt-1].i_len; 1378 old_len = item->ri_buf[item->ri_cnt-1].i_len;
1384 1379
1385 ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u); 1380 ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u);
1386 memcpy(&ptr[old_len], dp, len); /* d, s, l */ 1381 memcpy(&ptr[old_len], dp, len); /* d, s, l */
1387 item->ri_buf[item->ri_cnt-1].i_len += len; 1382 item->ri_buf[item->ri_cnt-1].i_len += len;
1388 item->ri_buf[item->ri_cnt-1].i_addr = ptr; 1383 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1389 return 0; 1384 return 0;
1390 } 1385 }
1391 1386
1392 /* 1387 /*
1393 * The next region to add is the start of a new region. It could be 1388 * The next region to add is the start of a new region. It could be
1394 * a whole region or it could be the first part of a new region. Because 1389 * a whole region or it could be the first part of a new region. Because
1395 * of this, the assumption here is that the type and size fields of all 1390 * of this, the assumption here is that the type and size fields of all
1396 * format structures fit into the first 32 bits of the structure. 1391 * format structures fit into the first 32 bits of the structure.
1397 * 1392 *
1398 * This works because all regions must be 32 bit aligned. Therefore, we 1393 * This works because all regions must be 32 bit aligned. Therefore, we
1399 * either have both fields or we have neither field. In the case we have 1394 * either have both fields or we have neither field. In the case we have
1400 * neither field, the data part of the region is zero length. We only have 1395 * neither field, the data part of the region is zero length. We only have
1401 * a log_op_header and can throw away the header since a new one will appear 1396 * a log_op_header and can throw away the header since a new one will appear
1402 * later. If we have at least 4 bytes, then we can determine how many regions 1397 * later. If we have at least 4 bytes, then we can determine how many regions
1403 * will appear in the current log item. 1398 * will appear in the current log item.
1404 */ 1399 */
1405 STATIC int 1400 STATIC int
1406 xlog_recover_add_to_trans( 1401 xlog_recover_add_to_trans(
1407 xlog_recover_t *trans, 1402 xlog_recover_t *trans,
1408 xfs_caddr_t dp, 1403 xfs_caddr_t dp,
1409 int len) 1404 int len)
1410 { 1405 {
1411 xfs_inode_log_format_t *in_f; /* any will do */ 1406 xfs_inode_log_format_t *in_f; /* any will do */
1412 xlog_recover_item_t *item; 1407 xlog_recover_item_t *item;
1413 xfs_caddr_t ptr; 1408 xfs_caddr_t ptr;
1414 1409
1415 if (!len) 1410 if (!len)
1416 return 0; 1411 return 0;
1417 item = trans->r_itemq; 1412 item = trans->r_itemq;
1418 if (item == NULL) { 1413 if (item == NULL) {
1419 /* we need to catch log corruptions here */ 1414 /* we need to catch log corruptions here */
1420 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { 1415 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1421 xlog_warn("XFS: xlog_recover_add_to_trans: " 1416 xlog_warn("XFS: xlog_recover_add_to_trans: "
1422 "bad header magic number"); 1417 "bad header magic number");
1423 ASSERT(0); 1418 ASSERT(0);
1424 return XFS_ERROR(EIO); 1419 return XFS_ERROR(EIO);
1425 } 1420 }
1426 if (len == sizeof(xfs_trans_header_t)) 1421 if (len == sizeof(xfs_trans_header_t))
1427 xlog_recover_add_item(&trans->r_itemq); 1422 xlog_recover_add_item(&trans->r_itemq);
1428 memcpy(&trans->r_theader, dp, len); /* d, s, l */ 1423 memcpy(&trans->r_theader, dp, len); /* d, s, l */
1429 return 0; 1424 return 0;
1430 } 1425 }
1431 1426
1432 ptr = kmem_alloc(len, KM_SLEEP); 1427 ptr = kmem_alloc(len, KM_SLEEP);
1433 memcpy(ptr, dp, len); 1428 memcpy(ptr, dp, len);
1434 in_f = (xfs_inode_log_format_t *)ptr; 1429 in_f = (xfs_inode_log_format_t *)ptr;
1435 1430
1436 if (item->ri_prev->ri_total != 0 && 1431 if (item->ri_prev->ri_total != 0 &&
1437 item->ri_prev->ri_total == item->ri_prev->ri_cnt) { 1432 item->ri_prev->ri_total == item->ri_prev->ri_cnt) {
1438 xlog_recover_add_item(&trans->r_itemq); 1433 xlog_recover_add_item(&trans->r_itemq);
1439 } 1434 }
1440 item = trans->r_itemq; 1435 item = trans->r_itemq;
1441 item = item->ri_prev; 1436 item = item->ri_prev;
1442 1437
1443 if (item->ri_total == 0) { /* first region to be added */ 1438 if (item->ri_total == 0) { /* first region to be added */
1444 item->ri_total = in_f->ilf_size; 1439 item->ri_total = in_f->ilf_size;
1445 ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM); 1440 ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM);
1446 item->ri_buf = kmem_zalloc((item->ri_total * 1441 item->ri_buf = kmem_zalloc((item->ri_total *
1447 sizeof(xfs_log_iovec_t)), KM_SLEEP); 1442 sizeof(xfs_log_iovec_t)), KM_SLEEP);
1448 } 1443 }
1449 ASSERT(item->ri_total > item->ri_cnt); 1444 ASSERT(item->ri_total > item->ri_cnt);
1450 /* Description region is ri_buf[0] */ 1445 /* Description region is ri_buf[0] */
1451 item->ri_buf[item->ri_cnt].i_addr = ptr; 1446 item->ri_buf[item->ri_cnt].i_addr = ptr;
1452 item->ri_buf[item->ri_cnt].i_len = len; 1447 item->ri_buf[item->ri_cnt].i_len = len;
1453 item->ri_cnt++; 1448 item->ri_cnt++;
1454 return 0; 1449 return 0;
1455 } 1450 }
1456 1451
1457 STATIC void 1452 STATIC void
1458 xlog_recover_new_tid( 1453 xlog_recover_new_tid(
1459 xlog_recover_t **q, 1454 xlog_recover_t **q,
1460 xlog_tid_t tid, 1455 xlog_tid_t tid,
1461 xfs_lsn_t lsn) 1456 xfs_lsn_t lsn)
1462 { 1457 {
1463 xlog_recover_t *trans; 1458 xlog_recover_t *trans;
1464 1459
1465 trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP); 1460 trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1466 trans->r_log_tid = tid; 1461 trans->r_log_tid = tid;
1467 trans->r_lsn = lsn; 1462 trans->r_lsn = lsn;
1468 xlog_recover_put_hashq(q, trans); 1463 xlog_recover_put_hashq(q, trans);
1469 } 1464 }
1470 1465
1471 STATIC int 1466 STATIC int
1472 xlog_recover_unlink_tid( 1467 xlog_recover_unlink_tid(
1473 xlog_recover_t **q, 1468 xlog_recover_t **q,
1474 xlog_recover_t *trans) 1469 xlog_recover_t *trans)
1475 { 1470 {
1476 xlog_recover_t *tp; 1471 xlog_recover_t *tp;
1477 int found = 0; 1472 int found = 0;
1478 1473
1479 ASSERT(trans != NULL); 1474 ASSERT(trans != NULL);
1480 if (trans == *q) { 1475 if (trans == *q) {
1481 *q = (*q)->r_next; 1476 *q = (*q)->r_next;
1482 } else { 1477 } else {
1483 tp = *q; 1478 tp = *q;
1484 while (tp) { 1479 while (tp) {
1485 if (tp->r_next == trans) { 1480 if (tp->r_next == trans) {
1486 found = 1; 1481 found = 1;
1487 break; 1482 break;
1488 } 1483 }
1489 tp = tp->r_next; 1484 tp = tp->r_next;
1490 } 1485 }
1491 if (!found) { 1486 if (!found) {
1492 xlog_warn( 1487 xlog_warn(
1493 "XFS: xlog_recover_unlink_tid: trans not found"); 1488 "XFS: xlog_recover_unlink_tid: trans not found");
1494 ASSERT(0); 1489 ASSERT(0);
1495 return XFS_ERROR(EIO); 1490 return XFS_ERROR(EIO);
1496 } 1491 }
1497 tp->r_next = tp->r_next->r_next; 1492 tp->r_next = tp->r_next->r_next;
1498 } 1493 }
1499 return 0; 1494 return 0;
1500 } 1495 }
1501 1496
1502 STATIC void 1497 STATIC void
1503 xlog_recover_insert_item_backq( 1498 xlog_recover_insert_item_backq(
1504 xlog_recover_item_t **q, 1499 xlog_recover_item_t **q,
1505 xlog_recover_item_t *item) 1500 xlog_recover_item_t *item)
1506 { 1501 {
1507 if (*q == NULL) { 1502 if (*q == NULL) {
1508 item->ri_prev = item->ri_next = item; 1503 item->ri_prev = item->ri_next = item;
1509 *q = item; 1504 *q = item;
1510 } else { 1505 } else {
1511 item->ri_next = *q; 1506 item->ri_next = *q;
1512 item->ri_prev = (*q)->ri_prev; 1507 item->ri_prev = (*q)->ri_prev;
1513 (*q)->ri_prev = item; 1508 (*q)->ri_prev = item;
1514 item->ri_prev->ri_next = item; 1509 item->ri_prev->ri_next = item;
1515 } 1510 }
1516 } 1511 }
1517 1512
1518 STATIC void 1513 STATIC void
1519 xlog_recover_insert_item_frontq( 1514 xlog_recover_insert_item_frontq(
1520 xlog_recover_item_t **q, 1515 xlog_recover_item_t **q,
1521 xlog_recover_item_t *item) 1516 xlog_recover_item_t *item)
1522 { 1517 {
1523 xlog_recover_insert_item_backq(q, item); 1518 xlog_recover_insert_item_backq(q, item);
1524 *q = item; 1519 *q = item;
1525 } 1520 }
1526 1521
1527 STATIC int 1522 STATIC int
1528 xlog_recover_reorder_trans( 1523 xlog_recover_reorder_trans(
1529 xlog_recover_t *trans) 1524 xlog_recover_t *trans)
1530 { 1525 {
1531 xlog_recover_item_t *first_item, *itemq, *itemq_next; 1526 xlog_recover_item_t *first_item, *itemq, *itemq_next;
1532 xfs_buf_log_format_t *buf_f; 1527 xfs_buf_log_format_t *buf_f;
1533 ushort flags = 0; 1528 ushort flags = 0;
1534 1529
1535 first_item = itemq = trans->r_itemq; 1530 first_item = itemq = trans->r_itemq;
1536 trans->r_itemq = NULL; 1531 trans->r_itemq = NULL;
1537 do { 1532 do {
1538 itemq_next = itemq->ri_next; 1533 itemq_next = itemq->ri_next;
1539 buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr; 1534 buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr;
1540 1535
1541 switch (ITEM_TYPE(itemq)) { 1536 switch (ITEM_TYPE(itemq)) {
1542 case XFS_LI_BUF: 1537 case XFS_LI_BUF:
1543 flags = buf_f->blf_flags; 1538 flags = buf_f->blf_flags;
1544 if (!(flags & XFS_BLI_CANCEL)) { 1539 if (!(flags & XFS_BLI_CANCEL)) {
1545 xlog_recover_insert_item_frontq(&trans->r_itemq, 1540 xlog_recover_insert_item_frontq(&trans->r_itemq,
1546 itemq); 1541 itemq);
1547 break; 1542 break;
1548 } 1543 }
1549 case XFS_LI_INODE: 1544 case XFS_LI_INODE:
1550 case XFS_LI_DQUOT: 1545 case XFS_LI_DQUOT:
1551 case XFS_LI_QUOTAOFF: 1546 case XFS_LI_QUOTAOFF:
1552 case XFS_LI_EFD: 1547 case XFS_LI_EFD:
1553 case XFS_LI_EFI: 1548 case XFS_LI_EFI:
1554 xlog_recover_insert_item_backq(&trans->r_itemq, itemq); 1549 xlog_recover_insert_item_backq(&trans->r_itemq, itemq);
1555 break; 1550 break;
1556 default: 1551 default:
1557 xlog_warn( 1552 xlog_warn(
1558 "XFS: xlog_recover_reorder_trans: unrecognized type of log operation"); 1553 "XFS: xlog_recover_reorder_trans: unrecognized type of log operation");
1559 ASSERT(0); 1554 ASSERT(0);
1560 return XFS_ERROR(EIO); 1555 return XFS_ERROR(EIO);
1561 } 1556 }
1562 itemq = itemq_next; 1557 itemq = itemq_next;
1563 } while (first_item != itemq); 1558 } while (first_item != itemq);
1564 return 0; 1559 return 0;
1565 } 1560 }
1566 1561
1567 /* 1562 /*
1568 * Build up the table of buf cancel records so that we don't replay 1563 * Build up the table of buf cancel records so that we don't replay
1569 * cancelled data in the second pass. For buffer records that are 1564 * cancelled data in the second pass. For buffer records that are
1570 * not cancel records, there is nothing to do here so we just return. 1565 * not cancel records, there is nothing to do here so we just return.
1571 * 1566 *
1572 * If we get a cancel record which is already in the table, this indicates 1567 * If we get a cancel record which is already in the table, this indicates
1573 * that the buffer was cancelled multiple times. In order to ensure 1568 * that the buffer was cancelled multiple times. In order to ensure
1574 * that during pass 2 we keep the record in the table until we reach its 1569 * that during pass 2 we keep the record in the table until we reach its
1575 * last occurrence in the log, we keep a reference count in the cancel 1570 * last occurrence in the log, we keep a reference count in the cancel
1576 * record in the table to tell us how many times we expect to see this 1571 * record in the table to tell us how many times we expect to see this
1577 * record during the second pass. 1572 * record during the second pass.
1578 */ 1573 */
1579 STATIC void 1574 STATIC void
1580 xlog_recover_do_buffer_pass1( 1575 xlog_recover_do_buffer_pass1(
1581 xlog_t *log, 1576 xlog_t *log,
1582 xfs_buf_log_format_t *buf_f) 1577 xfs_buf_log_format_t *buf_f)
1583 { 1578 {
1584 xfs_buf_cancel_t *bcp; 1579 xfs_buf_cancel_t *bcp;
1585 xfs_buf_cancel_t *nextp; 1580 xfs_buf_cancel_t *nextp;
1586 xfs_buf_cancel_t *prevp; 1581 xfs_buf_cancel_t *prevp;
1587 xfs_buf_cancel_t **bucket; 1582 xfs_buf_cancel_t **bucket;
1588 xfs_daddr_t blkno = 0; 1583 xfs_daddr_t blkno = 0;
1589 uint len = 0; 1584 uint len = 0;
1590 ushort flags = 0; 1585 ushort flags = 0;
1591 1586
1592 switch (buf_f->blf_type) { 1587 switch (buf_f->blf_type) {
1593 case XFS_LI_BUF: 1588 case XFS_LI_BUF:
1594 blkno = buf_f->blf_blkno; 1589 blkno = buf_f->blf_blkno;
1595 len = buf_f->blf_len; 1590 len = buf_f->blf_len;
1596 flags = buf_f->blf_flags; 1591 flags = buf_f->blf_flags;
1597 break; 1592 break;
1598 } 1593 }
1599 1594
1600 /* 1595 /*
1601 * If this isn't a cancel buffer item, then just return. 1596 * If this isn't a cancel buffer item, then just return.
1602 */ 1597 */
1603 if (!(flags & XFS_BLI_CANCEL)) 1598 if (!(flags & XFS_BLI_CANCEL))
1604 return; 1599 return;
1605 1600
1606 /* 1601 /*
1607 * Insert an xfs_buf_cancel record into the hash table of 1602 * Insert an xfs_buf_cancel record into the hash table of
1608 * them. If there is already an identical record, bump 1603 * them. If there is already an identical record, bump
1609 * its reference count. 1604 * its reference count.
1610 */ 1605 */
1611 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno % 1606 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1612 XLOG_BC_TABLE_SIZE]; 1607 XLOG_BC_TABLE_SIZE];
1613 /* 1608 /*
1614 * If the hash bucket is empty then just insert a new record into 1609 * If the hash bucket is empty then just insert a new record into
1615 * the bucket. 1610 * the bucket.
1616 */ 1611 */
1617 if (*bucket == NULL) { 1612 if (*bucket == NULL) {
1618 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t), 1613 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1619 KM_SLEEP); 1614 KM_SLEEP);
1620 bcp->bc_blkno = blkno; 1615 bcp->bc_blkno = blkno;
1621 bcp->bc_len = len; 1616 bcp->bc_len = len;
1622 bcp->bc_refcount = 1; 1617 bcp->bc_refcount = 1;
1623 bcp->bc_next = NULL; 1618 bcp->bc_next = NULL;
1624 *bucket = bcp; 1619 *bucket = bcp;
1625 return; 1620 return;
1626 } 1621 }
1627 1622
1628 /* 1623 /*
1629 * The hash bucket is not empty, so search for duplicates of our 1624 * The hash bucket is not empty, so search for duplicates of our
1630 * record. If we find one them just bump its refcount. If not 1625 * record. If we find one them just bump its refcount. If not
1631 * then add us at the end of the list. 1626 * then add us at the end of the list.
1632 */ 1627 */
1633 prevp = NULL; 1628 prevp = NULL;
1634 nextp = *bucket; 1629 nextp = *bucket;
1635 while (nextp != NULL) { 1630 while (nextp != NULL) {
1636 if (nextp->bc_blkno == blkno && nextp->bc_len == len) { 1631 if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
1637 nextp->bc_refcount++; 1632 nextp->bc_refcount++;
1638 return; 1633 return;
1639 } 1634 }
1640 prevp = nextp; 1635 prevp = nextp;
1641 nextp = nextp->bc_next; 1636 nextp = nextp->bc_next;
1642 } 1637 }
1643 ASSERT(prevp != NULL); 1638 ASSERT(prevp != NULL);
1644 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t), 1639 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1645 KM_SLEEP); 1640 KM_SLEEP);
1646 bcp->bc_blkno = blkno; 1641 bcp->bc_blkno = blkno;
1647 bcp->bc_len = len; 1642 bcp->bc_len = len;
1648 bcp->bc_refcount = 1; 1643 bcp->bc_refcount = 1;
1649 bcp->bc_next = NULL; 1644 bcp->bc_next = NULL;
1650 prevp->bc_next = bcp; 1645 prevp->bc_next = bcp;
1651 } 1646 }
1652 1647
1653 /* 1648 /*
1654 * Check to see whether the buffer being recovered has a corresponding 1649 * Check to see whether the buffer being recovered has a corresponding
1655 * entry in the buffer cancel record table. If it does then return 1 1650 * entry in the buffer cancel record table. If it does then return 1
1656 * so that it will be cancelled, otherwise return 0. If the buffer is 1651 * so that it will be cancelled, otherwise return 0. If the buffer is
1657 * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement 1652 * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement
1658 * the refcount on the entry in the table and remove it from the table 1653 * the refcount on the entry in the table and remove it from the table
1659 * if this is the last reference. 1654 * if this is the last reference.
1660 * 1655 *
1661 * We remove the cancel record from the table when we encounter its 1656 * We remove the cancel record from the table when we encounter its
1662 * last occurrence in the log so that if the same buffer is re-used 1657 * last occurrence in the log so that if the same buffer is re-used
1663 * again after its last cancellation we actually replay the changes 1658 * again after its last cancellation we actually replay the changes
1664 * made at that point. 1659 * made at that point.
1665 */ 1660 */
1666 STATIC int 1661 STATIC int
1667 xlog_check_buffer_cancelled( 1662 xlog_check_buffer_cancelled(
1668 xlog_t *log, 1663 xlog_t *log,
1669 xfs_daddr_t blkno, 1664 xfs_daddr_t blkno,
1670 uint len, 1665 uint len,
1671 ushort flags) 1666 ushort flags)
1672 { 1667 {
1673 xfs_buf_cancel_t *bcp; 1668 xfs_buf_cancel_t *bcp;
1674 xfs_buf_cancel_t *prevp; 1669 xfs_buf_cancel_t *prevp;
1675 xfs_buf_cancel_t **bucket; 1670 xfs_buf_cancel_t **bucket;
1676 1671
1677 if (log->l_buf_cancel_table == NULL) { 1672 if (log->l_buf_cancel_table == NULL) {
1678 /* 1673 /*
1679 * There is nothing in the table built in pass one, 1674 * There is nothing in the table built in pass one,
1680 * so this buffer must not be cancelled. 1675 * so this buffer must not be cancelled.
1681 */ 1676 */
1682 ASSERT(!(flags & XFS_BLI_CANCEL)); 1677 ASSERT(!(flags & XFS_BLI_CANCEL));
1683 return 0; 1678 return 0;
1684 } 1679 }
1685 1680
1686 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno % 1681 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1687 XLOG_BC_TABLE_SIZE]; 1682 XLOG_BC_TABLE_SIZE];
1688 bcp = *bucket; 1683 bcp = *bucket;
1689 if (bcp == NULL) { 1684 if (bcp == NULL) {
1690 /* 1685 /*
1691 * There is no corresponding entry in the table built 1686 * There is no corresponding entry in the table built
1692 * in pass one, so this buffer has not been cancelled. 1687 * in pass one, so this buffer has not been cancelled.
1693 */ 1688 */
1694 ASSERT(!(flags & XFS_BLI_CANCEL)); 1689 ASSERT(!(flags & XFS_BLI_CANCEL));
1695 return 0; 1690 return 0;
1696 } 1691 }
1697 1692
1698 /* 1693 /*
1699 * Search for an entry in the buffer cancel table that 1694 * Search for an entry in the buffer cancel table that
1700 * matches our buffer. 1695 * matches our buffer.
1701 */ 1696 */
1702 prevp = NULL; 1697 prevp = NULL;
1703 while (bcp != NULL) { 1698 while (bcp != NULL) {
1704 if (bcp->bc_blkno == blkno && bcp->bc_len == len) { 1699 if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
1705 /* 1700 /*
1706 * We've go a match, so return 1 so that the 1701 * We've go a match, so return 1 so that the
1707 * recovery of this buffer is cancelled. 1702 * recovery of this buffer is cancelled.
1708 * If this buffer is actually a buffer cancel 1703 * If this buffer is actually a buffer cancel
1709 * log item, then decrement the refcount on the 1704 * log item, then decrement the refcount on the
1710 * one in the table and remove it if this is the 1705 * one in the table and remove it if this is the
1711 * last reference. 1706 * last reference.
1712 */ 1707 */
1713 if (flags & XFS_BLI_CANCEL) { 1708 if (flags & XFS_BLI_CANCEL) {
1714 bcp->bc_refcount--; 1709 bcp->bc_refcount--;
1715 if (bcp->bc_refcount == 0) { 1710 if (bcp->bc_refcount == 0) {
1716 if (prevp == NULL) { 1711 if (prevp == NULL) {
1717 *bucket = bcp->bc_next; 1712 *bucket = bcp->bc_next;
1718 } else { 1713 } else {
1719 prevp->bc_next = bcp->bc_next; 1714 prevp->bc_next = bcp->bc_next;
1720 } 1715 }
1721 kmem_free(bcp); 1716 kmem_free(bcp);
1722 } 1717 }
1723 } 1718 }
1724 return 1; 1719 return 1;
1725 } 1720 }
1726 prevp = bcp; 1721 prevp = bcp;
1727 bcp = bcp->bc_next; 1722 bcp = bcp->bc_next;
1728 } 1723 }
1729 /* 1724 /*
1730 * We didn't find a corresponding entry in the table, so 1725 * We didn't find a corresponding entry in the table, so
1731 * return 0 so that the buffer is NOT cancelled. 1726 * return 0 so that the buffer is NOT cancelled.
1732 */ 1727 */
1733 ASSERT(!(flags & XFS_BLI_CANCEL)); 1728 ASSERT(!(flags & XFS_BLI_CANCEL));
1734 return 0; 1729 return 0;
1735 } 1730 }
1736 1731
1737 STATIC int 1732 STATIC int
1738 xlog_recover_do_buffer_pass2( 1733 xlog_recover_do_buffer_pass2(
1739 xlog_t *log, 1734 xlog_t *log,
1740 xfs_buf_log_format_t *buf_f) 1735 xfs_buf_log_format_t *buf_f)
1741 { 1736 {
1742 xfs_daddr_t blkno = 0; 1737 xfs_daddr_t blkno = 0;
1743 ushort flags = 0; 1738 ushort flags = 0;
1744 uint len = 0; 1739 uint len = 0;
1745 1740
1746 switch (buf_f->blf_type) { 1741 switch (buf_f->blf_type) {
1747 case XFS_LI_BUF: 1742 case XFS_LI_BUF:
1748 blkno = buf_f->blf_blkno; 1743 blkno = buf_f->blf_blkno;
1749 flags = buf_f->blf_flags; 1744 flags = buf_f->blf_flags;
1750 len = buf_f->blf_len; 1745 len = buf_f->blf_len;
1751 break; 1746 break;
1752 } 1747 }
1753 1748
1754 return xlog_check_buffer_cancelled(log, blkno, len, flags); 1749 return xlog_check_buffer_cancelled(log, blkno, len, flags);
1755 } 1750 }
1756 1751
1757 /* 1752 /*
1758 * Perform recovery for a buffer full of inodes. In these buffers, 1753 * Perform recovery for a buffer full of inodes. In these buffers,
1759 * the only data which should be recovered is that which corresponds 1754 * the only data which should be recovered is that which corresponds
1760 * to the di_next_unlinked pointers in the on disk inode structures. 1755 * to the di_next_unlinked pointers in the on disk inode structures.
1761 * The rest of the data for the inodes is always logged through the 1756 * The rest of the data for the inodes is always logged through the
1762 * inodes themselves rather than the inode buffer and is recovered 1757 * inodes themselves rather than the inode buffer and is recovered
1763 * in xlog_recover_do_inode_trans(). 1758 * in xlog_recover_do_inode_trans().
1764 * 1759 *
1765 * The only time when buffers full of inodes are fully recovered is 1760 * The only time when buffers full of inodes are fully recovered is
1766 * when the buffer is full of newly allocated inodes. In this case 1761 * when the buffer is full of newly allocated inodes. In this case
1767 * the buffer will not be marked as an inode buffer and so will be 1762 * the buffer will not be marked as an inode buffer and so will be
1768 * sent to xlog_recover_do_reg_buffer() below during recovery. 1763 * sent to xlog_recover_do_reg_buffer() below during recovery.
1769 */ 1764 */
1770 STATIC int 1765 STATIC int
1771 xlog_recover_do_inode_buffer( 1766 xlog_recover_do_inode_buffer(
1772 xfs_mount_t *mp, 1767 xfs_mount_t *mp,
1773 xlog_recover_item_t *item, 1768 xlog_recover_item_t *item,
1774 xfs_buf_t *bp, 1769 xfs_buf_t *bp,
1775 xfs_buf_log_format_t *buf_f) 1770 xfs_buf_log_format_t *buf_f)
1776 { 1771 {
1777 int i; 1772 int i;
1778 int item_index; 1773 int item_index;
1779 int bit; 1774 int bit;
1780 int nbits; 1775 int nbits;
1781 int reg_buf_offset; 1776 int reg_buf_offset;
1782 int reg_buf_bytes; 1777 int reg_buf_bytes;
1783 int next_unlinked_offset; 1778 int next_unlinked_offset;
1784 int inodes_per_buf; 1779 int inodes_per_buf;
1785 xfs_agino_t *logged_nextp; 1780 xfs_agino_t *logged_nextp;
1786 xfs_agino_t *buffer_nextp; 1781 xfs_agino_t *buffer_nextp;
1787 unsigned int *data_map = NULL; 1782 unsigned int *data_map = NULL;
1788 unsigned int map_size = 0; 1783 unsigned int map_size = 0;
1789 1784
1790 switch (buf_f->blf_type) { 1785 switch (buf_f->blf_type) {
1791 case XFS_LI_BUF: 1786 case XFS_LI_BUF:
1792 data_map = buf_f->blf_data_map; 1787 data_map = buf_f->blf_data_map;
1793 map_size = buf_f->blf_map_size; 1788 map_size = buf_f->blf_map_size;
1794 break; 1789 break;
1795 } 1790 }
1796 /* 1791 /*
1797 * Set the variables corresponding to the current region to 1792 * Set the variables corresponding to the current region to
1798 * 0 so that we'll initialize them on the first pass through 1793 * 0 so that we'll initialize them on the first pass through
1799 * the loop. 1794 * the loop.
1800 */ 1795 */
1801 reg_buf_offset = 0; 1796 reg_buf_offset = 0;
1802 reg_buf_bytes = 0; 1797 reg_buf_bytes = 0;
1803 bit = 0; 1798 bit = 0;
1804 nbits = 0; 1799 nbits = 0;
1805 item_index = 0; 1800 item_index = 0;
1806 inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog; 1801 inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
1807 for (i = 0; i < inodes_per_buf; i++) { 1802 for (i = 0; i < inodes_per_buf; i++) {
1808 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + 1803 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
1809 offsetof(xfs_dinode_t, di_next_unlinked); 1804 offsetof(xfs_dinode_t, di_next_unlinked);
1810 1805
1811 while (next_unlinked_offset >= 1806 while (next_unlinked_offset >=
1812 (reg_buf_offset + reg_buf_bytes)) { 1807 (reg_buf_offset + reg_buf_bytes)) {
1813 /* 1808 /*
1814 * The next di_next_unlinked field is beyond 1809 * The next di_next_unlinked field is beyond
1815 * the current logged region. Find the next 1810 * the current logged region. Find the next
1816 * logged region that contains or is beyond 1811 * logged region that contains or is beyond
1817 * the current di_next_unlinked field. 1812 * the current di_next_unlinked field.
1818 */ 1813 */
1819 bit += nbits; 1814 bit += nbits;
1820 bit = xfs_next_bit(data_map, map_size, bit); 1815 bit = xfs_next_bit(data_map, map_size, bit);
1821 1816
1822 /* 1817 /*
1823 * If there are no more logged regions in the 1818 * If there are no more logged regions in the
1824 * buffer, then we're done. 1819 * buffer, then we're done.
1825 */ 1820 */
1826 if (bit == -1) { 1821 if (bit == -1) {
1827 return 0; 1822 return 0;
1828 } 1823 }
1829 1824
1830 nbits = xfs_contig_bits(data_map, map_size, 1825 nbits = xfs_contig_bits(data_map, map_size,
1831 bit); 1826 bit);
1832 ASSERT(nbits > 0); 1827 ASSERT(nbits > 0);
1833 reg_buf_offset = bit << XFS_BLI_SHIFT; 1828 reg_buf_offset = bit << XFS_BLI_SHIFT;
1834 reg_buf_bytes = nbits << XFS_BLI_SHIFT; 1829 reg_buf_bytes = nbits << XFS_BLI_SHIFT;
1835 item_index++; 1830 item_index++;
1836 } 1831 }
1837 1832
1838 /* 1833 /*
1839 * If the current logged region starts after the current 1834 * If the current logged region starts after the current
1840 * di_next_unlinked field, then move on to the next 1835 * di_next_unlinked field, then move on to the next
1841 * di_next_unlinked field. 1836 * di_next_unlinked field.
1842 */ 1837 */
1843 if (next_unlinked_offset < reg_buf_offset) { 1838 if (next_unlinked_offset < reg_buf_offset) {
1844 continue; 1839 continue;
1845 } 1840 }
1846 1841
1847 ASSERT(item->ri_buf[item_index].i_addr != NULL); 1842 ASSERT(item->ri_buf[item_index].i_addr != NULL);
1848 ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0); 1843 ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0);
1849 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); 1844 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
1850 1845
1851 /* 1846 /*
1852 * The current logged region contains a copy of the 1847 * The current logged region contains a copy of the
1853 * current di_next_unlinked field. Extract its value 1848 * current di_next_unlinked field. Extract its value
1854 * and copy it to the buffer copy. 1849 * and copy it to the buffer copy.
1855 */ 1850 */
1856 logged_nextp = (xfs_agino_t *) 1851 logged_nextp = (xfs_agino_t *)
1857 ((char *)(item->ri_buf[item_index].i_addr) + 1852 ((char *)(item->ri_buf[item_index].i_addr) +
1858 (next_unlinked_offset - reg_buf_offset)); 1853 (next_unlinked_offset - reg_buf_offset));
1859 if (unlikely(*logged_nextp == 0)) { 1854 if (unlikely(*logged_nextp == 0)) {
1860 xfs_fs_cmn_err(CE_ALERT, mp, 1855 xfs_fs_cmn_err(CE_ALERT, mp,
1861 "bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field", 1856 "bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field",
1862 item, bp); 1857 item, bp);
1863 XFS_ERROR_REPORT("xlog_recover_do_inode_buf", 1858 XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
1864 XFS_ERRLEVEL_LOW, mp); 1859 XFS_ERRLEVEL_LOW, mp);
1865 return XFS_ERROR(EFSCORRUPTED); 1860 return XFS_ERROR(EFSCORRUPTED);
1866 } 1861 }
1867 1862
1868 buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, 1863 buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
1869 next_unlinked_offset); 1864 next_unlinked_offset);
1870 *buffer_nextp = *logged_nextp; 1865 *buffer_nextp = *logged_nextp;
1871 } 1866 }
1872 1867
1873 return 0; 1868 return 0;
1874 } 1869 }
1875 1870
1876 /* 1871 /*
1877 * Perform a 'normal' buffer recovery. Each logged region of the 1872 * Perform a 'normal' buffer recovery. Each logged region of the
1878 * buffer should be copied over the corresponding region in the 1873 * buffer should be copied over the corresponding region in the
1879 * given buffer. The bitmap in the buf log format structure indicates 1874 * given buffer. The bitmap in the buf log format structure indicates
1880 * where to place the logged data. 1875 * where to place the logged data.
1881 */ 1876 */
1882 /*ARGSUSED*/ 1877 /*ARGSUSED*/
1883 STATIC void 1878 STATIC void
1884 xlog_recover_do_reg_buffer( 1879 xlog_recover_do_reg_buffer(
1885 xlog_recover_item_t *item, 1880 xlog_recover_item_t *item,
1886 xfs_buf_t *bp, 1881 xfs_buf_t *bp,
1887 xfs_buf_log_format_t *buf_f) 1882 xfs_buf_log_format_t *buf_f)
1888 { 1883 {
1889 int i; 1884 int i;
1890 int bit; 1885 int bit;
1891 int nbits; 1886 int nbits;
1892 unsigned int *data_map = NULL; 1887 unsigned int *data_map = NULL;
1893 unsigned int map_size = 0; 1888 unsigned int map_size = 0;
1894 int error; 1889 int error;
1895 1890
1896 switch (buf_f->blf_type) { 1891 switch (buf_f->blf_type) {
1897 case XFS_LI_BUF: 1892 case XFS_LI_BUF:
1898 data_map = buf_f->blf_data_map; 1893 data_map = buf_f->blf_data_map;
1899 map_size = buf_f->blf_map_size; 1894 map_size = buf_f->blf_map_size;
1900 break; 1895 break;
1901 } 1896 }
1902 bit = 0; 1897 bit = 0;
1903 i = 1; /* 0 is the buf format structure */ 1898 i = 1; /* 0 is the buf format structure */
1904 while (1) { 1899 while (1) {
1905 bit = xfs_next_bit(data_map, map_size, bit); 1900 bit = xfs_next_bit(data_map, map_size, bit);
1906 if (bit == -1) 1901 if (bit == -1)
1907 break; 1902 break;
1908 nbits = xfs_contig_bits(data_map, map_size, bit); 1903 nbits = xfs_contig_bits(data_map, map_size, bit);
1909 ASSERT(nbits > 0); 1904 ASSERT(nbits > 0);
1910 ASSERT(item->ri_buf[i].i_addr != NULL); 1905 ASSERT(item->ri_buf[i].i_addr != NULL);
1911 ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0); 1906 ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
1912 ASSERT(XFS_BUF_COUNT(bp) >= 1907 ASSERT(XFS_BUF_COUNT(bp) >=
1913 ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT)); 1908 ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
1914 1909
1915 /* 1910 /*
1916 * Do a sanity check if this is a dquot buffer. Just checking 1911 * Do a sanity check if this is a dquot buffer. Just checking
1917 * the first dquot in the buffer should do. XXXThis is 1912 * the first dquot in the buffer should do. XXXThis is
1918 * probably a good thing to do for other buf types also. 1913 * probably a good thing to do for other buf types also.
1919 */ 1914 */
1920 error = 0; 1915 error = 0;
1921 if (buf_f->blf_flags & 1916 if (buf_f->blf_flags &
1922 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { 1917 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
1923 error = xfs_qm_dqcheck((xfs_disk_dquot_t *) 1918 error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
1924 item->ri_buf[i].i_addr, 1919 item->ri_buf[i].i_addr,
1925 -1, 0, XFS_QMOPT_DOWARN, 1920 -1, 0, XFS_QMOPT_DOWARN,
1926 "dquot_buf_recover"); 1921 "dquot_buf_recover");
1927 } 1922 }
1928 if (!error) 1923 if (!error)
1929 memcpy(xfs_buf_offset(bp, 1924 memcpy(xfs_buf_offset(bp,
1930 (uint)bit << XFS_BLI_SHIFT), /* dest */ 1925 (uint)bit << XFS_BLI_SHIFT), /* dest */
1931 item->ri_buf[i].i_addr, /* source */ 1926 item->ri_buf[i].i_addr, /* source */
1932 nbits<<XFS_BLI_SHIFT); /* length */ 1927 nbits<<XFS_BLI_SHIFT); /* length */
1933 i++; 1928 i++;
1934 bit += nbits; 1929 bit += nbits;
1935 } 1930 }
1936 1931
1937 /* Shouldn't be any more regions */ 1932 /* Shouldn't be any more regions */
1938 ASSERT(i == item->ri_total); 1933 ASSERT(i == item->ri_total);
1939 } 1934 }
1940 1935
1941 /* 1936 /*
1942 * Do some primitive error checking on ondisk dquot data structures. 1937 * Do some primitive error checking on ondisk dquot data structures.
1943 */ 1938 */
1944 int 1939 int
1945 xfs_qm_dqcheck( 1940 xfs_qm_dqcheck(
1946 xfs_disk_dquot_t *ddq, 1941 xfs_disk_dquot_t *ddq,
1947 xfs_dqid_t id, 1942 xfs_dqid_t id,
1948 uint type, /* used only when IO_dorepair is true */ 1943 uint type, /* used only when IO_dorepair is true */
1949 uint flags, 1944 uint flags,
1950 char *str) 1945 char *str)
1951 { 1946 {
1952 xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; 1947 xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;
1953 int errs = 0; 1948 int errs = 0;
1954 1949
1955 /* 1950 /*
1956 * We can encounter an uninitialized dquot buffer for 2 reasons: 1951 * We can encounter an uninitialized dquot buffer for 2 reasons:
1957 * 1. If we crash while deleting the quotainode(s), and those blks got 1952 * 1. If we crash while deleting the quotainode(s), and those blks got
1958 * used for user data. This is because we take the path of regular 1953 * used for user data. This is because we take the path of regular
1959 * file deletion; however, the size field of quotainodes is never 1954 * file deletion; however, the size field of quotainodes is never
1960 * updated, so all the tricks that we play in itruncate_finish 1955 * updated, so all the tricks that we play in itruncate_finish
1961 * don't quite matter. 1956 * don't quite matter.
1962 * 1957 *
1963 * 2. We don't play the quota buffers when there's a quotaoff logitem. 1958 * 2. We don't play the quota buffers when there's a quotaoff logitem.
1964 * But the allocation will be replayed so we'll end up with an 1959 * But the allocation will be replayed so we'll end up with an
1965 * uninitialized quota block. 1960 * uninitialized quota block.
1966 * 1961 *
1967 * This is all fine; things are still consistent, and we haven't lost 1962 * This is all fine; things are still consistent, and we haven't lost
1968 * any quota information. Just don't complain about bad dquot blks. 1963 * any quota information. Just don't complain about bad dquot blks.
1969 */ 1964 */
1970 if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) { 1965 if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
1971 if (flags & XFS_QMOPT_DOWARN) 1966 if (flags & XFS_QMOPT_DOWARN)
1972 cmn_err(CE_ALERT, 1967 cmn_err(CE_ALERT,
1973 "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", 1968 "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
1974 str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC); 1969 str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
1975 errs++; 1970 errs++;
1976 } 1971 }
1977 if (ddq->d_version != XFS_DQUOT_VERSION) { 1972 if (ddq->d_version != XFS_DQUOT_VERSION) {
1978 if (flags & XFS_QMOPT_DOWARN) 1973 if (flags & XFS_QMOPT_DOWARN)
1979 cmn_err(CE_ALERT, 1974 cmn_err(CE_ALERT,
1980 "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", 1975 "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
1981 str, id, ddq->d_version, XFS_DQUOT_VERSION); 1976 str, id, ddq->d_version, XFS_DQUOT_VERSION);
1982 errs++; 1977 errs++;
1983 } 1978 }
1984 1979
1985 if (ddq->d_flags != XFS_DQ_USER && 1980 if (ddq->d_flags != XFS_DQ_USER &&
1986 ddq->d_flags != XFS_DQ_PROJ && 1981 ddq->d_flags != XFS_DQ_PROJ &&
1987 ddq->d_flags != XFS_DQ_GROUP) { 1982 ddq->d_flags != XFS_DQ_GROUP) {
1988 if (flags & XFS_QMOPT_DOWARN) 1983 if (flags & XFS_QMOPT_DOWARN)
1989 cmn_err(CE_ALERT, 1984 cmn_err(CE_ALERT,
1990 "%s : XFS dquot ID 0x%x, unknown flags 0x%x", 1985 "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
1991 str, id, ddq->d_flags); 1986 str, id, ddq->d_flags);
1992 errs++; 1987 errs++;
1993 } 1988 }
1994 1989
1995 if (id != -1 && id != be32_to_cpu(ddq->d_id)) { 1990 if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
1996 if (flags & XFS_QMOPT_DOWARN) 1991 if (flags & XFS_QMOPT_DOWARN)
1997 cmn_err(CE_ALERT, 1992 cmn_err(CE_ALERT,
1998 "%s : ondisk-dquot 0x%p, ID mismatch: " 1993 "%s : ondisk-dquot 0x%p, ID mismatch: "
1999 "0x%x expected, found id 0x%x", 1994 "0x%x expected, found id 0x%x",
2000 str, ddq, id, be32_to_cpu(ddq->d_id)); 1995 str, ddq, id, be32_to_cpu(ddq->d_id));
2001 errs++; 1996 errs++;
2002 } 1997 }
2003 1998
2004 if (!errs && ddq->d_id) { 1999 if (!errs && ddq->d_id) {
2005 if (ddq->d_blk_softlimit && 2000 if (ddq->d_blk_softlimit &&
2006 be64_to_cpu(ddq->d_bcount) >= 2001 be64_to_cpu(ddq->d_bcount) >=
2007 be64_to_cpu(ddq->d_blk_softlimit)) { 2002 be64_to_cpu(ddq->d_blk_softlimit)) {
2008 if (!ddq->d_btimer) { 2003 if (!ddq->d_btimer) {
2009 if (flags & XFS_QMOPT_DOWARN) 2004 if (flags & XFS_QMOPT_DOWARN)
2010 cmn_err(CE_ALERT, 2005 cmn_err(CE_ALERT,
2011 "%s : Dquot ID 0x%x (0x%p) " 2006 "%s : Dquot ID 0x%x (0x%p) "
2012 "BLK TIMER NOT STARTED", 2007 "BLK TIMER NOT STARTED",
2013 str, (int)be32_to_cpu(ddq->d_id), ddq); 2008 str, (int)be32_to_cpu(ddq->d_id), ddq);
2014 errs++; 2009 errs++;
2015 } 2010 }
2016 } 2011 }
2017 if (ddq->d_ino_softlimit && 2012 if (ddq->d_ino_softlimit &&
2018 be64_to_cpu(ddq->d_icount) >= 2013 be64_to_cpu(ddq->d_icount) >=
2019 be64_to_cpu(ddq->d_ino_softlimit)) { 2014 be64_to_cpu(ddq->d_ino_softlimit)) {
2020 if (!ddq->d_itimer) { 2015 if (!ddq->d_itimer) {
2021 if (flags & XFS_QMOPT_DOWARN) 2016 if (flags & XFS_QMOPT_DOWARN)
2022 cmn_err(CE_ALERT, 2017 cmn_err(CE_ALERT,
2023 "%s : Dquot ID 0x%x (0x%p) " 2018 "%s : Dquot ID 0x%x (0x%p) "
2024 "INODE TIMER NOT STARTED", 2019 "INODE TIMER NOT STARTED",
2025 str, (int)be32_to_cpu(ddq->d_id), ddq); 2020 str, (int)be32_to_cpu(ddq->d_id), ddq);
2026 errs++; 2021 errs++;
2027 } 2022 }
2028 } 2023 }
2029 if (ddq->d_rtb_softlimit && 2024 if (ddq->d_rtb_softlimit &&
2030 be64_to_cpu(ddq->d_rtbcount) >= 2025 be64_to_cpu(ddq->d_rtbcount) >=
2031 be64_to_cpu(ddq->d_rtb_softlimit)) { 2026 be64_to_cpu(ddq->d_rtb_softlimit)) {
2032 if (!ddq->d_rtbtimer) { 2027 if (!ddq->d_rtbtimer) {
2033 if (flags & XFS_QMOPT_DOWARN) 2028 if (flags & XFS_QMOPT_DOWARN)
2034 cmn_err(CE_ALERT, 2029 cmn_err(CE_ALERT,
2035 "%s : Dquot ID 0x%x (0x%p) " 2030 "%s : Dquot ID 0x%x (0x%p) "
2036 "RTBLK TIMER NOT STARTED", 2031 "RTBLK TIMER NOT STARTED",
2037 str, (int)be32_to_cpu(ddq->d_id), ddq); 2032 str, (int)be32_to_cpu(ddq->d_id), ddq);
2038 errs++; 2033 errs++;
2039 } 2034 }
2040 } 2035 }
2041 } 2036 }
2042 2037
2043 if (!errs || !(flags & XFS_QMOPT_DQREPAIR)) 2038 if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
2044 return errs; 2039 return errs;
2045 2040
2046 if (flags & XFS_QMOPT_DOWARN) 2041 if (flags & XFS_QMOPT_DOWARN)
2047 cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id); 2042 cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id);
2048 2043
2049 /* 2044 /*
2050 * Typically, a repair is only requested by quotacheck. 2045 * Typically, a repair is only requested by quotacheck.
2051 */ 2046 */
2052 ASSERT(id != -1); 2047 ASSERT(id != -1);
2053 ASSERT(flags & XFS_QMOPT_DQREPAIR); 2048 ASSERT(flags & XFS_QMOPT_DQREPAIR);
2054 memset(d, 0, sizeof(xfs_dqblk_t)); 2049 memset(d, 0, sizeof(xfs_dqblk_t));
2055 2050
2056 d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); 2051 d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
2057 d->dd_diskdq.d_version = XFS_DQUOT_VERSION; 2052 d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
2058 d->dd_diskdq.d_flags = type; 2053 d->dd_diskdq.d_flags = type;
2059 d->dd_diskdq.d_id = cpu_to_be32(id); 2054 d->dd_diskdq.d_id = cpu_to_be32(id);
2060 2055
2061 return errs; 2056 return errs;
2062 } 2057 }
2063 2058
2064 /* 2059 /*
2065 * Perform a dquot buffer recovery. 2060 * Perform a dquot buffer recovery.
2066 * Simple algorithm: if we have found a QUOTAOFF logitem of the same type 2061 * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
2067 * (ie. USR or GRP), then just toss this buffer away; don't recover it. 2062 * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2068 * Else, treat it as a regular buffer and do recovery. 2063 * Else, treat it as a regular buffer and do recovery.
2069 */ 2064 */
2070 STATIC void 2065 STATIC void
2071 xlog_recover_do_dquot_buffer( 2066 xlog_recover_do_dquot_buffer(
2072 xfs_mount_t *mp, 2067 xfs_mount_t *mp,
2073 xlog_t *log, 2068 xlog_t *log,
2074 xlog_recover_item_t *item, 2069 xlog_recover_item_t *item,
2075 xfs_buf_t *bp, 2070 xfs_buf_t *bp,
2076 xfs_buf_log_format_t *buf_f) 2071 xfs_buf_log_format_t *buf_f)
2077 { 2072 {
2078 uint type; 2073 uint type;
2079 2074
2080 /* 2075 /*
2081 * Filesystems are required to send in quota flags at mount time. 2076 * Filesystems are required to send in quota flags at mount time.
2082 */ 2077 */
2083 if (mp->m_qflags == 0) { 2078 if (mp->m_qflags == 0) {
2084 return; 2079 return;
2085 } 2080 }
2086 2081
2087 type = 0; 2082 type = 0;
2088 if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF) 2083 if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF)
2089 type |= XFS_DQ_USER; 2084 type |= XFS_DQ_USER;
2090 if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF) 2085 if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF)
2091 type |= XFS_DQ_PROJ; 2086 type |= XFS_DQ_PROJ;
2092 if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF) 2087 if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF)
2093 type |= XFS_DQ_GROUP; 2088 type |= XFS_DQ_GROUP;
2094 /* 2089 /*
2095 * This type of quotas was turned off, so ignore this buffer 2090 * This type of quotas was turned off, so ignore this buffer
2096 */ 2091 */
2097 if (log->l_quotaoffs_flag & type) 2092 if (log->l_quotaoffs_flag & type)
2098 return; 2093 return;
2099 2094
2100 xlog_recover_do_reg_buffer(item, bp, buf_f); 2095 xlog_recover_do_reg_buffer(item, bp, buf_f);
2101 } 2096 }
2102 2097
2103 /* 2098 /*
2104 * This routine replays a modification made to a buffer at runtime. 2099 * This routine replays a modification made to a buffer at runtime.
2105 * There are actually two types of buffer, regular and inode, which 2100 * There are actually two types of buffer, regular and inode, which
2106 * are handled differently. Inode buffers are handled differently 2101 * are handled differently. Inode buffers are handled differently
2107 * in that we only recover a specific set of data from them, namely 2102 * in that we only recover a specific set of data from them, namely
2108 * the inode di_next_unlinked fields. This is because all other inode 2103 * the inode di_next_unlinked fields. This is because all other inode
2109 * data is actually logged via inode records and any data we replay 2104 * data is actually logged via inode records and any data we replay
2110 * here which overlaps that may be stale. 2105 * here which overlaps that may be stale.
2111 * 2106 *
2112 * When meta-data buffers are freed at run time we log a buffer item 2107 * When meta-data buffers are freed at run time we log a buffer item
2113 * with the XFS_BLI_CANCEL bit set to indicate that previous copies 2108 * with the XFS_BLI_CANCEL bit set to indicate that previous copies
2114 * of the buffer in the log should not be replayed at recovery time. 2109 * of the buffer in the log should not be replayed at recovery time.
2115 * This is so that if the blocks covered by the buffer are reused for 2110 * This is so that if the blocks covered by the buffer are reused for
2116 * file data before we crash we don't end up replaying old, freed 2111 * file data before we crash we don't end up replaying old, freed
2117 * meta-data into a user's file. 2112 * meta-data into a user's file.
2118 * 2113 *
2119 * To handle the cancellation of buffer log items, we make two passes 2114 * To handle the cancellation of buffer log items, we make two passes
2120 * over the log during recovery. During the first we build a table of 2115 * over the log during recovery. During the first we build a table of
2121 * those buffers which have been cancelled, and during the second we 2116 * those buffers which have been cancelled, and during the second we
2122 * only replay those buffers which do not have corresponding cancel 2117 * only replay those buffers which do not have corresponding cancel
2123 * records in the table. See xlog_recover_do_buffer_pass[1,2] above 2118 * records in the table. See xlog_recover_do_buffer_pass[1,2] above
2124 * for more details on the implementation of the table of cancel records. 2119 * for more details on the implementation of the table of cancel records.
2125 */ 2120 */
2126 STATIC int 2121 STATIC int
2127 xlog_recover_do_buffer_trans( 2122 xlog_recover_do_buffer_trans(
2128 xlog_t *log, 2123 xlog_t *log,
2129 xlog_recover_item_t *item, 2124 xlog_recover_item_t *item,
2130 int pass) 2125 int pass)
2131 { 2126 {
2132 xfs_buf_log_format_t *buf_f; 2127 xfs_buf_log_format_t *buf_f;
2133 xfs_mount_t *mp; 2128 xfs_mount_t *mp;
2134 xfs_buf_t *bp; 2129 xfs_buf_t *bp;
2135 int error; 2130 int error;
2136 int cancel; 2131 int cancel;
2137 xfs_daddr_t blkno; 2132 xfs_daddr_t blkno;
2138 int len; 2133 int len;
2139 ushort flags; 2134 ushort flags;
2140 2135
2141 buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr; 2136 buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
2142 2137
2143 if (pass == XLOG_RECOVER_PASS1) { 2138 if (pass == XLOG_RECOVER_PASS1) {
2144 /* 2139 /*
2145 * In this pass we're only looking for buf items 2140 * In this pass we're only looking for buf items
2146 * with the XFS_BLI_CANCEL bit set. 2141 * with the XFS_BLI_CANCEL bit set.
2147 */ 2142 */
2148 xlog_recover_do_buffer_pass1(log, buf_f); 2143 xlog_recover_do_buffer_pass1(log, buf_f);
2149 return 0; 2144 return 0;
2150 } else { 2145 } else {
2151 /* 2146 /*
2152 * In this pass we want to recover all the buffers 2147 * In this pass we want to recover all the buffers
2153 * which have not been cancelled and are not 2148 * which have not been cancelled and are not
2154 * cancellation buffers themselves. The routine 2149 * cancellation buffers themselves. The routine
2155 * we call here will tell us whether or not to 2150 * we call here will tell us whether or not to
2156 * continue with the replay of this buffer. 2151 * continue with the replay of this buffer.
2157 */ 2152 */
2158 cancel = xlog_recover_do_buffer_pass2(log, buf_f); 2153 cancel = xlog_recover_do_buffer_pass2(log, buf_f);
2159 if (cancel) { 2154 if (cancel) {
2160 return 0; 2155 return 0;
2161 } 2156 }
2162 } 2157 }
2163 switch (buf_f->blf_type) { 2158 switch (buf_f->blf_type) {
2164 case XFS_LI_BUF: 2159 case XFS_LI_BUF:
2165 blkno = buf_f->blf_blkno; 2160 blkno = buf_f->blf_blkno;
2166 len = buf_f->blf_len; 2161 len = buf_f->blf_len;
2167 flags = buf_f->blf_flags; 2162 flags = buf_f->blf_flags;
2168 break; 2163 break;
2169 default: 2164 default:
2170 xfs_fs_cmn_err(CE_ALERT, log->l_mp, 2165 xfs_fs_cmn_err(CE_ALERT, log->l_mp,
2171 "xfs_log_recover: unknown buffer type 0x%x, logdev %s", 2166 "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
2172 buf_f->blf_type, log->l_mp->m_logname ? 2167 buf_f->blf_type, log->l_mp->m_logname ?
2173 log->l_mp->m_logname : "internal"); 2168 log->l_mp->m_logname : "internal");
2174 XFS_ERROR_REPORT("xlog_recover_do_buffer_trans", 2169 XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
2175 XFS_ERRLEVEL_LOW, log->l_mp); 2170 XFS_ERRLEVEL_LOW, log->l_mp);
2176 return XFS_ERROR(EFSCORRUPTED); 2171 return XFS_ERROR(EFSCORRUPTED);
2177 } 2172 }
2178 2173
2179 mp = log->l_mp; 2174 mp = log->l_mp;
2180 if (flags & XFS_BLI_INODE_BUF) { 2175 if (flags & XFS_BLI_INODE_BUF) {
2181 bp = xfs_buf_read_flags(mp->m_ddev_targp, blkno, len, 2176 bp = xfs_buf_read_flags(mp->m_ddev_targp, blkno, len,
2182 XFS_BUF_LOCK); 2177 XFS_BUF_LOCK);
2183 } else { 2178 } else {
2184 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, 0); 2179 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, 0);
2185 } 2180 }
2186 if (XFS_BUF_ISERROR(bp)) { 2181 if (XFS_BUF_ISERROR(bp)) {
2187 xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp, 2182 xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
2188 bp, blkno); 2183 bp, blkno);
2189 error = XFS_BUF_GETERROR(bp); 2184 error = XFS_BUF_GETERROR(bp);
2190 xfs_buf_relse(bp); 2185 xfs_buf_relse(bp);
2191 return error; 2186 return error;
2192 } 2187 }
2193 2188
2194 error = 0; 2189 error = 0;
2195 if (flags & XFS_BLI_INODE_BUF) { 2190 if (flags & XFS_BLI_INODE_BUF) {
2196 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); 2191 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2197 } else if (flags & 2192 } else if (flags &
2198 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { 2193 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
2199 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); 2194 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2200 } else { 2195 } else {
2201 xlog_recover_do_reg_buffer(item, bp, buf_f); 2196 xlog_recover_do_reg_buffer(item, bp, buf_f);
2202 } 2197 }
2203 if (error) 2198 if (error)
2204 return XFS_ERROR(error); 2199 return XFS_ERROR(error);
2205 2200
2206 /* 2201 /*
2207 * Perform delayed write on the buffer. Asynchronous writes will be 2202 * Perform delayed write on the buffer. Asynchronous writes will be
2208 * slower when taking into account all the buffers to be flushed. 2203 * slower when taking into account all the buffers to be flushed.
2209 * 2204 *
2210 * Also make sure that only inode buffers with good sizes stay in 2205 * Also make sure that only inode buffers with good sizes stay in
2211 * the buffer cache. The kernel moves inodes in buffers of 1 block 2206 * the buffer cache. The kernel moves inodes in buffers of 1 block
2212 * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode 2207 * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode
2213 * buffers in the log can be a different size if the log was generated 2208 * buffers in the log can be a different size if the log was generated
2214 * by an older kernel using unclustered inode buffers or a newer kernel 2209 * by an older kernel using unclustered inode buffers or a newer kernel
2215 * running with a different inode cluster size. Regardless, if the 2210 * running with a different inode cluster size. Regardless, if the
2216 * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE) 2211 * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
2217 * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep 2212 * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
2218 * the buffer out of the buffer cache so that the buffer won't 2213 * the buffer out of the buffer cache so that the buffer won't
2219 * overlap with future reads of those inodes. 2214 * overlap with future reads of those inodes.
2220 */ 2215 */
2221 if (XFS_DINODE_MAGIC == 2216 if (XFS_DINODE_MAGIC ==
2222 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && 2217 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
2223 (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize, 2218 (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
2224 (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { 2219 (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
2225 XFS_BUF_STALE(bp); 2220 XFS_BUF_STALE(bp);
2226 error = xfs_bwrite(mp, bp); 2221 error = xfs_bwrite(mp, bp);
2227 } else { 2222 } else {
2228 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL || 2223 ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
2229 XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp); 2224 bp->b_mount = mp;
2230 XFS_BUF_SET_FSPRIVATE(bp, mp);
2231 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2225 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2232 xfs_bdwrite(mp, bp); 2226 xfs_bdwrite(mp, bp);
2233 } 2227 }
2234 2228
2235 return (error); 2229 return (error);
2236 } 2230 }
2237 2231
2238 STATIC int 2232 STATIC int
2239 xlog_recover_do_inode_trans( 2233 xlog_recover_do_inode_trans(
2240 xlog_t *log, 2234 xlog_t *log,
2241 xlog_recover_item_t *item, 2235 xlog_recover_item_t *item,
2242 int pass) 2236 int pass)
2243 { 2237 {
2244 xfs_inode_log_format_t *in_f; 2238 xfs_inode_log_format_t *in_f;
2245 xfs_mount_t *mp; 2239 xfs_mount_t *mp;
2246 xfs_buf_t *bp; 2240 xfs_buf_t *bp;
2247 xfs_dinode_t *dip; 2241 xfs_dinode_t *dip;
2248 xfs_ino_t ino; 2242 xfs_ino_t ino;
2249 int len; 2243 int len;
2250 xfs_caddr_t src; 2244 xfs_caddr_t src;
2251 xfs_caddr_t dest; 2245 xfs_caddr_t dest;
2252 int error; 2246 int error;
2253 int attr_index; 2247 int attr_index;
2254 uint fields; 2248 uint fields;
2255 xfs_icdinode_t *dicp; 2249 xfs_icdinode_t *dicp;
2256 int need_free = 0; 2250 int need_free = 0;
2257 2251
2258 if (pass == XLOG_RECOVER_PASS1) { 2252 if (pass == XLOG_RECOVER_PASS1) {
2259 return 0; 2253 return 0;
2260 } 2254 }
2261 2255
2262 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { 2256 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2263 in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr; 2257 in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
2264 } else { 2258 } else {
2265 in_f = (xfs_inode_log_format_t *)kmem_alloc( 2259 in_f = (xfs_inode_log_format_t *)kmem_alloc(
2266 sizeof(xfs_inode_log_format_t), KM_SLEEP); 2260 sizeof(xfs_inode_log_format_t), KM_SLEEP);
2267 need_free = 1; 2261 need_free = 1;
2268 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); 2262 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
2269 if (error) 2263 if (error)
2270 goto error; 2264 goto error;
2271 } 2265 }
2272 ino = in_f->ilf_ino; 2266 ino = in_f->ilf_ino;
2273 mp = log->l_mp; 2267 mp = log->l_mp;
2274 2268
2275 /* 2269 /*
2276 * Inode buffers can be freed, look out for it, 2270 * Inode buffers can be freed, look out for it,
2277 * and do not replay the inode. 2271 * and do not replay the inode.
2278 */ 2272 */
2279 if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, 2273 if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
2280 in_f->ilf_len, 0)) { 2274 in_f->ilf_len, 0)) {
2281 error = 0; 2275 error = 0;
2282 goto error; 2276 goto error;
2283 } 2277 }
2284 2278
2285 bp = xfs_buf_read_flags(mp->m_ddev_targp, in_f->ilf_blkno, 2279 bp = xfs_buf_read_flags(mp->m_ddev_targp, in_f->ilf_blkno,
2286 in_f->ilf_len, XFS_BUF_LOCK); 2280 in_f->ilf_len, XFS_BUF_LOCK);
2287 if (XFS_BUF_ISERROR(bp)) { 2281 if (XFS_BUF_ISERROR(bp)) {
2288 xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, 2282 xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
2289 bp, in_f->ilf_blkno); 2283 bp, in_f->ilf_blkno);
2290 error = XFS_BUF_GETERROR(bp); 2284 error = XFS_BUF_GETERROR(bp);
2291 xfs_buf_relse(bp); 2285 xfs_buf_relse(bp);
2292 goto error; 2286 goto error;
2293 } 2287 }
2294 error = 0; 2288 error = 0;
2295 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); 2289 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2296 dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); 2290 dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
2297 2291
2298 /* 2292 /*
2299 * Make sure the place we're flushing out to really looks 2293 * Make sure the place we're flushing out to really looks
2300 * like an inode! 2294 * like an inode!
2301 */ 2295 */
2302 if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) { 2296 if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
2303 xfs_buf_relse(bp); 2297 xfs_buf_relse(bp);
2304 xfs_fs_cmn_err(CE_ALERT, mp, 2298 xfs_fs_cmn_err(CE_ALERT, mp,
2305 "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld", 2299 "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
2306 dip, bp, ino); 2300 dip, bp, ino);
2307 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)", 2301 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
2308 XFS_ERRLEVEL_LOW, mp); 2302 XFS_ERRLEVEL_LOW, mp);
2309 error = EFSCORRUPTED; 2303 error = EFSCORRUPTED;
2310 goto error; 2304 goto error;
2311 } 2305 }
2312 dicp = (xfs_icdinode_t *)(item->ri_buf[1].i_addr); 2306 dicp = (xfs_icdinode_t *)(item->ri_buf[1].i_addr);
2313 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { 2307 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2314 xfs_buf_relse(bp); 2308 xfs_buf_relse(bp);
2315 xfs_fs_cmn_err(CE_ALERT, mp, 2309 xfs_fs_cmn_err(CE_ALERT, mp,
2316 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld", 2310 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
2317 item, ino); 2311 item, ino);
2318 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)", 2312 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
2319 XFS_ERRLEVEL_LOW, mp); 2313 XFS_ERRLEVEL_LOW, mp);
2320 error = EFSCORRUPTED; 2314 error = EFSCORRUPTED;
2321 goto error; 2315 goto error;
2322 } 2316 }
2323 2317
2324 /* Skip replay when the on disk inode is newer than the log one */ 2318 /* Skip replay when the on disk inode is newer than the log one */
2325 if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { 2319 if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
2326 /* 2320 /*
2327 * Deal with the wrap case, DI_MAX_FLUSH is less 2321 * Deal with the wrap case, DI_MAX_FLUSH is less
2328 * than smaller numbers 2322 * than smaller numbers
2329 */ 2323 */
2330 if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && 2324 if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
2331 dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { 2325 dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
2332 /* do nothing */ 2326 /* do nothing */
2333 } else { 2327 } else {
2334 xfs_buf_relse(bp); 2328 xfs_buf_relse(bp);
2335 error = 0; 2329 error = 0;
2336 goto error; 2330 goto error;
2337 } 2331 }
2338 } 2332 }
2339 /* Take the opportunity to reset the flush iteration count */ 2333 /* Take the opportunity to reset the flush iteration count */
2340 dicp->di_flushiter = 0; 2334 dicp->di_flushiter = 0;
2341 2335
2342 if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) { 2336 if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
2343 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2337 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2344 (dicp->di_format != XFS_DINODE_FMT_BTREE)) { 2338 (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2345 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)", 2339 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
2346 XFS_ERRLEVEL_LOW, mp, dicp); 2340 XFS_ERRLEVEL_LOW, mp, dicp);
2347 xfs_buf_relse(bp); 2341 xfs_buf_relse(bp);
2348 xfs_fs_cmn_err(CE_ALERT, mp, 2342 xfs_fs_cmn_err(CE_ALERT, mp,
2349 "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2343 "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2350 item, dip, bp, ino); 2344 item, dip, bp, ino);
2351 error = EFSCORRUPTED; 2345 error = EFSCORRUPTED;
2352 goto error; 2346 goto error;
2353 } 2347 }
2354 } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) { 2348 } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) {
2355 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && 2349 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2356 (dicp->di_format != XFS_DINODE_FMT_BTREE) && 2350 (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2357 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { 2351 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2358 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)", 2352 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
2359 XFS_ERRLEVEL_LOW, mp, dicp); 2353 XFS_ERRLEVEL_LOW, mp, dicp);
2360 xfs_buf_relse(bp); 2354 xfs_buf_relse(bp);
2361 xfs_fs_cmn_err(CE_ALERT, mp, 2355 xfs_fs_cmn_err(CE_ALERT, mp,
2362 "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", 2356 "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2363 item, dip, bp, ino); 2357 item, dip, bp, ino);
2364 error = EFSCORRUPTED; 2358 error = EFSCORRUPTED;
2365 goto error; 2359 goto error;
2366 } 2360 }
2367 } 2361 }
2368 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ 2362 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2369 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)", 2363 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
2370 XFS_ERRLEVEL_LOW, mp, dicp); 2364 XFS_ERRLEVEL_LOW, mp, dicp);
2371 xfs_buf_relse(bp); 2365 xfs_buf_relse(bp);
2372 xfs_fs_cmn_err(CE_ALERT, mp, 2366 xfs_fs_cmn_err(CE_ALERT, mp,
2373 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", 2367 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2374 item, dip, bp, ino, 2368 item, dip, bp, ino,
2375 dicp->di_nextents + dicp->di_anextents, 2369 dicp->di_nextents + dicp->di_anextents,
2376 dicp->di_nblocks); 2370 dicp->di_nblocks);
2377 error = EFSCORRUPTED; 2371 error = EFSCORRUPTED;
2378 goto error; 2372 goto error;
2379 } 2373 }
2380 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { 2374 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2381 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)", 2375 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
2382 XFS_ERRLEVEL_LOW, mp, dicp); 2376 XFS_ERRLEVEL_LOW, mp, dicp);
2383 xfs_buf_relse(bp); 2377 xfs_buf_relse(bp);
2384 xfs_fs_cmn_err(CE_ALERT, mp, 2378 xfs_fs_cmn_err(CE_ALERT, mp,
2385 "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x", 2379 "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
2386 item, dip, bp, ino, dicp->di_forkoff); 2380 item, dip, bp, ino, dicp->di_forkoff);
2387 error = EFSCORRUPTED; 2381 error = EFSCORRUPTED;
2388 goto error; 2382 goto error;
2389 } 2383 }
2390 if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) { 2384 if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
2391 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)", 2385 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
2392 XFS_ERRLEVEL_LOW, mp, dicp); 2386 XFS_ERRLEVEL_LOW, mp, dicp);
2393 xfs_buf_relse(bp); 2387 xfs_buf_relse(bp);
2394 xfs_fs_cmn_err(CE_ALERT, mp, 2388 xfs_fs_cmn_err(CE_ALERT, mp,
2395 "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p", 2389 "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
2396 item->ri_buf[1].i_len, item); 2390 item->ri_buf[1].i_len, item);
2397 error = EFSCORRUPTED; 2391 error = EFSCORRUPTED;
2398 goto error; 2392 goto error;
2399 } 2393 }
2400 2394
2401 /* The core is in in-core format */ 2395 /* The core is in in-core format */
2402 xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr); 2396 xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr);
2403 2397
2404 /* the rest is in on-disk format */ 2398 /* the rest is in on-disk format */
2405 if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) { 2399 if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
2406 memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode), 2400 memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
2407 item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode), 2401 item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
2408 item->ri_buf[1].i_len - sizeof(struct xfs_icdinode)); 2402 item->ri_buf[1].i_len - sizeof(struct xfs_icdinode));
2409 } 2403 }
2410 2404
2411 fields = in_f->ilf_fields; 2405 fields = in_f->ilf_fields;
2412 switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) { 2406 switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
2413 case XFS_ILOG_DEV: 2407 case XFS_ILOG_DEV:
2414 xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); 2408 xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
2415 break; 2409 break;
2416 case XFS_ILOG_UUID: 2410 case XFS_ILOG_UUID:
2417 memcpy(XFS_DFORK_DPTR(dip), 2411 memcpy(XFS_DFORK_DPTR(dip),
2418 &in_f->ilf_u.ilfu_uuid, 2412 &in_f->ilf_u.ilfu_uuid,
2419 sizeof(uuid_t)); 2413 sizeof(uuid_t));
2420 break; 2414 break;
2421 } 2415 }
2422 2416
2423 if (in_f->ilf_size == 2) 2417 if (in_f->ilf_size == 2)
2424 goto write_inode_buffer; 2418 goto write_inode_buffer;
2425 len = item->ri_buf[2].i_len; 2419 len = item->ri_buf[2].i_len;
2426 src = item->ri_buf[2].i_addr; 2420 src = item->ri_buf[2].i_addr;
2427 ASSERT(in_f->ilf_size <= 4); 2421 ASSERT(in_f->ilf_size <= 4);
2428 ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); 2422 ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
2429 ASSERT(!(fields & XFS_ILOG_DFORK) || 2423 ASSERT(!(fields & XFS_ILOG_DFORK) ||
2430 (len == in_f->ilf_dsize)); 2424 (len == in_f->ilf_dsize));
2431 2425
2432 switch (fields & XFS_ILOG_DFORK) { 2426 switch (fields & XFS_ILOG_DFORK) {
2433 case XFS_ILOG_DDATA: 2427 case XFS_ILOG_DDATA:
2434 case XFS_ILOG_DEXT: 2428 case XFS_ILOG_DEXT:
2435 memcpy(XFS_DFORK_DPTR(dip), src, len); 2429 memcpy(XFS_DFORK_DPTR(dip), src, len);
2436 break; 2430 break;
2437 2431
2438 case XFS_ILOG_DBROOT: 2432 case XFS_ILOG_DBROOT:
2439 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, 2433 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
2440 (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip), 2434 (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
2441 XFS_DFORK_DSIZE(dip, mp)); 2435 XFS_DFORK_DSIZE(dip, mp));
2442 break; 2436 break;
2443 2437
2444 default: 2438 default:
2445 /* 2439 /*
2446 * There are no data fork flags set. 2440 * There are no data fork flags set.
2447 */ 2441 */
2448 ASSERT((fields & XFS_ILOG_DFORK) == 0); 2442 ASSERT((fields & XFS_ILOG_DFORK) == 0);
2449 break; 2443 break;
2450 } 2444 }
2451 2445
2452 /* 2446 /*
2453 * If we logged any attribute data, recover it. There may or 2447 * If we logged any attribute data, recover it. There may or
2454 * may not have been any other non-core data logged in this 2448 * may not have been any other non-core data logged in this
2455 * transaction. 2449 * transaction.
2456 */ 2450 */
2457 if (in_f->ilf_fields & XFS_ILOG_AFORK) { 2451 if (in_f->ilf_fields & XFS_ILOG_AFORK) {
2458 if (in_f->ilf_fields & XFS_ILOG_DFORK) { 2452 if (in_f->ilf_fields & XFS_ILOG_DFORK) {
2459 attr_index = 3; 2453 attr_index = 3;
2460 } else { 2454 } else {
2461 attr_index = 2; 2455 attr_index = 2;
2462 } 2456 }
2463 len = item->ri_buf[attr_index].i_len; 2457 len = item->ri_buf[attr_index].i_len;
2464 src = item->ri_buf[attr_index].i_addr; 2458 src = item->ri_buf[attr_index].i_addr;
2465 ASSERT(len == in_f->ilf_asize); 2459 ASSERT(len == in_f->ilf_asize);
2466 2460
2467 switch (in_f->ilf_fields & XFS_ILOG_AFORK) { 2461 switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
2468 case XFS_ILOG_ADATA: 2462 case XFS_ILOG_ADATA:
2469 case XFS_ILOG_AEXT: 2463 case XFS_ILOG_AEXT:
2470 dest = XFS_DFORK_APTR(dip); 2464 dest = XFS_DFORK_APTR(dip);
2471 ASSERT(len <= XFS_DFORK_ASIZE(dip, mp)); 2465 ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
2472 memcpy(dest, src, len); 2466 memcpy(dest, src, len);
2473 break; 2467 break;
2474 2468
2475 case XFS_ILOG_ABROOT: 2469 case XFS_ILOG_ABROOT:
2476 dest = XFS_DFORK_APTR(dip); 2470 dest = XFS_DFORK_APTR(dip);
2477 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, 2471 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
2478 len, (xfs_bmdr_block_t*)dest, 2472 len, (xfs_bmdr_block_t*)dest,
2479 XFS_DFORK_ASIZE(dip, mp)); 2473 XFS_DFORK_ASIZE(dip, mp));
2480 break; 2474 break;
2481 2475
2482 default: 2476 default:
2483 xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag"); 2477 xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
2484 ASSERT(0); 2478 ASSERT(0);
2485 xfs_buf_relse(bp); 2479 xfs_buf_relse(bp);
2486 error = EIO; 2480 error = EIO;
2487 goto error; 2481 goto error;
2488 } 2482 }
2489 } 2483 }
2490 2484
2491 write_inode_buffer: 2485 write_inode_buffer:
2492 if (ITEM_TYPE(item) == XFS_LI_INODE) { 2486 if (ITEM_TYPE(item) == XFS_LI_INODE) {
2493 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL || 2487 ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
2494 XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp); 2488 bp->b_mount = mp;
2495 XFS_BUF_SET_FSPRIVATE(bp, mp);
2496 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2489 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2497 xfs_bdwrite(mp, bp); 2490 xfs_bdwrite(mp, bp);
2498 } else { 2491 } else {
2499 XFS_BUF_STALE(bp); 2492 XFS_BUF_STALE(bp);
2500 error = xfs_bwrite(mp, bp); 2493 error = xfs_bwrite(mp, bp);
2501 } 2494 }
2502 2495
2503 error: 2496 error:
2504 if (need_free) 2497 if (need_free)
2505 kmem_free(in_f); 2498 kmem_free(in_f);
2506 return XFS_ERROR(error); 2499 return XFS_ERROR(error);
2507 } 2500 }
2508 2501
2509 /* 2502 /*
2510 * Recover QUOTAOFF records. We simply make a note of it in the xlog_t 2503 * Recover QUOTAOFF records. We simply make a note of it in the xlog_t
2511 * structure, so that we know not to do any dquot item or dquot buffer recovery, 2504 * structure, so that we know not to do any dquot item or dquot buffer recovery,
2512 * of that type. 2505 * of that type.
2513 */ 2506 */
2514 STATIC int 2507 STATIC int
2515 xlog_recover_do_quotaoff_trans( 2508 xlog_recover_do_quotaoff_trans(
2516 xlog_t *log, 2509 xlog_t *log,
2517 xlog_recover_item_t *item, 2510 xlog_recover_item_t *item,
2518 int pass) 2511 int pass)
2519 { 2512 {
2520 xfs_qoff_logformat_t *qoff_f; 2513 xfs_qoff_logformat_t *qoff_f;
2521 2514
2522 if (pass == XLOG_RECOVER_PASS2) { 2515 if (pass == XLOG_RECOVER_PASS2) {
2523 return (0); 2516 return (0);
2524 } 2517 }
2525 2518
2526 qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr; 2519 qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr;
2527 ASSERT(qoff_f); 2520 ASSERT(qoff_f);
2528 2521
2529 /* 2522 /*
2530 * The logitem format's flag tells us if this was user quotaoff, 2523 * The logitem format's flag tells us if this was user quotaoff,
2531 * group/project quotaoff or both. 2524 * group/project quotaoff or both.
2532 */ 2525 */
2533 if (qoff_f->qf_flags & XFS_UQUOTA_ACCT) 2526 if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
2534 log->l_quotaoffs_flag |= XFS_DQ_USER; 2527 log->l_quotaoffs_flag |= XFS_DQ_USER;
2535 if (qoff_f->qf_flags & XFS_PQUOTA_ACCT) 2528 if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
2536 log->l_quotaoffs_flag |= XFS_DQ_PROJ; 2529 log->l_quotaoffs_flag |= XFS_DQ_PROJ;
2537 if (qoff_f->qf_flags & XFS_GQUOTA_ACCT) 2530 if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
2538 log->l_quotaoffs_flag |= XFS_DQ_GROUP; 2531 log->l_quotaoffs_flag |= XFS_DQ_GROUP;
2539 2532
2540 return (0); 2533 return (0);
2541 } 2534 }
2542 2535
2543 /* 2536 /*
2544 * Recover a dquot record 2537 * Recover a dquot record
2545 */ 2538 */
2546 STATIC int 2539 STATIC int
2547 xlog_recover_do_dquot_trans( 2540 xlog_recover_do_dquot_trans(
2548 xlog_t *log, 2541 xlog_t *log,
2549 xlog_recover_item_t *item, 2542 xlog_recover_item_t *item,
2550 int pass) 2543 int pass)
2551 { 2544 {
2552 xfs_mount_t *mp; 2545 xfs_mount_t *mp;
2553 xfs_buf_t *bp; 2546 xfs_buf_t *bp;
2554 struct xfs_disk_dquot *ddq, *recddq; 2547 struct xfs_disk_dquot *ddq, *recddq;
2555 int error; 2548 int error;
2556 xfs_dq_logformat_t *dq_f; 2549 xfs_dq_logformat_t *dq_f;
2557 uint type; 2550 uint type;
2558 2551
2559 if (pass == XLOG_RECOVER_PASS1) { 2552 if (pass == XLOG_RECOVER_PASS1) {
2560 return 0; 2553 return 0;
2561 } 2554 }
2562 mp = log->l_mp; 2555 mp = log->l_mp;
2563 2556
2564 /* 2557 /*
2565 * Filesystems are required to send in quota flags at mount time. 2558 * Filesystems are required to send in quota flags at mount time.
2566 */ 2559 */
2567 if (mp->m_qflags == 0) 2560 if (mp->m_qflags == 0)
2568 return (0); 2561 return (0);
2569 2562
2570 recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr; 2563 recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
2571 ASSERT(recddq); 2564 ASSERT(recddq);
2572 /* 2565 /*
2573 * This type of quotas was turned off, so ignore this record. 2566 * This type of quotas was turned off, so ignore this record.
2574 */ 2567 */
2575 type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); 2568 type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
2576 ASSERT(type); 2569 ASSERT(type);
2577 if (log->l_quotaoffs_flag & type) 2570 if (log->l_quotaoffs_flag & type)
2578 return (0); 2571 return (0);
2579 2572
2580 /* 2573 /*
2581 * At this point we know that quota was _not_ turned off. 2574 * At this point we know that quota was _not_ turned off.
2582 * Since the mount flags are not indicating to us otherwise, this 2575 * Since the mount flags are not indicating to us otherwise, this
2583 * must mean that quota is on, and the dquot needs to be replayed. 2576 * must mean that quota is on, and the dquot needs to be replayed.
2584 * Remember that we may not have fully recovered the superblock yet, 2577 * Remember that we may not have fully recovered the superblock yet,
2585 * so we can't do the usual trick of looking at the SB quota bits. 2578 * so we can't do the usual trick of looking at the SB quota bits.
2586 * 2579 *
2587 * The other possibility, of course, is that the quota subsystem was 2580 * The other possibility, of course, is that the quota subsystem was
2588 * removed since the last mount - ENOSYS. 2581 * removed since the last mount - ENOSYS.
2589 */ 2582 */
2590 dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr; 2583 dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr;
2591 ASSERT(dq_f); 2584 ASSERT(dq_f);
2592 if ((error = xfs_qm_dqcheck(recddq, 2585 if ((error = xfs_qm_dqcheck(recddq,
2593 dq_f->qlf_id, 2586 dq_f->qlf_id,
2594 0, XFS_QMOPT_DOWARN, 2587 0, XFS_QMOPT_DOWARN,
2595 "xlog_recover_do_dquot_trans (log copy)"))) { 2588 "xlog_recover_do_dquot_trans (log copy)"))) {
2596 return XFS_ERROR(EIO); 2589 return XFS_ERROR(EIO);
2597 } 2590 }
2598 ASSERT(dq_f->qlf_len == 1); 2591 ASSERT(dq_f->qlf_len == 1);
2599 2592
2600 error = xfs_read_buf(mp, mp->m_ddev_targp, 2593 error = xfs_read_buf(mp, mp->m_ddev_targp,
2601 dq_f->qlf_blkno, 2594 dq_f->qlf_blkno,
2602 XFS_FSB_TO_BB(mp, dq_f->qlf_len), 2595 XFS_FSB_TO_BB(mp, dq_f->qlf_len),
2603 0, &bp); 2596 0, &bp);
2604 if (error) { 2597 if (error) {
2605 xfs_ioerror_alert("xlog_recover_do..(read#3)", mp, 2598 xfs_ioerror_alert("xlog_recover_do..(read#3)", mp,
2606 bp, dq_f->qlf_blkno); 2599 bp, dq_f->qlf_blkno);
2607 return error; 2600 return error;
2608 } 2601 }
2609 ASSERT(bp); 2602 ASSERT(bp);
2610 ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); 2603 ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
2611 2604
2612 /* 2605 /*
2613 * At least the magic num portion should be on disk because this 2606 * At least the magic num portion should be on disk because this
2614 * was among a chunk of dquots created earlier, and we did some 2607 * was among a chunk of dquots created earlier, and we did some
2615 * minimal initialization then. 2608 * minimal initialization then.
2616 */ 2609 */
2617 if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, 2610 if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2618 "xlog_recover_do_dquot_trans")) { 2611 "xlog_recover_do_dquot_trans")) {
2619 xfs_buf_relse(bp); 2612 xfs_buf_relse(bp);
2620 return XFS_ERROR(EIO); 2613 return XFS_ERROR(EIO);
2621 } 2614 }
2622 2615
2623 memcpy(ddq, recddq, item->ri_buf[1].i_len); 2616 memcpy(ddq, recddq, item->ri_buf[1].i_len);
2624 2617
2625 ASSERT(dq_f->qlf_size == 2); 2618 ASSERT(dq_f->qlf_size == 2);
2626 ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL || 2619 ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
2627 XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp); 2620 bp->b_mount = mp;
2628 XFS_BUF_SET_FSPRIVATE(bp, mp);
2629 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); 2621 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2630 xfs_bdwrite(mp, bp); 2622 xfs_bdwrite(mp, bp);
2631 2623
2632 return (0); 2624 return (0);
2633 } 2625 }
2634 2626
2635 /* 2627 /*
2636 * This routine is called to create an in-core extent free intent 2628 * This routine is called to create an in-core extent free intent
2637 * item from the efi format structure which was logged on disk. 2629 * item from the efi format structure which was logged on disk.
2638 * It allocates an in-core efi, copies the extents from the format 2630 * It allocates an in-core efi, copies the extents from the format
2639 * structure into it, and adds the efi to the AIL with the given 2631 * structure into it, and adds the efi to the AIL with the given
2640 * LSN. 2632 * LSN.
2641 */ 2633 */
2642 STATIC int 2634 STATIC int
2643 xlog_recover_do_efi_trans( 2635 xlog_recover_do_efi_trans(
2644 xlog_t *log, 2636 xlog_t *log,
2645 xlog_recover_item_t *item, 2637 xlog_recover_item_t *item,
2646 xfs_lsn_t lsn, 2638 xfs_lsn_t lsn,
2647 int pass) 2639 int pass)
2648 { 2640 {
2649 int error; 2641 int error;
2650 xfs_mount_t *mp; 2642 xfs_mount_t *mp;
2651 xfs_efi_log_item_t *efip; 2643 xfs_efi_log_item_t *efip;
2652 xfs_efi_log_format_t *efi_formatp; 2644 xfs_efi_log_format_t *efi_formatp;
2653 2645
2654 if (pass == XLOG_RECOVER_PASS1) { 2646 if (pass == XLOG_RECOVER_PASS1) {
2655 return 0; 2647 return 0;
2656 } 2648 }
2657 2649
2658 efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr; 2650 efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr;
2659 2651
2660 mp = log->l_mp; 2652 mp = log->l_mp;
2661 efip = xfs_efi_init(mp, efi_formatp->efi_nextents); 2653 efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
2662 if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), 2654 if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
2663 &(efip->efi_format)))) { 2655 &(efip->efi_format)))) {
2664 xfs_efi_item_free(efip); 2656 xfs_efi_item_free(efip);
2665 return error; 2657 return error;
2666 } 2658 }
2667 efip->efi_next_extent = efi_formatp->efi_nextents; 2659 efip->efi_next_extent = efi_formatp->efi_nextents;
2668 efip->efi_flags |= XFS_EFI_COMMITTED; 2660 efip->efi_flags |= XFS_EFI_COMMITTED;
2669 2661
2670 spin_lock(&log->l_ailp->xa_lock); 2662 spin_lock(&log->l_ailp->xa_lock);
2671 /* 2663 /*
2672 * xfs_trans_ail_update() drops the AIL lock. 2664 * xfs_trans_ail_update() drops the AIL lock.
2673 */ 2665 */
2674 xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn); 2666 xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
2675 return 0; 2667 return 0;
2676 } 2668 }
2677 2669
2678 2670
2679 /* 2671 /*
2680 * This routine is called when an efd format structure is found in 2672 * This routine is called when an efd format structure is found in
2681 * a committed transaction in the log. It's purpose is to cancel 2673 * a committed transaction in the log. It's purpose is to cancel
2682 * the corresponding efi if it was still in the log. To do this 2674 * the corresponding efi if it was still in the log. To do this
2683 * it searches the AIL for the efi with an id equal to that in the 2675 * it searches the AIL for the efi with an id equal to that in the
2684 * efd format structure. If we find it, we remove the efi from the 2676 * efd format structure. If we find it, we remove the efi from the
2685 * AIL and free it. 2677 * AIL and free it.
2686 */ 2678 */
2687 STATIC void 2679 STATIC void
2688 xlog_recover_do_efd_trans( 2680 xlog_recover_do_efd_trans(
2689 xlog_t *log, 2681 xlog_t *log,
2690 xlog_recover_item_t *item, 2682 xlog_recover_item_t *item,
2691 int pass) 2683 int pass)
2692 { 2684 {
2693 xfs_efd_log_format_t *efd_formatp; 2685 xfs_efd_log_format_t *efd_formatp;
2694 xfs_efi_log_item_t *efip = NULL; 2686 xfs_efi_log_item_t *efip = NULL;
2695 xfs_log_item_t *lip; 2687 xfs_log_item_t *lip;
2696 __uint64_t efi_id; 2688 __uint64_t efi_id;
2697 struct xfs_ail_cursor cur; 2689 struct xfs_ail_cursor cur;
2698 struct xfs_ail *ailp = log->l_ailp; 2690 struct xfs_ail *ailp = log->l_ailp;
2699 2691
2700 if (pass == XLOG_RECOVER_PASS1) { 2692 if (pass == XLOG_RECOVER_PASS1) {
2701 return; 2693 return;
2702 } 2694 }
2703 2695
2704 efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr; 2696 efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
2705 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + 2697 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
2706 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || 2698 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
2707 (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + 2699 (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
2708 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t))))); 2700 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
2709 efi_id = efd_formatp->efd_efi_id; 2701 efi_id = efd_formatp->efd_efi_id;
2710 2702
2711 /* 2703 /*
2712 * Search for the efi with the id in the efd format structure 2704 * Search for the efi with the id in the efd format structure
2713 * in the AIL. 2705 * in the AIL.
2714 */ 2706 */
2715 spin_lock(&ailp->xa_lock); 2707 spin_lock(&ailp->xa_lock);
2716 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); 2708 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2717 while (lip != NULL) { 2709 while (lip != NULL) {
2718 if (lip->li_type == XFS_LI_EFI) { 2710 if (lip->li_type == XFS_LI_EFI) {
2719 efip = (xfs_efi_log_item_t *)lip; 2711 efip = (xfs_efi_log_item_t *)lip;
2720 if (efip->efi_format.efi_id == efi_id) { 2712 if (efip->efi_format.efi_id == efi_id) {
2721 /* 2713 /*
2722 * xfs_trans_ail_delete() drops the 2714 * xfs_trans_ail_delete() drops the
2723 * AIL lock. 2715 * AIL lock.
2724 */ 2716 */
2725 xfs_trans_ail_delete(ailp, lip); 2717 xfs_trans_ail_delete(ailp, lip);
2726 xfs_efi_item_free(efip); 2718 xfs_efi_item_free(efip);
2727 spin_lock(&ailp->xa_lock); 2719 spin_lock(&ailp->xa_lock);
2728 break; 2720 break;
2729 } 2721 }
2730 } 2722 }
2731 lip = xfs_trans_ail_cursor_next(ailp, &cur); 2723 lip = xfs_trans_ail_cursor_next(ailp, &cur);
2732 } 2724 }
2733 xfs_trans_ail_cursor_done(ailp, &cur); 2725 xfs_trans_ail_cursor_done(ailp, &cur);
2734 spin_unlock(&ailp->xa_lock); 2726 spin_unlock(&ailp->xa_lock);
2735 } 2727 }
2736 2728
2737 /* 2729 /*
2738 * Perform the transaction 2730 * Perform the transaction
2739 * 2731 *
2740 * If the transaction modifies a buffer or inode, do it now. Otherwise, 2732 * If the transaction modifies a buffer or inode, do it now. Otherwise,
2741 * EFIs and EFDs get queued up by adding entries into the AIL for them. 2733 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2742 */ 2734 */
2743 STATIC int 2735 STATIC int
2744 xlog_recover_do_trans( 2736 xlog_recover_do_trans(
2745 xlog_t *log, 2737 xlog_t *log,
2746 xlog_recover_t *trans, 2738 xlog_recover_t *trans,
2747 int pass) 2739 int pass)
2748 { 2740 {
2749 int error = 0; 2741 int error = 0;
2750 xlog_recover_item_t *item, *first_item; 2742 xlog_recover_item_t *item, *first_item;
2751 2743
2752 if ((error = xlog_recover_reorder_trans(trans))) 2744 if ((error = xlog_recover_reorder_trans(trans)))
2753 return error; 2745 return error;
2754 first_item = item = trans->r_itemq; 2746 first_item = item = trans->r_itemq;
2755 do { 2747 do {
2756 /* 2748 /*
2757 * we don't need to worry about the block number being 2749 * we don't need to worry about the block number being
2758 * truncated in > 1 TB buffers because in user-land, 2750 * truncated in > 1 TB buffers because in user-land,
2759 * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so 2751 * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so
2760 * the blknos will get through the user-mode buffer 2752 * the blknos will get through the user-mode buffer
2761 * cache properly. The only bad case is o32 kernels 2753 * cache properly. The only bad case is o32 kernels
2762 * where xfs_daddr_t is 32-bits but mount will warn us 2754 * where xfs_daddr_t is 32-bits but mount will warn us
2763 * off a > 1 TB filesystem before we get here. 2755 * off a > 1 TB filesystem before we get here.
2764 */ 2756 */
2765 if ((ITEM_TYPE(item) == XFS_LI_BUF)) { 2757 if ((ITEM_TYPE(item) == XFS_LI_BUF)) {
2766 if ((error = xlog_recover_do_buffer_trans(log, item, 2758 if ((error = xlog_recover_do_buffer_trans(log, item,
2767 pass))) 2759 pass)))
2768 break; 2760 break;
2769 } else if ((ITEM_TYPE(item) == XFS_LI_INODE)) { 2761 } else if ((ITEM_TYPE(item) == XFS_LI_INODE)) {
2770 if ((error = xlog_recover_do_inode_trans(log, item, 2762 if ((error = xlog_recover_do_inode_trans(log, item,
2771 pass))) 2763 pass)))
2772 break; 2764 break;
2773 } else if (ITEM_TYPE(item) == XFS_LI_EFI) { 2765 } else if (ITEM_TYPE(item) == XFS_LI_EFI) {
2774 if ((error = xlog_recover_do_efi_trans(log, item, trans->r_lsn, 2766 if ((error = xlog_recover_do_efi_trans(log, item, trans->r_lsn,
2775 pass))) 2767 pass)))
2776 break; 2768 break;
2777 } else if (ITEM_TYPE(item) == XFS_LI_EFD) { 2769 } else if (ITEM_TYPE(item) == XFS_LI_EFD) {
2778 xlog_recover_do_efd_trans(log, item, pass); 2770 xlog_recover_do_efd_trans(log, item, pass);
2779 } else if (ITEM_TYPE(item) == XFS_LI_DQUOT) { 2771 } else if (ITEM_TYPE(item) == XFS_LI_DQUOT) {
2780 if ((error = xlog_recover_do_dquot_trans(log, item, 2772 if ((error = xlog_recover_do_dquot_trans(log, item,
2781 pass))) 2773 pass)))
2782 break; 2774 break;
2783 } else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) { 2775 } else if ((ITEM_TYPE(item) == XFS_LI_QUOTAOFF)) {
2784 if ((error = xlog_recover_do_quotaoff_trans(log, item, 2776 if ((error = xlog_recover_do_quotaoff_trans(log, item,
2785 pass))) 2777 pass)))
2786 break; 2778 break;
2787 } else { 2779 } else {
2788 xlog_warn("XFS: xlog_recover_do_trans"); 2780 xlog_warn("XFS: xlog_recover_do_trans");
2789 ASSERT(0); 2781 ASSERT(0);
2790 error = XFS_ERROR(EIO); 2782 error = XFS_ERROR(EIO);
2791 break; 2783 break;
2792 } 2784 }
2793 item = item->ri_next; 2785 item = item->ri_next;
2794 } while (first_item != item); 2786 } while (first_item != item);
2795 2787
2796 return error; 2788 return error;
2797 } 2789 }
2798 2790
2799 /* 2791 /*
2800 * Free up any resources allocated by the transaction 2792 * Free up any resources allocated by the transaction
2801 * 2793 *
2802 * Remember that EFIs, EFDs, and IUNLINKs are handled later. 2794 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
2803 */ 2795 */
2804 STATIC void 2796 STATIC void
2805 xlog_recover_free_trans( 2797 xlog_recover_free_trans(
2806 xlog_recover_t *trans) 2798 xlog_recover_t *trans)
2807 { 2799 {
2808 xlog_recover_item_t *first_item, *item, *free_item; 2800 xlog_recover_item_t *first_item, *item, *free_item;
2809 int i; 2801 int i;
2810 2802
2811 item = first_item = trans->r_itemq; 2803 item = first_item = trans->r_itemq;
2812 do { 2804 do {
2813 free_item = item; 2805 free_item = item;
2814 item = item->ri_next; 2806 item = item->ri_next;
2815 /* Free the regions in the item. */ 2807 /* Free the regions in the item. */
2816 for (i = 0; i < free_item->ri_cnt; i++) { 2808 for (i = 0; i < free_item->ri_cnt; i++) {
2817 kmem_free(free_item->ri_buf[i].i_addr); 2809 kmem_free(free_item->ri_buf[i].i_addr);
2818 } 2810 }
2819 /* Free the item itself */ 2811 /* Free the item itself */
2820 kmem_free(free_item->ri_buf); 2812 kmem_free(free_item->ri_buf);
2821 kmem_free(free_item); 2813 kmem_free(free_item);
2822 } while (first_item != item); 2814 } while (first_item != item);
2823 /* Free the transaction recover structure */ 2815 /* Free the transaction recover structure */
2824 kmem_free(trans); 2816 kmem_free(trans);
2825 } 2817 }
2826 2818
2827 STATIC int 2819 STATIC int
2828 xlog_recover_commit_trans( 2820 xlog_recover_commit_trans(
2829 xlog_t *log, 2821 xlog_t *log,
2830 xlog_recover_t **q, 2822 xlog_recover_t **q,
2831 xlog_recover_t *trans, 2823 xlog_recover_t *trans,
2832 int pass) 2824 int pass)
2833 { 2825 {
2834 int error; 2826 int error;
2835 2827
2836 if ((error = xlog_recover_unlink_tid(q, trans))) 2828 if ((error = xlog_recover_unlink_tid(q, trans)))
2837 return error; 2829 return error;
2838 if ((error = xlog_recover_do_trans(log, trans, pass))) 2830 if ((error = xlog_recover_do_trans(log, trans, pass)))
2839 return error; 2831 return error;
2840 xlog_recover_free_trans(trans); /* no error */ 2832 xlog_recover_free_trans(trans); /* no error */
2841 return 0; 2833 return 0;
2842 } 2834 }
2843 2835
2844 STATIC int 2836 STATIC int
2845 xlog_recover_unmount_trans( 2837 xlog_recover_unmount_trans(
2846 xlog_recover_t *trans) 2838 xlog_recover_t *trans)
2847 { 2839 {
2848 /* Do nothing now */ 2840 /* Do nothing now */
2849 xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR"); 2841 xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
2850 return 0; 2842 return 0;
2851 } 2843 }
2852 2844
2853 /* 2845 /*
2854 * There are two valid states of the r_state field. 0 indicates that the 2846 * There are two valid states of the r_state field. 0 indicates that the
2855 * transaction structure is in a normal state. We have either seen the 2847 * transaction structure is in a normal state. We have either seen the
2856 * start of the transaction or the last operation we added was not a partial 2848 * start of the transaction or the last operation we added was not a partial
2857 * operation. If the last operation we added to the transaction was a 2849 * operation. If the last operation we added to the transaction was a
2858 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS. 2850 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
2859 * 2851 *
2860 * NOTE: skip LRs with 0 data length. 2852 * NOTE: skip LRs with 0 data length.
2861 */ 2853 */
2862 STATIC int 2854 STATIC int
2863 xlog_recover_process_data( 2855 xlog_recover_process_data(
2864 xlog_t *log, 2856 xlog_t *log,
2865 xlog_recover_t *rhash[], 2857 xlog_recover_t *rhash[],
2866 xlog_rec_header_t *rhead, 2858 xlog_rec_header_t *rhead,
2867 xfs_caddr_t dp, 2859 xfs_caddr_t dp,
2868 int pass) 2860 int pass)
2869 { 2861 {
2870 xfs_caddr_t lp; 2862 xfs_caddr_t lp;
2871 int num_logops; 2863 int num_logops;
2872 xlog_op_header_t *ohead; 2864 xlog_op_header_t *ohead;
2873 xlog_recover_t *trans; 2865 xlog_recover_t *trans;
2874 xlog_tid_t tid; 2866 xlog_tid_t tid;
2875 int error; 2867 int error;
2876 unsigned long hash; 2868 unsigned long hash;
2877 uint flags; 2869 uint flags;
2878 2870
2879 lp = dp + be32_to_cpu(rhead->h_len); 2871 lp = dp + be32_to_cpu(rhead->h_len);
2880 num_logops = be32_to_cpu(rhead->h_num_logops); 2872 num_logops = be32_to_cpu(rhead->h_num_logops);
2881 2873
2882 /* check the log format matches our own - else we can't recover */ 2874 /* check the log format matches our own - else we can't recover */
2883 if (xlog_header_check_recover(log->l_mp, rhead)) 2875 if (xlog_header_check_recover(log->l_mp, rhead))
2884 return (XFS_ERROR(EIO)); 2876 return (XFS_ERROR(EIO));
2885 2877
2886 while ((dp < lp) && num_logops) { 2878 while ((dp < lp) && num_logops) {
2887 ASSERT(dp + sizeof(xlog_op_header_t) <= lp); 2879 ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
2888 ohead = (xlog_op_header_t *)dp; 2880 ohead = (xlog_op_header_t *)dp;
2889 dp += sizeof(xlog_op_header_t); 2881 dp += sizeof(xlog_op_header_t);
2890 if (ohead->oh_clientid != XFS_TRANSACTION && 2882 if (ohead->oh_clientid != XFS_TRANSACTION &&
2891 ohead->oh_clientid != XFS_LOG) { 2883 ohead->oh_clientid != XFS_LOG) {
2892 xlog_warn( 2884 xlog_warn(
2893 "XFS: xlog_recover_process_data: bad clientid"); 2885 "XFS: xlog_recover_process_data: bad clientid");
2894 ASSERT(0); 2886 ASSERT(0);
2895 return (XFS_ERROR(EIO)); 2887 return (XFS_ERROR(EIO));
2896 } 2888 }
2897 tid = be32_to_cpu(ohead->oh_tid); 2889 tid = be32_to_cpu(ohead->oh_tid);
2898 hash = XLOG_RHASH(tid); 2890 hash = XLOG_RHASH(tid);
2899 trans = xlog_recover_find_tid(rhash[hash], tid); 2891 trans = xlog_recover_find_tid(rhash[hash], tid);
2900 if (trans == NULL) { /* not found; add new tid */ 2892 if (trans == NULL) { /* not found; add new tid */
2901 if (ohead->oh_flags & XLOG_START_TRANS) 2893 if (ohead->oh_flags & XLOG_START_TRANS)
2902 xlog_recover_new_tid(&rhash[hash], tid, 2894 xlog_recover_new_tid(&rhash[hash], tid,
2903 be64_to_cpu(rhead->h_lsn)); 2895 be64_to_cpu(rhead->h_lsn));
2904 } else { 2896 } else {
2905 if (dp + be32_to_cpu(ohead->oh_len) > lp) { 2897 if (dp + be32_to_cpu(ohead->oh_len) > lp) {
2906 xlog_warn( 2898 xlog_warn(
2907 "XFS: xlog_recover_process_data: bad length"); 2899 "XFS: xlog_recover_process_data: bad length");
2908 WARN_ON(1); 2900 WARN_ON(1);
2909 return (XFS_ERROR(EIO)); 2901 return (XFS_ERROR(EIO));
2910 } 2902 }
2911 flags = ohead->oh_flags & ~XLOG_END_TRANS; 2903 flags = ohead->oh_flags & ~XLOG_END_TRANS;
2912 if (flags & XLOG_WAS_CONT_TRANS) 2904 if (flags & XLOG_WAS_CONT_TRANS)
2913 flags &= ~XLOG_CONTINUE_TRANS; 2905 flags &= ~XLOG_CONTINUE_TRANS;
2914 switch (flags) { 2906 switch (flags) {
2915 case XLOG_COMMIT_TRANS: 2907 case XLOG_COMMIT_TRANS:
2916 error = xlog_recover_commit_trans(log, 2908 error = xlog_recover_commit_trans(log,
2917 &rhash[hash], trans, pass); 2909 &rhash[hash], trans, pass);
2918 break; 2910 break;
2919 case XLOG_UNMOUNT_TRANS: 2911 case XLOG_UNMOUNT_TRANS:
2920 error = xlog_recover_unmount_trans(trans); 2912 error = xlog_recover_unmount_trans(trans);
2921 break; 2913 break;
2922 case XLOG_WAS_CONT_TRANS: 2914 case XLOG_WAS_CONT_TRANS:
2923 error = xlog_recover_add_to_cont_trans(trans, 2915 error = xlog_recover_add_to_cont_trans(trans,
2924 dp, be32_to_cpu(ohead->oh_len)); 2916 dp, be32_to_cpu(ohead->oh_len));
2925 break; 2917 break;
2926 case XLOG_START_TRANS: 2918 case XLOG_START_TRANS:
2927 xlog_warn( 2919 xlog_warn(
2928 "XFS: xlog_recover_process_data: bad transaction"); 2920 "XFS: xlog_recover_process_data: bad transaction");
2929 ASSERT(0); 2921 ASSERT(0);
2930 error = XFS_ERROR(EIO); 2922 error = XFS_ERROR(EIO);
2931 break; 2923 break;
2932 case 0: 2924 case 0:
2933 case XLOG_CONTINUE_TRANS: 2925 case XLOG_CONTINUE_TRANS:
2934 error = xlog_recover_add_to_trans(trans, 2926 error = xlog_recover_add_to_trans(trans,
2935 dp, be32_to_cpu(ohead->oh_len)); 2927 dp, be32_to_cpu(ohead->oh_len));
2936 break; 2928 break;
2937 default: 2929 default:
2938 xlog_warn( 2930 xlog_warn(
2939 "XFS: xlog_recover_process_data: bad flag"); 2931 "XFS: xlog_recover_process_data: bad flag");
2940 ASSERT(0); 2932 ASSERT(0);
2941 error = XFS_ERROR(EIO); 2933 error = XFS_ERROR(EIO);
2942 break; 2934 break;
2943 } 2935 }
2944 if (error) 2936 if (error)
2945 return error; 2937 return error;
2946 } 2938 }
2947 dp += be32_to_cpu(ohead->oh_len); 2939 dp += be32_to_cpu(ohead->oh_len);
2948 num_logops--; 2940 num_logops--;
2949 } 2941 }
2950 return 0; 2942 return 0;
2951 } 2943 }
2952 2944
2953 /* 2945 /*
2954 * Process an extent free intent item that was recovered from 2946 * Process an extent free intent item that was recovered from
2955 * the log. We need to free the extents that it describes. 2947 * the log. We need to free the extents that it describes.
2956 */ 2948 */
2957 STATIC int 2949 STATIC int
2958 xlog_recover_process_efi( 2950 xlog_recover_process_efi(
2959 xfs_mount_t *mp, 2951 xfs_mount_t *mp,
2960 xfs_efi_log_item_t *efip) 2952 xfs_efi_log_item_t *efip)
2961 { 2953 {
2962 xfs_efd_log_item_t *efdp; 2954 xfs_efd_log_item_t *efdp;
2963 xfs_trans_t *tp; 2955 xfs_trans_t *tp;
2964 int i; 2956 int i;
2965 int error = 0; 2957 int error = 0;
2966 xfs_extent_t *extp; 2958 xfs_extent_t *extp;
2967 xfs_fsblock_t startblock_fsb; 2959 xfs_fsblock_t startblock_fsb;
2968 2960
2969 ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED)); 2961 ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
2970 2962
2971 /* 2963 /*
2972 * First check the validity of the extents described by the 2964 * First check the validity of the extents described by the
2973 * EFI. If any are bad, then assume that all are bad and 2965 * EFI. If any are bad, then assume that all are bad and
2974 * just toss the EFI. 2966 * just toss the EFI.
2975 */ 2967 */
2976 for (i = 0; i < efip->efi_format.efi_nextents; i++) { 2968 for (i = 0; i < efip->efi_format.efi_nextents; i++) {
2977 extp = &(efip->efi_format.efi_extents[i]); 2969 extp = &(efip->efi_format.efi_extents[i]);
2978 startblock_fsb = XFS_BB_TO_FSB(mp, 2970 startblock_fsb = XFS_BB_TO_FSB(mp,
2979 XFS_FSB_TO_DADDR(mp, extp->ext_start)); 2971 XFS_FSB_TO_DADDR(mp, extp->ext_start));
2980 if ((startblock_fsb == 0) || 2972 if ((startblock_fsb == 0) ||
2981 (extp->ext_len == 0) || 2973 (extp->ext_len == 0) ||
2982 (startblock_fsb >= mp->m_sb.sb_dblocks) || 2974 (startblock_fsb >= mp->m_sb.sb_dblocks) ||
2983 (extp->ext_len >= mp->m_sb.sb_agblocks)) { 2975 (extp->ext_len >= mp->m_sb.sb_agblocks)) {
2984 /* 2976 /*
2985 * This will pull the EFI from the AIL and 2977 * This will pull the EFI from the AIL and
2986 * free the memory associated with it. 2978 * free the memory associated with it.
2987 */ 2979 */
2988 xfs_efi_release(efip, efip->efi_format.efi_nextents); 2980 xfs_efi_release(efip, efip->efi_format.efi_nextents);
2989 return XFS_ERROR(EIO); 2981 return XFS_ERROR(EIO);
2990 } 2982 }
2991 } 2983 }
2992 2984
2993 tp = xfs_trans_alloc(mp, 0); 2985 tp = xfs_trans_alloc(mp, 0);
2994 error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0); 2986 error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
2995 if (error) 2987 if (error)
2996 goto abort_error; 2988 goto abort_error;
2997 efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); 2989 efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
2998 2990
2999 for (i = 0; i < efip->efi_format.efi_nextents; i++) { 2991 for (i = 0; i < efip->efi_format.efi_nextents; i++) {
3000 extp = &(efip->efi_format.efi_extents[i]); 2992 extp = &(efip->efi_format.efi_extents[i]);
3001 error = xfs_free_extent(tp, extp->ext_start, extp->ext_len); 2993 error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
3002 if (error) 2994 if (error)
3003 goto abort_error; 2995 goto abort_error;
3004 xfs_trans_log_efd_extent(tp, efdp, extp->ext_start, 2996 xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
3005 extp->ext_len); 2997 extp->ext_len);
3006 } 2998 }
3007 2999
3008 efip->efi_flags |= XFS_EFI_RECOVERED; 3000 efip->efi_flags |= XFS_EFI_RECOVERED;
3009 error = xfs_trans_commit(tp, 0); 3001 error = xfs_trans_commit(tp, 0);
3010 return error; 3002 return error;
3011 3003
3012 abort_error: 3004 abort_error:
3013 xfs_trans_cancel(tp, XFS_TRANS_ABORT); 3005 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3014 return error; 3006 return error;
3015 } 3007 }
3016 3008
3017 /* 3009 /*
3018 * When this is called, all of the EFIs which did not have 3010 * When this is called, all of the EFIs which did not have
3019 * corresponding EFDs should be in the AIL. What we do now 3011 * corresponding EFDs should be in the AIL. What we do now
3020 * is free the extents associated with each one. 3012 * is free the extents associated with each one.
3021 * 3013 *
3022 * Since we process the EFIs in normal transactions, they 3014 * Since we process the EFIs in normal transactions, they
3023 * will be removed at some point after the commit. This prevents 3015 * will be removed at some point after the commit. This prevents
3024 * us from just walking down the list processing each one. 3016 * us from just walking down the list processing each one.
3025 * We'll use a flag in the EFI to skip those that we've already 3017 * We'll use a flag in the EFI to skip those that we've already
3026 * processed and use the AIL iteration mechanism's generation 3018 * processed and use the AIL iteration mechanism's generation
3027 * count to try to speed this up at least a bit. 3019 * count to try to speed this up at least a bit.
3028 * 3020 *
3029 * When we start, we know that the EFIs are the only things in 3021 * When we start, we know that the EFIs are the only things in
3030 * the AIL. As we process them, however, other items are added 3022 * the AIL. As we process them, however, other items are added
3031 * to the AIL. Since everything added to the AIL must come after 3023 * to the AIL. Since everything added to the AIL must come after
3032 * everything already in the AIL, we stop processing as soon as 3024 * everything already in the AIL, we stop processing as soon as
3033 * we see something other than an EFI in the AIL. 3025 * we see something other than an EFI in the AIL.
3034 */ 3026 */
3035 STATIC int 3027 STATIC int
3036 xlog_recover_process_efis( 3028 xlog_recover_process_efis(
3037 xlog_t *log) 3029 xlog_t *log)
3038 { 3030 {
3039 xfs_log_item_t *lip; 3031 xfs_log_item_t *lip;
3040 xfs_efi_log_item_t *efip; 3032 xfs_efi_log_item_t *efip;
3041 int error = 0; 3033 int error = 0;
3042 struct xfs_ail_cursor cur; 3034 struct xfs_ail_cursor cur;
3043 struct xfs_ail *ailp; 3035 struct xfs_ail *ailp;
3044 3036
3045 ailp = log->l_ailp; 3037 ailp = log->l_ailp;
3046 spin_lock(&ailp->xa_lock); 3038 spin_lock(&ailp->xa_lock);
3047 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); 3039 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3048 while (lip != NULL) { 3040 while (lip != NULL) {
3049 /* 3041 /*
3050 * We're done when we see something other than an EFI. 3042 * We're done when we see something other than an EFI.
3051 * There should be no EFIs left in the AIL now. 3043 * There should be no EFIs left in the AIL now.
3052 */ 3044 */
3053 if (lip->li_type != XFS_LI_EFI) { 3045 if (lip->li_type != XFS_LI_EFI) {
3054 #ifdef DEBUG 3046 #ifdef DEBUG
3055 for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur)) 3047 for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
3056 ASSERT(lip->li_type != XFS_LI_EFI); 3048 ASSERT(lip->li_type != XFS_LI_EFI);
3057 #endif 3049 #endif
3058 break; 3050 break;
3059 } 3051 }
3060 3052
3061 /* 3053 /*
3062 * Skip EFIs that we've already processed. 3054 * Skip EFIs that we've already processed.
3063 */ 3055 */
3064 efip = (xfs_efi_log_item_t *)lip; 3056 efip = (xfs_efi_log_item_t *)lip;
3065 if (efip->efi_flags & XFS_EFI_RECOVERED) { 3057 if (efip->efi_flags & XFS_EFI_RECOVERED) {
3066 lip = xfs_trans_ail_cursor_next(ailp, &cur); 3058 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3067 continue; 3059 continue;
3068 } 3060 }
3069 3061
3070 spin_unlock(&ailp->xa_lock); 3062 spin_unlock(&ailp->xa_lock);
3071 error = xlog_recover_process_efi(log->l_mp, efip); 3063 error = xlog_recover_process_efi(log->l_mp, efip);
3072 spin_lock(&ailp->xa_lock); 3064 spin_lock(&ailp->xa_lock);
3073 if (error) 3065 if (error)
3074 goto out; 3066 goto out;
3075 lip = xfs_trans_ail_cursor_next(ailp, &cur); 3067 lip = xfs_trans_ail_cursor_next(ailp, &cur);
3076 } 3068 }
3077 out: 3069 out:
3078 xfs_trans_ail_cursor_done(ailp, &cur); 3070 xfs_trans_ail_cursor_done(ailp, &cur);
3079 spin_unlock(&ailp->xa_lock); 3071 spin_unlock(&ailp->xa_lock);
3080 return error; 3072 return error;
3081 } 3073 }
3082 3074
3083 /* 3075 /*
3084 * This routine performs a transaction to null out a bad inode pointer 3076 * This routine performs a transaction to null out a bad inode pointer
3085 * in an agi unlinked inode hash bucket. 3077 * in an agi unlinked inode hash bucket.
3086 */ 3078 */
3087 STATIC void 3079 STATIC void
3088 xlog_recover_clear_agi_bucket( 3080 xlog_recover_clear_agi_bucket(
3089 xfs_mount_t *mp, 3081 xfs_mount_t *mp,
3090 xfs_agnumber_t agno, 3082 xfs_agnumber_t agno,
3091 int bucket) 3083 int bucket)
3092 { 3084 {
3093 xfs_trans_t *tp; 3085 xfs_trans_t *tp;
3094 xfs_agi_t *agi; 3086 xfs_agi_t *agi;
3095 xfs_buf_t *agibp; 3087 xfs_buf_t *agibp;
3096 int offset; 3088 int offset;
3097 int error; 3089 int error;
3098 3090
3099 tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); 3091 tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
3100 error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 3092 error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
3101 0, 0, 0); 3093 0, 0, 0);
3102 if (error) 3094 if (error)
3103 goto out_abort; 3095 goto out_abort;
3104 3096
3105 error = xfs_read_agi(mp, tp, agno, &agibp); 3097 error = xfs_read_agi(mp, tp, agno, &agibp);
3106 if (error) 3098 if (error)
3107 goto out_abort; 3099 goto out_abort;
3108 3100
3109 agi = XFS_BUF_TO_AGI(agibp); 3101 agi = XFS_BUF_TO_AGI(agibp);
3110 agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); 3102 agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
3111 offset = offsetof(xfs_agi_t, agi_unlinked) + 3103 offset = offsetof(xfs_agi_t, agi_unlinked) +
3112 (sizeof(xfs_agino_t) * bucket); 3104 (sizeof(xfs_agino_t) * bucket);
3113 xfs_trans_log_buf(tp, agibp, offset, 3105 xfs_trans_log_buf(tp, agibp, offset,
3114 (offset + sizeof(xfs_agino_t) - 1)); 3106 (offset + sizeof(xfs_agino_t) - 1));
3115 3107
3116 error = xfs_trans_commit(tp, 0); 3108 error = xfs_trans_commit(tp, 0);
3117 if (error) 3109 if (error)
3118 goto out_error; 3110 goto out_error;
3119 return; 3111 return;
3120 3112
3121 out_abort: 3113 out_abort:
3122 xfs_trans_cancel(tp, XFS_TRANS_ABORT); 3114 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3123 out_error: 3115 out_error:
3124 xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: " 3116 xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: "
3125 "failed to clear agi %d. Continuing.", agno); 3117 "failed to clear agi %d. Continuing.", agno);
3126 return; 3118 return;
3127 } 3119 }
3128 3120
3129 STATIC xfs_agino_t 3121 STATIC xfs_agino_t
3130 xlog_recover_process_one_iunlink( 3122 xlog_recover_process_one_iunlink(
3131 struct xfs_mount *mp, 3123 struct xfs_mount *mp,
3132 xfs_agnumber_t agno, 3124 xfs_agnumber_t agno,
3133 xfs_agino_t agino, 3125 xfs_agino_t agino,
3134 int bucket) 3126 int bucket)
3135 { 3127 {
3136 struct xfs_buf *ibp; 3128 struct xfs_buf *ibp;
3137 struct xfs_dinode *dip; 3129 struct xfs_dinode *dip;
3138 struct xfs_inode *ip; 3130 struct xfs_inode *ip;
3139 xfs_ino_t ino; 3131 xfs_ino_t ino;
3140 int error; 3132 int error;
3141 3133
3142 ino = XFS_AGINO_TO_INO(mp, agno, agino); 3134 ino = XFS_AGINO_TO_INO(mp, agno, agino);
3143 error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0); 3135 error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
3144 if (error) 3136 if (error)
3145 goto fail; 3137 goto fail;
3146 3138
3147 /* 3139 /*
3148 * Get the on disk inode to find the next inode in the bucket. 3140 * Get the on disk inode to find the next inode in the bucket.
3149 */ 3141 */
3150 error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK); 3142 error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK);
3151 if (error) 3143 if (error)
3152 goto fail_iput; 3144 goto fail_iput;
3153 3145
3154 ASSERT(ip->i_d.di_nlink == 0); 3146 ASSERT(ip->i_d.di_nlink == 0);
3155 ASSERT(ip->i_d.di_mode != 0); 3147 ASSERT(ip->i_d.di_mode != 0);
3156 3148
3157 /* setup for the next pass */ 3149 /* setup for the next pass */
3158 agino = be32_to_cpu(dip->di_next_unlinked); 3150 agino = be32_to_cpu(dip->di_next_unlinked);
3159 xfs_buf_relse(ibp); 3151 xfs_buf_relse(ibp);
3160 3152
3161 /* 3153 /*
3162 * Prevent any DMAPI event from being sent when the reference on 3154 * Prevent any DMAPI event from being sent when the reference on
3163 * the inode is dropped. 3155 * the inode is dropped.
3164 */ 3156 */
3165 ip->i_d.di_dmevmask = 0; 3157 ip->i_d.di_dmevmask = 0;
3166 3158
3167 IRELE(ip); 3159 IRELE(ip);
3168 return agino; 3160 return agino;
3169 3161
3170 fail_iput: 3162 fail_iput:
3171 IRELE(ip); 3163 IRELE(ip);
3172 fail: 3164 fail:
3173 /* 3165 /*
3174 * We can't read in the inode this bucket points to, or this inode 3166 * We can't read in the inode this bucket points to, or this inode
3175 * is messed up. Just ditch this bucket of inodes. We will lose 3167 * is messed up. Just ditch this bucket of inodes. We will lose
3176 * some inodes and space, but at least we won't hang. 3168 * some inodes and space, but at least we won't hang.
3177 * 3169 *
3178 * Call xlog_recover_clear_agi_bucket() to perform a transaction to 3170 * Call xlog_recover_clear_agi_bucket() to perform a transaction to
3179 * clear the inode pointer in the bucket. 3171 * clear the inode pointer in the bucket.
3180 */ 3172 */
3181 xlog_recover_clear_agi_bucket(mp, agno, bucket); 3173 xlog_recover_clear_agi_bucket(mp, agno, bucket);
3182 return NULLAGINO; 3174 return NULLAGINO;
3183 } 3175 }
3184 3176
3185 /* 3177 /*
3186 * xlog_iunlink_recover 3178 * xlog_iunlink_recover
3187 * 3179 *
3188 * This is called during recovery to process any inodes which 3180 * This is called during recovery to process any inodes which
3189 * we unlinked but not freed when the system crashed. These 3181 * we unlinked but not freed when the system crashed. These
3190 * inodes will be on the lists in the AGI blocks. What we do 3182 * inodes will be on the lists in the AGI blocks. What we do
3191 * here is scan all the AGIs and fully truncate and free any 3183 * here is scan all the AGIs and fully truncate and free any
3192 * inodes found on the lists. Each inode is removed from the 3184 * inodes found on the lists. Each inode is removed from the
3193 * lists when it has been fully truncated and is freed. The 3185 * lists when it has been fully truncated and is freed. The
3194 * freeing of the inode and its removal from the list must be 3186 * freeing of the inode and its removal from the list must be
3195 * atomic. 3187 * atomic.
3196 */ 3188 */
3197 void 3189 void
3198 xlog_recover_process_iunlinks( 3190 xlog_recover_process_iunlinks(
3199 xlog_t *log) 3191 xlog_t *log)
3200 { 3192 {
3201 xfs_mount_t *mp; 3193 xfs_mount_t *mp;
3202 xfs_agnumber_t agno; 3194 xfs_agnumber_t agno;
3203 xfs_agi_t *agi; 3195 xfs_agi_t *agi;
3204 xfs_buf_t *agibp; 3196 xfs_buf_t *agibp;
3205 xfs_agino_t agino; 3197 xfs_agino_t agino;
3206 int bucket; 3198 int bucket;
3207 int error; 3199 int error;
3208 uint mp_dmevmask; 3200 uint mp_dmevmask;
3209 3201
3210 mp = log->l_mp; 3202 mp = log->l_mp;
3211 3203
3212 /* 3204 /*
3213 * Prevent any DMAPI event from being sent while in this function. 3205 * Prevent any DMAPI event from being sent while in this function.
3214 */ 3206 */
3215 mp_dmevmask = mp->m_dmevmask; 3207 mp_dmevmask = mp->m_dmevmask;
3216 mp->m_dmevmask = 0; 3208 mp->m_dmevmask = 0;
3217 3209
3218 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 3210 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3219 /* 3211 /*
3220 * Find the agi for this ag. 3212 * Find the agi for this ag.
3221 */ 3213 */
3222 error = xfs_read_agi(mp, NULL, agno, &agibp); 3214 error = xfs_read_agi(mp, NULL, agno, &agibp);
3223 if (error) { 3215 if (error) {
3224 /* 3216 /*
3225 * AGI is b0rked. Don't process it. 3217 * AGI is b0rked. Don't process it.
3226 * 3218 *
3227 * We should probably mark the filesystem as corrupt 3219 * We should probably mark the filesystem as corrupt
3228 * after we've recovered all the ag's we can.... 3220 * after we've recovered all the ag's we can....
3229 */ 3221 */
3230 continue; 3222 continue;
3231 } 3223 }
3232 agi = XFS_BUF_TO_AGI(agibp); 3224 agi = XFS_BUF_TO_AGI(agibp);
3233 3225
3234 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { 3226 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
3235 agino = be32_to_cpu(agi->agi_unlinked[bucket]); 3227 agino = be32_to_cpu(agi->agi_unlinked[bucket]);
3236 while (agino != NULLAGINO) { 3228 while (agino != NULLAGINO) {
3237 /* 3229 /*
3238 * Release the agi buffer so that it can 3230 * Release the agi buffer so that it can
3239 * be acquired in the normal course of the 3231 * be acquired in the normal course of the
3240 * transaction to truncate and free the inode. 3232 * transaction to truncate and free the inode.
3241 */ 3233 */
3242 xfs_buf_relse(agibp); 3234 xfs_buf_relse(agibp);
3243 3235
3244 agino = xlog_recover_process_one_iunlink(mp, 3236 agino = xlog_recover_process_one_iunlink(mp,
3245 agno, agino, bucket); 3237 agno, agino, bucket);
3246 3238
3247 /* 3239 /*
3248 * Reacquire the agibuffer and continue around 3240 * Reacquire the agibuffer and continue around
3249 * the loop. This should never fail as we know 3241 * the loop. This should never fail as we know
3250 * the buffer was good earlier on. 3242 * the buffer was good earlier on.
3251 */ 3243 */
3252 error = xfs_read_agi(mp, NULL, agno, &agibp); 3244 error = xfs_read_agi(mp, NULL, agno, &agibp);
3253 ASSERT(error == 0); 3245 ASSERT(error == 0);
3254 agi = XFS_BUF_TO_AGI(agibp); 3246 agi = XFS_BUF_TO_AGI(agibp);
3255 } 3247 }
3256 } 3248 }
3257 3249
3258 /* 3250 /*
3259 * Release the buffer for the current agi so we can 3251 * Release the buffer for the current agi so we can
3260 * go on to the next one. 3252 * go on to the next one.
3261 */ 3253 */
3262 xfs_buf_relse(agibp); 3254 xfs_buf_relse(agibp);
3263 } 3255 }
3264 3256
3265 mp->m_dmevmask = mp_dmevmask; 3257 mp->m_dmevmask = mp_dmevmask;
3266 } 3258 }
3267 3259
3268 3260
3269 #ifdef DEBUG 3261 #ifdef DEBUG
3270 STATIC void 3262 STATIC void
3271 xlog_pack_data_checksum( 3263 xlog_pack_data_checksum(
3272 xlog_t *log, 3264 xlog_t *log,
3273 xlog_in_core_t *iclog, 3265 xlog_in_core_t *iclog,
3274 int size) 3266 int size)
3275 { 3267 {
3276 int i; 3268 int i;
3277 __be32 *up; 3269 __be32 *up;
3278 uint chksum = 0; 3270 uint chksum = 0;
3279 3271
3280 up = (__be32 *)iclog->ic_datap; 3272 up = (__be32 *)iclog->ic_datap;
3281 /* divide length by 4 to get # words */ 3273 /* divide length by 4 to get # words */
3282 for (i = 0; i < (size >> 2); i++) { 3274 for (i = 0; i < (size >> 2); i++) {
3283 chksum ^= be32_to_cpu(*up); 3275 chksum ^= be32_to_cpu(*up);
3284 up++; 3276 up++;
3285 } 3277 }
3286 iclog->ic_header.h_chksum = cpu_to_be32(chksum); 3278 iclog->ic_header.h_chksum = cpu_to_be32(chksum);
3287 } 3279 }
3288 #else 3280 #else
3289 #define xlog_pack_data_checksum(log, iclog, size) 3281 #define xlog_pack_data_checksum(log, iclog, size)
3290 #endif 3282 #endif
3291 3283
3292 /* 3284 /*
3293 * Stamp cycle number in every block 3285 * Stamp cycle number in every block
3294 */ 3286 */
3295 void 3287 void
3296 xlog_pack_data( 3288 xlog_pack_data(
3297 xlog_t *log, 3289 xlog_t *log,
3298 xlog_in_core_t *iclog, 3290 xlog_in_core_t *iclog,
3299 int roundoff) 3291 int roundoff)
3300 { 3292 {
3301 int i, j, k; 3293 int i, j, k;
3302 int size = iclog->ic_offset + roundoff; 3294 int size = iclog->ic_offset + roundoff;
3303 __be32 cycle_lsn; 3295 __be32 cycle_lsn;
3304 xfs_caddr_t dp; 3296 xfs_caddr_t dp;
3305 3297
3306 xlog_pack_data_checksum(log, iclog, size); 3298 xlog_pack_data_checksum(log, iclog, size);
3307 3299
3308 cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); 3300 cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
3309 3301
3310 dp = iclog->ic_datap; 3302 dp = iclog->ic_datap;
3311 for (i = 0; i < BTOBB(size) && 3303 for (i = 0; i < BTOBB(size) &&
3312 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { 3304 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3313 iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; 3305 iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
3314 *(__be32 *)dp = cycle_lsn; 3306 *(__be32 *)dp = cycle_lsn;
3315 dp += BBSIZE; 3307 dp += BBSIZE;
3316 } 3308 }
3317 3309
3318 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 3310 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3319 xlog_in_core_2_t *xhdr = iclog->ic_data; 3311 xlog_in_core_2_t *xhdr = iclog->ic_data;
3320 3312
3321 for ( ; i < BTOBB(size); i++) { 3313 for ( ; i < BTOBB(size); i++) {
3322 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3314 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3323 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3315 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3324 xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; 3316 xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
3325 *(__be32 *)dp = cycle_lsn; 3317 *(__be32 *)dp = cycle_lsn;
3326 dp += BBSIZE; 3318 dp += BBSIZE;
3327 } 3319 }
3328 3320
3329 for (i = 1; i < log->l_iclog_heads; i++) { 3321 for (i = 1; i < log->l_iclog_heads; i++) {
3330 xhdr[i].hic_xheader.xh_cycle = cycle_lsn; 3322 xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
3331 } 3323 }
3332 } 3324 }
3333 } 3325 }
3334 3326
3335 #if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) 3327 #if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
3336 STATIC void 3328 STATIC void
3337 xlog_unpack_data_checksum( 3329 xlog_unpack_data_checksum(
3338 xlog_rec_header_t *rhead, 3330 xlog_rec_header_t *rhead,
3339 xfs_caddr_t dp, 3331 xfs_caddr_t dp,
3340 xlog_t *log) 3332 xlog_t *log)
3341 { 3333 {
3342 __be32 *up = (__be32 *)dp; 3334 __be32 *up = (__be32 *)dp;
3343 uint chksum = 0; 3335 uint chksum = 0;
3344 int i; 3336 int i;
3345 3337
3346 /* divide length by 4 to get # words */ 3338 /* divide length by 4 to get # words */
3347 for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) { 3339 for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) {
3348 chksum ^= be32_to_cpu(*up); 3340 chksum ^= be32_to_cpu(*up);
3349 up++; 3341 up++;
3350 } 3342 }
3351 if (chksum != be32_to_cpu(rhead->h_chksum)) { 3343 if (chksum != be32_to_cpu(rhead->h_chksum)) {
3352 if (rhead->h_chksum || 3344 if (rhead->h_chksum ||
3353 ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) { 3345 ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) {
3354 cmn_err(CE_DEBUG, 3346 cmn_err(CE_DEBUG,
3355 "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n", 3347 "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n",
3356 be32_to_cpu(rhead->h_chksum), chksum); 3348 be32_to_cpu(rhead->h_chksum), chksum);
3357 cmn_err(CE_DEBUG, 3349 cmn_err(CE_DEBUG,
3358 "XFS: Disregard message if filesystem was created with non-DEBUG kernel"); 3350 "XFS: Disregard message if filesystem was created with non-DEBUG kernel");
3359 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 3351 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3360 cmn_err(CE_DEBUG, 3352 cmn_err(CE_DEBUG,
3361 "XFS: LogR this is a LogV2 filesystem\n"); 3353 "XFS: LogR this is a LogV2 filesystem\n");
3362 } 3354 }
3363 log->l_flags |= XLOG_CHKSUM_MISMATCH; 3355 log->l_flags |= XLOG_CHKSUM_MISMATCH;
3364 } 3356 }
3365 } 3357 }
3366 } 3358 }
3367 #else 3359 #else
3368 #define xlog_unpack_data_checksum(rhead, dp, log) 3360 #define xlog_unpack_data_checksum(rhead, dp, log)
3369 #endif 3361 #endif
3370 3362
3371 STATIC void 3363 STATIC void
3372 xlog_unpack_data( 3364 xlog_unpack_data(
3373 xlog_rec_header_t *rhead, 3365 xlog_rec_header_t *rhead,
3374 xfs_caddr_t dp, 3366 xfs_caddr_t dp,
3375 xlog_t *log) 3367 xlog_t *log)
3376 { 3368 {
3377 int i, j, k; 3369 int i, j, k;
3378 3370
3379 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && 3371 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
3380 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { 3372 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3381 *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; 3373 *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
3382 dp += BBSIZE; 3374 dp += BBSIZE;
3383 } 3375 }
3384 3376
3385 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 3377 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3386 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; 3378 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
3387 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { 3379 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
3388 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3380 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3389 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); 3381 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3390 *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; 3382 *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
3391 dp += BBSIZE; 3383 dp += BBSIZE;
3392 } 3384 }
3393 } 3385 }
3394 3386
3395 xlog_unpack_data_checksum(rhead, dp, log); 3387 xlog_unpack_data_checksum(rhead, dp, log);
3396 } 3388 }
3397 3389
3398 STATIC int 3390 STATIC int
3399 xlog_valid_rec_header( 3391 xlog_valid_rec_header(
3400 xlog_t *log, 3392 xlog_t *log,
3401 xlog_rec_header_t *rhead, 3393 xlog_rec_header_t *rhead,
3402 xfs_daddr_t blkno) 3394 xfs_daddr_t blkno)
3403 { 3395 {
3404 int hlen; 3396 int hlen;
3405 3397
3406 if (unlikely(be32_to_cpu(rhead->h_magicno) != XLOG_HEADER_MAGIC_NUM)) { 3398 if (unlikely(be32_to_cpu(rhead->h_magicno) != XLOG_HEADER_MAGIC_NUM)) {
3407 XFS_ERROR_REPORT("xlog_valid_rec_header(1)", 3399 XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
3408 XFS_ERRLEVEL_LOW, log->l_mp); 3400 XFS_ERRLEVEL_LOW, log->l_mp);
3409 return XFS_ERROR(EFSCORRUPTED); 3401 return XFS_ERROR(EFSCORRUPTED);
3410 } 3402 }
3411 if (unlikely( 3403 if (unlikely(
3412 (!rhead->h_version || 3404 (!rhead->h_version ||
3413 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { 3405 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
3414 xlog_warn("XFS: %s: unrecognised log version (%d).", 3406 xlog_warn("XFS: %s: unrecognised log version (%d).",
3415 __func__, be32_to_cpu(rhead->h_version)); 3407 __func__, be32_to_cpu(rhead->h_version));
3416 return XFS_ERROR(EIO); 3408 return XFS_ERROR(EIO);
3417 } 3409 }
3418 3410
3419 /* LR body must have data or it wouldn't have been written */ 3411 /* LR body must have data or it wouldn't have been written */
3420 hlen = be32_to_cpu(rhead->h_len); 3412 hlen = be32_to_cpu(rhead->h_len);
3421 if (unlikely( hlen <= 0 || hlen > INT_MAX )) { 3413 if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
3422 XFS_ERROR_REPORT("xlog_valid_rec_header(2)", 3414 XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
3423 XFS_ERRLEVEL_LOW, log->l_mp); 3415 XFS_ERRLEVEL_LOW, log->l_mp);
3424 return XFS_ERROR(EFSCORRUPTED); 3416 return XFS_ERROR(EFSCORRUPTED);
3425 } 3417 }
3426 if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) { 3418 if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
3427 XFS_ERROR_REPORT("xlog_valid_rec_header(3)", 3419 XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
3428 XFS_ERRLEVEL_LOW, log->l_mp); 3420 XFS_ERRLEVEL_LOW, log->l_mp);
3429 return XFS_ERROR(EFSCORRUPTED); 3421 return XFS_ERROR(EFSCORRUPTED);
3430 } 3422 }
3431 return 0; 3423 return 0;
3432 } 3424 }
3433 3425
3434 /* 3426 /*
3435 * Read the log from tail to head and process the log records found. 3427 * Read the log from tail to head and process the log records found.
3436 * Handle the two cases where the tail and head are in the same cycle 3428 * Handle the two cases where the tail and head are in the same cycle
3437 * and where the active portion of the log wraps around the end of 3429 * and where the active portion of the log wraps around the end of
3438 * the physical log separately. The pass parameter is passed through 3430 * the physical log separately. The pass parameter is passed through
3439 * to the routines called to process the data and is not looked at 3431 * to the routines called to process the data and is not looked at
3440 * here. 3432 * here.
3441 */ 3433 */
3442 STATIC int 3434 STATIC int
3443 xlog_do_recovery_pass( 3435 xlog_do_recovery_pass(
3444 xlog_t *log, 3436 xlog_t *log,
3445 xfs_daddr_t head_blk, 3437 xfs_daddr_t head_blk,
3446 xfs_daddr_t tail_blk, 3438 xfs_daddr_t tail_blk,
3447 int pass) 3439 int pass)
3448 { 3440 {
3449 xlog_rec_header_t *rhead; 3441 xlog_rec_header_t *rhead;
3450 xfs_daddr_t blk_no; 3442 xfs_daddr_t blk_no;
3451 xfs_caddr_t bufaddr, offset; 3443 xfs_caddr_t bufaddr, offset;
3452 xfs_buf_t *hbp, *dbp; 3444 xfs_buf_t *hbp, *dbp;
3453 int error = 0, h_size; 3445 int error = 0, h_size;
3454 int bblks, split_bblks; 3446 int bblks, split_bblks;
3455 int hblks, split_hblks, wrapped_hblks; 3447 int hblks, split_hblks, wrapped_hblks;
3456 xlog_recover_t *rhash[XLOG_RHASH_SIZE]; 3448 xlog_recover_t *rhash[XLOG_RHASH_SIZE];
3457 3449
3458 ASSERT(head_blk != tail_blk); 3450 ASSERT(head_blk != tail_blk);
3459 3451
3460 /* 3452 /*
3461 * Read the header of the tail block and get the iclog buffer size from 3453 * Read the header of the tail block and get the iclog buffer size from
3462 * h_size. Use this to tell how many sectors make up the log header. 3454 * h_size. Use this to tell how many sectors make up the log header.
3463 */ 3455 */
3464 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { 3456 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3465 /* 3457 /*
3466 * When using variable length iclogs, read first sector of 3458 * When using variable length iclogs, read first sector of
3467 * iclog header and extract the header size from it. Get a 3459 * iclog header and extract the header size from it. Get a
3468 * new hbp that is the correct size. 3460 * new hbp that is the correct size.
3469 */ 3461 */
3470 hbp = xlog_get_bp(log, 1); 3462 hbp = xlog_get_bp(log, 1);
3471 if (!hbp) 3463 if (!hbp)
3472 return ENOMEM; 3464 return ENOMEM;
3473 if ((error = xlog_bread(log, tail_blk, 1, hbp))) 3465 if ((error = xlog_bread(log, tail_blk, 1, hbp)))
3474 goto bread_err1; 3466 goto bread_err1;
3475 offset = xlog_align(log, tail_blk, 1, hbp); 3467 offset = xlog_align(log, tail_blk, 1, hbp);
3476 rhead = (xlog_rec_header_t *)offset; 3468 rhead = (xlog_rec_header_t *)offset;
3477 error = xlog_valid_rec_header(log, rhead, tail_blk); 3469 error = xlog_valid_rec_header(log, rhead, tail_blk);
3478 if (error) 3470 if (error)
3479 goto bread_err1; 3471 goto bread_err1;
3480 h_size = be32_to_cpu(rhead->h_size); 3472 h_size = be32_to_cpu(rhead->h_size);
3481 if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) && 3473 if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
3482 (h_size > XLOG_HEADER_CYCLE_SIZE)) { 3474 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
3483 hblks = h_size / XLOG_HEADER_CYCLE_SIZE; 3475 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
3484 if (h_size % XLOG_HEADER_CYCLE_SIZE) 3476 if (h_size % XLOG_HEADER_CYCLE_SIZE)
3485 hblks++; 3477 hblks++;
3486 xlog_put_bp(hbp); 3478 xlog_put_bp(hbp);
3487 hbp = xlog_get_bp(log, hblks); 3479 hbp = xlog_get_bp(log, hblks);
3488 } else { 3480 } else {
3489 hblks = 1; 3481 hblks = 1;
3490 } 3482 }
3491 } else { 3483 } else {
3492 ASSERT(log->l_sectbb_log == 0); 3484 ASSERT(log->l_sectbb_log == 0);
3493 hblks = 1; 3485 hblks = 1;
3494 hbp = xlog_get_bp(log, 1); 3486 hbp = xlog_get_bp(log, 1);
3495 h_size = XLOG_BIG_RECORD_BSIZE; 3487 h_size = XLOG_BIG_RECORD_BSIZE;
3496 } 3488 }
3497 3489
3498 if (!hbp) 3490 if (!hbp)
3499 return ENOMEM; 3491 return ENOMEM;
3500 dbp = xlog_get_bp(log, BTOBB(h_size)); 3492 dbp = xlog_get_bp(log, BTOBB(h_size));
3501 if (!dbp) { 3493 if (!dbp) {
3502 xlog_put_bp(hbp); 3494 xlog_put_bp(hbp);
3503 return ENOMEM; 3495 return ENOMEM;
3504 } 3496 }
3505 3497
3506 memset(rhash, 0, sizeof(rhash)); 3498 memset(rhash, 0, sizeof(rhash));
3507 if (tail_blk <= head_blk) { 3499 if (tail_blk <= head_blk) {
3508 for (blk_no = tail_blk; blk_no < head_blk; ) { 3500 for (blk_no = tail_blk; blk_no < head_blk; ) {
3509 if ((error = xlog_bread(log, blk_no, hblks, hbp))) 3501 if ((error = xlog_bread(log, blk_no, hblks, hbp)))
3510 goto bread_err2; 3502 goto bread_err2;
3511 offset = xlog_align(log, blk_no, hblks, hbp); 3503 offset = xlog_align(log, blk_no, hblks, hbp);
3512 rhead = (xlog_rec_header_t *)offset; 3504 rhead = (xlog_rec_header_t *)offset;
3513 error = xlog_valid_rec_header(log, rhead, blk_no); 3505 error = xlog_valid_rec_header(log, rhead, blk_no);
3514 if (error) 3506 if (error)
3515 goto bread_err2; 3507 goto bread_err2;
3516 3508
3517 /* blocks in data section */ 3509 /* blocks in data section */
3518 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); 3510 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3519 error = xlog_bread(log, blk_no + hblks, bblks, dbp); 3511 error = xlog_bread(log, blk_no + hblks, bblks, dbp);
3520 if (error) 3512 if (error)
3521 goto bread_err2; 3513 goto bread_err2;
3522 offset = xlog_align(log, blk_no + hblks, bblks, dbp); 3514 offset = xlog_align(log, blk_no + hblks, bblks, dbp);
3523 xlog_unpack_data(rhead, offset, log); 3515 xlog_unpack_data(rhead, offset, log);
3524 if ((error = xlog_recover_process_data(log, 3516 if ((error = xlog_recover_process_data(log,
3525 rhash, rhead, offset, pass))) 3517 rhash, rhead, offset, pass)))
3526 goto bread_err2; 3518 goto bread_err2;
3527 blk_no += bblks + hblks; 3519 blk_no += bblks + hblks;
3528 } 3520 }
3529 } else { 3521 } else {
3530 /* 3522 /*
3531 * Perform recovery around the end of the physical log. 3523 * Perform recovery around the end of the physical log.
3532 * When the head is not on the same cycle number as the tail, 3524 * When the head is not on the same cycle number as the tail,
3533 * we can't do a sequential recovery as above. 3525 * we can't do a sequential recovery as above.
3534 */ 3526 */
3535 blk_no = tail_blk; 3527 blk_no = tail_blk;
3536 while (blk_no < log->l_logBBsize) { 3528 while (blk_no < log->l_logBBsize) {
3537 /* 3529 /*
3538 * Check for header wrapping around physical end-of-log 3530 * Check for header wrapping around physical end-of-log
3539 */ 3531 */
3540 offset = NULL; 3532 offset = NULL;
3541 split_hblks = 0; 3533 split_hblks = 0;
3542 wrapped_hblks = 0; 3534 wrapped_hblks = 0;
3543 if (blk_no + hblks <= log->l_logBBsize) { 3535 if (blk_no + hblks <= log->l_logBBsize) {
3544 /* Read header in one read */ 3536 /* Read header in one read */
3545 error = xlog_bread(log, blk_no, hblks, hbp); 3537 error = xlog_bread(log, blk_no, hblks, hbp);
3546 if (error) 3538 if (error)
3547 goto bread_err2; 3539 goto bread_err2;
3548 offset = xlog_align(log, blk_no, hblks, hbp); 3540 offset = xlog_align(log, blk_no, hblks, hbp);
3549 } else { 3541 } else {
3550 /* This LR is split across physical log end */ 3542 /* This LR is split across physical log end */
3551 if (blk_no != log->l_logBBsize) { 3543 if (blk_no != log->l_logBBsize) {
3552 /* some data before physical log end */ 3544 /* some data before physical log end */
3553 ASSERT(blk_no <= INT_MAX); 3545 ASSERT(blk_no <= INT_MAX);
3554 split_hblks = log->l_logBBsize - (int)blk_no; 3546 split_hblks = log->l_logBBsize - (int)blk_no;
3555 ASSERT(split_hblks > 0); 3547 ASSERT(split_hblks > 0);
3556 if ((error = xlog_bread(log, blk_no, 3548 if ((error = xlog_bread(log, blk_no,
3557 split_hblks, hbp))) 3549 split_hblks, hbp)))
3558 goto bread_err2; 3550 goto bread_err2;
3559 offset = xlog_align(log, blk_no, 3551 offset = xlog_align(log, blk_no,
3560 split_hblks, hbp); 3552 split_hblks, hbp);
3561 } 3553 }
3562 /* 3554 /*
3563 * Note: this black magic still works with 3555 * Note: this black magic still works with
3564 * large sector sizes (non-512) only because: 3556 * large sector sizes (non-512) only because:
3565 * - we increased the buffer size originally 3557 * - we increased the buffer size originally
3566 * by 1 sector giving us enough extra space 3558 * by 1 sector giving us enough extra space
3567 * for the second read; 3559 * for the second read;
3568 * - the log start is guaranteed to be sector 3560 * - the log start is guaranteed to be sector
3569 * aligned; 3561 * aligned;
3570 * - we read the log end (LR header start) 3562 * - we read the log end (LR header start)
3571 * _first_, then the log start (LR header end) 3563 * _first_, then the log start (LR header end)
3572 * - order is important. 3564 * - order is important.
3573 */ 3565 */
3574 wrapped_hblks = hblks - split_hblks; 3566 wrapped_hblks = hblks - split_hblks;
3575 bufaddr = XFS_BUF_PTR(hbp); 3567 bufaddr = XFS_BUF_PTR(hbp);
3576 error = XFS_BUF_SET_PTR(hbp, 3568 error = XFS_BUF_SET_PTR(hbp,
3577 bufaddr + BBTOB(split_hblks), 3569 bufaddr + BBTOB(split_hblks),
3578 BBTOB(hblks - split_hblks)); 3570 BBTOB(hblks - split_hblks));
3579 if (!error) 3571 if (!error)
3580 error = xlog_bread(log, 0, 3572 error = xlog_bread(log, 0,
3581 wrapped_hblks, hbp); 3573 wrapped_hblks, hbp);
3582 if (!error) 3574 if (!error)
3583 error = XFS_BUF_SET_PTR(hbp, bufaddr, 3575 error = XFS_BUF_SET_PTR(hbp, bufaddr,
3584 BBTOB(hblks)); 3576 BBTOB(hblks));
3585 if (error) 3577 if (error)
3586 goto bread_err2; 3578 goto bread_err2;
3587 if (!offset) 3579 if (!offset)
3588 offset = xlog_align(log, 0, 3580 offset = xlog_align(log, 0,
3589 wrapped_hblks, hbp); 3581 wrapped_hblks, hbp);
3590 } 3582 }
3591 rhead = (xlog_rec_header_t *)offset; 3583 rhead = (xlog_rec_header_t *)offset;
3592 error = xlog_valid_rec_header(log, rhead, 3584 error = xlog_valid_rec_header(log, rhead,
3593 split_hblks ? blk_no : 0); 3585 split_hblks ? blk_no : 0);
3594 if (error) 3586 if (error)
3595 goto bread_err2; 3587 goto bread_err2;
3596 3588
3597 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); 3589 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3598 blk_no += hblks; 3590 blk_no += hblks;
3599 3591
3600 /* Read in data for log record */ 3592 /* Read in data for log record */
3601 if (blk_no + bblks <= log->l_logBBsize) { 3593 if (blk_no + bblks <= log->l_logBBsize) {
3602 error = xlog_bread(log, blk_no, bblks, dbp); 3594 error = xlog_bread(log, blk_no, bblks, dbp);
3603 if (error) 3595 if (error)
3604 goto bread_err2; 3596 goto bread_err2;
3605 offset = xlog_align(log, blk_no, bblks, dbp); 3597 offset = xlog_align(log, blk_no, bblks, dbp);
3606 } else { 3598 } else {
3607 /* This log record is split across the 3599 /* This log record is split across the
3608 * physical end of log */ 3600 * physical end of log */
3609 offset = NULL; 3601 offset = NULL;
3610 split_bblks = 0; 3602 split_bblks = 0;
3611 if (blk_no != log->l_logBBsize) { 3603 if (blk_no != log->l_logBBsize) {
3612 /* some data is before the physical 3604 /* some data is before the physical
3613 * end of log */ 3605 * end of log */
3614 ASSERT(!wrapped_hblks); 3606 ASSERT(!wrapped_hblks);
3615 ASSERT(blk_no <= INT_MAX); 3607 ASSERT(blk_no <= INT_MAX);
3616 split_bblks = 3608 split_bblks =
3617 log->l_logBBsize - (int)blk_no; 3609 log->l_logBBsize - (int)blk_no;
3618 ASSERT(split_bblks > 0); 3610 ASSERT(split_bblks > 0);
3619 if ((error = xlog_bread(log, blk_no, 3611 if ((error = xlog_bread(log, blk_no,
3620 split_bblks, dbp))) 3612 split_bblks, dbp)))
3621 goto bread_err2; 3613 goto bread_err2;
3622 offset = xlog_align(log, blk_no, 3614 offset = xlog_align(log, blk_no,
3623 split_bblks, dbp); 3615 split_bblks, dbp);
3624 } 3616 }
3625 /* 3617 /*
3626 * Note: this black magic still works with 3618 * Note: this black magic still works with
3627 * large sector sizes (non-512) only because: 3619 * large sector sizes (non-512) only because:
3628 * - we increased the buffer size originally 3620 * - we increased the buffer size originally
3629 * by 1 sector giving us enough extra space 3621 * by 1 sector giving us enough extra space
3630 * for the second read; 3622 * for the second read;
3631 * - the log start is guaranteed to be sector 3623 * - the log start is guaranteed to be sector
3632 * aligned; 3624 * aligned;
3633 * - we read the log end (LR header start) 3625 * - we read the log end (LR header start)
3634 * _first_, then the log start (LR header end) 3626 * _first_, then the log start (LR header end)
3635 * - order is important. 3627 * - order is important.
3636 */ 3628 */
3637 bufaddr = XFS_BUF_PTR(dbp); 3629 bufaddr = XFS_BUF_PTR(dbp);
3638 error = XFS_BUF_SET_PTR(dbp, 3630 error = XFS_BUF_SET_PTR(dbp,
3639 bufaddr + BBTOB(split_bblks), 3631 bufaddr + BBTOB(split_bblks),
3640 BBTOB(bblks - split_bblks)); 3632 BBTOB(bblks - split_bblks));
3641 if (!error) 3633 if (!error)
3642 error = xlog_bread(log, wrapped_hblks, 3634 error = xlog_bread(log, wrapped_hblks,
3643 bblks - split_bblks, 3635 bblks - split_bblks,
3644 dbp); 3636 dbp);
3645 if (!error) 3637 if (!error)
3646 error = XFS_BUF_SET_PTR(dbp, bufaddr, 3638 error = XFS_BUF_SET_PTR(dbp, bufaddr,
3647 h_size); 3639 h_size);
3648 if (error) 3640 if (error)
3649 goto bread_err2; 3641 goto bread_err2;
3650 if (!offset) 3642 if (!offset)
3651 offset = xlog_align(log, wrapped_hblks, 3643 offset = xlog_align(log, wrapped_hblks,
3652 bblks - split_bblks, dbp); 3644 bblks - split_bblks, dbp);
3653 } 3645 }
3654 xlog_unpack_data(rhead, offset, log); 3646 xlog_unpack_data(rhead, offset, log);
3655 if ((error = xlog_recover_process_data(log, rhash, 3647 if ((error = xlog_recover_process_data(log, rhash,
3656 rhead, offset, pass))) 3648 rhead, offset, pass)))
3657 goto bread_err2; 3649 goto bread_err2;
3658 blk_no += bblks; 3650 blk_no += bblks;
3659 } 3651 }
3660 3652
3661 ASSERT(blk_no >= log->l_logBBsize); 3653 ASSERT(blk_no >= log->l_logBBsize);
3662 blk_no -= log->l_logBBsize; 3654 blk_no -= log->l_logBBsize;
3663 3655
3664 /* read first part of physical log */ 3656 /* read first part of physical log */
3665 while (blk_no < head_blk) { 3657 while (blk_no < head_blk) {
3666 if ((error = xlog_bread(log, blk_no, hblks, hbp))) 3658 if ((error = xlog_bread(log, blk_no, hblks, hbp)))
3667 goto bread_err2; 3659 goto bread_err2;
3668 offset = xlog_align(log, blk_no, hblks, hbp); 3660 offset = xlog_align(log, blk_no, hblks, hbp);
3669 rhead = (xlog_rec_header_t *)offset; 3661 rhead = (xlog_rec_header_t *)offset;
3670 error = xlog_valid_rec_header(log, rhead, blk_no); 3662 error = xlog_valid_rec_header(log, rhead, blk_no);
3671 if (error) 3663 if (error)
3672 goto bread_err2; 3664 goto bread_err2;
3673 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); 3665 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3674 if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp))) 3666 if ((error = xlog_bread(log, blk_no+hblks, bblks, dbp)))
3675 goto bread_err2; 3667 goto bread_err2;
3676 offset = xlog_align(log, blk_no+hblks, bblks, dbp); 3668 offset = xlog_align(log, blk_no+hblks, bblks, dbp);
3677 xlog_unpack_data(rhead, offset, log); 3669 xlog_unpack_data(rhead, offset, log);
3678 if ((error = xlog_recover_process_data(log, rhash, 3670 if ((error = xlog_recover_process_data(log, rhash,
3679 rhead, offset, pass))) 3671 rhead, offset, pass)))
3680 goto bread_err2; 3672 goto bread_err2;
3681 blk_no += bblks + hblks; 3673 blk_no += bblks + hblks;
3682 } 3674 }
3683 } 3675 }
3684 3676
3685 bread_err2: 3677 bread_err2:
3686 xlog_put_bp(dbp); 3678 xlog_put_bp(dbp);
3687 bread_err1: 3679 bread_err1:
3688 xlog_put_bp(hbp); 3680 xlog_put_bp(hbp);
3689 return error; 3681 return error;
3690 } 3682 }
3691 3683
3692 /* 3684 /*
3693 * Do the recovery of the log. We actually do this in two phases. 3685 * Do the recovery of the log. We actually do this in two phases.
3694 * The two passes are necessary in order to implement the function 3686 * The two passes are necessary in order to implement the function
3695 * of cancelling a record written into the log. The first pass 3687 * of cancelling a record written into the log. The first pass
3696 * determines those things which have been cancelled, and the 3688 * determines those things which have been cancelled, and the
3697 * second pass replays log items normally except for those which 3689 * second pass replays log items normally except for those which
3698 * have been cancelled. The handling of the replay and cancellations 3690 * have been cancelled. The handling of the replay and cancellations
3699 * takes place in the log item type specific routines. 3691 * takes place in the log item type specific routines.
3700 * 3692 *
3701 * The table of items which have cancel records in the log is allocated 3693 * The table of items which have cancel records in the log is allocated
3702 * and freed at this level, since only here do we know when all of 3694 * and freed at this level, since only here do we know when all of
3703 * the log recovery has been completed. 3695 * the log recovery has been completed.
3704 */ 3696 */
3705 STATIC int 3697 STATIC int
3706 xlog_do_log_recovery( 3698 xlog_do_log_recovery(
3707 xlog_t *log, 3699 xlog_t *log,
3708 xfs_daddr_t head_blk, 3700 xfs_daddr_t head_blk,
3709 xfs_daddr_t tail_blk) 3701 xfs_daddr_t tail_blk)
3710 { 3702 {
3711 int error; 3703 int error;
3712 3704
3713 ASSERT(head_blk != tail_blk); 3705 ASSERT(head_blk != tail_blk);
3714 3706
3715 /* 3707 /*
3716 * First do a pass to find all of the cancelled buf log items. 3708 * First do a pass to find all of the cancelled buf log items.
3717 * Store them in the buf_cancel_table for use in the second pass. 3709 * Store them in the buf_cancel_table for use in the second pass.
3718 */ 3710 */
3719 log->l_buf_cancel_table = 3711 log->l_buf_cancel_table =
3720 (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE * 3712 (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
3721 sizeof(xfs_buf_cancel_t*), 3713 sizeof(xfs_buf_cancel_t*),
3722 KM_SLEEP); 3714 KM_SLEEP);
3723 error = xlog_do_recovery_pass(log, head_blk, tail_blk, 3715 error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3724 XLOG_RECOVER_PASS1); 3716 XLOG_RECOVER_PASS1);
3725 if (error != 0) { 3717 if (error != 0) {
3726 kmem_free(log->l_buf_cancel_table); 3718 kmem_free(log->l_buf_cancel_table);
3727 log->l_buf_cancel_table = NULL; 3719 log->l_buf_cancel_table = NULL;
3728 return error; 3720 return error;
3729 } 3721 }
3730 /* 3722 /*
3731 * Then do a second pass to actually recover the items in the log. 3723 * Then do a second pass to actually recover the items in the log.
3732 * When it is complete free the table of buf cancel items. 3724 * When it is complete free the table of buf cancel items.
3733 */ 3725 */
3734 error = xlog_do_recovery_pass(log, head_blk, tail_blk, 3726 error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3735 XLOG_RECOVER_PASS2); 3727 XLOG_RECOVER_PASS2);
3736 #ifdef DEBUG 3728 #ifdef DEBUG
3737 if (!error) { 3729 if (!error) {
3738 int i; 3730 int i;
3739 3731
3740 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) 3732 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3741 ASSERT(log->l_buf_cancel_table[i] == NULL); 3733 ASSERT(log->l_buf_cancel_table[i] == NULL);
3742 } 3734 }
3743 #endif /* DEBUG */ 3735 #endif /* DEBUG */
3744 3736
3745 kmem_free(log->l_buf_cancel_table); 3737 kmem_free(log->l_buf_cancel_table);
3746 log->l_buf_cancel_table = NULL; 3738 log->l_buf_cancel_table = NULL;
3747 3739
3748 return error; 3740 return error;
3749 } 3741 }
3750 3742
3751 /* 3743 /*
3752 * Do the actual recovery 3744 * Do the actual recovery
3753 */ 3745 */
3754 STATIC int 3746 STATIC int
3755 xlog_do_recover( 3747 xlog_do_recover(
3756 xlog_t *log, 3748 xlog_t *log,
3757 xfs_daddr_t head_blk, 3749 xfs_daddr_t head_blk,
3758 xfs_daddr_t tail_blk) 3750 xfs_daddr_t tail_blk)
3759 { 3751 {
3760 int error; 3752 int error;
3761 xfs_buf_t *bp; 3753 xfs_buf_t *bp;
3762 xfs_sb_t *sbp; 3754 xfs_sb_t *sbp;
3763 3755
3764 /* 3756 /*
3765 * First replay the images in the log. 3757 * First replay the images in the log.
3766 */ 3758 */
3767 error = xlog_do_log_recovery(log, head_blk, tail_blk); 3759 error = xlog_do_log_recovery(log, head_blk, tail_blk);
3768 if (error) { 3760 if (error) {
3769 return error; 3761 return error;
3770 } 3762 }
3771 3763
3772 XFS_bflush(log->l_mp->m_ddev_targp); 3764 XFS_bflush(log->l_mp->m_ddev_targp);
3773 3765
3774 /* 3766 /*
3775 * If IO errors happened during recovery, bail out. 3767 * If IO errors happened during recovery, bail out.
3776 */ 3768 */
3777 if (XFS_FORCED_SHUTDOWN(log->l_mp)) { 3769 if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
3778 return (EIO); 3770 return (EIO);
3779 } 3771 }
3780 3772
3781 /* 3773 /*
3782 * We now update the tail_lsn since much of the recovery has completed 3774 * We now update the tail_lsn since much of the recovery has completed
3783 * and there may be space available to use. If there were no extent 3775 * and there may be space available to use. If there were no extent
3784 * or iunlinks, we can free up the entire log and set the tail_lsn to 3776 * or iunlinks, we can free up the entire log and set the tail_lsn to
3785 * be the last_sync_lsn. This was set in xlog_find_tail to be the 3777 * be the last_sync_lsn. This was set in xlog_find_tail to be the
3786 * lsn of the last known good LR on disk. If there are extent frees 3778 * lsn of the last known good LR on disk. If there are extent frees
3787 * or iunlinks they will have some entries in the AIL; so we look at 3779 * or iunlinks they will have some entries in the AIL; so we look at
3788 * the AIL to determine how to set the tail_lsn. 3780 * the AIL to determine how to set the tail_lsn.
3789 */ 3781 */
3790 xlog_assign_tail_lsn(log->l_mp); 3782 xlog_assign_tail_lsn(log->l_mp);
3791 3783
3792 /* 3784 /*
3793 * Now that we've finished replaying all buffer and inode 3785 * Now that we've finished replaying all buffer and inode
3794 * updates, re-read in the superblock. 3786 * updates, re-read in the superblock.
3795 */ 3787 */
3796 bp = xfs_getsb(log->l_mp, 0); 3788 bp = xfs_getsb(log->l_mp, 0);
3797 XFS_BUF_UNDONE(bp); 3789 XFS_BUF_UNDONE(bp);
3798 ASSERT(!(XFS_BUF_ISWRITE(bp))); 3790 ASSERT(!(XFS_BUF_ISWRITE(bp)));
3799 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); 3791 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
3800 XFS_BUF_READ(bp); 3792 XFS_BUF_READ(bp);
3801 XFS_BUF_UNASYNC(bp); 3793 XFS_BUF_UNASYNC(bp);
3802 xfsbdstrat(log->l_mp, bp); 3794 xfsbdstrat(log->l_mp, bp);
3803 error = xfs_iowait(bp); 3795 error = xfs_iowait(bp);
3804 if (error) { 3796 if (error) {
3805 xfs_ioerror_alert("xlog_do_recover", 3797 xfs_ioerror_alert("xlog_do_recover",
3806 log->l_mp, bp, XFS_BUF_ADDR(bp)); 3798 log->l_mp, bp, XFS_BUF_ADDR(bp));
3807 ASSERT(0); 3799 ASSERT(0);
3808 xfs_buf_relse(bp); 3800 xfs_buf_relse(bp);
3809 return error; 3801 return error;
3810 } 3802 }
3811 3803
3812 /* Convert superblock from on-disk format */ 3804 /* Convert superblock from on-disk format */
3813 sbp = &log->l_mp->m_sb; 3805 sbp = &log->l_mp->m_sb;
3814 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); 3806 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
3815 ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); 3807 ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
3816 ASSERT(xfs_sb_good_version(sbp)); 3808 ASSERT(xfs_sb_good_version(sbp));
3817 xfs_buf_relse(bp); 3809 xfs_buf_relse(bp);
3818 3810
3819 /* We've re-read the superblock so re-initialize per-cpu counters */ 3811 /* We've re-read the superblock so re-initialize per-cpu counters */
3820 xfs_icsb_reinit_counters(log->l_mp); 3812 xfs_icsb_reinit_counters(log->l_mp);
3821 3813
3822 xlog_recover_check_summary(log); 3814 xlog_recover_check_summary(log);
3823 3815
3824 /* Normal transactions can now occur */ 3816 /* Normal transactions can now occur */
3825 log->l_flags &= ~XLOG_ACTIVE_RECOVERY; 3817 log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
3826 return 0; 3818 return 0;
3827 } 3819 }
3828 3820
3829 /* 3821 /*
3830 * Perform recovery and re-initialize some log variables in xlog_find_tail. 3822 * Perform recovery and re-initialize some log variables in xlog_find_tail.
3831 * 3823 *
3832 * Return error or zero. 3824 * Return error or zero.
3833 */ 3825 */
3834 int 3826 int
3835 xlog_recover( 3827 xlog_recover(
3836 xlog_t *log) 3828 xlog_t *log)
3837 { 3829 {
3838 xfs_daddr_t head_blk, tail_blk; 3830 xfs_daddr_t head_blk, tail_blk;
3839 int error; 3831 int error;
3840 3832
3841 /* find the tail of the log */ 3833 /* find the tail of the log */
3842 if ((error = xlog_find_tail(log, &head_blk, &tail_blk))) 3834 if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
3843 return error; 3835 return error;
3844 3836
3845 if (tail_blk != head_blk) { 3837 if (tail_blk != head_blk) {
3846 /* There used to be a comment here: 3838 /* There used to be a comment here:
3847 * 3839 *
3848 * disallow recovery on read-only mounts. note -- mount 3840 * disallow recovery on read-only mounts. note -- mount
3849 * checks for ENOSPC and turns it into an intelligent 3841 * checks for ENOSPC and turns it into an intelligent
3850 * error message. 3842 * error message.
3851 * ...but this is no longer true. Now, unless you specify 3843 * ...but this is no longer true. Now, unless you specify
3852 * NORECOVERY (in which case this function would never be 3844 * NORECOVERY (in which case this function would never be
3853 * called), we just go ahead and recover. We do this all 3845 * called), we just go ahead and recover. We do this all
3854 * under the vfs layer, so we can get away with it unless 3846 * under the vfs layer, so we can get away with it unless
3855 * the device itself is read-only, in which case we fail. 3847 * the device itself is read-only, in which case we fail.
3856 */ 3848 */
3857 if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) { 3849 if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
3858 return error; 3850 return error;
3859 } 3851 }
3860 3852
3861 cmn_err(CE_NOTE, 3853 cmn_err(CE_NOTE,
3862 "Starting XFS recovery on filesystem: %s (logdev: %s)", 3854 "Starting XFS recovery on filesystem: %s (logdev: %s)",
3863 log->l_mp->m_fsname, log->l_mp->m_logname ? 3855 log->l_mp->m_fsname, log->l_mp->m_logname ?
3864 log->l_mp->m_logname : "internal"); 3856 log->l_mp->m_logname : "internal");
3865 3857
3866 error = xlog_do_recover(log, head_blk, tail_blk); 3858 error = xlog_do_recover(log, head_blk, tail_blk);
3867 log->l_flags |= XLOG_RECOVERY_NEEDED; 3859 log->l_flags |= XLOG_RECOVERY_NEEDED;
3868 } 3860 }
3869 return error; 3861 return error;
3870 } 3862 }
3871 3863
3872 /* 3864 /*
3873 * In the first part of recovery we replay inodes and buffers and build 3865 * In the first part of recovery we replay inodes and buffers and build
3874 * up the list of extent free items which need to be processed. Here 3866 * up the list of extent free items which need to be processed. Here
3875 * we process the extent free items and clean up the on disk unlinked 3867 * we process the extent free items and clean up the on disk unlinked
3876 * inode lists. This is separated from the first part of recovery so 3868 * inode lists. This is separated from the first part of recovery so
3877 * that the root and real-time bitmap inodes can be read in from disk in 3869 * that the root and real-time bitmap inodes can be read in from disk in
3878 * between the two stages. This is necessary so that we can free space 3870 * between the two stages. This is necessary so that we can free space
3879 * in the real-time portion of the file system. 3871 * in the real-time portion of the file system.
3880 */ 3872 */
3881 int 3873 int
3882 xlog_recover_finish( 3874 xlog_recover_finish(
3883 xlog_t *log) 3875 xlog_t *log)
3884 { 3876 {
3885 /* 3877 /*
3886 * Now we're ready to do the transactions needed for the 3878 * Now we're ready to do the transactions needed for the
3887 * rest of recovery. Start with completing all the extent 3879 * rest of recovery. Start with completing all the extent
3888 * free intent records and then process the unlinked inode 3880 * free intent records and then process the unlinked inode
3889 * lists. At this point, we essentially run in normal mode 3881 * lists. At this point, we essentially run in normal mode
3890 * except that we're still performing recovery actions 3882 * except that we're still performing recovery actions
3891 * rather than accepting new requests. 3883 * rather than accepting new requests.
3892 */ 3884 */
3893 if (log->l_flags & XLOG_RECOVERY_NEEDED) { 3885 if (log->l_flags & XLOG_RECOVERY_NEEDED) {
3894 int error; 3886 int error;
3895 error = xlog_recover_process_efis(log); 3887 error = xlog_recover_process_efis(log);
3896 if (error) { 3888 if (error) {
3897 cmn_err(CE_ALERT, 3889 cmn_err(CE_ALERT,
3898 "Failed to recover EFIs on filesystem: %s", 3890 "Failed to recover EFIs on filesystem: %s",
3899 log->l_mp->m_fsname); 3891 log->l_mp->m_fsname);
3900 return error; 3892 return error;
3901 } 3893 }
3902 /* 3894 /*
3903 * Sync the log to get all the EFIs out of the AIL. 3895 * Sync the log to get all the EFIs out of the AIL.
3904 * This isn't absolutely necessary, but it helps in 3896 * This isn't absolutely necessary, but it helps in
3905 * case the unlink transactions would have problems 3897 * case the unlink transactions would have problems
3906 * pushing the EFIs out of the way. 3898 * pushing the EFIs out of the way.
3907 */ 3899 */
3908 xfs_log_force(log->l_mp, (xfs_lsn_t)0, 3900 xfs_log_force(log->l_mp, (xfs_lsn_t)0,
3909 (XFS_LOG_FORCE | XFS_LOG_SYNC)); 3901 (XFS_LOG_FORCE | XFS_LOG_SYNC));
3910 3902
3911 xlog_recover_process_iunlinks(log); 3903 xlog_recover_process_iunlinks(log);
3912 3904
3913 xlog_recover_check_summary(log); 3905 xlog_recover_check_summary(log);
3914 3906
3915 cmn_err(CE_NOTE, 3907 cmn_err(CE_NOTE,
3916 "Ending XFS recovery on filesystem: %s (logdev: %s)", 3908 "Ending XFS recovery on filesystem: %s (logdev: %s)",
3917 log->l_mp->m_fsname, log->l_mp->m_logname ? 3909 log->l_mp->m_fsname, log->l_mp->m_logname ?
3918 log->l_mp->m_logname : "internal"); 3910 log->l_mp->m_logname : "internal");
3919 log->l_flags &= ~XLOG_RECOVERY_NEEDED; 3911 log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3920 } else { 3912 } else {
3921 cmn_err(CE_DEBUG, 3913 cmn_err(CE_DEBUG,
3922 "!Ending clean XFS mount for filesystem: %s\n", 3914 "!Ending clean XFS mount for filesystem: %s\n",
3923 log->l_mp->m_fsname); 3915 log->l_mp->m_fsname);
3924 } 3916 }
3925 return 0; 3917 return 0;
3926 } 3918 }
3927 3919
3928 3920
3929 #if defined(DEBUG) 3921 #if defined(DEBUG)
3930 /* 3922 /*
3931 * Read all of the agf and agi counters and check that they 3923 * Read all of the agf and agi counters and check that they
3932 * are consistent with the superblock counters. 3924 * are consistent with the superblock counters.
3933 */ 3925 */
3934 void 3926 void
3935 xlog_recover_check_summary( 3927 xlog_recover_check_summary(
3936 xlog_t *log) 3928 xlog_t *log)
3937 { 3929 {
3938 xfs_mount_t *mp; 3930 xfs_mount_t *mp;
3939 xfs_agf_t *agfp; 3931 xfs_agf_t *agfp;
3940 xfs_buf_t *agfbp; 3932 xfs_buf_t *agfbp;
3941 xfs_buf_t *agibp; 3933 xfs_buf_t *agibp;
3942 xfs_buf_t *sbbp; 3934 xfs_buf_t *sbbp;
3943 #ifdef XFS_LOUD_RECOVERY 3935 #ifdef XFS_LOUD_RECOVERY
3944 xfs_sb_t *sbp; 3936 xfs_sb_t *sbp;
3945 #endif 3937 #endif
3946 xfs_agnumber_t agno; 3938 xfs_agnumber_t agno;
3947 __uint64_t freeblks; 3939 __uint64_t freeblks;
3948 __uint64_t itotal; 3940 __uint64_t itotal;
3949 __uint64_t ifree; 3941 __uint64_t ifree;
3950 int error; 3942 int error;
3951 3943
3952 mp = log->l_mp; 3944 mp = log->l_mp;
3953 3945
3954 freeblks = 0LL; 3946 freeblks = 0LL;
3955 itotal = 0LL; 3947 itotal = 0LL;
3956 ifree = 0LL; 3948 ifree = 0LL;
3957 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { 3949 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3958 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp); 3950 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
3959 if (error) { 3951 if (error) {
3960 xfs_fs_cmn_err(CE_ALERT, mp, 3952 xfs_fs_cmn_err(CE_ALERT, mp,
3961 "xlog_recover_check_summary(agf)" 3953 "xlog_recover_check_summary(agf)"
3962 "agf read failed agno %d error %d", 3954 "agf read failed agno %d error %d",
3963 agno, error); 3955 agno, error);
3964 } else { 3956 } else {
3965 agfp = XFS_BUF_TO_AGF(agfbp); 3957 agfp = XFS_BUF_TO_AGF(agfbp);
3966 freeblks += be32_to_cpu(agfp->agf_freeblks) + 3958 freeblks += be32_to_cpu(agfp->agf_freeblks) +
3967 be32_to_cpu(agfp->agf_flcount); 3959 be32_to_cpu(agfp->agf_flcount);
3968 xfs_buf_relse(agfbp); 3960 xfs_buf_relse(agfbp);
3969 } 3961 }
3970 3962
3971 error = xfs_read_agi(mp, NULL, agno, &agibp); 3963 error = xfs_read_agi(mp, NULL, agno, &agibp);
3972 if (!error) { 3964 if (!error) {
3973 struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); 3965 struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
3974 3966
3975 itotal += be32_to_cpu(agi->agi_count); 3967 itotal += be32_to_cpu(agi->agi_count);
3976 ifree += be32_to_cpu(agi->agi_freecount); 3968 ifree += be32_to_cpu(agi->agi_freecount);
3977 xfs_buf_relse(agibp); 3969 xfs_buf_relse(agibp);
3978 } 3970 }
3979 } 3971 }
3980 3972
3981 sbbp = xfs_getsb(mp, 0); 3973 sbbp = xfs_getsb(mp, 0);
3982 #ifdef XFS_LOUD_RECOVERY 3974 #ifdef XFS_LOUD_RECOVERY
3983 sbp = &mp->m_sb; 3975 sbp = &mp->m_sb;
3984 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp)); 3976 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp));
3985 cmn_err(CE_NOTE, 3977 cmn_err(CE_NOTE,
3986 "xlog_recover_check_summary: sb_icount %Lu itotal %Lu", 3978 "xlog_recover_check_summary: sb_icount %Lu itotal %Lu",
3987 sbp->sb_icount, itotal); 3979 sbp->sb_icount, itotal);
3988 cmn_err(CE_NOTE, 3980 cmn_err(CE_NOTE,
3989 "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu", 3981 "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu",
3990 sbp->sb_ifree, ifree); 3982 sbp->sb_ifree, ifree);
3991 cmn_err(CE_NOTE, 3983 cmn_err(CE_NOTE,
3992 "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu", 3984 "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu",
3993 sbp->sb_fdblocks, freeblks); 3985 sbp->sb_fdblocks, freeblks);
3994 #if 0 3986 #if 0
3995 /* 3987 /*
3996 * This is turned off until I account for the allocation 3988 * This is turned off until I account for the allocation
3997 * btree blocks which live in free space. 3989 * btree blocks which live in free space.
3998 */ 3990 */
3999 ASSERT(sbp->sb_icount == itotal); 3991 ASSERT(sbp->sb_icount == itotal);
4000 ASSERT(sbp->sb_ifree == ifree); 3992 ASSERT(sbp->sb_ifree == ifree);
4001 ASSERT(sbp->sb_fdblocks == freeblks); 3993 ASSERT(sbp->sb_fdblocks == freeblks);
4002 #endif 3994 #endif
4003 #endif 3995 #endif
4004 xfs_buf_relse(sbbp); 3996 xfs_buf_relse(sbbp);
4005 } 3997 }
4006 #endif /* DEBUG */ 3998 #endif /* DEBUG */
4007 3999
1 /* 1 /*
2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3 * All Rights Reserved. 3 * All Rights Reserved.
4 * 4 *
5 * This program is free software; you can redistribute it and/or 5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as 6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation. 7 * published by the Free Software Foundation.
8 * 8 *
9 * This program is distributed in the hope that it would be useful, 9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details. 12 * GNU General Public License for more details.
13 * 13 *
14 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation, 15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */ 17 */
18 #include "xfs.h" 18 #include "xfs.h"
19 #include "xfs_fs.h" 19 #include "xfs_fs.h"
20 #include "xfs_types.h" 20 #include "xfs_types.h"
21 #include "xfs_bit.h" 21 #include "xfs_bit.h"
22 #include "xfs_log.h" 22 #include "xfs_log.h"
23 #include "xfs_inum.h" 23 #include "xfs_inum.h"
24 #include "xfs_trans.h" 24 #include "xfs_trans.h"
25 #include "xfs_sb.h" 25 #include "xfs_sb.h"
26 #include "xfs_ag.h" 26 #include "xfs_ag.h"
27 #include "xfs_dir2.h" 27 #include "xfs_dir2.h"
28 #include "xfs_dmapi.h" 28 #include "xfs_dmapi.h"
29 #include "xfs_mount.h" 29 #include "xfs_mount.h"
30 #include "xfs_bmap_btree.h" 30 #include "xfs_bmap_btree.h"
31 #include "xfs_alloc_btree.h" 31 #include "xfs_alloc_btree.h"
32 #include "xfs_ialloc_btree.h" 32 #include "xfs_ialloc_btree.h"
33 #include "xfs_dir2_sf.h" 33 #include "xfs_dir2_sf.h"
34 #include "xfs_attr_sf.h" 34 #include "xfs_attr_sf.h"
35 #include "xfs_dinode.h" 35 #include "xfs_dinode.h"
36 #include "xfs_inode.h" 36 #include "xfs_inode.h"
37 #include "xfs_inode_item.h" 37 #include "xfs_inode_item.h"
38 #include "xfs_itable.h" 38 #include "xfs_itable.h"
39 #include "xfs_btree.h" 39 #include "xfs_btree.h"
40 #include "xfs_alloc.h" 40 #include "xfs_alloc.h"
41 #include "xfs_ialloc.h" 41 #include "xfs_ialloc.h"
42 #include "xfs_attr.h" 42 #include "xfs_attr.h"
43 #include "xfs_bmap.h" 43 #include "xfs_bmap.h"
44 #include "xfs_acl.h" 44 #include "xfs_acl.h"
45 #include "xfs_error.h" 45 #include "xfs_error.h"
46 #include "xfs_buf_item.h" 46 #include "xfs_buf_item.h"
47 #include "xfs_rw.h" 47 #include "xfs_rw.h"
48 48
49 /* 49 /*
50 * This is a subroutine for xfs_write() and other writers (xfs_ioctl) 50 * This is a subroutine for xfs_write() and other writers (xfs_ioctl)
51 * which clears the setuid and setgid bits when a file is written. 51 * which clears the setuid and setgid bits when a file is written.
52 */ 52 */
53 int 53 int
54 xfs_write_clear_setuid( 54 xfs_write_clear_setuid(
55 xfs_inode_t *ip) 55 xfs_inode_t *ip)
56 { 56 {
57 xfs_mount_t *mp; 57 xfs_mount_t *mp;
58 xfs_trans_t *tp; 58 xfs_trans_t *tp;
59 int error; 59 int error;
60 60
61 mp = ip->i_mount; 61 mp = ip->i_mount;
62 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID); 62 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
63 if ((error = xfs_trans_reserve(tp, 0, 63 if ((error = xfs_trans_reserve(tp, 0,
64 XFS_WRITEID_LOG_RES(mp), 64 XFS_WRITEID_LOG_RES(mp),
65 0, 0, 0))) { 65 0, 0, 0))) {
66 xfs_trans_cancel(tp, 0); 66 xfs_trans_cancel(tp, 0);
67 return error; 67 return error;
68 } 68 }
69 xfs_ilock(ip, XFS_ILOCK_EXCL); 69 xfs_ilock(ip, XFS_ILOCK_EXCL);
70 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 70 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
71 xfs_trans_ihold(tp, ip); 71 xfs_trans_ihold(tp, ip);
72 ip->i_d.di_mode &= ~S_ISUID; 72 ip->i_d.di_mode &= ~S_ISUID;
73 73
74 /* 74 /*
75 * Note that we don't have to worry about mandatory 75 * Note that we don't have to worry about mandatory
76 * file locking being disabled here because we only 76 * file locking being disabled here because we only
77 * clear the S_ISGID bit if the Group execute bit is 77 * clear the S_ISGID bit if the Group execute bit is
78 * on, but if it was on then mandatory locking wouldn't 78 * on, but if it was on then mandatory locking wouldn't
79 * have been enabled. 79 * have been enabled.
80 */ 80 */
81 if (ip->i_d.di_mode & S_IXGRP) { 81 if (ip->i_d.di_mode & S_IXGRP) {
82 ip->i_d.di_mode &= ~S_ISGID; 82 ip->i_d.di_mode &= ~S_ISGID;
83 } 83 }
84 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 84 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
85 xfs_trans_set_sync(tp); 85 xfs_trans_set_sync(tp);
86 error = xfs_trans_commit(tp, 0); 86 error = xfs_trans_commit(tp, 0);
87 xfs_iunlock(ip, XFS_ILOCK_EXCL); 87 xfs_iunlock(ip, XFS_ILOCK_EXCL);
88 return 0; 88 return 0;
89 } 89 }
90 90
91 /* 91 /*
92 * Handle logging requirements of various synchronous types of write. 92 * Handle logging requirements of various synchronous types of write.
93 */ 93 */
94 int 94 int
95 xfs_write_sync_logforce( 95 xfs_write_sync_logforce(
96 xfs_mount_t *mp, 96 xfs_mount_t *mp,
97 xfs_inode_t *ip) 97 xfs_inode_t *ip)
98 { 98 {
99 int error = 0; 99 int error = 0;
100 100
101 /* 101 /*
102 * If we're treating this as O_DSYNC and we have not updated the 102 * If we're treating this as O_DSYNC and we have not updated the
103 * size, force the log. 103 * size, force the log.
104 */ 104 */
105 if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) && 105 if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
106 !(ip->i_update_size)) { 106 !(ip->i_update_size)) {
107 xfs_inode_log_item_t *iip = ip->i_itemp; 107 xfs_inode_log_item_t *iip = ip->i_itemp;
108 108
109 /* 109 /*
110 * If an allocation transaction occurred 110 * If an allocation transaction occurred
111 * without extending the size, then we have to force 111 * without extending the size, then we have to force
112 * the log up the proper point to ensure that the 112 * the log up the proper point to ensure that the
113 * allocation is permanent. We can't count on 113 * allocation is permanent. We can't count on
114 * the fact that buffered writes lock out direct I/O 114 * the fact that buffered writes lock out direct I/O
115 * writes - the direct I/O write could have extended 115 * writes - the direct I/O write could have extended
116 * the size nontransactionally, then finished before 116 * the size nontransactionally, then finished before
117 * we started. xfs_write_file will think that the file 117 * we started. xfs_write_file will think that the file
118 * didn't grow but the update isn't safe unless the 118 * didn't grow but the update isn't safe unless the
119 * size change is logged. 119 * size change is logged.
120 * 120 *
121 * Force the log if we've committed a transaction 121 * Force the log if we've committed a transaction
122 * against the inode or if someone else has and 122 * against the inode or if someone else has and
123 * the commit record hasn't gone to disk (e.g. 123 * the commit record hasn't gone to disk (e.g.
124 * the inode is pinned). This guarantees that 124 * the inode is pinned). This guarantees that
125 * all changes affecting the inode are permanent 125 * all changes affecting the inode are permanent
126 * when we return. 126 * when we return.
127 */ 127 */
128 if (iip && iip->ili_last_lsn) { 128 if (iip && iip->ili_last_lsn) {
129 error = _xfs_log_force(mp, iip->ili_last_lsn, 129 error = _xfs_log_force(mp, iip->ili_last_lsn,
130 XFS_LOG_FORCE | XFS_LOG_SYNC, NULL); 130 XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
131 } else if (xfs_ipincount(ip) > 0) { 131 } else if (xfs_ipincount(ip) > 0) {
132 error = _xfs_log_force(mp, (xfs_lsn_t)0, 132 error = _xfs_log_force(mp, (xfs_lsn_t)0,
133 XFS_LOG_FORCE | XFS_LOG_SYNC, NULL); 133 XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
134 } 134 }
135 135
136 } else { 136 } else {
137 xfs_trans_t *tp; 137 xfs_trans_t *tp;
138 138
139 /* 139 /*
140 * O_SYNC or O_DSYNC _with_ a size update are handled 140 * O_SYNC or O_DSYNC _with_ a size update are handled
141 * the same way. 141 * the same way.
142 * 142 *
143 * If the write was synchronous then we need to make 143 * If the write was synchronous then we need to make
144 * sure that the inode modification time is permanent. 144 * sure that the inode modification time is permanent.
145 * We'll have updated the timestamp above, so here 145 * We'll have updated the timestamp above, so here
146 * we use a synchronous transaction to log the inode. 146 * we use a synchronous transaction to log the inode.
147 * It's not fast, but it's necessary. 147 * It's not fast, but it's necessary.
148 * 148 *
149 * If this a dsync write and the size got changed 149 * If this a dsync write and the size got changed
150 * non-transactionally, then we need to ensure that 150 * non-transactionally, then we need to ensure that
151 * the size change gets logged in a synchronous 151 * the size change gets logged in a synchronous
152 * transaction. 152 * transaction.
153 */ 153 */
154 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC); 154 tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
155 if ((error = xfs_trans_reserve(tp, 0, 155 if ((error = xfs_trans_reserve(tp, 0,
156 XFS_SWRITE_LOG_RES(mp), 156 XFS_SWRITE_LOG_RES(mp),
157 0, 0, 0))) { 157 0, 0, 0))) {
158 /* Transaction reserve failed */ 158 /* Transaction reserve failed */
159 xfs_trans_cancel(tp, 0); 159 xfs_trans_cancel(tp, 0);
160 } else { 160 } else {
161 /* Transaction reserve successful */ 161 /* Transaction reserve successful */
162 xfs_ilock(ip, XFS_ILOCK_EXCL); 162 xfs_ilock(ip, XFS_ILOCK_EXCL);
163 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 163 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
164 xfs_trans_ihold(tp, ip); 164 xfs_trans_ihold(tp, ip);
165 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 165 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
166 xfs_trans_set_sync(tp); 166 xfs_trans_set_sync(tp);
167 error = xfs_trans_commit(tp, 0); 167 error = xfs_trans_commit(tp, 0);
168 xfs_iunlock(ip, XFS_ILOCK_EXCL); 168 xfs_iunlock(ip, XFS_ILOCK_EXCL);
169 } 169 }
170 } 170 }
171 171
172 return error; 172 return error;
173 } 173 }
174 174
175 /* 175 /*
176 * Force a shutdown of the filesystem instantly while keeping 176 * Force a shutdown of the filesystem instantly while keeping
177 * the filesystem consistent. We don't do an unmount here; just shutdown 177 * the filesystem consistent. We don't do an unmount here; just shutdown
178 * the shop, make sure that absolutely nothing persistent happens to 178 * the shop, make sure that absolutely nothing persistent happens to
179 * this filesystem after this point. 179 * this filesystem after this point.
180 */ 180 */
181 void 181 void
182 xfs_do_force_shutdown( 182 xfs_do_force_shutdown(
183 xfs_mount_t *mp, 183 xfs_mount_t *mp,
184 int flags, 184 int flags,
185 char *fname, 185 char *fname,
186 int lnnum) 186 int lnnum)
187 { 187 {
188 int logerror; 188 int logerror;
189 189
190 logerror = flags & SHUTDOWN_LOG_IO_ERROR; 190 logerror = flags & SHUTDOWN_LOG_IO_ERROR;
191 191
192 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { 192 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
193 cmn_err(CE_NOTE, "xfs_force_shutdown(%s,0x%x) called from " 193 cmn_err(CE_NOTE, "xfs_force_shutdown(%s,0x%x) called from "
194 "line %d of file %s. Return address = 0x%p", 194 "line %d of file %s. Return address = 0x%p",
195 mp->m_fsname, flags, lnnum, fname, __return_address); 195 mp->m_fsname, flags, lnnum, fname, __return_address);
196 } 196 }
197 /* 197 /*
198 * No need to duplicate efforts. 198 * No need to duplicate efforts.
199 */ 199 */
200 if (XFS_FORCED_SHUTDOWN(mp) && !logerror) 200 if (XFS_FORCED_SHUTDOWN(mp) && !logerror)
201 return; 201 return;
202 202
203 /* 203 /*
204 * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't 204 * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't
205 * queue up anybody new on the log reservations, and wakes up 205 * queue up anybody new on the log reservations, and wakes up
206 * everybody who's sleeping on log reservations to tell them 206 * everybody who's sleeping on log reservations to tell them
207 * the bad news. 207 * the bad news.
208 */ 208 */
209 if (xfs_log_force_umount(mp, logerror)) 209 if (xfs_log_force_umount(mp, logerror))
210 return; 210 return;
211 211
212 if (flags & SHUTDOWN_CORRUPT_INCORE) { 212 if (flags & SHUTDOWN_CORRUPT_INCORE) {
213 xfs_cmn_err(XFS_PTAG_SHUTDOWN_CORRUPT, CE_ALERT, mp, 213 xfs_cmn_err(XFS_PTAG_SHUTDOWN_CORRUPT, CE_ALERT, mp,
214 "Corruption of in-memory data detected. Shutting down filesystem: %s", 214 "Corruption of in-memory data detected. Shutting down filesystem: %s",
215 mp->m_fsname); 215 mp->m_fsname);
216 if (XFS_ERRLEVEL_HIGH <= xfs_error_level) { 216 if (XFS_ERRLEVEL_HIGH <= xfs_error_level) {
217 xfs_stack_trace(); 217 xfs_stack_trace();
218 } 218 }
219 } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { 219 } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
220 if (logerror) { 220 if (logerror) {
221 xfs_cmn_err(XFS_PTAG_SHUTDOWN_LOGERROR, CE_ALERT, mp, 221 xfs_cmn_err(XFS_PTAG_SHUTDOWN_LOGERROR, CE_ALERT, mp,
222 "Log I/O Error Detected. Shutting down filesystem: %s", 222 "Log I/O Error Detected. Shutting down filesystem: %s",
223 mp->m_fsname); 223 mp->m_fsname);
224 } else if (flags & SHUTDOWN_DEVICE_REQ) { 224 } else if (flags & SHUTDOWN_DEVICE_REQ) {
225 xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp, 225 xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
226 "All device paths lost. Shutting down filesystem: %s", 226 "All device paths lost. Shutting down filesystem: %s",
227 mp->m_fsname); 227 mp->m_fsname);
228 } else if (!(flags & SHUTDOWN_REMOTE_REQ)) { 228 } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
229 xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp, 229 xfs_cmn_err(XFS_PTAG_SHUTDOWN_IOERROR, CE_ALERT, mp,
230 "I/O Error Detected. Shutting down filesystem: %s", 230 "I/O Error Detected. Shutting down filesystem: %s",
231 mp->m_fsname); 231 mp->m_fsname);
232 } 232 }
233 } 233 }
234 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) { 234 if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
235 cmn_err(CE_ALERT, "Please umount the filesystem, " 235 cmn_err(CE_ALERT, "Please umount the filesystem, "
236 "and rectify the problem(s)"); 236 "and rectify the problem(s)");
237 } 237 }
238 } 238 }
239 239
240 240
241 /* 241 /*
242 * Called when we want to stop a buffer from getting written or read. 242 * Called when we want to stop a buffer from getting written or read.
243 * We attach the EIO error, muck with its flags, and call biodone 243 * We attach the EIO error, muck with its flags, and call biodone
244 * so that the proper iodone callbacks get called. 244 * so that the proper iodone callbacks get called.
245 */ 245 */
246 int 246 int
247 xfs_bioerror( 247 xfs_bioerror(
248 xfs_buf_t *bp) 248 xfs_buf_t *bp)
249 { 249 {
250 250
251 #ifdef XFSERRORDEBUG 251 #ifdef XFSERRORDEBUG
252 ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone); 252 ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
253 #endif 253 #endif
254 254
255 /* 255 /*
256 * No need to wait until the buffer is unpinned. 256 * No need to wait until the buffer is unpinned.
257 * We aren't flushing it. 257 * We aren't flushing it.
258 */ 258 */
259 xfs_buftrace("XFS IOERROR", bp); 259 xfs_buftrace("XFS IOERROR", bp);
260 XFS_BUF_ERROR(bp, EIO); 260 XFS_BUF_ERROR(bp, EIO);
261 /* 261 /*
262 * We're calling biodone, so delete B_DONE flag. Either way 262 * We're calling biodone, so delete B_DONE flag. Either way
263 * we have to call the iodone callback, and calling biodone 263 * we have to call the iodone callback, and calling biodone
264 * probably is the best way since it takes care of 264 * probably is the best way since it takes care of
265 * GRIO as well. 265 * GRIO as well.
266 */ 266 */
267 XFS_BUF_UNREAD(bp); 267 XFS_BUF_UNREAD(bp);
268 XFS_BUF_UNDELAYWRITE(bp); 268 XFS_BUF_UNDELAYWRITE(bp);
269 XFS_BUF_UNDONE(bp); 269 XFS_BUF_UNDONE(bp);
270 XFS_BUF_STALE(bp); 270 XFS_BUF_STALE(bp);
271 271
272 XFS_BUF_CLR_BDSTRAT_FUNC(bp); 272 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
273 xfs_biodone(bp); 273 xfs_biodone(bp);
274 274
275 return (EIO); 275 return (EIO);
276 } 276 }
277 277
278 /* 278 /*
279 * Same as xfs_bioerror, except that we are releasing the buffer 279 * Same as xfs_bioerror, except that we are releasing the buffer
280 * here ourselves, and avoiding the biodone call. 280 * here ourselves, and avoiding the biodone call.
281 * This is meant for userdata errors; metadata bufs come with 281 * This is meant for userdata errors; metadata bufs come with
282 * iodone functions attached, so that we can track down errors. 282 * iodone functions attached, so that we can track down errors.
283 */ 283 */
284 int 284 int
285 xfs_bioerror_relse( 285 xfs_bioerror_relse(
286 xfs_buf_t *bp) 286 xfs_buf_t *bp)
287 { 287 {
288 int64_t fl; 288 int64_t fl;
289 289
290 ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks); 290 ASSERT(XFS_BUF_IODONE_FUNC(bp) != xfs_buf_iodone_callbacks);
291 ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone); 291 ASSERT(XFS_BUF_IODONE_FUNC(bp) != xlog_iodone);
292 292
293 xfs_buftrace("XFS IOERRELSE", bp); 293 xfs_buftrace("XFS IOERRELSE", bp);
294 fl = XFS_BUF_BFLAGS(bp); 294 fl = XFS_BUF_BFLAGS(bp);
295 /* 295 /*
296 * No need to wait until the buffer is unpinned. 296 * No need to wait until the buffer is unpinned.
297 * We aren't flushing it. 297 * We aren't flushing it.
298 * 298 *
299 * chunkhold expects B_DONE to be set, whether 299 * chunkhold expects B_DONE to be set, whether
300 * we actually finish the I/O or not. We don't want to 300 * we actually finish the I/O or not. We don't want to
301 * change that interface. 301 * change that interface.
302 */ 302 */
303 XFS_BUF_UNREAD(bp); 303 XFS_BUF_UNREAD(bp);
304 XFS_BUF_UNDELAYWRITE(bp); 304 XFS_BUF_UNDELAYWRITE(bp);
305 XFS_BUF_DONE(bp); 305 XFS_BUF_DONE(bp);
306 XFS_BUF_STALE(bp); 306 XFS_BUF_STALE(bp);
307 XFS_BUF_CLR_IODONE_FUNC(bp); 307 XFS_BUF_CLR_IODONE_FUNC(bp);
308 XFS_BUF_CLR_BDSTRAT_FUNC(bp); 308 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
309 if (!(fl & XFS_B_ASYNC)) { 309 if (!(fl & XFS_B_ASYNC)) {
310 /* 310 /*
311 * Mark b_error and B_ERROR _both_. 311 * Mark b_error and B_ERROR _both_.
312 * Lot's of chunkcache code assumes that. 312 * Lot's of chunkcache code assumes that.
313 * There's no reason to mark error for 313 * There's no reason to mark error for
314 * ASYNC buffers. 314 * ASYNC buffers.
315 */ 315 */
316 XFS_BUF_ERROR(bp, EIO); 316 XFS_BUF_ERROR(bp, EIO);
317 XFS_BUF_FINISH_IOWAIT(bp); 317 XFS_BUF_FINISH_IOWAIT(bp);
318 } else { 318 } else {
319 xfs_buf_relse(bp); 319 xfs_buf_relse(bp);
320 } 320 }
321 return (EIO); 321 return (EIO);
322 } 322 }
323 323
324 /* 324 /*
325 * Prints out an ALERT message about I/O error. 325 * Prints out an ALERT message about I/O error.
326 */ 326 */
327 void 327 void
328 xfs_ioerror_alert( 328 xfs_ioerror_alert(
329 char *func, 329 char *func,
330 struct xfs_mount *mp, 330 struct xfs_mount *mp,
331 xfs_buf_t *bp, 331 xfs_buf_t *bp,
332 xfs_daddr_t blkno) 332 xfs_daddr_t blkno)
333 { 333 {
334 cmn_err(CE_ALERT, 334 cmn_err(CE_ALERT,
335 "I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx" 335 "I/O error in filesystem (\"%s\") meta-data dev %s block 0x%llx"
336 " (\"%s\") error %d buf count %zd", 336 " (\"%s\") error %d buf count %zd",
337 (!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname, 337 (!mp || !mp->m_fsname) ? "(fs name not set)" : mp->m_fsname,
338 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)), 338 XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
339 (__uint64_t)blkno, func, 339 (__uint64_t)blkno, func,
340 XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp)); 340 XFS_BUF_GETERROR(bp), XFS_BUF_COUNT(bp));
341 } 341 }
342 342
343 /* 343 /*
344 * This isn't an absolute requirement, but it is 344 * This isn't an absolute requirement, but it is
345 * just a good idea to call xfs_read_buf instead of 345 * just a good idea to call xfs_read_buf instead of
346 * directly doing a read_buf call. For one, we shouldn't 346 * directly doing a read_buf call. For one, we shouldn't
347 * be doing this disk read if we are in SHUTDOWN state anyway, 347 * be doing this disk read if we are in SHUTDOWN state anyway,
348 * so this stops that from happening. Secondly, this does all 348 * so this stops that from happening. Secondly, this does all
349 * the error checking stuff and the brelse if appropriate for 349 * the error checking stuff and the brelse if appropriate for
350 * the caller, so the code can be a little leaner. 350 * the caller, so the code can be a little leaner.
351 */ 351 */
352 352
353 int 353 int
354 xfs_read_buf( 354 xfs_read_buf(
355 struct xfs_mount *mp, 355 struct xfs_mount *mp,
356 xfs_buftarg_t *target, 356 xfs_buftarg_t *target,
357 xfs_daddr_t blkno, 357 xfs_daddr_t blkno,
358 int len, 358 int len,
359 uint flags, 359 uint flags,
360 xfs_buf_t **bpp) 360 xfs_buf_t **bpp)
361 { 361 {
362 xfs_buf_t *bp; 362 xfs_buf_t *bp;
363 int error; 363 int error;
364 364
365 if (flags) 365 if (flags)
366 bp = xfs_buf_read_flags(target, blkno, len, flags); 366 bp = xfs_buf_read_flags(target, blkno, len, flags);
367 else 367 else
368 bp = xfs_buf_read(target, blkno, len, flags); 368 bp = xfs_buf_read(target, blkno, len, flags);
369 if (!bp) 369 if (!bp)
370 return XFS_ERROR(EIO); 370 return XFS_ERROR(EIO);
371 error = XFS_BUF_GETERROR(bp); 371 error = XFS_BUF_GETERROR(bp);
372 if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) { 372 if (bp && !error && !XFS_FORCED_SHUTDOWN(mp)) {
373 *bpp = bp; 373 *bpp = bp;
374 } else { 374 } else {
375 *bpp = NULL; 375 *bpp = NULL;
376 if (error) { 376 if (error) {
377 xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp)); 377 xfs_ioerror_alert("xfs_read_buf", mp, bp, XFS_BUF_ADDR(bp));
378 } else { 378 } else {
379 error = XFS_ERROR(EIO); 379 error = XFS_ERROR(EIO);
380 } 380 }
381 if (bp) { 381 if (bp) {
382 XFS_BUF_UNDONE(bp); 382 XFS_BUF_UNDONE(bp);
383 XFS_BUF_UNDELAYWRITE(bp); 383 XFS_BUF_UNDELAYWRITE(bp);
384 XFS_BUF_STALE(bp); 384 XFS_BUF_STALE(bp);
385 /* 385 /*
386 * brelse clears B_ERROR and b_error 386 * brelse clears B_ERROR and b_error
387 */ 387 */
388 xfs_buf_relse(bp); 388 xfs_buf_relse(bp);
389 } 389 }
390 } 390 }
391 return (error); 391 return (error);
392 } 392 }
393 393
394 /* 394 /*
395 * Wrapper around bwrite() so that we can trap 395 * Wrapper around bwrite() so that we can trap
396 * write errors, and act accordingly. 396 * write errors, and act accordingly.
397 */ 397 */
398 int 398 int
399 xfs_bwrite( 399 xfs_bwrite(
400 struct xfs_mount *mp, 400 struct xfs_mount *mp,
401 struct xfs_buf *bp) 401 struct xfs_buf *bp)
402 { 402 {
403 int error; 403 int error;
404 404
405 /* 405 /*
406 * XXXsup how does this work for quotas. 406 * XXXsup how does this work for quotas.
407 */ 407 */
408 XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb); 408 XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
409 XFS_BUF_SET_FSPRIVATE3(bp, mp); 409 bp->b_mount = mp;
410 XFS_BUF_WRITE(bp); 410 XFS_BUF_WRITE(bp);
411 411
412 if ((error = XFS_bwrite(bp))) { 412 if ((error = XFS_bwrite(bp))) {
413 ASSERT(mp); 413 ASSERT(mp);
414 /* 414 /*
415 * Cannot put a buftrace here since if the buffer is not 415 * Cannot put a buftrace here since if the buffer is not
416 * B_HOLD then we will brelse() the buffer before returning 416 * B_HOLD then we will brelse() the buffer before returning
417 * from bwrite and we could be tracing a buffer that has 417 * from bwrite and we could be tracing a buffer that has
418 * been reused. 418 * been reused.
419 */ 419 */
420 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 420 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
421 } 421 }
422 return (error); 422 return (error);
423 } 423 }
424 424