Commit 87f979d390f9ecfa3d0038a9f9a002a62f8a1895

Authored by Alex Elder
1 parent e7e319a9c5

ceph: kill ceph_osdc_writepages() "nofail" parameter

There is only one caller of ceph_osdc_writepages(), and it always
passes the value true as its "nofail" argument.  Get rid of that
argument and replace its use in ceph_osdc_writepages() with the
constant value true.

This and a number of cleanup patches that follow resolve:
    http://tracker.ceph.com/issues/4126

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>

Showing 3 changed files with 5 additions and 5 deletions Inline Diff

1 #include <linux/ceph/ceph_debug.h> 1 #include <linux/ceph/ceph_debug.h>
2 2
3 #include <linux/backing-dev.h> 3 #include <linux/backing-dev.h>
4 #include <linux/fs.h> 4 #include <linux/fs.h>
5 #include <linux/mm.h> 5 #include <linux/mm.h>
6 #include <linux/pagemap.h> 6 #include <linux/pagemap.h>
7 #include <linux/writeback.h> /* generic_writepages */ 7 #include <linux/writeback.h> /* generic_writepages */
8 #include <linux/slab.h> 8 #include <linux/slab.h>
9 #include <linux/pagevec.h> 9 #include <linux/pagevec.h>
10 #include <linux/task_io_accounting_ops.h> 10 #include <linux/task_io_accounting_ops.h>
11 11
12 #include "super.h" 12 #include "super.h"
13 #include "mds_client.h" 13 #include "mds_client.h"
14 #include <linux/ceph/osd_client.h> 14 #include <linux/ceph/osd_client.h>
15 15
16 /* 16 /*
17 * Ceph address space ops. 17 * Ceph address space ops.
18 * 18 *
19 * There are a few funny things going on here. 19 * There are a few funny things going on here.
20 * 20 *
21 * The page->private field is used to reference a struct 21 * The page->private field is used to reference a struct
22 * ceph_snap_context for _every_ dirty page. This indicates which 22 * ceph_snap_context for _every_ dirty page. This indicates which
23 * snapshot the page was logically dirtied in, and thus which snap 23 * snapshot the page was logically dirtied in, and thus which snap
24 * context needs to be associated with the osd write during writeback. 24 * context needs to be associated with the osd write during writeback.
25 * 25 *
26 * Similarly, struct ceph_inode_info maintains a set of counters to 26 * Similarly, struct ceph_inode_info maintains a set of counters to
27 * count dirty pages on the inode. In the absence of snapshots, 27 * count dirty pages on the inode. In the absence of snapshots,
28 * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count. 28 * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
29 * 29 *
30 * When a snapshot is taken (that is, when the client receives 30 * When a snapshot is taken (that is, when the client receives
31 * notification that a snapshot was taken), each inode with caps and 31 * notification that a snapshot was taken), each inode with caps and
32 * with dirty pages (dirty pages implies there is a cap) gets a new 32 * with dirty pages (dirty pages implies there is a cap) gets a new
33 * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending 33 * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
34 * order, new snaps go to the tail). The i_wrbuffer_ref_head count is 34 * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
35 * moved to capsnap->dirty. (Unless a sync write is currently in 35 * moved to capsnap->dirty. (Unless a sync write is currently in
36 * progress. In that case, the capsnap is said to be "pending", new 36 * progress. In that case, the capsnap is said to be "pending", new
37 * writes cannot start, and the capsnap isn't "finalized" until the 37 * writes cannot start, and the capsnap isn't "finalized" until the
38 * write completes (or fails) and a final size/mtime for the inode for 38 * write completes (or fails) and a final size/mtime for the inode for
39 * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0. 39 * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
40 * 40 *
41 * On writeback, we must submit writes to the osd IN SNAP ORDER. So, 41 * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
42 * we look for the first capsnap in i_cap_snaps and write out pages in 42 * we look for the first capsnap in i_cap_snaps and write out pages in
43 * that snap context _only_. Then we move on to the next capsnap, 43 * that snap context _only_. Then we move on to the next capsnap,
44 * eventually reaching the "live" or "head" context (i.e., pages that 44 * eventually reaching the "live" or "head" context (i.e., pages that
45 * are not yet snapped) and are writing the most recently dirtied 45 * are not yet snapped) and are writing the most recently dirtied
46 * pages. 46 * pages.
47 * 47 *
48 * Invalidate and so forth must take care to ensure the dirty page 48 * Invalidate and so forth must take care to ensure the dirty page
49 * accounting is preserved. 49 * accounting is preserved.
50 */ 50 */
51 51
52 #define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10)) 52 #define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
53 #define CONGESTION_OFF_THRESH(congestion_kb) \ 53 #define CONGESTION_OFF_THRESH(congestion_kb) \
54 (CONGESTION_ON_THRESH(congestion_kb) - \ 54 (CONGESTION_ON_THRESH(congestion_kb) - \
55 (CONGESTION_ON_THRESH(congestion_kb) >> 2)) 55 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
56 56
57 static inline struct ceph_snap_context *page_snap_context(struct page *page) 57 static inline struct ceph_snap_context *page_snap_context(struct page *page)
58 { 58 {
59 if (PagePrivate(page)) 59 if (PagePrivate(page))
60 return (void *)page->private; 60 return (void *)page->private;
61 return NULL; 61 return NULL;
62 } 62 }
63 63
64 /* 64 /*
65 * Dirty a page. Optimistically adjust accounting, on the assumption 65 * Dirty a page. Optimistically adjust accounting, on the assumption
66 * that we won't race with invalidate. If we do, readjust. 66 * that we won't race with invalidate. If we do, readjust.
67 */ 67 */
68 static int ceph_set_page_dirty(struct page *page) 68 static int ceph_set_page_dirty(struct page *page)
69 { 69 {
70 struct address_space *mapping = page->mapping; 70 struct address_space *mapping = page->mapping;
71 struct inode *inode; 71 struct inode *inode;
72 struct ceph_inode_info *ci; 72 struct ceph_inode_info *ci;
73 int undo = 0; 73 int undo = 0;
74 struct ceph_snap_context *snapc; 74 struct ceph_snap_context *snapc;
75 75
76 if (unlikely(!mapping)) 76 if (unlikely(!mapping))
77 return !TestSetPageDirty(page); 77 return !TestSetPageDirty(page);
78 78
79 if (TestSetPageDirty(page)) { 79 if (TestSetPageDirty(page)) {
80 dout("%p set_page_dirty %p idx %lu -- already dirty\n", 80 dout("%p set_page_dirty %p idx %lu -- already dirty\n",
81 mapping->host, page, page->index); 81 mapping->host, page, page->index);
82 return 0; 82 return 0;
83 } 83 }
84 84
85 inode = mapping->host; 85 inode = mapping->host;
86 ci = ceph_inode(inode); 86 ci = ceph_inode(inode);
87 87
88 /* 88 /*
89 * Note that we're grabbing a snapc ref here without holding 89 * Note that we're grabbing a snapc ref here without holding
90 * any locks! 90 * any locks!
91 */ 91 */
92 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); 92 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
93 93
94 /* dirty the head */ 94 /* dirty the head */
95 spin_lock(&ci->i_ceph_lock); 95 spin_lock(&ci->i_ceph_lock);
96 if (ci->i_head_snapc == NULL) 96 if (ci->i_head_snapc == NULL)
97 ci->i_head_snapc = ceph_get_snap_context(snapc); 97 ci->i_head_snapc = ceph_get_snap_context(snapc);
98 ++ci->i_wrbuffer_ref_head; 98 ++ci->i_wrbuffer_ref_head;
99 if (ci->i_wrbuffer_ref == 0) 99 if (ci->i_wrbuffer_ref == 0)
100 ihold(inode); 100 ihold(inode);
101 ++ci->i_wrbuffer_ref; 101 ++ci->i_wrbuffer_ref;
102 dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d " 102 dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
103 "snapc %p seq %lld (%d snaps)\n", 103 "snapc %p seq %lld (%d snaps)\n",
104 mapping->host, page, page->index, 104 mapping->host, page, page->index,
105 ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, 105 ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
106 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, 106 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
107 snapc, snapc->seq, snapc->num_snaps); 107 snapc, snapc->seq, snapc->num_snaps);
108 spin_unlock(&ci->i_ceph_lock); 108 spin_unlock(&ci->i_ceph_lock);
109 109
110 /* now adjust page */ 110 /* now adjust page */
111 spin_lock_irq(&mapping->tree_lock); 111 spin_lock_irq(&mapping->tree_lock);
112 if (page->mapping) { /* Race with truncate? */ 112 if (page->mapping) { /* Race with truncate? */
113 WARN_ON_ONCE(!PageUptodate(page)); 113 WARN_ON_ONCE(!PageUptodate(page));
114 account_page_dirtied(page, page->mapping); 114 account_page_dirtied(page, page->mapping);
115 radix_tree_tag_set(&mapping->page_tree, 115 radix_tree_tag_set(&mapping->page_tree,
116 page_index(page), PAGECACHE_TAG_DIRTY); 116 page_index(page), PAGECACHE_TAG_DIRTY);
117 117
118 /* 118 /*
119 * Reference snap context in page->private. Also set 119 * Reference snap context in page->private. Also set
120 * PagePrivate so that we get invalidatepage callback. 120 * PagePrivate so that we get invalidatepage callback.
121 */ 121 */
122 page->private = (unsigned long)snapc; 122 page->private = (unsigned long)snapc;
123 SetPagePrivate(page); 123 SetPagePrivate(page);
124 } else { 124 } else {
125 dout("ANON set_page_dirty %p (raced truncate?)\n", page); 125 dout("ANON set_page_dirty %p (raced truncate?)\n", page);
126 undo = 1; 126 undo = 1;
127 } 127 }
128 128
129 spin_unlock_irq(&mapping->tree_lock); 129 spin_unlock_irq(&mapping->tree_lock);
130 130
131 if (undo) 131 if (undo)
132 /* whoops, we failed to dirty the page */ 132 /* whoops, we failed to dirty the page */
133 ceph_put_wrbuffer_cap_refs(ci, 1, snapc); 133 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
134 134
135 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 135 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
136 136
137 BUG_ON(!PageDirty(page)); 137 BUG_ON(!PageDirty(page));
138 return 1; 138 return 1;
139 } 139 }
140 140
141 /* 141 /*
142 * If we are truncating the full page (i.e. offset == 0), adjust the 142 * If we are truncating the full page (i.e. offset == 0), adjust the
143 * dirty page counters appropriately. Only called if there is private 143 * dirty page counters appropriately. Only called if there is private
144 * data on the page. 144 * data on the page.
145 */ 145 */
146 static void ceph_invalidatepage(struct page *page, unsigned long offset) 146 static void ceph_invalidatepage(struct page *page, unsigned long offset)
147 { 147 {
148 struct inode *inode; 148 struct inode *inode;
149 struct ceph_inode_info *ci; 149 struct ceph_inode_info *ci;
150 struct ceph_snap_context *snapc = page_snap_context(page); 150 struct ceph_snap_context *snapc = page_snap_context(page);
151 151
152 BUG_ON(!PageLocked(page)); 152 BUG_ON(!PageLocked(page));
153 BUG_ON(!PagePrivate(page)); 153 BUG_ON(!PagePrivate(page));
154 BUG_ON(!page->mapping); 154 BUG_ON(!page->mapping);
155 155
156 inode = page->mapping->host; 156 inode = page->mapping->host;
157 157
158 /* 158 /*
159 * We can get non-dirty pages here due to races between 159 * We can get non-dirty pages here due to races between
160 * set_page_dirty and truncate_complete_page; just spit out a 160 * set_page_dirty and truncate_complete_page; just spit out a
161 * warning, in case we end up with accounting problems later. 161 * warning, in case we end up with accounting problems later.
162 */ 162 */
163 if (!PageDirty(page)) 163 if (!PageDirty(page))
164 pr_err("%p invalidatepage %p page not dirty\n", inode, page); 164 pr_err("%p invalidatepage %p page not dirty\n", inode, page);
165 165
166 if (offset == 0) 166 if (offset == 0)
167 ClearPageChecked(page); 167 ClearPageChecked(page);
168 168
169 ci = ceph_inode(inode); 169 ci = ceph_inode(inode);
170 if (offset == 0) { 170 if (offset == 0) {
171 dout("%p invalidatepage %p idx %lu full dirty page %lu\n", 171 dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
172 inode, page, page->index, offset); 172 inode, page, page->index, offset);
173 ceph_put_wrbuffer_cap_refs(ci, 1, snapc); 173 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
174 ceph_put_snap_context(snapc); 174 ceph_put_snap_context(snapc);
175 page->private = 0; 175 page->private = 0;
176 ClearPagePrivate(page); 176 ClearPagePrivate(page);
177 } else { 177 } else {
178 dout("%p invalidatepage %p idx %lu partial dirty page\n", 178 dout("%p invalidatepage %p idx %lu partial dirty page\n",
179 inode, page, page->index); 179 inode, page, page->index);
180 } 180 }
181 } 181 }
182 182
183 /* just a sanity check */ 183 /* just a sanity check */
184 static int ceph_releasepage(struct page *page, gfp_t g) 184 static int ceph_releasepage(struct page *page, gfp_t g)
185 { 185 {
186 struct inode *inode = page->mapping ? page->mapping->host : NULL; 186 struct inode *inode = page->mapping ? page->mapping->host : NULL;
187 dout("%p releasepage %p idx %lu\n", inode, page, page->index); 187 dout("%p releasepage %p idx %lu\n", inode, page, page->index);
188 WARN_ON(PageDirty(page)); 188 WARN_ON(PageDirty(page));
189 WARN_ON(PagePrivate(page)); 189 WARN_ON(PagePrivate(page));
190 return 0; 190 return 0;
191 } 191 }
192 192
193 /* 193 /*
194 * read a single page, without unlocking it. 194 * read a single page, without unlocking it.
195 */ 195 */
196 static int readpage_nounlock(struct file *filp, struct page *page) 196 static int readpage_nounlock(struct file *filp, struct page *page)
197 { 197 {
198 struct inode *inode = filp->f_dentry->d_inode; 198 struct inode *inode = filp->f_dentry->d_inode;
199 struct ceph_inode_info *ci = ceph_inode(inode); 199 struct ceph_inode_info *ci = ceph_inode(inode);
200 struct ceph_osd_client *osdc = 200 struct ceph_osd_client *osdc =
201 &ceph_inode_to_client(inode)->client->osdc; 201 &ceph_inode_to_client(inode)->client->osdc;
202 int err = 0; 202 int err = 0;
203 u64 len = PAGE_CACHE_SIZE; 203 u64 len = PAGE_CACHE_SIZE;
204 204
205 dout("readpage inode %p file %p page %p index %lu\n", 205 dout("readpage inode %p file %p page %p index %lu\n",
206 inode, filp, page, page->index); 206 inode, filp, page, page->index);
207 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 207 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
208 (u64) page_offset(page), &len, 208 (u64) page_offset(page), &len,
209 ci->i_truncate_seq, ci->i_truncate_size, 209 ci->i_truncate_seq, ci->i_truncate_size,
210 &page, 1, 0); 210 &page, 1, 0);
211 if (err == -ENOENT) 211 if (err == -ENOENT)
212 err = 0; 212 err = 0;
213 if (err < 0) { 213 if (err < 0) {
214 SetPageError(page); 214 SetPageError(page);
215 goto out; 215 goto out;
216 } else if (err < PAGE_CACHE_SIZE) { 216 } else if (err < PAGE_CACHE_SIZE) {
217 /* zero fill remainder of page */ 217 /* zero fill remainder of page */
218 zero_user_segment(page, err, PAGE_CACHE_SIZE); 218 zero_user_segment(page, err, PAGE_CACHE_SIZE);
219 } 219 }
220 SetPageUptodate(page); 220 SetPageUptodate(page);
221 221
222 out: 222 out:
223 return err < 0 ? err : 0; 223 return err < 0 ? err : 0;
224 } 224 }
225 225
226 static int ceph_readpage(struct file *filp, struct page *page) 226 static int ceph_readpage(struct file *filp, struct page *page)
227 { 227 {
228 int r = readpage_nounlock(filp, page); 228 int r = readpage_nounlock(filp, page);
229 unlock_page(page); 229 unlock_page(page);
230 return r; 230 return r;
231 } 231 }
232 232
233 /* 233 /*
234 * Finish an async read(ahead) op. 234 * Finish an async read(ahead) op.
235 */ 235 */
236 static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) 236 static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
237 { 237 {
238 struct inode *inode = req->r_inode; 238 struct inode *inode = req->r_inode;
239 struct ceph_osd_reply_head *replyhead; 239 struct ceph_osd_reply_head *replyhead;
240 int rc, bytes; 240 int rc, bytes;
241 int i; 241 int i;
242 242
243 /* parse reply */ 243 /* parse reply */
244 replyhead = msg->front.iov_base; 244 replyhead = msg->front.iov_base;
245 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); 245 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
246 rc = le32_to_cpu(replyhead->result); 246 rc = le32_to_cpu(replyhead->result);
247 bytes = le32_to_cpu(msg->hdr.data_len); 247 bytes = le32_to_cpu(msg->hdr.data_len);
248 248
249 dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); 249 dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
250 250
251 /* unlock all pages, zeroing any data we didn't read */ 251 /* unlock all pages, zeroing any data we didn't read */
252 for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) { 252 for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) {
253 struct page *page = req->r_pages[i]; 253 struct page *page = req->r_pages[i];
254 254
255 if (bytes < (int)PAGE_CACHE_SIZE) { 255 if (bytes < (int)PAGE_CACHE_SIZE) {
256 /* zero (remainder of) page */ 256 /* zero (remainder of) page */
257 int s = bytes < 0 ? 0 : bytes; 257 int s = bytes < 0 ? 0 : bytes;
258 zero_user_segment(page, s, PAGE_CACHE_SIZE); 258 zero_user_segment(page, s, PAGE_CACHE_SIZE);
259 } 259 }
260 dout("finish_read %p uptodate %p idx %lu\n", inode, page, 260 dout("finish_read %p uptodate %p idx %lu\n", inode, page,
261 page->index); 261 page->index);
262 flush_dcache_page(page); 262 flush_dcache_page(page);
263 SetPageUptodate(page); 263 SetPageUptodate(page);
264 unlock_page(page); 264 unlock_page(page);
265 page_cache_release(page); 265 page_cache_release(page);
266 } 266 }
267 kfree(req->r_pages); 267 kfree(req->r_pages);
268 } 268 }
269 269
270 static void ceph_unlock_page_vector(struct page **pages, int num_pages) 270 static void ceph_unlock_page_vector(struct page **pages, int num_pages)
271 { 271 {
272 int i; 272 int i;
273 273
274 for (i = 0; i < num_pages; i++) 274 for (i = 0; i < num_pages; i++)
275 unlock_page(pages[i]); 275 unlock_page(pages[i]);
276 } 276 }
277 277
278 /* 278 /*
279 * start an async read(ahead) operation. return nr_pages we submitted 279 * start an async read(ahead) operation. return nr_pages we submitted
280 * a read for on success, or negative error code. 280 * a read for on success, or negative error code.
281 */ 281 */
282 static int start_read(struct inode *inode, struct list_head *page_list, int max) 282 static int start_read(struct inode *inode, struct list_head *page_list, int max)
283 { 283 {
284 struct ceph_osd_client *osdc = 284 struct ceph_osd_client *osdc =
285 &ceph_inode_to_client(inode)->client->osdc; 285 &ceph_inode_to_client(inode)->client->osdc;
286 struct ceph_inode_info *ci = ceph_inode(inode); 286 struct ceph_inode_info *ci = ceph_inode(inode);
287 struct page *page = list_entry(page_list->prev, struct page, lru); 287 struct page *page = list_entry(page_list->prev, struct page, lru);
288 struct ceph_osd_request *req; 288 struct ceph_osd_request *req;
289 u64 off; 289 u64 off;
290 u64 len; 290 u64 len;
291 int i; 291 int i;
292 struct page **pages; 292 struct page **pages;
293 pgoff_t next_index; 293 pgoff_t next_index;
294 int nr_pages = 0; 294 int nr_pages = 0;
295 int ret; 295 int ret;
296 296
297 off = (u64) page_offset(page); 297 off = (u64) page_offset(page);
298 298
299 /* count pages */ 299 /* count pages */
300 next_index = page->index; 300 next_index = page->index;
301 list_for_each_entry_reverse(page, page_list, lru) { 301 list_for_each_entry_reverse(page, page_list, lru) {
302 if (page->index != next_index) 302 if (page->index != next_index)
303 break; 303 break;
304 nr_pages++; 304 nr_pages++;
305 next_index++; 305 next_index++;
306 if (max && nr_pages == max) 306 if (max && nr_pages == max)
307 break; 307 break;
308 } 308 }
309 len = nr_pages << PAGE_CACHE_SHIFT; 309 len = nr_pages << PAGE_CACHE_SHIFT;
310 dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, 310 dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
311 off, len); 311 off, len);
312 312
313 req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), 313 req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode),
314 off, &len, 314 off, &len,
315 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 315 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
316 NULL, 0, 316 NULL, 0,
317 ci->i_truncate_seq, ci->i_truncate_size, 317 ci->i_truncate_seq, ci->i_truncate_size,
318 NULL, false, 1, 0); 318 NULL, false, 1, 0);
319 if (IS_ERR(req)) 319 if (IS_ERR(req))
320 return PTR_ERR(req); 320 return PTR_ERR(req);
321 321
322 /* build page vector */ 322 /* build page vector */
323 nr_pages = len >> PAGE_CACHE_SHIFT; 323 nr_pages = len >> PAGE_CACHE_SHIFT;
324 pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); 324 pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
325 ret = -ENOMEM; 325 ret = -ENOMEM;
326 if (!pages) 326 if (!pages)
327 goto out; 327 goto out;
328 for (i = 0; i < nr_pages; ++i) { 328 for (i = 0; i < nr_pages; ++i) {
329 page = list_entry(page_list->prev, struct page, lru); 329 page = list_entry(page_list->prev, struct page, lru);
330 BUG_ON(PageLocked(page)); 330 BUG_ON(PageLocked(page));
331 list_del(&page->lru); 331 list_del(&page->lru);
332 332
333 dout("start_read %p adding %p idx %lu\n", inode, page, 333 dout("start_read %p adding %p idx %lu\n", inode, page,
334 page->index); 334 page->index);
335 if (add_to_page_cache_lru(page, &inode->i_data, page->index, 335 if (add_to_page_cache_lru(page, &inode->i_data, page->index,
336 GFP_NOFS)) { 336 GFP_NOFS)) {
337 page_cache_release(page); 337 page_cache_release(page);
338 dout("start_read %p add_to_page_cache failed %p\n", 338 dout("start_read %p add_to_page_cache failed %p\n",
339 inode, page); 339 inode, page);
340 nr_pages = i; 340 nr_pages = i;
341 goto out_pages; 341 goto out_pages;
342 } 342 }
343 pages[i] = page; 343 pages[i] = page;
344 } 344 }
345 req->r_pages = pages; 345 req->r_pages = pages;
346 req->r_num_pages = nr_pages; 346 req->r_num_pages = nr_pages;
347 req->r_callback = finish_read; 347 req->r_callback = finish_read;
348 req->r_inode = inode; 348 req->r_inode = inode;
349 349
350 dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); 350 dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
351 ret = ceph_osdc_start_request(osdc, req, false); 351 ret = ceph_osdc_start_request(osdc, req, false);
352 if (ret < 0) 352 if (ret < 0)
353 goto out_pages; 353 goto out_pages;
354 ceph_osdc_put_request(req); 354 ceph_osdc_put_request(req);
355 return nr_pages; 355 return nr_pages;
356 356
357 out_pages: 357 out_pages:
358 ceph_unlock_page_vector(pages, nr_pages); 358 ceph_unlock_page_vector(pages, nr_pages);
359 ceph_release_page_vector(pages, nr_pages); 359 ceph_release_page_vector(pages, nr_pages);
360 out: 360 out:
361 ceph_osdc_put_request(req); 361 ceph_osdc_put_request(req);
362 return ret; 362 return ret;
363 } 363 }
364 364
365 365
366 /* 366 /*
367 * Read multiple pages. Leave pages we don't read + unlock in page_list; 367 * Read multiple pages. Leave pages we don't read + unlock in page_list;
368 * the caller (VM) cleans them up. 368 * the caller (VM) cleans them up.
369 */ 369 */
370 static int ceph_readpages(struct file *file, struct address_space *mapping, 370 static int ceph_readpages(struct file *file, struct address_space *mapping,
371 struct list_head *page_list, unsigned nr_pages) 371 struct list_head *page_list, unsigned nr_pages)
372 { 372 {
373 struct inode *inode = file->f_dentry->d_inode; 373 struct inode *inode = file->f_dentry->d_inode;
374 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 374 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
375 int rc = 0; 375 int rc = 0;
376 int max = 0; 376 int max = 0;
377 377
378 if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) 378 if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
379 max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) 379 max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
380 >> PAGE_SHIFT; 380 >> PAGE_SHIFT;
381 381
382 dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages, 382 dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages,
383 max); 383 max);
384 while (!list_empty(page_list)) { 384 while (!list_empty(page_list)) {
385 rc = start_read(inode, page_list, max); 385 rc = start_read(inode, page_list, max);
386 if (rc < 0) 386 if (rc < 0)
387 goto out; 387 goto out;
388 BUG_ON(rc == 0); 388 BUG_ON(rc == 0);
389 } 389 }
390 out: 390 out:
391 dout("readpages %p file %p ret %d\n", inode, file, rc); 391 dout("readpages %p file %p ret %d\n", inode, file, rc);
392 return rc; 392 return rc;
393 } 393 }
394 394
395 /* 395 /*
396 * Get ref for the oldest snapc for an inode with dirty data... that is, the 396 * Get ref for the oldest snapc for an inode with dirty data... that is, the
397 * only snap context we are allowed to write back. 397 * only snap context we are allowed to write back.
398 */ 398 */
399 static struct ceph_snap_context *get_oldest_context(struct inode *inode, 399 static struct ceph_snap_context *get_oldest_context(struct inode *inode,
400 u64 *snap_size) 400 u64 *snap_size)
401 { 401 {
402 struct ceph_inode_info *ci = ceph_inode(inode); 402 struct ceph_inode_info *ci = ceph_inode(inode);
403 struct ceph_snap_context *snapc = NULL; 403 struct ceph_snap_context *snapc = NULL;
404 struct ceph_cap_snap *capsnap = NULL; 404 struct ceph_cap_snap *capsnap = NULL;
405 405
406 spin_lock(&ci->i_ceph_lock); 406 spin_lock(&ci->i_ceph_lock);
407 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 407 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
408 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, 408 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
409 capsnap->context, capsnap->dirty_pages); 409 capsnap->context, capsnap->dirty_pages);
410 if (capsnap->dirty_pages) { 410 if (capsnap->dirty_pages) {
411 snapc = ceph_get_snap_context(capsnap->context); 411 snapc = ceph_get_snap_context(capsnap->context);
412 if (snap_size) 412 if (snap_size)
413 *snap_size = capsnap->size; 413 *snap_size = capsnap->size;
414 break; 414 break;
415 } 415 }
416 } 416 }
417 if (!snapc && ci->i_wrbuffer_ref_head) { 417 if (!snapc && ci->i_wrbuffer_ref_head) {
418 snapc = ceph_get_snap_context(ci->i_head_snapc); 418 snapc = ceph_get_snap_context(ci->i_head_snapc);
419 dout(" head snapc %p has %d dirty pages\n", 419 dout(" head snapc %p has %d dirty pages\n",
420 snapc, ci->i_wrbuffer_ref_head); 420 snapc, ci->i_wrbuffer_ref_head);
421 } 421 }
422 spin_unlock(&ci->i_ceph_lock); 422 spin_unlock(&ci->i_ceph_lock);
423 return snapc; 423 return snapc;
424 } 424 }
425 425
426 /* 426 /*
427 * Write a single page, but leave the page locked. 427 * Write a single page, but leave the page locked.
428 * 428 *
429 * If we get a write error, set the page error bit, but still adjust the 429 * If we get a write error, set the page error bit, but still adjust the
430 * dirty page accounting (i.e., page is no longer dirty). 430 * dirty page accounting (i.e., page is no longer dirty).
431 */ 431 */
432 static int writepage_nounlock(struct page *page, struct writeback_control *wbc) 432 static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
433 { 433 {
434 struct inode *inode; 434 struct inode *inode;
435 struct ceph_inode_info *ci; 435 struct ceph_inode_info *ci;
436 struct ceph_fs_client *fsc; 436 struct ceph_fs_client *fsc;
437 struct ceph_osd_client *osdc; 437 struct ceph_osd_client *osdc;
438 loff_t page_off = page_offset(page); 438 loff_t page_off = page_offset(page);
439 int len = PAGE_CACHE_SIZE; 439 int len = PAGE_CACHE_SIZE;
440 loff_t i_size; 440 loff_t i_size;
441 int err = 0; 441 int err = 0;
442 struct ceph_snap_context *snapc, *oldest; 442 struct ceph_snap_context *snapc, *oldest;
443 u64 snap_size = 0; 443 u64 snap_size = 0;
444 long writeback_stat; 444 long writeback_stat;
445 445
446 dout("writepage %p idx %lu\n", page, page->index); 446 dout("writepage %p idx %lu\n", page, page->index);
447 447
448 if (!page->mapping || !page->mapping->host) { 448 if (!page->mapping || !page->mapping->host) {
449 dout("writepage %p - no mapping\n", page); 449 dout("writepage %p - no mapping\n", page);
450 return -EFAULT; 450 return -EFAULT;
451 } 451 }
452 inode = page->mapping->host; 452 inode = page->mapping->host;
453 ci = ceph_inode(inode); 453 ci = ceph_inode(inode);
454 fsc = ceph_inode_to_client(inode); 454 fsc = ceph_inode_to_client(inode);
455 osdc = &fsc->client->osdc; 455 osdc = &fsc->client->osdc;
456 456
457 /* verify this is a writeable snap context */ 457 /* verify this is a writeable snap context */
458 snapc = page_snap_context(page); 458 snapc = page_snap_context(page);
459 if (snapc == NULL) { 459 if (snapc == NULL) {
460 dout("writepage %p page %p not dirty?\n", inode, page); 460 dout("writepage %p page %p not dirty?\n", inode, page);
461 goto out; 461 goto out;
462 } 462 }
463 oldest = get_oldest_context(inode, &snap_size); 463 oldest = get_oldest_context(inode, &snap_size);
464 if (snapc->seq > oldest->seq) { 464 if (snapc->seq > oldest->seq) {
465 dout("writepage %p page %p snapc %p not writeable - noop\n", 465 dout("writepage %p page %p snapc %p not writeable - noop\n",
466 inode, page, snapc); 466 inode, page, snapc);
467 /* we should only noop if called by kswapd */ 467 /* we should only noop if called by kswapd */
468 WARN_ON((current->flags & PF_MEMALLOC) == 0); 468 WARN_ON((current->flags & PF_MEMALLOC) == 0);
469 ceph_put_snap_context(oldest); 469 ceph_put_snap_context(oldest);
470 goto out; 470 goto out;
471 } 471 }
472 ceph_put_snap_context(oldest); 472 ceph_put_snap_context(oldest);
473 473
474 /* is this a partial page at end of file? */ 474 /* is this a partial page at end of file? */
475 if (snap_size) 475 if (snap_size)
476 i_size = snap_size; 476 i_size = snap_size;
477 else 477 else
478 i_size = i_size_read(inode); 478 i_size = i_size_read(inode);
479 if (i_size < page_off + len) 479 if (i_size < page_off + len)
480 len = i_size - page_off; 480 len = i_size - page_off;
481 481
482 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", 482 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
483 inode, page, page->index, page_off, len, snapc); 483 inode, page, page->index, page_off, len, snapc);
484 484
485 writeback_stat = atomic_long_inc_return(&fsc->writeback_count); 485 writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
486 if (writeback_stat > 486 if (writeback_stat >
487 CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) 487 CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
488 set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); 488 set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
489 489
490 set_page_writeback(page); 490 set_page_writeback(page);
491 err = ceph_osdc_writepages(osdc, ceph_vino(inode), 491 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
492 &ci->i_layout, snapc, 492 &ci->i_layout, snapc,
493 page_off, len, 493 page_off, len,
494 ci->i_truncate_seq, ci->i_truncate_size, 494 ci->i_truncate_seq, ci->i_truncate_size,
495 &inode->i_mtime, 495 &inode->i_mtime,
496 &page, 1, 0, 0, true); 496 &page, 1, 0, 0);
497 if (err < 0) { 497 if (err < 0) {
498 dout("writepage setting page/mapping error %d %p\n", err, page); 498 dout("writepage setting page/mapping error %d %p\n", err, page);
499 SetPageError(page); 499 SetPageError(page);
500 mapping_set_error(&inode->i_data, err); 500 mapping_set_error(&inode->i_data, err);
501 if (wbc) 501 if (wbc)
502 wbc->pages_skipped++; 502 wbc->pages_skipped++;
503 } else { 503 } else {
504 dout("writepage cleaned page %p\n", page); 504 dout("writepage cleaned page %p\n", page);
505 err = 0; /* vfs expects us to return 0 */ 505 err = 0; /* vfs expects us to return 0 */
506 } 506 }
507 page->private = 0; 507 page->private = 0;
508 ClearPagePrivate(page); 508 ClearPagePrivate(page);
509 end_page_writeback(page); 509 end_page_writeback(page);
510 ceph_put_wrbuffer_cap_refs(ci, 1, snapc); 510 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
511 ceph_put_snap_context(snapc); /* page's reference */ 511 ceph_put_snap_context(snapc); /* page's reference */
512 out: 512 out:
513 return err; 513 return err;
514 } 514 }
515 515
516 static int ceph_writepage(struct page *page, struct writeback_control *wbc) 516 static int ceph_writepage(struct page *page, struct writeback_control *wbc)
517 { 517 {
518 int err; 518 int err;
519 struct inode *inode = page->mapping->host; 519 struct inode *inode = page->mapping->host;
520 BUG_ON(!inode); 520 BUG_ON(!inode);
521 ihold(inode); 521 ihold(inode);
522 err = writepage_nounlock(page, wbc); 522 err = writepage_nounlock(page, wbc);
523 unlock_page(page); 523 unlock_page(page);
524 iput(inode); 524 iput(inode);
525 return err; 525 return err;
526 } 526 }
527 527
528 528
529 /* 529 /*
530 * lame release_pages helper. release_pages() isn't exported to 530 * lame release_pages helper. release_pages() isn't exported to
531 * modules. 531 * modules.
532 */ 532 */
533 static void ceph_release_pages(struct page **pages, int num) 533 static void ceph_release_pages(struct page **pages, int num)
534 { 534 {
535 struct pagevec pvec; 535 struct pagevec pvec;
536 int i; 536 int i;
537 537
538 pagevec_init(&pvec, 0); 538 pagevec_init(&pvec, 0);
539 for (i = 0; i < num; i++) { 539 for (i = 0; i < num; i++) {
540 if (pagevec_add(&pvec, pages[i]) == 0) 540 if (pagevec_add(&pvec, pages[i]) == 0)
541 pagevec_release(&pvec); 541 pagevec_release(&pvec);
542 } 542 }
543 pagevec_release(&pvec); 543 pagevec_release(&pvec);
544 } 544 }
545 545
546 546
547 /* 547 /*
548 * async writeback completion handler. 548 * async writeback completion handler.
549 * 549 *
550 * If we get an error, set the mapping error bit, but not the individual 550 * If we get an error, set the mapping error bit, but not the individual
551 * page error bits. 551 * page error bits.
552 */ 552 */
553 static void writepages_finish(struct ceph_osd_request *req, 553 static void writepages_finish(struct ceph_osd_request *req,
554 struct ceph_msg *msg) 554 struct ceph_msg *msg)
555 { 555 {
556 struct inode *inode = req->r_inode; 556 struct inode *inode = req->r_inode;
557 struct ceph_osd_reply_head *replyhead; 557 struct ceph_osd_reply_head *replyhead;
558 struct ceph_osd_op *op; 558 struct ceph_osd_op *op;
559 struct ceph_inode_info *ci = ceph_inode(inode); 559 struct ceph_inode_info *ci = ceph_inode(inode);
560 unsigned wrote; 560 unsigned wrote;
561 struct page *page; 561 struct page *page;
562 int i; 562 int i;
563 struct ceph_snap_context *snapc = req->r_snapc; 563 struct ceph_snap_context *snapc = req->r_snapc;
564 struct address_space *mapping = inode->i_mapping; 564 struct address_space *mapping = inode->i_mapping;
565 __s32 rc = -EIO; 565 __s32 rc = -EIO;
566 u64 bytes = 0; 566 u64 bytes = 0;
567 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 567 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
568 long writeback_stat; 568 long writeback_stat;
569 unsigned issued = ceph_caps_issued(ci); 569 unsigned issued = ceph_caps_issued(ci);
570 570
571 /* parse reply */ 571 /* parse reply */
572 replyhead = msg->front.iov_base; 572 replyhead = msg->front.iov_base;
573 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); 573 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
574 op = (void *)(replyhead + 1); 574 op = (void *)(replyhead + 1);
575 rc = le32_to_cpu(replyhead->result); 575 rc = le32_to_cpu(replyhead->result);
576 bytes = le64_to_cpu(op->extent.length); 576 bytes = le64_to_cpu(op->extent.length);
577 577
578 if (rc >= 0) { 578 if (rc >= 0) {
579 /* 579 /*
580 * Assume we wrote the pages we originally sent. The 580 * Assume we wrote the pages we originally sent. The
581 * osd might reply with fewer pages if our writeback 581 * osd might reply with fewer pages if our writeback
582 * raced with a truncation and was adjusted at the osd, 582 * raced with a truncation and was adjusted at the osd,
583 * so don't believe the reply. 583 * so don't believe the reply.
584 */ 584 */
585 wrote = req->r_num_pages; 585 wrote = req->r_num_pages;
586 } else { 586 } else {
587 wrote = 0; 587 wrote = 0;
588 mapping_set_error(mapping, rc); 588 mapping_set_error(mapping, rc);
589 } 589 }
590 dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n", 590 dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
591 inode, rc, bytes, wrote); 591 inode, rc, bytes, wrote);
592 592
593 /* clean all pages */ 593 /* clean all pages */
594 for (i = 0; i < req->r_num_pages; i++) { 594 for (i = 0; i < req->r_num_pages; i++) {
595 page = req->r_pages[i]; 595 page = req->r_pages[i];
596 BUG_ON(!page); 596 BUG_ON(!page);
597 WARN_ON(!PageUptodate(page)); 597 WARN_ON(!PageUptodate(page));
598 598
599 writeback_stat = 599 writeback_stat =
600 atomic_long_dec_return(&fsc->writeback_count); 600 atomic_long_dec_return(&fsc->writeback_count);
601 if (writeback_stat < 601 if (writeback_stat <
602 CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) 602 CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
603 clear_bdi_congested(&fsc->backing_dev_info, 603 clear_bdi_congested(&fsc->backing_dev_info,
604 BLK_RW_ASYNC); 604 BLK_RW_ASYNC);
605 605
606 ceph_put_snap_context(page_snap_context(page)); 606 ceph_put_snap_context(page_snap_context(page));
607 page->private = 0; 607 page->private = 0;
608 ClearPagePrivate(page); 608 ClearPagePrivate(page);
609 dout("unlocking %d %p\n", i, page); 609 dout("unlocking %d %p\n", i, page);
610 end_page_writeback(page); 610 end_page_writeback(page);
611 611
612 /* 612 /*
613 * We lost the cache cap, need to truncate the page before 613 * We lost the cache cap, need to truncate the page before
614 * it is unlocked, otherwise we'd truncate it later in the 614 * it is unlocked, otherwise we'd truncate it later in the
615 * page truncation thread, possibly losing some data that 615 * page truncation thread, possibly losing some data that
616 * raced its way in 616 * raced its way in
617 */ 617 */
618 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) 618 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
619 generic_error_remove_page(inode->i_mapping, page); 619 generic_error_remove_page(inode->i_mapping, page);
620 620
621 unlock_page(page); 621 unlock_page(page);
622 } 622 }
623 dout("%p wrote+cleaned %d pages\n", inode, wrote); 623 dout("%p wrote+cleaned %d pages\n", inode, wrote);
624 ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc); 624 ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
625 625
626 ceph_release_pages(req->r_pages, req->r_num_pages); 626 ceph_release_pages(req->r_pages, req->r_num_pages);
627 if (req->r_pages_from_pool) 627 if (req->r_pages_from_pool)
628 mempool_free(req->r_pages, 628 mempool_free(req->r_pages,
629 ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); 629 ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
630 else 630 else
631 kfree(req->r_pages); 631 kfree(req->r_pages);
632 ceph_osdc_put_request(req); 632 ceph_osdc_put_request(req);
633 } 633 }
634 634
635 /* 635 /*
636 * allocate a page vec, either directly, or if necessary, via a the 636 * allocate a page vec, either directly, or if necessary, via a the
637 * mempool. we avoid the mempool if we can because req->r_num_pages 637 * mempool. we avoid the mempool if we can because req->r_num_pages
638 * may be less than the maximum write size. 638 * may be less than the maximum write size.
639 */ 639 */
640 static void alloc_page_vec(struct ceph_fs_client *fsc, 640 static void alloc_page_vec(struct ceph_fs_client *fsc,
641 struct ceph_osd_request *req) 641 struct ceph_osd_request *req)
642 { 642 {
643 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, 643 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
644 GFP_NOFS); 644 GFP_NOFS);
645 if (!req->r_pages) { 645 if (!req->r_pages) {
646 req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS); 646 req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
647 req->r_pages_from_pool = 1; 647 req->r_pages_from_pool = 1;
648 WARN_ON(!req->r_pages); 648 WARN_ON(!req->r_pages);
649 } 649 }
650 } 650 }
651 651
652 /* 652 /*
653 * initiate async writeback 653 * initiate async writeback
654 */ 654 */
655 static int ceph_writepages_start(struct address_space *mapping, 655 static int ceph_writepages_start(struct address_space *mapping,
656 struct writeback_control *wbc) 656 struct writeback_control *wbc)
657 { 657 {
658 struct inode *inode = mapping->host; 658 struct inode *inode = mapping->host;
659 struct ceph_inode_info *ci = ceph_inode(inode); 659 struct ceph_inode_info *ci = ceph_inode(inode);
660 struct ceph_fs_client *fsc; 660 struct ceph_fs_client *fsc;
661 pgoff_t index, start, end; 661 pgoff_t index, start, end;
662 int range_whole = 0; 662 int range_whole = 0;
663 int should_loop = 1; 663 int should_loop = 1;
664 pgoff_t max_pages = 0, max_pages_ever = 0; 664 pgoff_t max_pages = 0, max_pages_ever = 0;
665 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; 665 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
666 struct pagevec pvec; 666 struct pagevec pvec;
667 int done = 0; 667 int done = 0;
668 int rc = 0; 668 int rc = 0;
669 unsigned wsize = 1 << inode->i_blkbits; 669 unsigned wsize = 1 << inode->i_blkbits;
670 struct ceph_osd_request *req = NULL; 670 struct ceph_osd_request *req = NULL;
671 int do_sync; 671 int do_sync;
672 u64 snap_size = 0; 672 u64 snap_size = 0;
673 673
674 /* 674 /*
675 * Include a 'sync' in the OSD request if this is a data 675 * Include a 'sync' in the OSD request if this is a data
676 * integrity write (e.g., O_SYNC write or fsync()), or if our 676 * integrity write (e.g., O_SYNC write or fsync()), or if our
677 * cap is being revoked. 677 * cap is being revoked.
678 */ 678 */
679 do_sync = wbc->sync_mode == WB_SYNC_ALL; 679 do_sync = wbc->sync_mode == WB_SYNC_ALL;
680 if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) 680 if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
681 do_sync = 1; 681 do_sync = 1;
682 dout("writepages_start %p dosync=%d (mode=%s)\n", 682 dout("writepages_start %p dosync=%d (mode=%s)\n",
683 inode, do_sync, 683 inode, do_sync,
684 wbc->sync_mode == WB_SYNC_NONE ? "NONE" : 684 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
685 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); 685 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
686 686
687 fsc = ceph_inode_to_client(inode); 687 fsc = ceph_inode_to_client(inode);
688 if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { 688 if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
689 pr_warning("writepage_start %p on forced umount\n", inode); 689 pr_warning("writepage_start %p on forced umount\n", inode);
690 return -EIO; /* we're in a forced umount, don't write! */ 690 return -EIO; /* we're in a forced umount, don't write! */
691 } 691 }
692 if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) 692 if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
693 wsize = fsc->mount_options->wsize; 693 wsize = fsc->mount_options->wsize;
694 if (wsize < PAGE_CACHE_SIZE) 694 if (wsize < PAGE_CACHE_SIZE)
695 wsize = PAGE_CACHE_SIZE; 695 wsize = PAGE_CACHE_SIZE;
696 max_pages_ever = wsize >> PAGE_CACHE_SHIFT; 696 max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
697 697
698 pagevec_init(&pvec, 0); 698 pagevec_init(&pvec, 0);
699 699
700 /* where to start/end? */ 700 /* where to start/end? */
701 if (wbc->range_cyclic) { 701 if (wbc->range_cyclic) {
702 start = mapping->writeback_index; /* Start from prev offset */ 702 start = mapping->writeback_index; /* Start from prev offset */
703 end = -1; 703 end = -1;
704 dout(" cyclic, start at %lu\n", start); 704 dout(" cyclic, start at %lu\n", start);
705 } else { 705 } else {
706 start = wbc->range_start >> PAGE_CACHE_SHIFT; 706 start = wbc->range_start >> PAGE_CACHE_SHIFT;
707 end = wbc->range_end >> PAGE_CACHE_SHIFT; 707 end = wbc->range_end >> PAGE_CACHE_SHIFT;
708 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 708 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
709 range_whole = 1; 709 range_whole = 1;
710 should_loop = 0; 710 should_loop = 0;
711 dout(" not cyclic, %lu to %lu\n", start, end); 711 dout(" not cyclic, %lu to %lu\n", start, end);
712 } 712 }
713 index = start; 713 index = start;
714 714
715 retry: 715 retry:
716 /* find oldest snap context with dirty data */ 716 /* find oldest snap context with dirty data */
717 ceph_put_snap_context(snapc); 717 ceph_put_snap_context(snapc);
718 snapc = get_oldest_context(inode, &snap_size); 718 snapc = get_oldest_context(inode, &snap_size);
719 if (!snapc) { 719 if (!snapc) {
720 /* hmm, why does writepages get called when there 720 /* hmm, why does writepages get called when there
721 is no dirty data? */ 721 is no dirty data? */
722 dout(" no snap context with dirty data?\n"); 722 dout(" no snap context with dirty data?\n");
723 goto out; 723 goto out;
724 } 724 }
725 dout(" oldest snapc is %p seq %lld (%d snaps)\n", 725 dout(" oldest snapc is %p seq %lld (%d snaps)\n",
726 snapc, snapc->seq, snapc->num_snaps); 726 snapc, snapc->seq, snapc->num_snaps);
727 if (last_snapc && snapc != last_snapc) { 727 if (last_snapc && snapc != last_snapc) {
728 /* if we switched to a newer snapc, restart our scan at the 728 /* if we switched to a newer snapc, restart our scan at the
729 * start of the original file range. */ 729 * start of the original file range. */
730 dout(" snapc differs from last pass, restarting at %lu\n", 730 dout(" snapc differs from last pass, restarting at %lu\n",
731 index); 731 index);
732 index = start; 732 index = start;
733 } 733 }
734 last_snapc = snapc; 734 last_snapc = snapc;
735 735
736 while (!done && index <= end) { 736 while (!done && index <= end) {
737 unsigned i; 737 unsigned i;
738 int first; 738 int first;
739 pgoff_t next; 739 pgoff_t next;
740 int pvec_pages, locked_pages; 740 int pvec_pages, locked_pages;
741 struct page *page; 741 struct page *page;
742 int want; 742 int want;
743 u64 offset, len; 743 u64 offset, len;
744 struct ceph_osd_request_head *reqhead; 744 struct ceph_osd_request_head *reqhead;
745 struct ceph_osd_op *op; 745 struct ceph_osd_op *op;
746 long writeback_stat; 746 long writeback_stat;
747 747
748 next = 0; 748 next = 0;
749 locked_pages = 0; 749 locked_pages = 0;
750 max_pages = max_pages_ever; 750 max_pages = max_pages_ever;
751 751
752 get_more_pages: 752 get_more_pages:
753 first = -1; 753 first = -1;
754 want = min(end - index, 754 want = min(end - index,
755 min((pgoff_t)PAGEVEC_SIZE, 755 min((pgoff_t)PAGEVEC_SIZE,
756 max_pages - (pgoff_t)locked_pages) - 1) 756 max_pages - (pgoff_t)locked_pages) - 1)
757 + 1; 757 + 1;
758 pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, 758 pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
759 PAGECACHE_TAG_DIRTY, 759 PAGECACHE_TAG_DIRTY,
760 want); 760 want);
761 dout("pagevec_lookup_tag got %d\n", pvec_pages); 761 dout("pagevec_lookup_tag got %d\n", pvec_pages);
762 if (!pvec_pages && !locked_pages) 762 if (!pvec_pages && !locked_pages)
763 break; 763 break;
764 for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { 764 for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
765 page = pvec.pages[i]; 765 page = pvec.pages[i];
766 dout("? %p idx %lu\n", page, page->index); 766 dout("? %p idx %lu\n", page, page->index);
767 if (locked_pages == 0) 767 if (locked_pages == 0)
768 lock_page(page); /* first page */ 768 lock_page(page); /* first page */
769 else if (!trylock_page(page)) 769 else if (!trylock_page(page))
770 break; 770 break;
771 771
772 /* only dirty pages, or our accounting breaks */ 772 /* only dirty pages, or our accounting breaks */
773 if (unlikely(!PageDirty(page)) || 773 if (unlikely(!PageDirty(page)) ||
774 unlikely(page->mapping != mapping)) { 774 unlikely(page->mapping != mapping)) {
775 dout("!dirty or !mapping %p\n", page); 775 dout("!dirty or !mapping %p\n", page);
776 unlock_page(page); 776 unlock_page(page);
777 break; 777 break;
778 } 778 }
779 if (!wbc->range_cyclic && page->index > end) { 779 if (!wbc->range_cyclic && page->index > end) {
780 dout("end of range %p\n", page); 780 dout("end of range %p\n", page);
781 done = 1; 781 done = 1;
782 unlock_page(page); 782 unlock_page(page);
783 break; 783 break;
784 } 784 }
785 if (next && (page->index != next)) { 785 if (next && (page->index != next)) {
786 dout("not consecutive %p\n", page); 786 dout("not consecutive %p\n", page);
787 unlock_page(page); 787 unlock_page(page);
788 break; 788 break;
789 } 789 }
790 if (wbc->sync_mode != WB_SYNC_NONE) { 790 if (wbc->sync_mode != WB_SYNC_NONE) {
791 dout("waiting on writeback %p\n", page); 791 dout("waiting on writeback %p\n", page);
792 wait_on_page_writeback(page); 792 wait_on_page_writeback(page);
793 } 793 }
794 if ((snap_size && page_offset(page) > snap_size) || 794 if ((snap_size && page_offset(page) > snap_size) ||
795 (!snap_size && 795 (!snap_size &&
796 page_offset(page) > i_size_read(inode))) { 796 page_offset(page) > i_size_read(inode))) {
797 dout("%p page eof %llu\n", page, snap_size ? 797 dout("%p page eof %llu\n", page, snap_size ?
798 snap_size : i_size_read(inode)); 798 snap_size : i_size_read(inode));
799 done = 1; 799 done = 1;
800 unlock_page(page); 800 unlock_page(page);
801 break; 801 break;
802 } 802 }
803 if (PageWriteback(page)) { 803 if (PageWriteback(page)) {
804 dout("%p under writeback\n", page); 804 dout("%p under writeback\n", page);
805 unlock_page(page); 805 unlock_page(page);
806 break; 806 break;
807 } 807 }
808 808
809 /* only if matching snap context */ 809 /* only if matching snap context */
810 pgsnapc = page_snap_context(page); 810 pgsnapc = page_snap_context(page);
811 if (pgsnapc->seq > snapc->seq) { 811 if (pgsnapc->seq > snapc->seq) {
812 dout("page snapc %p %lld > oldest %p %lld\n", 812 dout("page snapc %p %lld > oldest %p %lld\n",
813 pgsnapc, pgsnapc->seq, snapc, snapc->seq); 813 pgsnapc, pgsnapc->seq, snapc, snapc->seq);
814 unlock_page(page); 814 unlock_page(page);
815 if (!locked_pages) 815 if (!locked_pages)
816 continue; /* keep looking for snap */ 816 continue; /* keep looking for snap */
817 break; 817 break;
818 } 818 }
819 819
820 if (!clear_page_dirty_for_io(page)) { 820 if (!clear_page_dirty_for_io(page)) {
821 dout("%p !clear_page_dirty_for_io\n", page); 821 dout("%p !clear_page_dirty_for_io\n", page);
822 unlock_page(page); 822 unlock_page(page);
823 break; 823 break;
824 } 824 }
825 825
826 /* ok */ 826 /* ok */
827 if (locked_pages == 0) { 827 if (locked_pages == 0) {
828 /* prepare async write request */ 828 /* prepare async write request */
829 offset = (u64) page_offset(page); 829 offset = (u64) page_offset(page);
830 len = wsize; 830 len = wsize;
831 req = ceph_osdc_new_request(&fsc->client->osdc, 831 req = ceph_osdc_new_request(&fsc->client->osdc,
832 &ci->i_layout, 832 &ci->i_layout,
833 ceph_vino(inode), 833 ceph_vino(inode),
834 offset, &len, 834 offset, &len,
835 CEPH_OSD_OP_WRITE, 835 CEPH_OSD_OP_WRITE,
836 CEPH_OSD_FLAG_WRITE | 836 CEPH_OSD_FLAG_WRITE |
837 CEPH_OSD_FLAG_ONDISK, 837 CEPH_OSD_FLAG_ONDISK,
838 snapc, do_sync, 838 snapc, do_sync,
839 ci->i_truncate_seq, 839 ci->i_truncate_seq,
840 ci->i_truncate_size, 840 ci->i_truncate_size,
841 &inode->i_mtime, true, 1, 0); 841 &inode->i_mtime, true, 1, 0);
842 842
843 if (IS_ERR(req)) { 843 if (IS_ERR(req)) {
844 rc = PTR_ERR(req); 844 rc = PTR_ERR(req);
845 unlock_page(page); 845 unlock_page(page);
846 break; 846 break;
847 } 847 }
848 848
849 max_pages = req->r_num_pages; 849 max_pages = req->r_num_pages;
850 850
851 alloc_page_vec(fsc, req); 851 alloc_page_vec(fsc, req);
852 req->r_callback = writepages_finish; 852 req->r_callback = writepages_finish;
853 req->r_inode = inode; 853 req->r_inode = inode;
854 } 854 }
855 855
856 /* note position of first page in pvec */ 856 /* note position of first page in pvec */
857 if (first < 0) 857 if (first < 0)
858 first = i; 858 first = i;
859 dout("%p will write page %p idx %lu\n", 859 dout("%p will write page %p idx %lu\n",
860 inode, page, page->index); 860 inode, page, page->index);
861 861
862 writeback_stat = 862 writeback_stat =
863 atomic_long_inc_return(&fsc->writeback_count); 863 atomic_long_inc_return(&fsc->writeback_count);
864 if (writeback_stat > CONGESTION_ON_THRESH( 864 if (writeback_stat > CONGESTION_ON_THRESH(
865 fsc->mount_options->congestion_kb)) { 865 fsc->mount_options->congestion_kb)) {
866 set_bdi_congested(&fsc->backing_dev_info, 866 set_bdi_congested(&fsc->backing_dev_info,
867 BLK_RW_ASYNC); 867 BLK_RW_ASYNC);
868 } 868 }
869 869
870 set_page_writeback(page); 870 set_page_writeback(page);
871 req->r_pages[locked_pages] = page; 871 req->r_pages[locked_pages] = page;
872 locked_pages++; 872 locked_pages++;
873 next = page->index + 1; 873 next = page->index + 1;
874 } 874 }
875 875
876 /* did we get anything? */ 876 /* did we get anything? */
877 if (!locked_pages) 877 if (!locked_pages)
878 goto release_pvec_pages; 878 goto release_pvec_pages;
879 if (i) { 879 if (i) {
880 int j; 880 int j;
881 BUG_ON(!locked_pages || first < 0); 881 BUG_ON(!locked_pages || first < 0);
882 882
883 if (pvec_pages && i == pvec_pages && 883 if (pvec_pages && i == pvec_pages &&
884 locked_pages < max_pages) { 884 locked_pages < max_pages) {
885 dout("reached end pvec, trying for more\n"); 885 dout("reached end pvec, trying for more\n");
886 pagevec_reinit(&pvec); 886 pagevec_reinit(&pvec);
887 goto get_more_pages; 887 goto get_more_pages;
888 } 888 }
889 889
890 /* shift unused pages over in the pvec... we 890 /* shift unused pages over in the pvec... we
891 * will need to release them below. */ 891 * will need to release them below. */
892 for (j = i; j < pvec_pages; j++) { 892 for (j = i; j < pvec_pages; j++) {
893 dout(" pvec leftover page %p\n", 893 dout(" pvec leftover page %p\n",
894 pvec.pages[j]); 894 pvec.pages[j]);
895 pvec.pages[j-i+first] = pvec.pages[j]; 895 pvec.pages[j-i+first] = pvec.pages[j];
896 } 896 }
897 pvec.nr -= i-first; 897 pvec.nr -= i-first;
898 } 898 }
899 899
900 /* submit the write */ 900 /* submit the write */
901 offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT; 901 offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
902 len = min((snap_size ? snap_size : i_size_read(inode)) - offset, 902 len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
903 (u64)locked_pages << PAGE_CACHE_SHIFT); 903 (u64)locked_pages << PAGE_CACHE_SHIFT);
904 dout("writepages got %d pages at %llu~%llu\n", 904 dout("writepages got %d pages at %llu~%llu\n",
905 locked_pages, offset, len); 905 locked_pages, offset, len);
906 906
907 /* revise final length, page count */ 907 /* revise final length, page count */
908 req->r_num_pages = locked_pages; 908 req->r_num_pages = locked_pages;
909 reqhead = req->r_request->front.iov_base; 909 reqhead = req->r_request->front.iov_base;
910 op = (void *)(reqhead + 1); 910 op = (void *)(reqhead + 1);
911 op->extent.length = cpu_to_le64(len); 911 op->extent.length = cpu_to_le64(len);
912 op->payload_len = cpu_to_le32(len); 912 op->payload_len = cpu_to_le32(len);
913 req->r_request->hdr.data_len = cpu_to_le32(len); 913 req->r_request->hdr.data_len = cpu_to_le32(len);
914 914
915 rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); 915 rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
916 BUG_ON(rc); 916 BUG_ON(rc);
917 req = NULL; 917 req = NULL;
918 918
919 /* continue? */ 919 /* continue? */
920 index = next; 920 index = next;
921 wbc->nr_to_write -= locked_pages; 921 wbc->nr_to_write -= locked_pages;
922 if (wbc->nr_to_write <= 0) 922 if (wbc->nr_to_write <= 0)
923 done = 1; 923 done = 1;
924 924
925 release_pvec_pages: 925 release_pvec_pages:
926 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, 926 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
927 pvec.nr ? pvec.pages[0] : NULL); 927 pvec.nr ? pvec.pages[0] : NULL);
928 pagevec_release(&pvec); 928 pagevec_release(&pvec);
929 929
930 if (locked_pages && !done) 930 if (locked_pages && !done)
931 goto retry; 931 goto retry;
932 } 932 }
933 933
934 if (should_loop && !done) { 934 if (should_loop && !done) {
935 /* more to do; loop back to beginning of file */ 935 /* more to do; loop back to beginning of file */
936 dout("writepages looping back to beginning of file\n"); 936 dout("writepages looping back to beginning of file\n");
937 should_loop = 0; 937 should_loop = 0;
938 index = 0; 938 index = 0;
939 goto retry; 939 goto retry;
940 } 940 }
941 941
942 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 942 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
943 mapping->writeback_index = index; 943 mapping->writeback_index = index;
944 944
945 out: 945 out:
946 if (req) 946 if (req)
947 ceph_osdc_put_request(req); 947 ceph_osdc_put_request(req);
948 ceph_put_snap_context(snapc); 948 ceph_put_snap_context(snapc);
949 dout("writepages done, rc = %d\n", rc); 949 dout("writepages done, rc = %d\n", rc);
950 return rc; 950 return rc;
951 } 951 }
952 952
953 953
954 954
955 /* 955 /*
956 * See if a given @snapc is either writeable, or already written. 956 * See if a given @snapc is either writeable, or already written.
957 */ 957 */
958 static int context_is_writeable_or_written(struct inode *inode, 958 static int context_is_writeable_or_written(struct inode *inode,
959 struct ceph_snap_context *snapc) 959 struct ceph_snap_context *snapc)
960 { 960 {
961 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL); 961 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
962 int ret = !oldest || snapc->seq <= oldest->seq; 962 int ret = !oldest || snapc->seq <= oldest->seq;
963 963
964 ceph_put_snap_context(oldest); 964 ceph_put_snap_context(oldest);
965 return ret; 965 return ret;
966 } 966 }
967 967
968 /* 968 /*
969 * We are only allowed to write into/dirty the page if the page is 969 * We are only allowed to write into/dirty the page if the page is
970 * clean, or already dirty within the same snap context. 970 * clean, or already dirty within the same snap context.
971 * 971 *
972 * called with page locked. 972 * called with page locked.
973 * return success with page locked, 973 * return success with page locked,
974 * or any failure (incl -EAGAIN) with page unlocked. 974 * or any failure (incl -EAGAIN) with page unlocked.
975 */ 975 */
976 static int ceph_update_writeable_page(struct file *file, 976 static int ceph_update_writeable_page(struct file *file,
977 loff_t pos, unsigned len, 977 loff_t pos, unsigned len,
978 struct page *page) 978 struct page *page)
979 { 979 {
980 struct inode *inode = file->f_dentry->d_inode; 980 struct inode *inode = file->f_dentry->d_inode;
981 struct ceph_inode_info *ci = ceph_inode(inode); 981 struct ceph_inode_info *ci = ceph_inode(inode);
982 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 982 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
983 loff_t page_off = pos & PAGE_CACHE_MASK; 983 loff_t page_off = pos & PAGE_CACHE_MASK;
984 int pos_in_page = pos & ~PAGE_CACHE_MASK; 984 int pos_in_page = pos & ~PAGE_CACHE_MASK;
985 int end_in_page = pos_in_page + len; 985 int end_in_page = pos_in_page + len;
986 loff_t i_size; 986 loff_t i_size;
987 int r; 987 int r;
988 struct ceph_snap_context *snapc, *oldest; 988 struct ceph_snap_context *snapc, *oldest;
989 989
990 retry_locked: 990 retry_locked:
991 /* writepages currently holds page lock, but if we change that later, */ 991 /* writepages currently holds page lock, but if we change that later, */
992 wait_on_page_writeback(page); 992 wait_on_page_writeback(page);
993 993
994 /* check snap context */ 994 /* check snap context */
995 BUG_ON(!ci->i_snap_realm); 995 BUG_ON(!ci->i_snap_realm);
996 down_read(&mdsc->snap_rwsem); 996 down_read(&mdsc->snap_rwsem);
997 BUG_ON(!ci->i_snap_realm->cached_context); 997 BUG_ON(!ci->i_snap_realm->cached_context);
998 snapc = page_snap_context(page); 998 snapc = page_snap_context(page);
999 if (snapc && snapc != ci->i_head_snapc) { 999 if (snapc && snapc != ci->i_head_snapc) {
1000 /* 1000 /*
1001 * this page is already dirty in another (older) snap 1001 * this page is already dirty in another (older) snap
1002 * context! is it writeable now? 1002 * context! is it writeable now?
1003 */ 1003 */
1004 oldest = get_oldest_context(inode, NULL); 1004 oldest = get_oldest_context(inode, NULL);
1005 up_read(&mdsc->snap_rwsem); 1005 up_read(&mdsc->snap_rwsem);
1006 1006
1007 if (snapc->seq > oldest->seq) { 1007 if (snapc->seq > oldest->seq) {
1008 ceph_put_snap_context(oldest); 1008 ceph_put_snap_context(oldest);
1009 dout(" page %p snapc %p not current or oldest\n", 1009 dout(" page %p snapc %p not current or oldest\n",
1010 page, snapc); 1010 page, snapc);
1011 /* 1011 /*
1012 * queue for writeback, and wait for snapc to 1012 * queue for writeback, and wait for snapc to
1013 * be writeable or written 1013 * be writeable or written
1014 */ 1014 */
1015 snapc = ceph_get_snap_context(snapc); 1015 snapc = ceph_get_snap_context(snapc);
1016 unlock_page(page); 1016 unlock_page(page);
1017 ceph_queue_writeback(inode); 1017 ceph_queue_writeback(inode);
1018 r = wait_event_interruptible(ci->i_cap_wq, 1018 r = wait_event_interruptible(ci->i_cap_wq,
1019 context_is_writeable_or_written(inode, snapc)); 1019 context_is_writeable_or_written(inode, snapc));
1020 ceph_put_snap_context(snapc); 1020 ceph_put_snap_context(snapc);
1021 if (r == -ERESTARTSYS) 1021 if (r == -ERESTARTSYS)
1022 return r; 1022 return r;
1023 return -EAGAIN; 1023 return -EAGAIN;
1024 } 1024 }
1025 ceph_put_snap_context(oldest); 1025 ceph_put_snap_context(oldest);
1026 1026
1027 /* yay, writeable, do it now (without dropping page lock) */ 1027 /* yay, writeable, do it now (without dropping page lock) */
1028 dout(" page %p snapc %p not current, but oldest\n", 1028 dout(" page %p snapc %p not current, but oldest\n",
1029 page, snapc); 1029 page, snapc);
1030 if (!clear_page_dirty_for_io(page)) 1030 if (!clear_page_dirty_for_io(page))
1031 goto retry_locked; 1031 goto retry_locked;
1032 r = writepage_nounlock(page, NULL); 1032 r = writepage_nounlock(page, NULL);
1033 if (r < 0) 1033 if (r < 0)
1034 goto fail_nosnap; 1034 goto fail_nosnap;
1035 goto retry_locked; 1035 goto retry_locked;
1036 } 1036 }
1037 1037
1038 if (PageUptodate(page)) { 1038 if (PageUptodate(page)) {
1039 dout(" page %p already uptodate\n", page); 1039 dout(" page %p already uptodate\n", page);
1040 return 0; 1040 return 0;
1041 } 1041 }
1042 1042
1043 /* full page? */ 1043 /* full page? */
1044 if (pos_in_page == 0 && len == PAGE_CACHE_SIZE) 1044 if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
1045 return 0; 1045 return 0;
1046 1046
1047 /* past end of file? */ 1047 /* past end of file? */
1048 i_size = inode->i_size; /* caller holds i_mutex */ 1048 i_size = inode->i_size; /* caller holds i_mutex */
1049 1049
1050 if (i_size + len > inode->i_sb->s_maxbytes) { 1050 if (i_size + len > inode->i_sb->s_maxbytes) {
1051 /* file is too big */ 1051 /* file is too big */
1052 r = -EINVAL; 1052 r = -EINVAL;
1053 goto fail; 1053 goto fail;
1054 } 1054 }
1055 1055
1056 if (page_off >= i_size || 1056 if (page_off >= i_size ||
1057 (pos_in_page == 0 && (pos+len) >= i_size && 1057 (pos_in_page == 0 && (pos+len) >= i_size &&
1058 end_in_page - pos_in_page != PAGE_CACHE_SIZE)) { 1058 end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
1059 dout(" zeroing %p 0 - %d and %d - %d\n", 1059 dout(" zeroing %p 0 - %d and %d - %d\n",
1060 page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE); 1060 page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
1061 zero_user_segments(page, 1061 zero_user_segments(page,
1062 0, pos_in_page, 1062 0, pos_in_page,
1063 end_in_page, PAGE_CACHE_SIZE); 1063 end_in_page, PAGE_CACHE_SIZE);
1064 return 0; 1064 return 0;
1065 } 1065 }
1066 1066
1067 /* we need to read it. */ 1067 /* we need to read it. */
1068 up_read(&mdsc->snap_rwsem); 1068 up_read(&mdsc->snap_rwsem);
1069 r = readpage_nounlock(file, page); 1069 r = readpage_nounlock(file, page);
1070 if (r < 0) 1070 if (r < 0)
1071 goto fail_nosnap; 1071 goto fail_nosnap;
1072 goto retry_locked; 1072 goto retry_locked;
1073 1073
1074 fail: 1074 fail:
1075 up_read(&mdsc->snap_rwsem); 1075 up_read(&mdsc->snap_rwsem);
1076 fail_nosnap: 1076 fail_nosnap:
1077 unlock_page(page); 1077 unlock_page(page);
1078 return r; 1078 return r;
1079 } 1079 }
1080 1080
1081 /* 1081 /*
1082 * We are only allowed to write into/dirty the page if the page is 1082 * We are only allowed to write into/dirty the page if the page is
1083 * clean, or already dirty within the same snap context. 1083 * clean, or already dirty within the same snap context.
1084 */ 1084 */
1085 static int ceph_write_begin(struct file *file, struct address_space *mapping, 1085 static int ceph_write_begin(struct file *file, struct address_space *mapping,
1086 loff_t pos, unsigned len, unsigned flags, 1086 loff_t pos, unsigned len, unsigned flags,
1087 struct page **pagep, void **fsdata) 1087 struct page **pagep, void **fsdata)
1088 { 1088 {
1089 struct inode *inode = file->f_dentry->d_inode; 1089 struct inode *inode = file->f_dentry->d_inode;
1090 struct ceph_inode_info *ci = ceph_inode(inode); 1090 struct ceph_inode_info *ci = ceph_inode(inode);
1091 struct ceph_file_info *fi = file->private_data; 1091 struct ceph_file_info *fi = file->private_data;
1092 struct page *page; 1092 struct page *page;
1093 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1093 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1094 int r, want, got = 0; 1094 int r, want, got = 0;
1095 1095
1096 if (fi->fmode & CEPH_FILE_MODE_LAZY) 1096 if (fi->fmode & CEPH_FILE_MODE_LAZY)
1097 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; 1097 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
1098 else 1098 else
1099 want = CEPH_CAP_FILE_BUFFER; 1099 want = CEPH_CAP_FILE_BUFFER;
1100 1100
1101 dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n", 1101 dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
1102 inode, ceph_vinop(inode), pos, len, inode->i_size); 1102 inode, ceph_vinop(inode), pos, len, inode->i_size);
1103 r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len); 1103 r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len);
1104 if (r < 0) 1104 if (r < 0)
1105 return r; 1105 return r;
1106 dout("write_begin %p %llx.%llx %llu~%u got cap refs on %s\n", 1106 dout("write_begin %p %llx.%llx %llu~%u got cap refs on %s\n",
1107 inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); 1107 inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
1108 if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) { 1108 if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) {
1109 ceph_put_cap_refs(ci, got); 1109 ceph_put_cap_refs(ci, got);
1110 return -EAGAIN; 1110 return -EAGAIN;
1111 } 1111 }
1112 1112
1113 do { 1113 do {
1114 /* get a page */ 1114 /* get a page */
1115 page = grab_cache_page_write_begin(mapping, index, 0); 1115 page = grab_cache_page_write_begin(mapping, index, 0);
1116 if (!page) { 1116 if (!page) {
1117 r = -ENOMEM; 1117 r = -ENOMEM;
1118 break; 1118 break;
1119 } 1119 }
1120 1120
1121 dout("write_begin file %p inode %p page %p %d~%d\n", file, 1121 dout("write_begin file %p inode %p page %p %d~%d\n", file,
1122 inode, page, (int)pos, (int)len); 1122 inode, page, (int)pos, (int)len);
1123 1123
1124 r = ceph_update_writeable_page(file, pos, len, page); 1124 r = ceph_update_writeable_page(file, pos, len, page);
1125 if (r) 1125 if (r)
1126 page_cache_release(page); 1126 page_cache_release(page);
1127 } while (r == -EAGAIN); 1127 } while (r == -EAGAIN);
1128 1128
1129 if (r) { 1129 if (r) {
1130 ceph_put_cap_refs(ci, got); 1130 ceph_put_cap_refs(ci, got);
1131 } else { 1131 } else {
1132 *pagep = page; 1132 *pagep = page;
1133 *(int *)fsdata = got; 1133 *(int *)fsdata = got;
1134 } 1134 }
1135 return r; 1135 return r;
1136 } 1136 }
1137 1137
1138 /* 1138 /*
1139 * we don't do anything in here that simple_write_end doesn't do 1139 * we don't do anything in here that simple_write_end doesn't do
1140 * except adjust dirty page accounting and drop read lock on 1140 * except adjust dirty page accounting and drop read lock on
1141 * mdsc->snap_rwsem. 1141 * mdsc->snap_rwsem.
1142 */ 1142 */
1143 static int ceph_write_end(struct file *file, struct address_space *mapping, 1143 static int ceph_write_end(struct file *file, struct address_space *mapping,
1144 loff_t pos, unsigned len, unsigned copied, 1144 loff_t pos, unsigned len, unsigned copied,
1145 struct page *page, void *fsdata) 1145 struct page *page, void *fsdata)
1146 { 1146 {
1147 struct inode *inode = file->f_dentry->d_inode; 1147 struct inode *inode = file->f_dentry->d_inode;
1148 struct ceph_inode_info *ci = ceph_inode(inode); 1148 struct ceph_inode_info *ci = ceph_inode(inode);
1149 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 1149 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1150 struct ceph_mds_client *mdsc = fsc->mdsc; 1150 struct ceph_mds_client *mdsc = fsc->mdsc;
1151 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 1151 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1152 int check_cap = 0; 1152 int check_cap = 0;
1153 int got = (unsigned long)fsdata; 1153 int got = (unsigned long)fsdata;
1154 1154
1155 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, 1155 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
1156 inode, page, (int)pos, (int)copied, (int)len); 1156 inode, page, (int)pos, (int)copied, (int)len);
1157 1157
1158 /* zero the stale part of the page if we did a short copy */ 1158 /* zero the stale part of the page if we did a short copy */
1159 if (copied < len) 1159 if (copied < len)
1160 zero_user_segment(page, from+copied, len); 1160 zero_user_segment(page, from+copied, len);
1161 1161
1162 /* did file size increase? */ 1162 /* did file size increase? */
1163 /* (no need for i_size_read(); we caller holds i_mutex */ 1163 /* (no need for i_size_read(); we caller holds i_mutex */
1164 if (pos+copied > inode->i_size) 1164 if (pos+copied > inode->i_size)
1165 check_cap = ceph_inode_set_size(inode, pos+copied); 1165 check_cap = ceph_inode_set_size(inode, pos+copied);
1166 1166
1167 if (!PageUptodate(page)) 1167 if (!PageUptodate(page))
1168 SetPageUptodate(page); 1168 SetPageUptodate(page);
1169 1169
1170 set_page_dirty(page); 1170 set_page_dirty(page);
1171 1171
1172 unlock_page(page); 1172 unlock_page(page);
1173 up_read(&mdsc->snap_rwsem); 1173 up_read(&mdsc->snap_rwsem);
1174 page_cache_release(page); 1174 page_cache_release(page);
1175 1175
1176 if (copied > 0) { 1176 if (copied > 0) {
1177 int dirty; 1177 int dirty;
1178 spin_lock(&ci->i_ceph_lock); 1178 spin_lock(&ci->i_ceph_lock);
1179 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); 1179 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
1180 spin_unlock(&ci->i_ceph_lock); 1180 spin_unlock(&ci->i_ceph_lock);
1181 if (dirty) 1181 if (dirty)
1182 __mark_inode_dirty(inode, dirty); 1182 __mark_inode_dirty(inode, dirty);
1183 } 1183 }
1184 1184
1185 dout("write_end %p %llx.%llx %llu~%u dropping cap refs on %s\n", 1185 dout("write_end %p %llx.%llx %llu~%u dropping cap refs on %s\n",
1186 inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); 1186 inode, ceph_vinop(inode), pos, len, ceph_cap_string(got));
1187 ceph_put_cap_refs(ci, got); 1187 ceph_put_cap_refs(ci, got);
1188 1188
1189 if (check_cap) 1189 if (check_cap)
1190 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); 1190 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
1191 1191
1192 return copied; 1192 return copied;
1193 } 1193 }
1194 1194
1195 /* 1195 /*
1196 * we set .direct_IO to indicate direct io is supported, but since we 1196 * we set .direct_IO to indicate direct io is supported, but since we
1197 * intercept O_DIRECT reads and writes early, this function should 1197 * intercept O_DIRECT reads and writes early, this function should
1198 * never get called. 1198 * never get called.
1199 */ 1199 */
1200 static ssize_t ceph_direct_io(int rw, struct kiocb *iocb, 1200 static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
1201 const struct iovec *iov, 1201 const struct iovec *iov,
1202 loff_t pos, unsigned long nr_segs) 1202 loff_t pos, unsigned long nr_segs)
1203 { 1203 {
1204 WARN_ON(1); 1204 WARN_ON(1);
1205 return -EINVAL; 1205 return -EINVAL;
1206 } 1206 }
1207 1207
1208 const struct address_space_operations ceph_aops = { 1208 const struct address_space_operations ceph_aops = {
1209 .readpage = ceph_readpage, 1209 .readpage = ceph_readpage,
1210 .readpages = ceph_readpages, 1210 .readpages = ceph_readpages,
1211 .writepage = ceph_writepage, 1211 .writepage = ceph_writepage,
1212 .writepages = ceph_writepages_start, 1212 .writepages = ceph_writepages_start,
1213 .write_begin = ceph_write_begin, 1213 .write_begin = ceph_write_begin,
1214 .write_end = ceph_write_end, 1214 .write_end = ceph_write_end,
1215 .set_page_dirty = ceph_set_page_dirty, 1215 .set_page_dirty = ceph_set_page_dirty,
1216 .invalidatepage = ceph_invalidatepage, 1216 .invalidatepage = ceph_invalidatepage,
1217 .releasepage = ceph_releasepage, 1217 .releasepage = ceph_releasepage,
1218 .direct_IO = ceph_direct_io, 1218 .direct_IO = ceph_direct_io,
1219 }; 1219 };
1220 1220
1221 1221
1222 /* 1222 /*
1223 * vm ops 1223 * vm ops
1224 */ 1224 */
1225 1225
1226 /* 1226 /*
1227 * Reuse write_begin here for simplicity. 1227 * Reuse write_begin here for simplicity.
1228 */ 1228 */
1229 static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 1229 static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1230 { 1230 {
1231 struct inode *inode = vma->vm_file->f_dentry->d_inode; 1231 struct inode *inode = vma->vm_file->f_dentry->d_inode;
1232 struct page *page = vmf->page; 1232 struct page *page = vmf->page;
1233 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 1233 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1234 loff_t off = page_offset(page); 1234 loff_t off = page_offset(page);
1235 loff_t size, len; 1235 loff_t size, len;
1236 int ret; 1236 int ret;
1237 1237
1238 /* Update time before taking page lock */ 1238 /* Update time before taking page lock */
1239 file_update_time(vma->vm_file); 1239 file_update_time(vma->vm_file);
1240 1240
1241 size = i_size_read(inode); 1241 size = i_size_read(inode);
1242 if (off + PAGE_CACHE_SIZE <= size) 1242 if (off + PAGE_CACHE_SIZE <= size)
1243 len = PAGE_CACHE_SIZE; 1243 len = PAGE_CACHE_SIZE;
1244 else 1244 else
1245 len = size & ~PAGE_CACHE_MASK; 1245 len = size & ~PAGE_CACHE_MASK;
1246 1246
1247 dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode, 1247 dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
1248 off, len, page, page->index); 1248 off, len, page, page->index);
1249 1249
1250 lock_page(page); 1250 lock_page(page);
1251 1251
1252 ret = VM_FAULT_NOPAGE; 1252 ret = VM_FAULT_NOPAGE;
1253 if ((off > size) || 1253 if ((off > size) ||
1254 (page->mapping != inode->i_mapping)) 1254 (page->mapping != inode->i_mapping))
1255 goto out; 1255 goto out;
1256 1256
1257 ret = ceph_update_writeable_page(vma->vm_file, off, len, page); 1257 ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
1258 if (ret == 0) { 1258 if (ret == 0) {
1259 /* success. we'll keep the page locked. */ 1259 /* success. we'll keep the page locked. */
1260 set_page_dirty(page); 1260 set_page_dirty(page);
1261 up_read(&mdsc->snap_rwsem); 1261 up_read(&mdsc->snap_rwsem);
1262 ret = VM_FAULT_LOCKED; 1262 ret = VM_FAULT_LOCKED;
1263 } else { 1263 } else {
1264 if (ret == -ENOMEM) 1264 if (ret == -ENOMEM)
1265 ret = VM_FAULT_OOM; 1265 ret = VM_FAULT_OOM;
1266 else 1266 else
1267 ret = VM_FAULT_SIGBUS; 1267 ret = VM_FAULT_SIGBUS;
1268 } 1268 }
1269 out: 1269 out:
1270 dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret); 1270 dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
1271 if (ret != VM_FAULT_LOCKED) 1271 if (ret != VM_FAULT_LOCKED)
1272 unlock_page(page); 1272 unlock_page(page);
1273 return ret; 1273 return ret;
1274 } 1274 }
1275 1275
1276 static struct vm_operations_struct ceph_vmops = { 1276 static struct vm_operations_struct ceph_vmops = {
1277 .fault = filemap_fault, 1277 .fault = filemap_fault,
1278 .page_mkwrite = ceph_page_mkwrite, 1278 .page_mkwrite = ceph_page_mkwrite,
1279 .remap_pages = generic_file_remap_pages, 1279 .remap_pages = generic_file_remap_pages,
1280 }; 1280 };
1281 1281
1282 int ceph_mmap(struct file *file, struct vm_area_struct *vma) 1282 int ceph_mmap(struct file *file, struct vm_area_struct *vma)
1283 { 1283 {
1284 struct address_space *mapping = file->f_mapping; 1284 struct address_space *mapping = file->f_mapping;
1285 1285
1286 if (!mapping->a_ops->readpage) 1286 if (!mapping->a_ops->readpage)
1287 return -ENOEXEC; 1287 return -ENOEXEC;
1288 file_accessed(file); 1288 file_accessed(file);
1289 vma->vm_ops = &ceph_vmops; 1289 vma->vm_ops = &ceph_vmops;
1290 return 0; 1290 return 0;
1291 } 1291 }
1292 1292
include/linux/ceph/osd_client.h
1 #ifndef _FS_CEPH_OSD_CLIENT_H 1 #ifndef _FS_CEPH_OSD_CLIENT_H
2 #define _FS_CEPH_OSD_CLIENT_H 2 #define _FS_CEPH_OSD_CLIENT_H
3 3
4 #include <linux/completion.h> 4 #include <linux/completion.h>
5 #include <linux/kref.h> 5 #include <linux/kref.h>
6 #include <linux/mempool.h> 6 #include <linux/mempool.h>
7 #include <linux/rbtree.h> 7 #include <linux/rbtree.h>
8 8
9 #include <linux/ceph/types.h> 9 #include <linux/ceph/types.h>
10 #include <linux/ceph/osdmap.h> 10 #include <linux/ceph/osdmap.h>
11 #include <linux/ceph/messenger.h> 11 #include <linux/ceph/messenger.h>
12 #include <linux/ceph/auth.h> 12 #include <linux/ceph/auth.h>
13 #include <linux/ceph/pagelist.h> 13 #include <linux/ceph/pagelist.h>
14 14
15 /* 15 /*
16 * Maximum object name size 16 * Maximum object name size
17 * (must be at least as big as RBD_MAX_MD_NAME_LEN -- currently 100) 17 * (must be at least as big as RBD_MAX_MD_NAME_LEN -- currently 100)
18 */ 18 */
19 #define MAX_OBJ_NAME_SIZE 100 19 #define MAX_OBJ_NAME_SIZE 100
20 20
21 struct ceph_msg; 21 struct ceph_msg;
22 struct ceph_snap_context; 22 struct ceph_snap_context;
23 struct ceph_osd_request; 23 struct ceph_osd_request;
24 struct ceph_osd_client; 24 struct ceph_osd_client;
25 struct ceph_authorizer; 25 struct ceph_authorizer;
26 26
27 /* 27 /*
28 * completion callback for async writepages 28 * completion callback for async writepages
29 */ 29 */
30 typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, 30 typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
31 struct ceph_msg *); 31 struct ceph_msg *);
32 32
33 /* a given osd we're communicating with */ 33 /* a given osd we're communicating with */
34 struct ceph_osd { 34 struct ceph_osd {
35 atomic_t o_ref; 35 atomic_t o_ref;
36 struct ceph_osd_client *o_osdc; 36 struct ceph_osd_client *o_osdc;
37 int o_osd; 37 int o_osd;
38 int o_incarnation; 38 int o_incarnation;
39 struct rb_node o_node; 39 struct rb_node o_node;
40 struct ceph_connection o_con; 40 struct ceph_connection o_con;
41 struct list_head o_requests; 41 struct list_head o_requests;
42 struct list_head o_linger_requests; 42 struct list_head o_linger_requests;
43 struct list_head o_osd_lru; 43 struct list_head o_osd_lru;
44 struct ceph_auth_handshake o_auth; 44 struct ceph_auth_handshake o_auth;
45 unsigned long lru_ttl; 45 unsigned long lru_ttl;
46 int o_marked_for_keepalive; 46 int o_marked_for_keepalive;
47 struct list_head o_keepalive_item; 47 struct list_head o_keepalive_item;
48 }; 48 };
49 49
50 /* an in-flight request */ 50 /* an in-flight request */
51 struct ceph_osd_request { 51 struct ceph_osd_request {
52 u64 r_tid; /* unique for this client */ 52 u64 r_tid; /* unique for this client */
53 struct rb_node r_node; 53 struct rb_node r_node;
54 struct list_head r_req_lru_item; 54 struct list_head r_req_lru_item;
55 struct list_head r_osd_item; 55 struct list_head r_osd_item;
56 struct list_head r_linger_item; 56 struct list_head r_linger_item;
57 struct list_head r_linger_osd; 57 struct list_head r_linger_osd;
58 struct ceph_osd *r_osd; 58 struct ceph_osd *r_osd;
59 struct ceph_pg r_pgid; 59 struct ceph_pg r_pgid;
60 int r_pg_osds[CEPH_PG_MAX_SIZE]; 60 int r_pg_osds[CEPH_PG_MAX_SIZE];
61 int r_num_pg_osds; 61 int r_num_pg_osds;
62 62
63 struct ceph_connection *r_con_filling_msg; 63 struct ceph_connection *r_con_filling_msg;
64 64
65 struct ceph_msg *r_request, *r_reply; 65 struct ceph_msg *r_request, *r_reply;
66 int r_result; 66 int r_result;
67 int r_flags; /* any additional flags for the osd */ 67 int r_flags; /* any additional flags for the osd */
68 u32 r_sent; /* >0 if r_request is sending/sent */ 68 u32 r_sent; /* >0 if r_request is sending/sent */
69 int r_got_reply; 69 int r_got_reply;
70 int r_linger; 70 int r_linger;
71 71
72 struct ceph_osd_client *r_osdc; 72 struct ceph_osd_client *r_osdc;
73 struct kref r_kref; 73 struct kref r_kref;
74 bool r_mempool; 74 bool r_mempool;
75 struct completion r_completion, r_safe_completion; 75 struct completion r_completion, r_safe_completion;
76 ceph_osdc_callback_t r_callback, r_safe_callback; 76 ceph_osdc_callback_t r_callback, r_safe_callback;
77 struct ceph_eversion r_reassert_version; 77 struct ceph_eversion r_reassert_version;
78 struct list_head r_unsafe_item; 78 struct list_head r_unsafe_item;
79 79
80 struct inode *r_inode; /* for use by callbacks */ 80 struct inode *r_inode; /* for use by callbacks */
81 void *r_priv; /* ditto */ 81 void *r_priv; /* ditto */
82 82
83 char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */ 83 char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */
84 int r_oid_len; 84 int r_oid_len;
85 unsigned long r_stamp; /* send OR check time */ 85 unsigned long r_stamp; /* send OR check time */
86 86
87 struct ceph_file_layout r_file_layout; 87 struct ceph_file_layout r_file_layout;
88 struct ceph_snap_context *r_snapc; /* snap context for writes */ 88 struct ceph_snap_context *r_snapc; /* snap context for writes */
89 unsigned r_num_pages; /* size of page array (follows) */ 89 unsigned r_num_pages; /* size of page array (follows) */
90 unsigned r_page_alignment; /* io offset in first page */ 90 unsigned r_page_alignment; /* io offset in first page */
91 struct page **r_pages; /* pages for data payload */ 91 struct page **r_pages; /* pages for data payload */
92 int r_pages_from_pool; 92 int r_pages_from_pool;
93 int r_own_pages; /* if true, i own page list */ 93 int r_own_pages; /* if true, i own page list */
94 #ifdef CONFIG_BLOCK 94 #ifdef CONFIG_BLOCK
95 struct bio *r_bio; /* instead of pages */ 95 struct bio *r_bio; /* instead of pages */
96 #endif 96 #endif
97 97
98 struct ceph_pagelist r_trail; /* trailing part of the data */ 98 struct ceph_pagelist r_trail; /* trailing part of the data */
99 }; 99 };
100 100
101 struct ceph_osd_event { 101 struct ceph_osd_event {
102 u64 cookie; 102 u64 cookie;
103 int one_shot; 103 int one_shot;
104 struct ceph_osd_client *osdc; 104 struct ceph_osd_client *osdc;
105 void (*cb)(u64, u64, u8, void *); 105 void (*cb)(u64, u64, u8, void *);
106 void *data; 106 void *data;
107 struct rb_node node; 107 struct rb_node node;
108 struct list_head osd_node; 108 struct list_head osd_node;
109 struct kref kref; 109 struct kref kref;
110 struct completion completion; 110 struct completion completion;
111 }; 111 };
112 112
113 struct ceph_osd_event_work { 113 struct ceph_osd_event_work {
114 struct work_struct work; 114 struct work_struct work;
115 struct ceph_osd_event *event; 115 struct ceph_osd_event *event;
116 u64 ver; 116 u64 ver;
117 u64 notify_id; 117 u64 notify_id;
118 u8 opcode; 118 u8 opcode;
119 }; 119 };
120 120
121 struct ceph_osd_client { 121 struct ceph_osd_client {
122 struct ceph_client *client; 122 struct ceph_client *client;
123 123
124 struct ceph_osdmap *osdmap; /* current map */ 124 struct ceph_osdmap *osdmap; /* current map */
125 struct rw_semaphore map_sem; 125 struct rw_semaphore map_sem;
126 struct completion map_waiters; 126 struct completion map_waiters;
127 u64 last_requested_map; 127 u64 last_requested_map;
128 128
129 struct mutex request_mutex; 129 struct mutex request_mutex;
130 struct rb_root osds; /* osds */ 130 struct rb_root osds; /* osds */
131 struct list_head osd_lru; /* idle osds */ 131 struct list_head osd_lru; /* idle osds */
132 u64 timeout_tid; /* tid of timeout triggering rq */ 132 u64 timeout_tid; /* tid of timeout triggering rq */
133 u64 last_tid; /* tid of last request */ 133 u64 last_tid; /* tid of last request */
134 struct rb_root requests; /* pending requests */ 134 struct rb_root requests; /* pending requests */
135 struct list_head req_lru; /* in-flight lru */ 135 struct list_head req_lru; /* in-flight lru */
136 struct list_head req_unsent; /* unsent/need-resend queue */ 136 struct list_head req_unsent; /* unsent/need-resend queue */
137 struct list_head req_notarget; /* map to no osd */ 137 struct list_head req_notarget; /* map to no osd */
138 struct list_head req_linger; /* lingering requests */ 138 struct list_head req_linger; /* lingering requests */
139 int num_requests; 139 int num_requests;
140 struct delayed_work timeout_work; 140 struct delayed_work timeout_work;
141 struct delayed_work osds_timeout_work; 141 struct delayed_work osds_timeout_work;
142 #ifdef CONFIG_DEBUG_FS 142 #ifdef CONFIG_DEBUG_FS
143 struct dentry *debugfs_file; 143 struct dentry *debugfs_file;
144 #endif 144 #endif
145 145
146 mempool_t *req_mempool; 146 mempool_t *req_mempool;
147 147
148 struct ceph_msgpool msgpool_op; 148 struct ceph_msgpool msgpool_op;
149 struct ceph_msgpool msgpool_op_reply; 149 struct ceph_msgpool msgpool_op_reply;
150 150
151 spinlock_t event_lock; 151 spinlock_t event_lock;
152 struct rb_root event_tree; 152 struct rb_root event_tree;
153 u64 event_count; 153 u64 event_count;
154 154
155 struct workqueue_struct *notify_wq; 155 struct workqueue_struct *notify_wq;
156 }; 156 };
157 157
158 struct ceph_osd_req_op { 158 struct ceph_osd_req_op {
159 u16 op; /* CEPH_OSD_OP_* */ 159 u16 op; /* CEPH_OSD_OP_* */
160 u32 payload_len; 160 u32 payload_len;
161 union { 161 union {
162 struct { 162 struct {
163 u64 offset, length; 163 u64 offset, length;
164 u64 truncate_size; 164 u64 truncate_size;
165 u32 truncate_seq; 165 u32 truncate_seq;
166 } extent; 166 } extent;
167 struct { 167 struct {
168 const char *name; 168 const char *name;
169 const char *val; 169 const char *val;
170 u32 name_len; 170 u32 name_len;
171 u32 value_len; 171 u32 value_len;
172 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ 172 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
173 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ 173 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
174 } xattr; 174 } xattr;
175 struct { 175 struct {
176 const char *class_name; 176 const char *class_name;
177 const char *method_name; 177 const char *method_name;
178 const char *indata; 178 const char *indata;
179 u32 indata_len; 179 u32 indata_len;
180 __u8 class_len; 180 __u8 class_len;
181 __u8 method_len; 181 __u8 method_len;
182 __u8 argc; 182 __u8 argc;
183 } cls; 183 } cls;
184 struct { 184 struct {
185 u64 cookie; 185 u64 cookie;
186 u64 count; 186 u64 count;
187 } pgls; 187 } pgls;
188 struct { 188 struct {
189 u64 snapid; 189 u64 snapid;
190 } snap; 190 } snap;
191 struct { 191 struct {
192 u64 cookie; 192 u64 cookie;
193 u64 ver; 193 u64 ver;
194 u32 prot_ver; 194 u32 prot_ver;
195 u32 timeout; 195 u32 timeout;
196 __u8 flag; 196 __u8 flag;
197 } watch; 197 } watch;
198 }; 198 };
199 }; 199 };
200 200
201 extern int ceph_osdc_init(struct ceph_osd_client *osdc, 201 extern int ceph_osdc_init(struct ceph_osd_client *osdc,
202 struct ceph_client *client); 202 struct ceph_client *client);
203 extern void ceph_osdc_stop(struct ceph_osd_client *osdc); 203 extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
204 204
205 extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, 205 extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
206 struct ceph_msg *msg); 206 struct ceph_msg *msg);
207 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, 207 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
208 struct ceph_msg *msg); 208 struct ceph_msg *msg);
209 209
210 extern int ceph_calc_raw_layout(struct ceph_file_layout *layout, 210 extern int ceph_calc_raw_layout(struct ceph_file_layout *layout,
211 u64 off, u64 *plen, u64 *bno, 211 u64 off, u64 *plen, u64 *bno,
212 struct ceph_osd_request *req, 212 struct ceph_osd_request *req,
213 struct ceph_osd_req_op *op); 213 struct ceph_osd_req_op *op);
214 214
215 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, 215 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
216 struct ceph_snap_context *snapc, 216 struct ceph_snap_context *snapc,
217 unsigned int num_op, 217 unsigned int num_op,
218 bool use_mempool, 218 bool use_mempool,
219 gfp_t gfp_flags); 219 gfp_t gfp_flags);
220 220
221 extern void ceph_osdc_build_request(struct ceph_osd_request *req, 221 extern void ceph_osdc_build_request(struct ceph_osd_request *req,
222 u64 off, u64 len, 222 u64 off, u64 len,
223 unsigned int num_op, 223 unsigned int num_op,
224 struct ceph_osd_req_op *src_ops, 224 struct ceph_osd_req_op *src_ops,
225 struct ceph_snap_context *snapc, 225 struct ceph_snap_context *snapc,
226 u64 snap_id, 226 u64 snap_id,
227 struct timespec *mtime); 227 struct timespec *mtime);
228 228
229 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, 229 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
230 struct ceph_file_layout *layout, 230 struct ceph_file_layout *layout,
231 struct ceph_vino vino, 231 struct ceph_vino vino,
232 u64 offset, u64 *len, int op, int flags, 232 u64 offset, u64 *len, int op, int flags,
233 struct ceph_snap_context *snapc, 233 struct ceph_snap_context *snapc,
234 int do_sync, u32 truncate_seq, 234 int do_sync, u32 truncate_seq,
235 u64 truncate_size, 235 u64 truncate_size,
236 struct timespec *mtime, 236 struct timespec *mtime,
237 bool use_mempool, int num_reply, 237 bool use_mempool, int num_reply,
238 int page_align); 238 int page_align);
239 239
240 extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, 240 extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
241 struct ceph_osd_request *req); 241 struct ceph_osd_request *req);
242 extern void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, 242 extern void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc,
243 struct ceph_osd_request *req); 243 struct ceph_osd_request *req);
244 244
245 static inline void ceph_osdc_get_request(struct ceph_osd_request *req) 245 static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
246 { 246 {
247 kref_get(&req->r_kref); 247 kref_get(&req->r_kref);
248 } 248 }
249 extern void ceph_osdc_release_request(struct kref *kref); 249 extern void ceph_osdc_release_request(struct kref *kref);
250 static inline void ceph_osdc_put_request(struct ceph_osd_request *req) 250 static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
251 { 251 {
252 kref_put(&req->r_kref, ceph_osdc_release_request); 252 kref_put(&req->r_kref, ceph_osdc_release_request);
253 } 253 }
254 254
255 extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, 255 extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
256 struct ceph_osd_request *req, 256 struct ceph_osd_request *req,
257 bool nofail); 257 bool nofail);
258 extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, 258 extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
259 struct ceph_osd_request *req); 259 struct ceph_osd_request *req);
260 extern void ceph_osdc_sync(struct ceph_osd_client *osdc); 260 extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
261 261
262 extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, 262 extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
263 struct ceph_vino vino, 263 struct ceph_vino vino,
264 struct ceph_file_layout *layout, 264 struct ceph_file_layout *layout,
265 u64 off, u64 *plen, 265 u64 off, u64 *plen,
266 u32 truncate_seq, u64 truncate_size, 266 u32 truncate_seq, u64 truncate_size,
267 struct page **pages, int nr_pages, 267 struct page **pages, int nr_pages,
268 int page_align); 268 int page_align);
269 269
270 extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, 270 extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
271 struct ceph_vino vino, 271 struct ceph_vino vino,
272 struct ceph_file_layout *layout, 272 struct ceph_file_layout *layout,
273 struct ceph_snap_context *sc, 273 struct ceph_snap_context *sc,
274 u64 off, u64 len, 274 u64 off, u64 len,
275 u32 truncate_seq, u64 truncate_size, 275 u32 truncate_seq, u64 truncate_size,
276 struct timespec *mtime, 276 struct timespec *mtime,
277 struct page **pages, int nr_pages, 277 struct page **pages, int nr_pages,
278 int flags, int do_sync, bool nofail); 278 int flags, int do_sync);
279 279
280 /* watch/notify events */ 280 /* watch/notify events */
281 extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, 281 extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
282 void (*event_cb)(u64, u64, u8, void *), 282 void (*event_cb)(u64, u64, u8, void *),
283 int one_shot, void *data, 283 int one_shot, void *data,
284 struct ceph_osd_event **pevent); 284 struct ceph_osd_event **pevent);
285 extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); 285 extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
286 extern int ceph_osdc_wait_event(struct ceph_osd_event *event, 286 extern int ceph_osdc_wait_event(struct ceph_osd_event *event,
287 unsigned long timeout); 287 unsigned long timeout);
288 extern void ceph_osdc_put_event(struct ceph_osd_event *event); 288 extern void ceph_osdc_put_event(struct ceph_osd_event *event);
289 #endif 289 #endif
290 290
291 291
net/ceph/osd_client.c
1 #include <linux/ceph/ceph_debug.h> 1 #include <linux/ceph/ceph_debug.h>
2 2
3 #include <linux/module.h> 3 #include <linux/module.h>
4 #include <linux/err.h> 4 #include <linux/err.h>
5 #include <linux/highmem.h> 5 #include <linux/highmem.h>
6 #include <linux/mm.h> 6 #include <linux/mm.h>
7 #include <linux/pagemap.h> 7 #include <linux/pagemap.h>
8 #include <linux/slab.h> 8 #include <linux/slab.h>
9 #include <linux/uaccess.h> 9 #include <linux/uaccess.h>
10 #ifdef CONFIG_BLOCK 10 #ifdef CONFIG_BLOCK
11 #include <linux/bio.h> 11 #include <linux/bio.h>
12 #endif 12 #endif
13 13
14 #include <linux/ceph/libceph.h> 14 #include <linux/ceph/libceph.h>
15 #include <linux/ceph/osd_client.h> 15 #include <linux/ceph/osd_client.h>
16 #include <linux/ceph/messenger.h> 16 #include <linux/ceph/messenger.h>
17 #include <linux/ceph/decode.h> 17 #include <linux/ceph/decode.h>
18 #include <linux/ceph/auth.h> 18 #include <linux/ceph/auth.h>
19 #include <linux/ceph/pagelist.h> 19 #include <linux/ceph/pagelist.h>
20 20
21 #define OSD_OP_FRONT_LEN 4096 21 #define OSD_OP_FRONT_LEN 4096
22 #define OSD_OPREPLY_FRONT_LEN 512 22 #define OSD_OPREPLY_FRONT_LEN 512
23 23
24 static const struct ceph_connection_operations osd_con_ops; 24 static const struct ceph_connection_operations osd_con_ops;
25 25
26 static void send_queued(struct ceph_osd_client *osdc); 26 static void send_queued(struct ceph_osd_client *osdc);
27 static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); 27 static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd);
28 static void __register_request(struct ceph_osd_client *osdc, 28 static void __register_request(struct ceph_osd_client *osdc,
29 struct ceph_osd_request *req); 29 struct ceph_osd_request *req);
30 static void __unregister_linger_request(struct ceph_osd_client *osdc, 30 static void __unregister_linger_request(struct ceph_osd_client *osdc,
31 struct ceph_osd_request *req); 31 struct ceph_osd_request *req);
32 static void __send_request(struct ceph_osd_client *osdc, 32 static void __send_request(struct ceph_osd_client *osdc,
33 struct ceph_osd_request *req); 33 struct ceph_osd_request *req);
34 34
35 static int op_has_extent(int op) 35 static int op_has_extent(int op)
36 { 36 {
37 return (op == CEPH_OSD_OP_READ || 37 return (op == CEPH_OSD_OP_READ ||
38 op == CEPH_OSD_OP_WRITE); 38 op == CEPH_OSD_OP_WRITE);
39 } 39 }
40 40
41 int ceph_calc_raw_layout(struct ceph_file_layout *layout, 41 int ceph_calc_raw_layout(struct ceph_file_layout *layout,
42 u64 off, u64 *plen, u64 *bno, 42 u64 off, u64 *plen, u64 *bno,
43 struct ceph_osd_request *req, 43 struct ceph_osd_request *req,
44 struct ceph_osd_req_op *op) 44 struct ceph_osd_req_op *op)
45 { 45 {
46 u64 orig_len = *plen; 46 u64 orig_len = *plen;
47 u64 objoff, objlen; /* extent in object */ 47 u64 objoff, objlen; /* extent in object */
48 int r; 48 int r;
49 49
50 /* object extent? */ 50 /* object extent? */
51 r = ceph_calc_file_object_mapping(layout, off, orig_len, bno, 51 r = ceph_calc_file_object_mapping(layout, off, orig_len, bno,
52 &objoff, &objlen); 52 &objoff, &objlen);
53 if (r < 0) 53 if (r < 0)
54 return r; 54 return r;
55 if (objlen < orig_len) { 55 if (objlen < orig_len) {
56 *plen = objlen; 56 *plen = objlen;
57 dout(" skipping last %llu, final file extent %llu~%llu\n", 57 dout(" skipping last %llu, final file extent %llu~%llu\n",
58 orig_len - *plen, off, *plen); 58 orig_len - *plen, off, *plen);
59 } 59 }
60 60
61 if (op_has_extent(op->op)) { 61 if (op_has_extent(op->op)) {
62 u32 osize = le32_to_cpu(layout->fl_object_size); 62 u32 osize = le32_to_cpu(layout->fl_object_size);
63 op->extent.offset = objoff; 63 op->extent.offset = objoff;
64 op->extent.length = objlen; 64 op->extent.length = objlen;
65 if (op->extent.truncate_size <= off - objoff) { 65 if (op->extent.truncate_size <= off - objoff) {
66 op->extent.truncate_size = 0; 66 op->extent.truncate_size = 0;
67 } else { 67 } else {
68 op->extent.truncate_size -= off - objoff; 68 op->extent.truncate_size -= off - objoff;
69 if (op->extent.truncate_size > osize) 69 if (op->extent.truncate_size > osize)
70 op->extent.truncate_size = osize; 70 op->extent.truncate_size = osize;
71 } 71 }
72 } 72 }
73 req->r_num_pages = calc_pages_for(off, *plen); 73 req->r_num_pages = calc_pages_for(off, *plen);
74 req->r_page_alignment = off & ~PAGE_MASK; 74 req->r_page_alignment = off & ~PAGE_MASK;
75 if (op->op == CEPH_OSD_OP_WRITE) 75 if (op->op == CEPH_OSD_OP_WRITE)
76 op->payload_len = *plen; 76 op->payload_len = *plen;
77 77
78 dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", 78 dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
79 *bno, objoff, objlen, req->r_num_pages); 79 *bno, objoff, objlen, req->r_num_pages);
80 return 0; 80 return 0;
81 } 81 }
82 EXPORT_SYMBOL(ceph_calc_raw_layout); 82 EXPORT_SYMBOL(ceph_calc_raw_layout);
83 83
84 /* 84 /*
85 * Implement client access to distributed object storage cluster. 85 * Implement client access to distributed object storage cluster.
86 * 86 *
87 * All data objects are stored within a cluster/cloud of OSDs, or 87 * All data objects are stored within a cluster/cloud of OSDs, or
88 * "object storage devices." (Note that Ceph OSDs have _nothing_ to 88 * "object storage devices." (Note that Ceph OSDs have _nothing_ to
89 * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply 89 * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
90 * remote daemons serving up and coordinating consistent and safe 90 * remote daemons serving up and coordinating consistent and safe
91 * access to storage. 91 * access to storage.
92 * 92 *
93 * Cluster membership and the mapping of data objects onto storage devices 93 * Cluster membership and the mapping of data objects onto storage devices
94 * are described by the osd map. 94 * are described by the osd map.
95 * 95 *
96 * We keep track of pending OSD requests (read, write), resubmit 96 * We keep track of pending OSD requests (read, write), resubmit
97 * requests to different OSDs when the cluster topology/data layout 97 * requests to different OSDs when the cluster topology/data layout
98 * change, or retry the affected requests when the communications 98 * change, or retry the affected requests when the communications
99 * channel with an OSD is reset. 99 * channel with an OSD is reset.
100 */ 100 */
101 101
102 /* 102 /*
103 * calculate the mapping of a file extent onto an object, and fill out the 103 * calculate the mapping of a file extent onto an object, and fill out the
104 * request accordingly. shorten extent as necessary if it crosses an 104 * request accordingly. shorten extent as necessary if it crosses an
105 * object boundary. 105 * object boundary.
106 * 106 *
107 * fill osd op in request message. 107 * fill osd op in request message.
108 */ 108 */
109 static int calc_layout(struct ceph_vino vino, 109 static int calc_layout(struct ceph_vino vino,
110 struct ceph_file_layout *layout, 110 struct ceph_file_layout *layout,
111 u64 off, u64 *plen, 111 u64 off, u64 *plen,
112 struct ceph_osd_request *req, 112 struct ceph_osd_request *req,
113 struct ceph_osd_req_op *op) 113 struct ceph_osd_req_op *op)
114 { 114 {
115 u64 bno; 115 u64 bno;
116 int r; 116 int r;
117 117
118 r = ceph_calc_raw_layout(layout, off, plen, &bno, req, op); 118 r = ceph_calc_raw_layout(layout, off, plen, &bno, req, op);
119 if (r < 0) 119 if (r < 0)
120 return r; 120 return r;
121 121
122 snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); 122 snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno);
123 req->r_oid_len = strlen(req->r_oid); 123 req->r_oid_len = strlen(req->r_oid);
124 124
125 return r; 125 return r;
126 } 126 }
127 127
128 /* 128 /*
129 * requests 129 * requests
130 */ 130 */
131 void ceph_osdc_release_request(struct kref *kref) 131 void ceph_osdc_release_request(struct kref *kref)
132 { 132 {
133 struct ceph_osd_request *req = container_of(kref, 133 struct ceph_osd_request *req = container_of(kref,
134 struct ceph_osd_request, 134 struct ceph_osd_request,
135 r_kref); 135 r_kref);
136 136
137 if (req->r_request) 137 if (req->r_request)
138 ceph_msg_put(req->r_request); 138 ceph_msg_put(req->r_request);
139 if (req->r_con_filling_msg) { 139 if (req->r_con_filling_msg) {
140 dout("%s revoking msg %p from con %p\n", __func__, 140 dout("%s revoking msg %p from con %p\n", __func__,
141 req->r_reply, req->r_con_filling_msg); 141 req->r_reply, req->r_con_filling_msg);
142 ceph_msg_revoke_incoming(req->r_reply); 142 ceph_msg_revoke_incoming(req->r_reply);
143 req->r_con_filling_msg->ops->put(req->r_con_filling_msg); 143 req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
144 req->r_con_filling_msg = NULL; 144 req->r_con_filling_msg = NULL;
145 } 145 }
146 if (req->r_reply) 146 if (req->r_reply)
147 ceph_msg_put(req->r_reply); 147 ceph_msg_put(req->r_reply);
148 if (req->r_own_pages) 148 if (req->r_own_pages)
149 ceph_release_page_vector(req->r_pages, 149 ceph_release_page_vector(req->r_pages,
150 req->r_num_pages); 150 req->r_num_pages);
151 ceph_put_snap_context(req->r_snapc); 151 ceph_put_snap_context(req->r_snapc);
152 ceph_pagelist_release(&req->r_trail); 152 ceph_pagelist_release(&req->r_trail);
153 if (req->r_mempool) 153 if (req->r_mempool)
154 mempool_free(req, req->r_osdc->req_mempool); 154 mempool_free(req, req->r_osdc->req_mempool);
155 else 155 else
156 kfree(req); 156 kfree(req);
157 } 157 }
158 EXPORT_SYMBOL(ceph_osdc_release_request); 158 EXPORT_SYMBOL(ceph_osdc_release_request);
159 159
160 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, 160 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
161 struct ceph_snap_context *snapc, 161 struct ceph_snap_context *snapc,
162 unsigned int num_op, 162 unsigned int num_op,
163 bool use_mempool, 163 bool use_mempool,
164 gfp_t gfp_flags) 164 gfp_t gfp_flags)
165 { 165 {
166 struct ceph_osd_request *req; 166 struct ceph_osd_request *req;
167 struct ceph_msg *msg; 167 struct ceph_msg *msg;
168 size_t msg_size = sizeof(struct ceph_osd_request_head); 168 size_t msg_size = sizeof(struct ceph_osd_request_head);
169 169
170 msg_size += num_op*sizeof(struct ceph_osd_op); 170 msg_size += num_op*sizeof(struct ceph_osd_op);
171 171
172 if (use_mempool) { 172 if (use_mempool) {
173 req = mempool_alloc(osdc->req_mempool, gfp_flags); 173 req = mempool_alloc(osdc->req_mempool, gfp_flags);
174 memset(req, 0, sizeof(*req)); 174 memset(req, 0, sizeof(*req));
175 } else { 175 } else {
176 req = kzalloc(sizeof(*req), gfp_flags); 176 req = kzalloc(sizeof(*req), gfp_flags);
177 } 177 }
178 if (req == NULL) 178 if (req == NULL)
179 return NULL; 179 return NULL;
180 180
181 req->r_osdc = osdc; 181 req->r_osdc = osdc;
182 req->r_mempool = use_mempool; 182 req->r_mempool = use_mempool;
183 183
184 kref_init(&req->r_kref); 184 kref_init(&req->r_kref);
185 init_completion(&req->r_completion); 185 init_completion(&req->r_completion);
186 init_completion(&req->r_safe_completion); 186 init_completion(&req->r_safe_completion);
187 RB_CLEAR_NODE(&req->r_node); 187 RB_CLEAR_NODE(&req->r_node);
188 INIT_LIST_HEAD(&req->r_unsafe_item); 188 INIT_LIST_HEAD(&req->r_unsafe_item);
189 INIT_LIST_HEAD(&req->r_linger_item); 189 INIT_LIST_HEAD(&req->r_linger_item);
190 INIT_LIST_HEAD(&req->r_linger_osd); 190 INIT_LIST_HEAD(&req->r_linger_osd);
191 INIT_LIST_HEAD(&req->r_req_lru_item); 191 INIT_LIST_HEAD(&req->r_req_lru_item);
192 INIT_LIST_HEAD(&req->r_osd_item); 192 INIT_LIST_HEAD(&req->r_osd_item);
193 193
194 /* create reply message */ 194 /* create reply message */
195 if (use_mempool) 195 if (use_mempool)
196 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); 196 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
197 else 197 else
198 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, 198 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
199 OSD_OPREPLY_FRONT_LEN, gfp_flags, true); 199 OSD_OPREPLY_FRONT_LEN, gfp_flags, true);
200 if (!msg) { 200 if (!msg) {
201 ceph_osdc_put_request(req); 201 ceph_osdc_put_request(req);
202 return NULL; 202 return NULL;
203 } 203 }
204 req->r_reply = msg; 204 req->r_reply = msg;
205 205
206 ceph_pagelist_init(&req->r_trail); 206 ceph_pagelist_init(&req->r_trail);
207 207
208 /* create request message; allow space for oid */ 208 /* create request message; allow space for oid */
209 msg_size += MAX_OBJ_NAME_SIZE; 209 msg_size += MAX_OBJ_NAME_SIZE;
210 if (snapc) 210 if (snapc)
211 msg_size += sizeof(u64) * snapc->num_snaps; 211 msg_size += sizeof(u64) * snapc->num_snaps;
212 if (use_mempool) 212 if (use_mempool)
213 msg = ceph_msgpool_get(&osdc->msgpool_op, 0); 213 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
214 else 214 else
215 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true); 215 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true);
216 if (!msg) { 216 if (!msg) {
217 ceph_osdc_put_request(req); 217 ceph_osdc_put_request(req);
218 return NULL; 218 return NULL;
219 } 219 }
220 220
221 memset(msg->front.iov_base, 0, msg->front.iov_len); 221 memset(msg->front.iov_base, 0, msg->front.iov_len);
222 222
223 req->r_request = msg; 223 req->r_request = msg;
224 224
225 return req; 225 return req;
226 } 226 }
227 EXPORT_SYMBOL(ceph_osdc_alloc_request); 227 EXPORT_SYMBOL(ceph_osdc_alloc_request);
228 228
229 static void osd_req_encode_op(struct ceph_osd_request *req, 229 static void osd_req_encode_op(struct ceph_osd_request *req,
230 struct ceph_osd_op *dst, 230 struct ceph_osd_op *dst,
231 struct ceph_osd_req_op *src) 231 struct ceph_osd_req_op *src)
232 { 232 {
233 dst->op = cpu_to_le16(src->op); 233 dst->op = cpu_to_le16(src->op);
234 234
235 switch (src->op) { 235 switch (src->op) {
236 case CEPH_OSD_OP_READ: 236 case CEPH_OSD_OP_READ:
237 case CEPH_OSD_OP_WRITE: 237 case CEPH_OSD_OP_WRITE:
238 dst->extent.offset = 238 dst->extent.offset =
239 cpu_to_le64(src->extent.offset); 239 cpu_to_le64(src->extent.offset);
240 dst->extent.length = 240 dst->extent.length =
241 cpu_to_le64(src->extent.length); 241 cpu_to_le64(src->extent.length);
242 dst->extent.truncate_size = 242 dst->extent.truncate_size =
243 cpu_to_le64(src->extent.truncate_size); 243 cpu_to_le64(src->extent.truncate_size);
244 dst->extent.truncate_seq = 244 dst->extent.truncate_seq =
245 cpu_to_le32(src->extent.truncate_seq); 245 cpu_to_le32(src->extent.truncate_seq);
246 break; 246 break;
247 247
248 case CEPH_OSD_OP_GETXATTR: 248 case CEPH_OSD_OP_GETXATTR:
249 case CEPH_OSD_OP_SETXATTR: 249 case CEPH_OSD_OP_SETXATTR:
250 case CEPH_OSD_OP_CMPXATTR: 250 case CEPH_OSD_OP_CMPXATTR:
251 dst->xattr.name_len = cpu_to_le32(src->xattr.name_len); 251 dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
252 dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); 252 dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
253 dst->xattr.cmp_op = src->xattr.cmp_op; 253 dst->xattr.cmp_op = src->xattr.cmp_op;
254 dst->xattr.cmp_mode = src->xattr.cmp_mode; 254 dst->xattr.cmp_mode = src->xattr.cmp_mode;
255 ceph_pagelist_append(&req->r_trail, src->xattr.name, 255 ceph_pagelist_append(&req->r_trail, src->xattr.name,
256 src->xattr.name_len); 256 src->xattr.name_len);
257 ceph_pagelist_append(&req->r_trail, src->xattr.val, 257 ceph_pagelist_append(&req->r_trail, src->xattr.val,
258 src->xattr.value_len); 258 src->xattr.value_len);
259 break; 259 break;
260 case CEPH_OSD_OP_CALL: 260 case CEPH_OSD_OP_CALL:
261 dst->cls.class_len = src->cls.class_len; 261 dst->cls.class_len = src->cls.class_len;
262 dst->cls.method_len = src->cls.method_len; 262 dst->cls.method_len = src->cls.method_len;
263 dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); 263 dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
264 264
265 ceph_pagelist_append(&req->r_trail, src->cls.class_name, 265 ceph_pagelist_append(&req->r_trail, src->cls.class_name,
266 src->cls.class_len); 266 src->cls.class_len);
267 ceph_pagelist_append(&req->r_trail, src->cls.method_name, 267 ceph_pagelist_append(&req->r_trail, src->cls.method_name,
268 src->cls.method_len); 268 src->cls.method_len);
269 ceph_pagelist_append(&req->r_trail, src->cls.indata, 269 ceph_pagelist_append(&req->r_trail, src->cls.indata,
270 src->cls.indata_len); 270 src->cls.indata_len);
271 break; 271 break;
272 case CEPH_OSD_OP_ROLLBACK: 272 case CEPH_OSD_OP_ROLLBACK:
273 dst->snap.snapid = cpu_to_le64(src->snap.snapid); 273 dst->snap.snapid = cpu_to_le64(src->snap.snapid);
274 break; 274 break;
275 case CEPH_OSD_OP_STARTSYNC: 275 case CEPH_OSD_OP_STARTSYNC:
276 break; 276 break;
277 case CEPH_OSD_OP_NOTIFY: 277 case CEPH_OSD_OP_NOTIFY:
278 { 278 {
279 __le32 prot_ver = cpu_to_le32(src->watch.prot_ver); 279 __le32 prot_ver = cpu_to_le32(src->watch.prot_ver);
280 __le32 timeout = cpu_to_le32(src->watch.timeout); 280 __le32 timeout = cpu_to_le32(src->watch.timeout);
281 281
282 ceph_pagelist_append(&req->r_trail, 282 ceph_pagelist_append(&req->r_trail,
283 &prot_ver, sizeof(prot_ver)); 283 &prot_ver, sizeof(prot_ver));
284 ceph_pagelist_append(&req->r_trail, 284 ceph_pagelist_append(&req->r_trail,
285 &timeout, sizeof(timeout)); 285 &timeout, sizeof(timeout));
286 } 286 }
287 case CEPH_OSD_OP_NOTIFY_ACK: 287 case CEPH_OSD_OP_NOTIFY_ACK:
288 case CEPH_OSD_OP_WATCH: 288 case CEPH_OSD_OP_WATCH:
289 dst->watch.cookie = cpu_to_le64(src->watch.cookie); 289 dst->watch.cookie = cpu_to_le64(src->watch.cookie);
290 dst->watch.ver = cpu_to_le64(src->watch.ver); 290 dst->watch.ver = cpu_to_le64(src->watch.ver);
291 dst->watch.flag = src->watch.flag; 291 dst->watch.flag = src->watch.flag;
292 break; 292 break;
293 default: 293 default:
294 pr_err("unrecognized osd opcode %d\n", dst->op); 294 pr_err("unrecognized osd opcode %d\n", dst->op);
295 WARN_ON(1); 295 WARN_ON(1);
296 break; 296 break;
297 } 297 }
298 dst->payload_len = cpu_to_le32(src->payload_len); 298 dst->payload_len = cpu_to_le32(src->payload_len);
299 } 299 }
300 300
301 /* 301 /*
302 * build new request AND message 302 * build new request AND message
303 * 303 *
304 */ 304 */
305 void ceph_osdc_build_request(struct ceph_osd_request *req, 305 void ceph_osdc_build_request(struct ceph_osd_request *req,
306 u64 off, u64 len, unsigned int num_op, 306 u64 off, u64 len, unsigned int num_op,
307 struct ceph_osd_req_op *src_ops, 307 struct ceph_osd_req_op *src_ops,
308 struct ceph_snap_context *snapc, u64 snap_id, 308 struct ceph_snap_context *snapc, u64 snap_id,
309 struct timespec *mtime) 309 struct timespec *mtime)
310 { 310 {
311 struct ceph_msg *msg = req->r_request; 311 struct ceph_msg *msg = req->r_request;
312 struct ceph_osd_request_head *head; 312 struct ceph_osd_request_head *head;
313 struct ceph_osd_req_op *src_op; 313 struct ceph_osd_req_op *src_op;
314 struct ceph_osd_op *op; 314 struct ceph_osd_op *op;
315 void *p; 315 void *p;
316 size_t msg_size = sizeof(*head) + num_op*sizeof(*op); 316 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
317 int flags = req->r_flags; 317 int flags = req->r_flags;
318 u64 data_len = 0; 318 u64 data_len = 0;
319 int i; 319 int i;
320 320
321 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); 321 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
322 322
323 head = msg->front.iov_base; 323 head = msg->front.iov_base;
324 head->snapid = cpu_to_le64(snap_id); 324 head->snapid = cpu_to_le64(snap_id);
325 op = (void *)(head + 1); 325 op = (void *)(head + 1);
326 p = (void *)(op + num_op); 326 p = (void *)(op + num_op);
327 327
328 req->r_snapc = ceph_get_snap_context(snapc); 328 req->r_snapc = ceph_get_snap_context(snapc);
329 329
330 head->client_inc = cpu_to_le32(1); /* always, for now. */ 330 head->client_inc = cpu_to_le32(1); /* always, for now. */
331 head->flags = cpu_to_le32(flags); 331 head->flags = cpu_to_le32(flags);
332 if (flags & CEPH_OSD_FLAG_WRITE) 332 if (flags & CEPH_OSD_FLAG_WRITE)
333 ceph_encode_timespec(&head->mtime, mtime); 333 ceph_encode_timespec(&head->mtime, mtime);
334 BUG_ON(num_op > (unsigned int) ((u16) -1)); 334 BUG_ON(num_op > (unsigned int) ((u16) -1));
335 head->num_ops = cpu_to_le16(num_op); 335 head->num_ops = cpu_to_le16(num_op);
336 336
337 /* fill in oid */ 337 /* fill in oid */
338 head->object_len = cpu_to_le32(req->r_oid_len); 338 head->object_len = cpu_to_le32(req->r_oid_len);
339 memcpy(p, req->r_oid, req->r_oid_len); 339 memcpy(p, req->r_oid, req->r_oid_len);
340 p += req->r_oid_len; 340 p += req->r_oid_len;
341 341
342 src_op = src_ops; 342 src_op = src_ops;
343 while (num_op--) 343 while (num_op--)
344 osd_req_encode_op(req, op++, src_op++); 344 osd_req_encode_op(req, op++, src_op++);
345 345
346 data_len += req->r_trail.length; 346 data_len += req->r_trail.length;
347 347
348 if (snapc) { 348 if (snapc) {
349 head->snap_seq = cpu_to_le64(snapc->seq); 349 head->snap_seq = cpu_to_le64(snapc->seq);
350 head->num_snaps = cpu_to_le32(snapc->num_snaps); 350 head->num_snaps = cpu_to_le32(snapc->num_snaps);
351 for (i = 0; i < snapc->num_snaps; i++) { 351 for (i = 0; i < snapc->num_snaps; i++) {
352 put_unaligned_le64(snapc->snaps[i], p); 352 put_unaligned_le64(snapc->snaps[i], p);
353 p += sizeof(u64); 353 p += sizeof(u64);
354 } 354 }
355 } 355 }
356 356
357 if (flags & CEPH_OSD_FLAG_WRITE) { 357 if (flags & CEPH_OSD_FLAG_WRITE) {
358 req->r_request->hdr.data_off = cpu_to_le16(off); 358 req->r_request->hdr.data_off = cpu_to_le16(off);
359 req->r_request->hdr.data_len = cpu_to_le32(len + data_len); 359 req->r_request->hdr.data_len = cpu_to_le32(len + data_len);
360 } else if (data_len) { 360 } else if (data_len) {
361 req->r_request->hdr.data_off = 0; 361 req->r_request->hdr.data_off = 0;
362 req->r_request->hdr.data_len = cpu_to_le32(data_len); 362 req->r_request->hdr.data_len = cpu_to_le32(data_len);
363 } 363 }
364 364
365 req->r_request->page_alignment = req->r_page_alignment; 365 req->r_request->page_alignment = req->r_page_alignment;
366 366
367 BUG_ON(p > msg->front.iov_base + msg->front.iov_len); 367 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
368 msg_size = p - msg->front.iov_base; 368 msg_size = p - msg->front.iov_base;
369 msg->front.iov_len = msg_size; 369 msg->front.iov_len = msg_size;
370 msg->hdr.front_len = cpu_to_le32(msg_size); 370 msg->hdr.front_len = cpu_to_le32(msg_size);
371 return; 371 return;
372 } 372 }
373 EXPORT_SYMBOL(ceph_osdc_build_request); 373 EXPORT_SYMBOL(ceph_osdc_build_request);
374 374
375 /* 375 /*
376 * build new request AND message, calculate layout, and adjust file 376 * build new request AND message, calculate layout, and adjust file
377 * extent as needed. 377 * extent as needed.
378 * 378 *
379 * if the file was recently truncated, we include information about its 379 * if the file was recently truncated, we include information about its
380 * old and new size so that the object can be updated appropriately. (we 380 * old and new size so that the object can be updated appropriately. (we
381 * avoid synchronously deleting truncated objects because it's slow.) 381 * avoid synchronously deleting truncated objects because it's slow.)
382 * 382 *
383 * if @do_sync, include a 'startsync' command so that the osd will flush 383 * if @do_sync, include a 'startsync' command so that the osd will flush
384 * data quickly. 384 * data quickly.
385 */ 385 */
386 struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, 386 struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
387 struct ceph_file_layout *layout, 387 struct ceph_file_layout *layout,
388 struct ceph_vino vino, 388 struct ceph_vino vino,
389 u64 off, u64 *plen, 389 u64 off, u64 *plen,
390 int opcode, int flags, 390 int opcode, int flags,
391 struct ceph_snap_context *snapc, 391 struct ceph_snap_context *snapc,
392 int do_sync, 392 int do_sync,
393 u32 truncate_seq, 393 u32 truncate_seq,
394 u64 truncate_size, 394 u64 truncate_size,
395 struct timespec *mtime, 395 struct timespec *mtime,
396 bool use_mempool, int num_reply, 396 bool use_mempool, int num_reply,
397 int page_align) 397 int page_align)
398 { 398 {
399 struct ceph_osd_req_op ops[2]; 399 struct ceph_osd_req_op ops[2];
400 struct ceph_osd_request *req; 400 struct ceph_osd_request *req;
401 unsigned int num_op = 1; 401 unsigned int num_op = 1;
402 int r; 402 int r;
403 403
404 memset(&ops, 0, sizeof ops); 404 memset(&ops, 0, sizeof ops);
405 405
406 ops[0].op = opcode; 406 ops[0].op = opcode;
407 ops[0].extent.truncate_seq = truncate_seq; 407 ops[0].extent.truncate_seq = truncate_seq;
408 ops[0].extent.truncate_size = truncate_size; 408 ops[0].extent.truncate_size = truncate_size;
409 409
410 if (do_sync) { 410 if (do_sync) {
411 ops[1].op = CEPH_OSD_OP_STARTSYNC; 411 ops[1].op = CEPH_OSD_OP_STARTSYNC;
412 num_op++; 412 num_op++;
413 } 413 }
414 414
415 req = ceph_osdc_alloc_request(osdc, snapc, num_op, use_mempool, 415 req = ceph_osdc_alloc_request(osdc, snapc, num_op, use_mempool,
416 GFP_NOFS); 416 GFP_NOFS);
417 if (!req) 417 if (!req)
418 return ERR_PTR(-ENOMEM); 418 return ERR_PTR(-ENOMEM);
419 req->r_flags = flags; 419 req->r_flags = flags;
420 420
421 /* calculate max write size */ 421 /* calculate max write size */
422 r = calc_layout(vino, layout, off, plen, req, ops); 422 r = calc_layout(vino, layout, off, plen, req, ops);
423 if (r < 0) 423 if (r < 0)
424 return ERR_PTR(r); 424 return ERR_PTR(r);
425 req->r_file_layout = *layout; /* keep a copy */ 425 req->r_file_layout = *layout; /* keep a copy */
426 426
427 /* in case it differs from natural (file) alignment that 427 /* in case it differs from natural (file) alignment that
428 calc_layout filled in for us */ 428 calc_layout filled in for us */
429 req->r_num_pages = calc_pages_for(page_align, *plen); 429 req->r_num_pages = calc_pages_for(page_align, *plen);
430 req->r_page_alignment = page_align; 430 req->r_page_alignment = page_align;
431 431
432 ceph_osdc_build_request(req, off, *plen, num_op, ops, 432 ceph_osdc_build_request(req, off, *plen, num_op, ops,
433 snapc, vino.snap, mtime); 433 snapc, vino.snap, mtime);
434 434
435 return req; 435 return req;
436 } 436 }
437 EXPORT_SYMBOL(ceph_osdc_new_request); 437 EXPORT_SYMBOL(ceph_osdc_new_request);
438 438
439 /* 439 /*
440 * We keep osd requests in an rbtree, sorted by ->r_tid. 440 * We keep osd requests in an rbtree, sorted by ->r_tid.
441 */ 441 */
442 static void __insert_request(struct ceph_osd_client *osdc, 442 static void __insert_request(struct ceph_osd_client *osdc,
443 struct ceph_osd_request *new) 443 struct ceph_osd_request *new)
444 { 444 {
445 struct rb_node **p = &osdc->requests.rb_node; 445 struct rb_node **p = &osdc->requests.rb_node;
446 struct rb_node *parent = NULL; 446 struct rb_node *parent = NULL;
447 struct ceph_osd_request *req = NULL; 447 struct ceph_osd_request *req = NULL;
448 448
449 while (*p) { 449 while (*p) {
450 parent = *p; 450 parent = *p;
451 req = rb_entry(parent, struct ceph_osd_request, r_node); 451 req = rb_entry(parent, struct ceph_osd_request, r_node);
452 if (new->r_tid < req->r_tid) 452 if (new->r_tid < req->r_tid)
453 p = &(*p)->rb_left; 453 p = &(*p)->rb_left;
454 else if (new->r_tid > req->r_tid) 454 else if (new->r_tid > req->r_tid)
455 p = &(*p)->rb_right; 455 p = &(*p)->rb_right;
456 else 456 else
457 BUG(); 457 BUG();
458 } 458 }
459 459
460 rb_link_node(&new->r_node, parent, p); 460 rb_link_node(&new->r_node, parent, p);
461 rb_insert_color(&new->r_node, &osdc->requests); 461 rb_insert_color(&new->r_node, &osdc->requests);
462 } 462 }
463 463
464 static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc, 464 static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
465 u64 tid) 465 u64 tid)
466 { 466 {
467 struct ceph_osd_request *req; 467 struct ceph_osd_request *req;
468 struct rb_node *n = osdc->requests.rb_node; 468 struct rb_node *n = osdc->requests.rb_node;
469 469
470 while (n) { 470 while (n) {
471 req = rb_entry(n, struct ceph_osd_request, r_node); 471 req = rb_entry(n, struct ceph_osd_request, r_node);
472 if (tid < req->r_tid) 472 if (tid < req->r_tid)
473 n = n->rb_left; 473 n = n->rb_left;
474 else if (tid > req->r_tid) 474 else if (tid > req->r_tid)
475 n = n->rb_right; 475 n = n->rb_right;
476 else 476 else
477 return req; 477 return req;
478 } 478 }
479 return NULL; 479 return NULL;
480 } 480 }
481 481
482 static struct ceph_osd_request * 482 static struct ceph_osd_request *
483 __lookup_request_ge(struct ceph_osd_client *osdc, 483 __lookup_request_ge(struct ceph_osd_client *osdc,
484 u64 tid) 484 u64 tid)
485 { 485 {
486 struct ceph_osd_request *req; 486 struct ceph_osd_request *req;
487 struct rb_node *n = osdc->requests.rb_node; 487 struct rb_node *n = osdc->requests.rb_node;
488 488
489 while (n) { 489 while (n) {
490 req = rb_entry(n, struct ceph_osd_request, r_node); 490 req = rb_entry(n, struct ceph_osd_request, r_node);
491 if (tid < req->r_tid) { 491 if (tid < req->r_tid) {
492 if (!n->rb_left) 492 if (!n->rb_left)
493 return req; 493 return req;
494 n = n->rb_left; 494 n = n->rb_left;
495 } else if (tid > req->r_tid) { 495 } else if (tid > req->r_tid) {
496 n = n->rb_right; 496 n = n->rb_right;
497 } else { 497 } else {
498 return req; 498 return req;
499 } 499 }
500 } 500 }
501 return NULL; 501 return NULL;
502 } 502 }
503 503
504 /* 504 /*
505 * Resubmit requests pending on the given osd. 505 * Resubmit requests pending on the given osd.
506 */ 506 */
507 static void __kick_osd_requests(struct ceph_osd_client *osdc, 507 static void __kick_osd_requests(struct ceph_osd_client *osdc,
508 struct ceph_osd *osd) 508 struct ceph_osd *osd)
509 { 509 {
510 struct ceph_osd_request *req, *nreq; 510 struct ceph_osd_request *req, *nreq;
511 int err; 511 int err;
512 512
513 dout("__kick_osd_requests osd%d\n", osd->o_osd); 513 dout("__kick_osd_requests osd%d\n", osd->o_osd);
514 err = __reset_osd(osdc, osd); 514 err = __reset_osd(osdc, osd);
515 if (err) 515 if (err)
516 return; 516 return;
517 517
518 list_for_each_entry(req, &osd->o_requests, r_osd_item) { 518 list_for_each_entry(req, &osd->o_requests, r_osd_item) {
519 list_move(&req->r_req_lru_item, &osdc->req_unsent); 519 list_move(&req->r_req_lru_item, &osdc->req_unsent);
520 dout("requeued %p tid %llu osd%d\n", req, req->r_tid, 520 dout("requeued %p tid %llu osd%d\n", req, req->r_tid,
521 osd->o_osd); 521 osd->o_osd);
522 if (!req->r_linger) 522 if (!req->r_linger)
523 req->r_flags |= CEPH_OSD_FLAG_RETRY; 523 req->r_flags |= CEPH_OSD_FLAG_RETRY;
524 } 524 }
525 525
526 list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, 526 list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
527 r_linger_osd) { 527 r_linger_osd) {
528 /* 528 /*
529 * reregister request prior to unregistering linger so 529 * reregister request prior to unregistering linger so
530 * that r_osd is preserved. 530 * that r_osd is preserved.
531 */ 531 */
532 BUG_ON(!list_empty(&req->r_req_lru_item)); 532 BUG_ON(!list_empty(&req->r_req_lru_item));
533 __register_request(osdc, req); 533 __register_request(osdc, req);
534 list_add(&req->r_req_lru_item, &osdc->req_unsent); 534 list_add(&req->r_req_lru_item, &osdc->req_unsent);
535 list_add(&req->r_osd_item, &req->r_osd->o_requests); 535 list_add(&req->r_osd_item, &req->r_osd->o_requests);
536 __unregister_linger_request(osdc, req); 536 __unregister_linger_request(osdc, req);
537 dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid, 537 dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid,
538 osd->o_osd); 538 osd->o_osd);
539 } 539 }
540 } 540 }
541 541
542 /* 542 /*
543 * If the osd connection drops, we need to resubmit all requests. 543 * If the osd connection drops, we need to resubmit all requests.
544 */ 544 */
545 static void osd_reset(struct ceph_connection *con) 545 static void osd_reset(struct ceph_connection *con)
546 { 546 {
547 struct ceph_osd *osd = con->private; 547 struct ceph_osd *osd = con->private;
548 struct ceph_osd_client *osdc; 548 struct ceph_osd_client *osdc;
549 549
550 if (!osd) 550 if (!osd)
551 return; 551 return;
552 dout("osd_reset osd%d\n", osd->o_osd); 552 dout("osd_reset osd%d\n", osd->o_osd);
553 osdc = osd->o_osdc; 553 osdc = osd->o_osdc;
554 down_read(&osdc->map_sem); 554 down_read(&osdc->map_sem);
555 mutex_lock(&osdc->request_mutex); 555 mutex_lock(&osdc->request_mutex);
556 __kick_osd_requests(osdc, osd); 556 __kick_osd_requests(osdc, osd);
557 mutex_unlock(&osdc->request_mutex); 557 mutex_unlock(&osdc->request_mutex);
558 send_queued(osdc); 558 send_queued(osdc);
559 up_read(&osdc->map_sem); 559 up_read(&osdc->map_sem);
560 } 560 }
561 561
562 /* 562 /*
563 * Track open sessions with osds. 563 * Track open sessions with osds.
564 */ 564 */
565 static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) 565 static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
566 { 566 {
567 struct ceph_osd *osd; 567 struct ceph_osd *osd;
568 568
569 osd = kzalloc(sizeof(*osd), GFP_NOFS); 569 osd = kzalloc(sizeof(*osd), GFP_NOFS);
570 if (!osd) 570 if (!osd)
571 return NULL; 571 return NULL;
572 572
573 atomic_set(&osd->o_ref, 1); 573 atomic_set(&osd->o_ref, 1);
574 osd->o_osdc = osdc; 574 osd->o_osdc = osdc;
575 osd->o_osd = onum; 575 osd->o_osd = onum;
576 RB_CLEAR_NODE(&osd->o_node); 576 RB_CLEAR_NODE(&osd->o_node);
577 INIT_LIST_HEAD(&osd->o_requests); 577 INIT_LIST_HEAD(&osd->o_requests);
578 INIT_LIST_HEAD(&osd->o_linger_requests); 578 INIT_LIST_HEAD(&osd->o_linger_requests);
579 INIT_LIST_HEAD(&osd->o_osd_lru); 579 INIT_LIST_HEAD(&osd->o_osd_lru);
580 osd->o_incarnation = 1; 580 osd->o_incarnation = 1;
581 581
582 ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr); 582 ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
583 583
584 INIT_LIST_HEAD(&osd->o_keepalive_item); 584 INIT_LIST_HEAD(&osd->o_keepalive_item);
585 return osd; 585 return osd;
586 } 586 }
587 587
588 static struct ceph_osd *get_osd(struct ceph_osd *osd) 588 static struct ceph_osd *get_osd(struct ceph_osd *osd)
589 { 589 {
590 if (atomic_inc_not_zero(&osd->o_ref)) { 590 if (atomic_inc_not_zero(&osd->o_ref)) {
591 dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1, 591 dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
592 atomic_read(&osd->o_ref)); 592 atomic_read(&osd->o_ref));
593 return osd; 593 return osd;
594 } else { 594 } else {
595 dout("get_osd %p FAIL\n", osd); 595 dout("get_osd %p FAIL\n", osd);
596 return NULL; 596 return NULL;
597 } 597 }
598 } 598 }
599 599
600 static void put_osd(struct ceph_osd *osd) 600 static void put_osd(struct ceph_osd *osd)
601 { 601 {
602 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), 602 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
603 atomic_read(&osd->o_ref) - 1); 603 atomic_read(&osd->o_ref) - 1);
604 if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) { 604 if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) {
605 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; 605 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
606 606
607 if (ac->ops && ac->ops->destroy_authorizer) 607 if (ac->ops && ac->ops->destroy_authorizer)
608 ac->ops->destroy_authorizer(ac, osd->o_auth.authorizer); 608 ac->ops->destroy_authorizer(ac, osd->o_auth.authorizer);
609 kfree(osd); 609 kfree(osd);
610 } 610 }
611 } 611 }
612 612
613 /* 613 /*
614 * remove an osd from our map 614 * remove an osd from our map
615 */ 615 */
616 static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) 616 static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
617 { 617 {
618 dout("__remove_osd %p\n", osd); 618 dout("__remove_osd %p\n", osd);
619 BUG_ON(!list_empty(&osd->o_requests)); 619 BUG_ON(!list_empty(&osd->o_requests));
620 rb_erase(&osd->o_node, &osdc->osds); 620 rb_erase(&osd->o_node, &osdc->osds);
621 list_del_init(&osd->o_osd_lru); 621 list_del_init(&osd->o_osd_lru);
622 ceph_con_close(&osd->o_con); 622 ceph_con_close(&osd->o_con);
623 put_osd(osd); 623 put_osd(osd);
624 } 624 }
625 625
626 static void remove_all_osds(struct ceph_osd_client *osdc) 626 static void remove_all_osds(struct ceph_osd_client *osdc)
627 { 627 {
628 dout("%s %p\n", __func__, osdc); 628 dout("%s %p\n", __func__, osdc);
629 mutex_lock(&osdc->request_mutex); 629 mutex_lock(&osdc->request_mutex);
630 while (!RB_EMPTY_ROOT(&osdc->osds)) { 630 while (!RB_EMPTY_ROOT(&osdc->osds)) {
631 struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), 631 struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
632 struct ceph_osd, o_node); 632 struct ceph_osd, o_node);
633 __remove_osd(osdc, osd); 633 __remove_osd(osdc, osd);
634 } 634 }
635 mutex_unlock(&osdc->request_mutex); 635 mutex_unlock(&osdc->request_mutex);
636 } 636 }
637 637
638 static void __move_osd_to_lru(struct ceph_osd_client *osdc, 638 static void __move_osd_to_lru(struct ceph_osd_client *osdc,
639 struct ceph_osd *osd) 639 struct ceph_osd *osd)
640 { 640 {
641 dout("__move_osd_to_lru %p\n", osd); 641 dout("__move_osd_to_lru %p\n", osd);
642 BUG_ON(!list_empty(&osd->o_osd_lru)); 642 BUG_ON(!list_empty(&osd->o_osd_lru));
643 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); 643 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
644 osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ; 644 osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
645 } 645 }
646 646
647 static void __remove_osd_from_lru(struct ceph_osd *osd) 647 static void __remove_osd_from_lru(struct ceph_osd *osd)
648 { 648 {
649 dout("__remove_osd_from_lru %p\n", osd); 649 dout("__remove_osd_from_lru %p\n", osd);
650 if (!list_empty(&osd->o_osd_lru)) 650 if (!list_empty(&osd->o_osd_lru))
651 list_del_init(&osd->o_osd_lru); 651 list_del_init(&osd->o_osd_lru);
652 } 652 }
653 653
654 static void remove_old_osds(struct ceph_osd_client *osdc) 654 static void remove_old_osds(struct ceph_osd_client *osdc)
655 { 655 {
656 struct ceph_osd *osd, *nosd; 656 struct ceph_osd *osd, *nosd;
657 657
658 dout("__remove_old_osds %p\n", osdc); 658 dout("__remove_old_osds %p\n", osdc);
659 mutex_lock(&osdc->request_mutex); 659 mutex_lock(&osdc->request_mutex);
660 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { 660 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
661 if (time_before(jiffies, osd->lru_ttl)) 661 if (time_before(jiffies, osd->lru_ttl))
662 break; 662 break;
663 __remove_osd(osdc, osd); 663 __remove_osd(osdc, osd);
664 } 664 }
665 mutex_unlock(&osdc->request_mutex); 665 mutex_unlock(&osdc->request_mutex);
666 } 666 }
667 667
668 /* 668 /*
669 * reset osd connect 669 * reset osd connect
670 */ 670 */
671 static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) 671 static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
672 { 672 {
673 struct ceph_entity_addr *peer_addr; 673 struct ceph_entity_addr *peer_addr;
674 674
675 dout("__reset_osd %p osd%d\n", osd, osd->o_osd); 675 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
676 if (list_empty(&osd->o_requests) && 676 if (list_empty(&osd->o_requests) &&
677 list_empty(&osd->o_linger_requests)) { 677 list_empty(&osd->o_linger_requests)) {
678 __remove_osd(osdc, osd); 678 __remove_osd(osdc, osd);
679 679
680 return -ENODEV; 680 return -ENODEV;
681 } 681 }
682 682
683 peer_addr = &osdc->osdmap->osd_addr[osd->o_osd]; 683 peer_addr = &osdc->osdmap->osd_addr[osd->o_osd];
684 if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) && 684 if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) &&
685 !ceph_con_opened(&osd->o_con)) { 685 !ceph_con_opened(&osd->o_con)) {
686 struct ceph_osd_request *req; 686 struct ceph_osd_request *req;
687 687
688 dout(" osd addr hasn't changed and connection never opened," 688 dout(" osd addr hasn't changed and connection never opened,"
689 " letting msgr retry"); 689 " letting msgr retry");
690 /* touch each r_stamp for handle_timeout()'s benfit */ 690 /* touch each r_stamp for handle_timeout()'s benfit */
691 list_for_each_entry(req, &osd->o_requests, r_osd_item) 691 list_for_each_entry(req, &osd->o_requests, r_osd_item)
692 req->r_stamp = jiffies; 692 req->r_stamp = jiffies;
693 693
694 return -EAGAIN; 694 return -EAGAIN;
695 } 695 }
696 696
697 ceph_con_close(&osd->o_con); 697 ceph_con_close(&osd->o_con);
698 ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr); 698 ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr);
699 osd->o_incarnation++; 699 osd->o_incarnation++;
700 700
701 return 0; 701 return 0;
702 } 702 }
703 703
704 static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new) 704 static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
705 { 705 {
706 struct rb_node **p = &osdc->osds.rb_node; 706 struct rb_node **p = &osdc->osds.rb_node;
707 struct rb_node *parent = NULL; 707 struct rb_node *parent = NULL;
708 struct ceph_osd *osd = NULL; 708 struct ceph_osd *osd = NULL;
709 709
710 dout("__insert_osd %p osd%d\n", new, new->o_osd); 710 dout("__insert_osd %p osd%d\n", new, new->o_osd);
711 while (*p) { 711 while (*p) {
712 parent = *p; 712 parent = *p;
713 osd = rb_entry(parent, struct ceph_osd, o_node); 713 osd = rb_entry(parent, struct ceph_osd, o_node);
714 if (new->o_osd < osd->o_osd) 714 if (new->o_osd < osd->o_osd)
715 p = &(*p)->rb_left; 715 p = &(*p)->rb_left;
716 else if (new->o_osd > osd->o_osd) 716 else if (new->o_osd > osd->o_osd)
717 p = &(*p)->rb_right; 717 p = &(*p)->rb_right;
718 else 718 else
719 BUG(); 719 BUG();
720 } 720 }
721 721
722 rb_link_node(&new->o_node, parent, p); 722 rb_link_node(&new->o_node, parent, p);
723 rb_insert_color(&new->o_node, &osdc->osds); 723 rb_insert_color(&new->o_node, &osdc->osds);
724 } 724 }
725 725
726 static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o) 726 static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
727 { 727 {
728 struct ceph_osd *osd; 728 struct ceph_osd *osd;
729 struct rb_node *n = osdc->osds.rb_node; 729 struct rb_node *n = osdc->osds.rb_node;
730 730
731 while (n) { 731 while (n) {
732 osd = rb_entry(n, struct ceph_osd, o_node); 732 osd = rb_entry(n, struct ceph_osd, o_node);
733 if (o < osd->o_osd) 733 if (o < osd->o_osd)
734 n = n->rb_left; 734 n = n->rb_left;
735 else if (o > osd->o_osd) 735 else if (o > osd->o_osd)
736 n = n->rb_right; 736 n = n->rb_right;
737 else 737 else
738 return osd; 738 return osd;
739 } 739 }
740 return NULL; 740 return NULL;
741 } 741 }
742 742
743 static void __schedule_osd_timeout(struct ceph_osd_client *osdc) 743 static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
744 { 744 {
745 schedule_delayed_work(&osdc->timeout_work, 745 schedule_delayed_work(&osdc->timeout_work,
746 osdc->client->options->osd_keepalive_timeout * HZ); 746 osdc->client->options->osd_keepalive_timeout * HZ);
747 } 747 }
748 748
749 static void __cancel_osd_timeout(struct ceph_osd_client *osdc) 749 static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
750 { 750 {
751 cancel_delayed_work(&osdc->timeout_work); 751 cancel_delayed_work(&osdc->timeout_work);
752 } 752 }
753 753
754 /* 754 /*
755 * Register request, assign tid. If this is the first request, set up 755 * Register request, assign tid. If this is the first request, set up
756 * the timeout event. 756 * the timeout event.
757 */ 757 */
758 static void __register_request(struct ceph_osd_client *osdc, 758 static void __register_request(struct ceph_osd_client *osdc,
759 struct ceph_osd_request *req) 759 struct ceph_osd_request *req)
760 { 760 {
761 req->r_tid = ++osdc->last_tid; 761 req->r_tid = ++osdc->last_tid;
762 req->r_request->hdr.tid = cpu_to_le64(req->r_tid); 762 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
763 dout("__register_request %p tid %lld\n", req, req->r_tid); 763 dout("__register_request %p tid %lld\n", req, req->r_tid);
764 __insert_request(osdc, req); 764 __insert_request(osdc, req);
765 ceph_osdc_get_request(req); 765 ceph_osdc_get_request(req);
766 osdc->num_requests++; 766 osdc->num_requests++;
767 if (osdc->num_requests == 1) { 767 if (osdc->num_requests == 1) {
768 dout(" first request, scheduling timeout\n"); 768 dout(" first request, scheduling timeout\n");
769 __schedule_osd_timeout(osdc); 769 __schedule_osd_timeout(osdc);
770 } 770 }
771 } 771 }
772 772
773 static void register_request(struct ceph_osd_client *osdc, 773 static void register_request(struct ceph_osd_client *osdc,
774 struct ceph_osd_request *req) 774 struct ceph_osd_request *req)
775 { 775 {
776 mutex_lock(&osdc->request_mutex); 776 mutex_lock(&osdc->request_mutex);
777 __register_request(osdc, req); 777 __register_request(osdc, req);
778 mutex_unlock(&osdc->request_mutex); 778 mutex_unlock(&osdc->request_mutex);
779 } 779 }
780 780
781 /* 781 /*
782 * called under osdc->request_mutex 782 * called under osdc->request_mutex
783 */ 783 */
784 static void __unregister_request(struct ceph_osd_client *osdc, 784 static void __unregister_request(struct ceph_osd_client *osdc,
785 struct ceph_osd_request *req) 785 struct ceph_osd_request *req)
786 { 786 {
787 if (RB_EMPTY_NODE(&req->r_node)) { 787 if (RB_EMPTY_NODE(&req->r_node)) {
788 dout("__unregister_request %p tid %lld not registered\n", 788 dout("__unregister_request %p tid %lld not registered\n",
789 req, req->r_tid); 789 req, req->r_tid);
790 return; 790 return;
791 } 791 }
792 792
793 dout("__unregister_request %p tid %lld\n", req, req->r_tid); 793 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
794 rb_erase(&req->r_node, &osdc->requests); 794 rb_erase(&req->r_node, &osdc->requests);
795 osdc->num_requests--; 795 osdc->num_requests--;
796 796
797 if (req->r_osd) { 797 if (req->r_osd) {
798 /* make sure the original request isn't in flight. */ 798 /* make sure the original request isn't in flight. */
799 ceph_msg_revoke(req->r_request); 799 ceph_msg_revoke(req->r_request);
800 800
801 list_del_init(&req->r_osd_item); 801 list_del_init(&req->r_osd_item);
802 if (list_empty(&req->r_osd->o_requests) && 802 if (list_empty(&req->r_osd->o_requests) &&
803 list_empty(&req->r_osd->o_linger_requests)) { 803 list_empty(&req->r_osd->o_linger_requests)) {
804 dout("moving osd to %p lru\n", req->r_osd); 804 dout("moving osd to %p lru\n", req->r_osd);
805 __move_osd_to_lru(osdc, req->r_osd); 805 __move_osd_to_lru(osdc, req->r_osd);
806 } 806 }
807 if (list_empty(&req->r_linger_item)) 807 if (list_empty(&req->r_linger_item))
808 req->r_osd = NULL; 808 req->r_osd = NULL;
809 } 809 }
810 810
811 list_del_init(&req->r_req_lru_item); 811 list_del_init(&req->r_req_lru_item);
812 ceph_osdc_put_request(req); 812 ceph_osdc_put_request(req);
813 813
814 if (osdc->num_requests == 0) { 814 if (osdc->num_requests == 0) {
815 dout(" no requests, canceling timeout\n"); 815 dout(" no requests, canceling timeout\n");
816 __cancel_osd_timeout(osdc); 816 __cancel_osd_timeout(osdc);
817 } 817 }
818 } 818 }
819 819
820 /* 820 /*
821 * Cancel a previously queued request message 821 * Cancel a previously queued request message
822 */ 822 */
823 static void __cancel_request(struct ceph_osd_request *req) 823 static void __cancel_request(struct ceph_osd_request *req)
824 { 824 {
825 if (req->r_sent && req->r_osd) { 825 if (req->r_sent && req->r_osd) {
826 ceph_msg_revoke(req->r_request); 826 ceph_msg_revoke(req->r_request);
827 req->r_sent = 0; 827 req->r_sent = 0;
828 } 828 }
829 } 829 }
830 830
831 static void __register_linger_request(struct ceph_osd_client *osdc, 831 static void __register_linger_request(struct ceph_osd_client *osdc,
832 struct ceph_osd_request *req) 832 struct ceph_osd_request *req)
833 { 833 {
834 dout("__register_linger_request %p\n", req); 834 dout("__register_linger_request %p\n", req);
835 list_add_tail(&req->r_linger_item, &osdc->req_linger); 835 list_add_tail(&req->r_linger_item, &osdc->req_linger);
836 if (req->r_osd) 836 if (req->r_osd)
837 list_add_tail(&req->r_linger_osd, 837 list_add_tail(&req->r_linger_osd,
838 &req->r_osd->o_linger_requests); 838 &req->r_osd->o_linger_requests);
839 } 839 }
840 840
841 static void __unregister_linger_request(struct ceph_osd_client *osdc, 841 static void __unregister_linger_request(struct ceph_osd_client *osdc,
842 struct ceph_osd_request *req) 842 struct ceph_osd_request *req)
843 { 843 {
844 dout("__unregister_linger_request %p\n", req); 844 dout("__unregister_linger_request %p\n", req);
845 list_del_init(&req->r_linger_item); 845 list_del_init(&req->r_linger_item);
846 if (req->r_osd) { 846 if (req->r_osd) {
847 list_del_init(&req->r_linger_osd); 847 list_del_init(&req->r_linger_osd);
848 848
849 if (list_empty(&req->r_osd->o_requests) && 849 if (list_empty(&req->r_osd->o_requests) &&
850 list_empty(&req->r_osd->o_linger_requests)) { 850 list_empty(&req->r_osd->o_linger_requests)) {
851 dout("moving osd to %p lru\n", req->r_osd); 851 dout("moving osd to %p lru\n", req->r_osd);
852 __move_osd_to_lru(osdc, req->r_osd); 852 __move_osd_to_lru(osdc, req->r_osd);
853 } 853 }
854 if (list_empty(&req->r_osd_item)) 854 if (list_empty(&req->r_osd_item))
855 req->r_osd = NULL; 855 req->r_osd = NULL;
856 } 856 }
857 } 857 }
858 858
859 void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, 859 void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc,
860 struct ceph_osd_request *req) 860 struct ceph_osd_request *req)
861 { 861 {
862 mutex_lock(&osdc->request_mutex); 862 mutex_lock(&osdc->request_mutex);
863 if (req->r_linger) { 863 if (req->r_linger) {
864 __unregister_linger_request(osdc, req); 864 __unregister_linger_request(osdc, req);
865 ceph_osdc_put_request(req); 865 ceph_osdc_put_request(req);
866 } 866 }
867 mutex_unlock(&osdc->request_mutex); 867 mutex_unlock(&osdc->request_mutex);
868 } 868 }
869 EXPORT_SYMBOL(ceph_osdc_unregister_linger_request); 869 EXPORT_SYMBOL(ceph_osdc_unregister_linger_request);
870 870
871 void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, 871 void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
872 struct ceph_osd_request *req) 872 struct ceph_osd_request *req)
873 { 873 {
874 if (!req->r_linger) { 874 if (!req->r_linger) {
875 dout("set_request_linger %p\n", req); 875 dout("set_request_linger %p\n", req);
876 req->r_linger = 1; 876 req->r_linger = 1;
877 /* 877 /*
878 * caller is now responsible for calling 878 * caller is now responsible for calling
879 * unregister_linger_request 879 * unregister_linger_request
880 */ 880 */
881 ceph_osdc_get_request(req); 881 ceph_osdc_get_request(req);
882 } 882 }
883 } 883 }
884 EXPORT_SYMBOL(ceph_osdc_set_request_linger); 884 EXPORT_SYMBOL(ceph_osdc_set_request_linger);
885 885
886 /* 886 /*
887 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct 887 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
888 * (as needed), and set the request r_osd appropriately. If there is 888 * (as needed), and set the request r_osd appropriately. If there is
889 * no up osd, set r_osd to NULL. Move the request to the appropriate list 889 * no up osd, set r_osd to NULL. Move the request to the appropriate list
890 * (unsent, homeless) or leave on in-flight lru. 890 * (unsent, homeless) or leave on in-flight lru.
891 * 891 *
892 * Return 0 if unchanged, 1 if changed, or negative on error. 892 * Return 0 if unchanged, 1 if changed, or negative on error.
893 * 893 *
894 * Caller should hold map_sem for read and request_mutex. 894 * Caller should hold map_sem for read and request_mutex.
895 */ 895 */
896 static int __map_request(struct ceph_osd_client *osdc, 896 static int __map_request(struct ceph_osd_client *osdc,
897 struct ceph_osd_request *req, int force_resend) 897 struct ceph_osd_request *req, int force_resend)
898 { 898 {
899 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; 899 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
900 struct ceph_pg pgid; 900 struct ceph_pg pgid;
901 int acting[CEPH_PG_MAX_SIZE]; 901 int acting[CEPH_PG_MAX_SIZE];
902 int o = -1, num = 0; 902 int o = -1, num = 0;
903 int err; 903 int err;
904 904
905 dout("map_request %p tid %lld\n", req, req->r_tid); 905 dout("map_request %p tid %lld\n", req, req->r_tid);
906 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid, 906 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
907 &req->r_file_layout, osdc->osdmap); 907 &req->r_file_layout, osdc->osdmap);
908 if (err) { 908 if (err) {
909 list_move(&req->r_req_lru_item, &osdc->req_notarget); 909 list_move(&req->r_req_lru_item, &osdc->req_notarget);
910 return err; 910 return err;
911 } 911 }
912 pgid = reqhead->layout.ol_pgid; 912 pgid = reqhead->layout.ol_pgid;
913 req->r_pgid = pgid; 913 req->r_pgid = pgid;
914 914
915 err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); 915 err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
916 if (err > 0) { 916 if (err > 0) {
917 o = acting[0]; 917 o = acting[0];
918 num = err; 918 num = err;
919 } 919 }
920 920
921 if ((!force_resend && 921 if ((!force_resend &&
922 req->r_osd && req->r_osd->o_osd == o && 922 req->r_osd && req->r_osd->o_osd == o &&
923 req->r_sent >= req->r_osd->o_incarnation && 923 req->r_sent >= req->r_osd->o_incarnation &&
924 req->r_num_pg_osds == num && 924 req->r_num_pg_osds == num &&
925 memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || 925 memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
926 (req->r_osd == NULL && o == -1)) 926 (req->r_osd == NULL && o == -1))
927 return 0; /* no change */ 927 return 0; /* no change */
928 928
929 dout("map_request tid %llu pgid %d.%x osd%d (was osd%d)\n", 929 dout("map_request tid %llu pgid %d.%x osd%d (was osd%d)\n",
930 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, 930 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
931 req->r_osd ? req->r_osd->o_osd : -1); 931 req->r_osd ? req->r_osd->o_osd : -1);
932 932
933 /* record full pg acting set */ 933 /* record full pg acting set */
934 memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num); 934 memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
935 req->r_num_pg_osds = num; 935 req->r_num_pg_osds = num;
936 936
937 if (req->r_osd) { 937 if (req->r_osd) {
938 __cancel_request(req); 938 __cancel_request(req);
939 list_del_init(&req->r_osd_item); 939 list_del_init(&req->r_osd_item);
940 req->r_osd = NULL; 940 req->r_osd = NULL;
941 } 941 }
942 942
943 req->r_osd = __lookup_osd(osdc, o); 943 req->r_osd = __lookup_osd(osdc, o);
944 if (!req->r_osd && o >= 0) { 944 if (!req->r_osd && o >= 0) {
945 err = -ENOMEM; 945 err = -ENOMEM;
946 req->r_osd = create_osd(osdc, o); 946 req->r_osd = create_osd(osdc, o);
947 if (!req->r_osd) { 947 if (!req->r_osd) {
948 list_move(&req->r_req_lru_item, &osdc->req_notarget); 948 list_move(&req->r_req_lru_item, &osdc->req_notarget);
949 goto out; 949 goto out;
950 } 950 }
951 951
952 dout("map_request osd %p is osd%d\n", req->r_osd, o); 952 dout("map_request osd %p is osd%d\n", req->r_osd, o);
953 __insert_osd(osdc, req->r_osd); 953 __insert_osd(osdc, req->r_osd);
954 954
955 ceph_con_open(&req->r_osd->o_con, 955 ceph_con_open(&req->r_osd->o_con,
956 CEPH_ENTITY_TYPE_OSD, o, 956 CEPH_ENTITY_TYPE_OSD, o,
957 &osdc->osdmap->osd_addr[o]); 957 &osdc->osdmap->osd_addr[o]);
958 } 958 }
959 959
960 if (req->r_osd) { 960 if (req->r_osd) {
961 __remove_osd_from_lru(req->r_osd); 961 __remove_osd_from_lru(req->r_osd);
962 list_add(&req->r_osd_item, &req->r_osd->o_requests); 962 list_add(&req->r_osd_item, &req->r_osd->o_requests);
963 list_move(&req->r_req_lru_item, &osdc->req_unsent); 963 list_move(&req->r_req_lru_item, &osdc->req_unsent);
964 } else { 964 } else {
965 list_move(&req->r_req_lru_item, &osdc->req_notarget); 965 list_move(&req->r_req_lru_item, &osdc->req_notarget);
966 } 966 }
967 err = 1; /* osd or pg changed */ 967 err = 1; /* osd or pg changed */
968 968
969 out: 969 out:
970 return err; 970 return err;
971 } 971 }
972 972
973 /* 973 /*
974 * caller should hold map_sem (for read) and request_mutex 974 * caller should hold map_sem (for read) and request_mutex
975 */ 975 */
976 static void __send_request(struct ceph_osd_client *osdc, 976 static void __send_request(struct ceph_osd_client *osdc,
977 struct ceph_osd_request *req) 977 struct ceph_osd_request *req)
978 { 978 {
979 struct ceph_osd_request_head *reqhead; 979 struct ceph_osd_request_head *reqhead;
980 980
981 dout("send_request %p tid %llu to osd%d flags %d\n", 981 dout("send_request %p tid %llu to osd%d flags %d\n",
982 req, req->r_tid, req->r_osd->o_osd, req->r_flags); 982 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
983 983
984 reqhead = req->r_request->front.iov_base; 984 reqhead = req->r_request->front.iov_base;
985 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch); 985 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
986 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */ 986 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
987 reqhead->reassert_version = req->r_reassert_version; 987 reqhead->reassert_version = req->r_reassert_version;
988 988
989 req->r_stamp = jiffies; 989 req->r_stamp = jiffies;
990 list_move_tail(&req->r_req_lru_item, &osdc->req_lru); 990 list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
991 991
992 ceph_msg_get(req->r_request); /* send consumes a ref */ 992 ceph_msg_get(req->r_request); /* send consumes a ref */
993 ceph_con_send(&req->r_osd->o_con, req->r_request); 993 ceph_con_send(&req->r_osd->o_con, req->r_request);
994 req->r_sent = req->r_osd->o_incarnation; 994 req->r_sent = req->r_osd->o_incarnation;
995 } 995 }
996 996
997 /* 997 /*
998 * Send any requests in the queue (req_unsent). 998 * Send any requests in the queue (req_unsent).
999 */ 999 */
1000 static void send_queued(struct ceph_osd_client *osdc) 1000 static void send_queued(struct ceph_osd_client *osdc)
1001 { 1001 {
1002 struct ceph_osd_request *req, *tmp; 1002 struct ceph_osd_request *req, *tmp;
1003 1003
1004 dout("send_queued\n"); 1004 dout("send_queued\n");
1005 mutex_lock(&osdc->request_mutex); 1005 mutex_lock(&osdc->request_mutex);
1006 list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) { 1006 list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) {
1007 __send_request(osdc, req); 1007 __send_request(osdc, req);
1008 } 1008 }
1009 mutex_unlock(&osdc->request_mutex); 1009 mutex_unlock(&osdc->request_mutex);
1010 } 1010 }
1011 1011
1012 /* 1012 /*
1013 * Timeout callback, called every N seconds when 1 or more osd 1013 * Timeout callback, called every N seconds when 1 or more osd
1014 * requests has been active for more than N seconds. When this 1014 * requests has been active for more than N seconds. When this
1015 * happens, we ping all OSDs with requests who have timed out to 1015 * happens, we ping all OSDs with requests who have timed out to
1016 * ensure any communications channel reset is detected. Reset the 1016 * ensure any communications channel reset is detected. Reset the
1017 * request timeouts another N seconds in the future as we go. 1017 * request timeouts another N seconds in the future as we go.
1018 * Reschedule the timeout event another N seconds in future (unless 1018 * Reschedule the timeout event another N seconds in future (unless
1019 * there are no open requests). 1019 * there are no open requests).
1020 */ 1020 */
1021 static void handle_timeout(struct work_struct *work) 1021 static void handle_timeout(struct work_struct *work)
1022 { 1022 {
1023 struct ceph_osd_client *osdc = 1023 struct ceph_osd_client *osdc =
1024 container_of(work, struct ceph_osd_client, timeout_work.work); 1024 container_of(work, struct ceph_osd_client, timeout_work.work);
1025 struct ceph_osd_request *req; 1025 struct ceph_osd_request *req;
1026 struct ceph_osd *osd; 1026 struct ceph_osd *osd;
1027 unsigned long keepalive = 1027 unsigned long keepalive =
1028 osdc->client->options->osd_keepalive_timeout * HZ; 1028 osdc->client->options->osd_keepalive_timeout * HZ;
1029 struct list_head slow_osds; 1029 struct list_head slow_osds;
1030 dout("timeout\n"); 1030 dout("timeout\n");
1031 down_read(&osdc->map_sem); 1031 down_read(&osdc->map_sem);
1032 1032
1033 ceph_monc_request_next_osdmap(&osdc->client->monc); 1033 ceph_monc_request_next_osdmap(&osdc->client->monc);
1034 1034
1035 mutex_lock(&osdc->request_mutex); 1035 mutex_lock(&osdc->request_mutex);
1036 1036
1037 /* 1037 /*
1038 * ping osds that are a bit slow. this ensures that if there 1038 * ping osds that are a bit slow. this ensures that if there
1039 * is a break in the TCP connection we will notice, and reopen 1039 * is a break in the TCP connection we will notice, and reopen
1040 * a connection with that osd (from the fault callback). 1040 * a connection with that osd (from the fault callback).
1041 */ 1041 */
1042 INIT_LIST_HEAD(&slow_osds); 1042 INIT_LIST_HEAD(&slow_osds);
1043 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { 1043 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
1044 if (time_before(jiffies, req->r_stamp + keepalive)) 1044 if (time_before(jiffies, req->r_stamp + keepalive))
1045 break; 1045 break;
1046 1046
1047 osd = req->r_osd; 1047 osd = req->r_osd;
1048 BUG_ON(!osd); 1048 BUG_ON(!osd);
1049 dout(" tid %llu is slow, will send keepalive on osd%d\n", 1049 dout(" tid %llu is slow, will send keepalive on osd%d\n",
1050 req->r_tid, osd->o_osd); 1050 req->r_tid, osd->o_osd);
1051 list_move_tail(&osd->o_keepalive_item, &slow_osds); 1051 list_move_tail(&osd->o_keepalive_item, &slow_osds);
1052 } 1052 }
1053 while (!list_empty(&slow_osds)) { 1053 while (!list_empty(&slow_osds)) {
1054 osd = list_entry(slow_osds.next, struct ceph_osd, 1054 osd = list_entry(slow_osds.next, struct ceph_osd,
1055 o_keepalive_item); 1055 o_keepalive_item);
1056 list_del_init(&osd->o_keepalive_item); 1056 list_del_init(&osd->o_keepalive_item);
1057 ceph_con_keepalive(&osd->o_con); 1057 ceph_con_keepalive(&osd->o_con);
1058 } 1058 }
1059 1059
1060 __schedule_osd_timeout(osdc); 1060 __schedule_osd_timeout(osdc);
1061 mutex_unlock(&osdc->request_mutex); 1061 mutex_unlock(&osdc->request_mutex);
1062 send_queued(osdc); 1062 send_queued(osdc);
1063 up_read(&osdc->map_sem); 1063 up_read(&osdc->map_sem);
1064 } 1064 }
1065 1065
1066 static void handle_osds_timeout(struct work_struct *work) 1066 static void handle_osds_timeout(struct work_struct *work)
1067 { 1067 {
1068 struct ceph_osd_client *osdc = 1068 struct ceph_osd_client *osdc =
1069 container_of(work, struct ceph_osd_client, 1069 container_of(work, struct ceph_osd_client,
1070 osds_timeout_work.work); 1070 osds_timeout_work.work);
1071 unsigned long delay = 1071 unsigned long delay =
1072 osdc->client->options->osd_idle_ttl * HZ >> 2; 1072 osdc->client->options->osd_idle_ttl * HZ >> 2;
1073 1073
1074 dout("osds timeout\n"); 1074 dout("osds timeout\n");
1075 down_read(&osdc->map_sem); 1075 down_read(&osdc->map_sem);
1076 remove_old_osds(osdc); 1076 remove_old_osds(osdc);
1077 up_read(&osdc->map_sem); 1077 up_read(&osdc->map_sem);
1078 1078
1079 schedule_delayed_work(&osdc->osds_timeout_work, 1079 schedule_delayed_work(&osdc->osds_timeout_work,
1080 round_jiffies_relative(delay)); 1080 round_jiffies_relative(delay));
1081 } 1081 }
1082 1082
1083 static void complete_request(struct ceph_osd_request *req) 1083 static void complete_request(struct ceph_osd_request *req)
1084 { 1084 {
1085 if (req->r_safe_callback) 1085 if (req->r_safe_callback)
1086 req->r_safe_callback(req, NULL); 1086 req->r_safe_callback(req, NULL);
1087 complete_all(&req->r_safe_completion); /* fsync waiter */ 1087 complete_all(&req->r_safe_completion); /* fsync waiter */
1088 } 1088 }
1089 1089
1090 /* 1090 /*
1091 * handle osd op reply. either call the callback if it is specified, 1091 * handle osd op reply. either call the callback if it is specified,
1092 * or do the completion to wake up the waiting thread. 1092 * or do the completion to wake up the waiting thread.
1093 */ 1093 */
1094 static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, 1094 static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1095 struct ceph_connection *con) 1095 struct ceph_connection *con)
1096 { 1096 {
1097 struct ceph_osd_reply_head *rhead = msg->front.iov_base; 1097 struct ceph_osd_reply_head *rhead = msg->front.iov_base;
1098 struct ceph_osd_request *req; 1098 struct ceph_osd_request *req;
1099 u64 tid; 1099 u64 tid;
1100 int numops, object_len, flags; 1100 int numops, object_len, flags;
1101 s32 result; 1101 s32 result;
1102 1102
1103 tid = le64_to_cpu(msg->hdr.tid); 1103 tid = le64_to_cpu(msg->hdr.tid);
1104 if (msg->front.iov_len < sizeof(*rhead)) 1104 if (msg->front.iov_len < sizeof(*rhead))
1105 goto bad; 1105 goto bad;
1106 numops = le32_to_cpu(rhead->num_ops); 1106 numops = le32_to_cpu(rhead->num_ops);
1107 object_len = le32_to_cpu(rhead->object_len); 1107 object_len = le32_to_cpu(rhead->object_len);
1108 result = le32_to_cpu(rhead->result); 1108 result = le32_to_cpu(rhead->result);
1109 if (msg->front.iov_len != sizeof(*rhead) + object_len + 1109 if (msg->front.iov_len != sizeof(*rhead) + object_len +
1110 numops * sizeof(struct ceph_osd_op)) 1110 numops * sizeof(struct ceph_osd_op))
1111 goto bad; 1111 goto bad;
1112 dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result); 1112 dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
1113 /* lookup */ 1113 /* lookup */
1114 mutex_lock(&osdc->request_mutex); 1114 mutex_lock(&osdc->request_mutex);
1115 req = __lookup_request(osdc, tid); 1115 req = __lookup_request(osdc, tid);
1116 if (req == NULL) { 1116 if (req == NULL) {
1117 dout("handle_reply tid %llu dne\n", tid); 1117 dout("handle_reply tid %llu dne\n", tid);
1118 mutex_unlock(&osdc->request_mutex); 1118 mutex_unlock(&osdc->request_mutex);
1119 return; 1119 return;
1120 } 1120 }
1121 ceph_osdc_get_request(req); 1121 ceph_osdc_get_request(req);
1122 flags = le32_to_cpu(rhead->flags); 1122 flags = le32_to_cpu(rhead->flags);
1123 1123
1124 /* 1124 /*
1125 * if this connection filled our message, drop our reference now, to 1125 * if this connection filled our message, drop our reference now, to
1126 * avoid a (safe but slower) revoke later. 1126 * avoid a (safe but slower) revoke later.
1127 */ 1127 */
1128 if (req->r_con_filling_msg == con && req->r_reply == msg) { 1128 if (req->r_con_filling_msg == con && req->r_reply == msg) {
1129 dout(" dropping con_filling_msg ref %p\n", con); 1129 dout(" dropping con_filling_msg ref %p\n", con);
1130 req->r_con_filling_msg = NULL; 1130 req->r_con_filling_msg = NULL;
1131 con->ops->put(con); 1131 con->ops->put(con);
1132 } 1132 }
1133 1133
1134 if (!req->r_got_reply) { 1134 if (!req->r_got_reply) {
1135 unsigned int bytes; 1135 unsigned int bytes;
1136 1136
1137 req->r_result = le32_to_cpu(rhead->result); 1137 req->r_result = le32_to_cpu(rhead->result);
1138 bytes = le32_to_cpu(msg->hdr.data_len); 1138 bytes = le32_to_cpu(msg->hdr.data_len);
1139 dout("handle_reply result %d bytes %d\n", req->r_result, 1139 dout("handle_reply result %d bytes %d\n", req->r_result,
1140 bytes); 1140 bytes);
1141 if (req->r_result == 0) 1141 if (req->r_result == 0)
1142 req->r_result = bytes; 1142 req->r_result = bytes;
1143 1143
1144 /* in case this is a write and we need to replay, */ 1144 /* in case this is a write and we need to replay, */
1145 req->r_reassert_version = rhead->reassert_version; 1145 req->r_reassert_version = rhead->reassert_version;
1146 1146
1147 req->r_got_reply = 1; 1147 req->r_got_reply = 1;
1148 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { 1148 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
1149 dout("handle_reply tid %llu dup ack\n", tid); 1149 dout("handle_reply tid %llu dup ack\n", tid);
1150 mutex_unlock(&osdc->request_mutex); 1150 mutex_unlock(&osdc->request_mutex);
1151 goto done; 1151 goto done;
1152 } 1152 }
1153 1153
1154 dout("handle_reply tid %llu flags %d\n", tid, flags); 1154 dout("handle_reply tid %llu flags %d\n", tid, flags);
1155 1155
1156 if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK)) 1156 if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK))
1157 __register_linger_request(osdc, req); 1157 __register_linger_request(osdc, req);
1158 1158
1159 /* either this is a read, or we got the safe response */ 1159 /* either this is a read, or we got the safe response */
1160 if (result < 0 || 1160 if (result < 0 ||
1161 (flags & CEPH_OSD_FLAG_ONDISK) || 1161 (flags & CEPH_OSD_FLAG_ONDISK) ||
1162 ((flags & CEPH_OSD_FLAG_WRITE) == 0)) 1162 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
1163 __unregister_request(osdc, req); 1163 __unregister_request(osdc, req);
1164 1164
1165 mutex_unlock(&osdc->request_mutex); 1165 mutex_unlock(&osdc->request_mutex);
1166 1166
1167 if (req->r_callback) 1167 if (req->r_callback)
1168 req->r_callback(req, msg); 1168 req->r_callback(req, msg);
1169 else 1169 else
1170 complete_all(&req->r_completion); 1170 complete_all(&req->r_completion);
1171 1171
1172 if (flags & CEPH_OSD_FLAG_ONDISK) 1172 if (flags & CEPH_OSD_FLAG_ONDISK)
1173 complete_request(req); 1173 complete_request(req);
1174 1174
1175 done: 1175 done:
1176 dout("req=%p req->r_linger=%d\n", req, req->r_linger); 1176 dout("req=%p req->r_linger=%d\n", req, req->r_linger);
1177 ceph_osdc_put_request(req); 1177 ceph_osdc_put_request(req);
1178 return; 1178 return;
1179 1179
1180 bad: 1180 bad:
1181 pr_err("corrupt osd_op_reply got %d %d expected %d\n", 1181 pr_err("corrupt osd_op_reply got %d %d expected %d\n",
1182 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len), 1182 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
1183 (int)sizeof(*rhead)); 1183 (int)sizeof(*rhead));
1184 ceph_msg_dump(msg); 1184 ceph_msg_dump(msg);
1185 } 1185 }
1186 1186
1187 static void reset_changed_osds(struct ceph_osd_client *osdc) 1187 static void reset_changed_osds(struct ceph_osd_client *osdc)
1188 { 1188 {
1189 struct rb_node *p, *n; 1189 struct rb_node *p, *n;
1190 1190
1191 for (p = rb_first(&osdc->osds); p; p = n) { 1191 for (p = rb_first(&osdc->osds); p; p = n) {
1192 struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node); 1192 struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node);
1193 1193
1194 n = rb_next(p); 1194 n = rb_next(p);
1195 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || 1195 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
1196 memcmp(&osd->o_con.peer_addr, 1196 memcmp(&osd->o_con.peer_addr,
1197 ceph_osd_addr(osdc->osdmap, 1197 ceph_osd_addr(osdc->osdmap,
1198 osd->o_osd), 1198 osd->o_osd),
1199 sizeof(struct ceph_entity_addr)) != 0) 1199 sizeof(struct ceph_entity_addr)) != 0)
1200 __reset_osd(osdc, osd); 1200 __reset_osd(osdc, osd);
1201 } 1201 }
1202 } 1202 }
1203 1203
1204 /* 1204 /*
1205 * Requeue requests whose mapping to an OSD has changed. If requests map to 1205 * Requeue requests whose mapping to an OSD has changed. If requests map to
1206 * no osd, request a new map. 1206 * no osd, request a new map.
1207 * 1207 *
1208 * Caller should hold map_sem for read. 1208 * Caller should hold map_sem for read.
1209 */ 1209 */
1210 static void kick_requests(struct ceph_osd_client *osdc, int force_resend) 1210 static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
1211 { 1211 {
1212 struct ceph_osd_request *req, *nreq; 1212 struct ceph_osd_request *req, *nreq;
1213 struct rb_node *p; 1213 struct rb_node *p;
1214 int needmap = 0; 1214 int needmap = 0;
1215 int err; 1215 int err;
1216 1216
1217 dout("kick_requests %s\n", force_resend ? " (force resend)" : ""); 1217 dout("kick_requests %s\n", force_resend ? " (force resend)" : "");
1218 mutex_lock(&osdc->request_mutex); 1218 mutex_lock(&osdc->request_mutex);
1219 for (p = rb_first(&osdc->requests); p; ) { 1219 for (p = rb_first(&osdc->requests); p; ) {
1220 req = rb_entry(p, struct ceph_osd_request, r_node); 1220 req = rb_entry(p, struct ceph_osd_request, r_node);
1221 p = rb_next(p); 1221 p = rb_next(p);
1222 1222
1223 /* 1223 /*
1224 * For linger requests that have not yet been 1224 * For linger requests that have not yet been
1225 * registered, move them to the linger list; they'll 1225 * registered, move them to the linger list; they'll
1226 * be sent to the osd in the loop below. Unregister 1226 * be sent to the osd in the loop below. Unregister
1227 * the request before re-registering it as a linger 1227 * the request before re-registering it as a linger
1228 * request to ensure the __map_request() below 1228 * request to ensure the __map_request() below
1229 * will decide it needs to be sent. 1229 * will decide it needs to be sent.
1230 */ 1230 */
1231 if (req->r_linger && list_empty(&req->r_linger_item)) { 1231 if (req->r_linger && list_empty(&req->r_linger_item)) {
1232 dout("%p tid %llu restart on osd%d\n", 1232 dout("%p tid %llu restart on osd%d\n",
1233 req, req->r_tid, 1233 req, req->r_tid,
1234 req->r_osd ? req->r_osd->o_osd : -1); 1234 req->r_osd ? req->r_osd->o_osd : -1);
1235 __unregister_request(osdc, req); 1235 __unregister_request(osdc, req);
1236 __register_linger_request(osdc, req); 1236 __register_linger_request(osdc, req);
1237 continue; 1237 continue;
1238 } 1238 }
1239 1239
1240 err = __map_request(osdc, req, force_resend); 1240 err = __map_request(osdc, req, force_resend);
1241 if (err < 0) 1241 if (err < 0)
1242 continue; /* error */ 1242 continue; /* error */
1243 if (req->r_osd == NULL) { 1243 if (req->r_osd == NULL) {
1244 dout("%p tid %llu maps to no osd\n", req, req->r_tid); 1244 dout("%p tid %llu maps to no osd\n", req, req->r_tid);
1245 needmap++; /* request a newer map */ 1245 needmap++; /* request a newer map */
1246 } else if (err > 0) { 1246 } else if (err > 0) {
1247 if (!req->r_linger) { 1247 if (!req->r_linger) {
1248 dout("%p tid %llu requeued on osd%d\n", req, 1248 dout("%p tid %llu requeued on osd%d\n", req,
1249 req->r_tid, 1249 req->r_tid,
1250 req->r_osd ? req->r_osd->o_osd : -1); 1250 req->r_osd ? req->r_osd->o_osd : -1);
1251 req->r_flags |= CEPH_OSD_FLAG_RETRY; 1251 req->r_flags |= CEPH_OSD_FLAG_RETRY;
1252 } 1252 }
1253 } 1253 }
1254 } 1254 }
1255 1255
1256 list_for_each_entry_safe(req, nreq, &osdc->req_linger, 1256 list_for_each_entry_safe(req, nreq, &osdc->req_linger,
1257 r_linger_item) { 1257 r_linger_item) {
1258 dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); 1258 dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
1259 1259
1260 err = __map_request(osdc, req, force_resend); 1260 err = __map_request(osdc, req, force_resend);
1261 dout("__map_request returned %d\n", err); 1261 dout("__map_request returned %d\n", err);
1262 if (err == 0) 1262 if (err == 0)
1263 continue; /* no change and no osd was specified */ 1263 continue; /* no change and no osd was specified */
1264 if (err < 0) 1264 if (err < 0)
1265 continue; /* hrm! */ 1265 continue; /* hrm! */
1266 if (req->r_osd == NULL) { 1266 if (req->r_osd == NULL) {
1267 dout("tid %llu maps to no valid osd\n", req->r_tid); 1267 dout("tid %llu maps to no valid osd\n", req->r_tid);
1268 needmap++; /* request a newer map */ 1268 needmap++; /* request a newer map */
1269 continue; 1269 continue;
1270 } 1270 }
1271 1271
1272 dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid, 1272 dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid,
1273 req->r_osd ? req->r_osd->o_osd : -1); 1273 req->r_osd ? req->r_osd->o_osd : -1);
1274 __register_request(osdc, req); 1274 __register_request(osdc, req);
1275 __unregister_linger_request(osdc, req); 1275 __unregister_linger_request(osdc, req);
1276 } 1276 }
1277 mutex_unlock(&osdc->request_mutex); 1277 mutex_unlock(&osdc->request_mutex);
1278 1278
1279 if (needmap) { 1279 if (needmap) {
1280 dout("%d requests for down osds, need new map\n", needmap); 1280 dout("%d requests for down osds, need new map\n", needmap);
1281 ceph_monc_request_next_osdmap(&osdc->client->monc); 1281 ceph_monc_request_next_osdmap(&osdc->client->monc);
1282 } 1282 }
1283 reset_changed_osds(osdc); 1283 reset_changed_osds(osdc);
1284 } 1284 }
1285 1285
1286 1286
1287 /* 1287 /*
1288 * Process updated osd map. 1288 * Process updated osd map.
1289 * 1289 *
1290 * The message contains any number of incremental and full maps, normally 1290 * The message contains any number of incremental and full maps, normally
1291 * indicating some sort of topology change in the cluster. Kick requests 1291 * indicating some sort of topology change in the cluster. Kick requests
1292 * off to different OSDs as needed. 1292 * off to different OSDs as needed.
1293 */ 1293 */
1294 void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) 1294 void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1295 { 1295 {
1296 void *p, *end, *next; 1296 void *p, *end, *next;
1297 u32 nr_maps, maplen; 1297 u32 nr_maps, maplen;
1298 u32 epoch; 1298 u32 epoch;
1299 struct ceph_osdmap *newmap = NULL, *oldmap; 1299 struct ceph_osdmap *newmap = NULL, *oldmap;
1300 int err; 1300 int err;
1301 struct ceph_fsid fsid; 1301 struct ceph_fsid fsid;
1302 1302
1303 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); 1303 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
1304 p = msg->front.iov_base; 1304 p = msg->front.iov_base;
1305 end = p + msg->front.iov_len; 1305 end = p + msg->front.iov_len;
1306 1306
1307 /* verify fsid */ 1307 /* verify fsid */
1308 ceph_decode_need(&p, end, sizeof(fsid), bad); 1308 ceph_decode_need(&p, end, sizeof(fsid), bad);
1309 ceph_decode_copy(&p, &fsid, sizeof(fsid)); 1309 ceph_decode_copy(&p, &fsid, sizeof(fsid));
1310 if (ceph_check_fsid(osdc->client, &fsid) < 0) 1310 if (ceph_check_fsid(osdc->client, &fsid) < 0)
1311 return; 1311 return;
1312 1312
1313 down_write(&osdc->map_sem); 1313 down_write(&osdc->map_sem);
1314 1314
1315 /* incremental maps */ 1315 /* incremental maps */
1316 ceph_decode_32_safe(&p, end, nr_maps, bad); 1316 ceph_decode_32_safe(&p, end, nr_maps, bad);
1317 dout(" %d inc maps\n", nr_maps); 1317 dout(" %d inc maps\n", nr_maps);
1318 while (nr_maps > 0) { 1318 while (nr_maps > 0) {
1319 ceph_decode_need(&p, end, 2*sizeof(u32), bad); 1319 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1320 epoch = ceph_decode_32(&p); 1320 epoch = ceph_decode_32(&p);
1321 maplen = ceph_decode_32(&p); 1321 maplen = ceph_decode_32(&p);
1322 ceph_decode_need(&p, end, maplen, bad); 1322 ceph_decode_need(&p, end, maplen, bad);
1323 next = p + maplen; 1323 next = p + maplen;
1324 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) { 1324 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
1325 dout("applying incremental map %u len %d\n", 1325 dout("applying incremental map %u len %d\n",
1326 epoch, maplen); 1326 epoch, maplen);
1327 newmap = osdmap_apply_incremental(&p, next, 1327 newmap = osdmap_apply_incremental(&p, next,
1328 osdc->osdmap, 1328 osdc->osdmap,
1329 &osdc->client->msgr); 1329 &osdc->client->msgr);
1330 if (IS_ERR(newmap)) { 1330 if (IS_ERR(newmap)) {
1331 err = PTR_ERR(newmap); 1331 err = PTR_ERR(newmap);
1332 goto bad; 1332 goto bad;
1333 } 1333 }
1334 BUG_ON(!newmap); 1334 BUG_ON(!newmap);
1335 if (newmap != osdc->osdmap) { 1335 if (newmap != osdc->osdmap) {
1336 ceph_osdmap_destroy(osdc->osdmap); 1336 ceph_osdmap_destroy(osdc->osdmap);
1337 osdc->osdmap = newmap; 1337 osdc->osdmap = newmap;
1338 } 1338 }
1339 kick_requests(osdc, 0); 1339 kick_requests(osdc, 0);
1340 } else { 1340 } else {
1341 dout("ignoring incremental map %u len %d\n", 1341 dout("ignoring incremental map %u len %d\n",
1342 epoch, maplen); 1342 epoch, maplen);
1343 } 1343 }
1344 p = next; 1344 p = next;
1345 nr_maps--; 1345 nr_maps--;
1346 } 1346 }
1347 if (newmap) 1347 if (newmap)
1348 goto done; 1348 goto done;
1349 1349
1350 /* full maps */ 1350 /* full maps */
1351 ceph_decode_32_safe(&p, end, nr_maps, bad); 1351 ceph_decode_32_safe(&p, end, nr_maps, bad);
1352 dout(" %d full maps\n", nr_maps); 1352 dout(" %d full maps\n", nr_maps);
1353 while (nr_maps) { 1353 while (nr_maps) {
1354 ceph_decode_need(&p, end, 2*sizeof(u32), bad); 1354 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1355 epoch = ceph_decode_32(&p); 1355 epoch = ceph_decode_32(&p);
1356 maplen = ceph_decode_32(&p); 1356 maplen = ceph_decode_32(&p);
1357 ceph_decode_need(&p, end, maplen, bad); 1357 ceph_decode_need(&p, end, maplen, bad);
1358 if (nr_maps > 1) { 1358 if (nr_maps > 1) {
1359 dout("skipping non-latest full map %u len %d\n", 1359 dout("skipping non-latest full map %u len %d\n",
1360 epoch, maplen); 1360 epoch, maplen);
1361 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) { 1361 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
1362 dout("skipping full map %u len %d, " 1362 dout("skipping full map %u len %d, "
1363 "older than our %u\n", epoch, maplen, 1363 "older than our %u\n", epoch, maplen,
1364 osdc->osdmap->epoch); 1364 osdc->osdmap->epoch);
1365 } else { 1365 } else {
1366 int skipped_map = 0; 1366 int skipped_map = 0;
1367 1367
1368 dout("taking full map %u len %d\n", epoch, maplen); 1368 dout("taking full map %u len %d\n", epoch, maplen);
1369 newmap = osdmap_decode(&p, p+maplen); 1369 newmap = osdmap_decode(&p, p+maplen);
1370 if (IS_ERR(newmap)) { 1370 if (IS_ERR(newmap)) {
1371 err = PTR_ERR(newmap); 1371 err = PTR_ERR(newmap);
1372 goto bad; 1372 goto bad;
1373 } 1373 }
1374 BUG_ON(!newmap); 1374 BUG_ON(!newmap);
1375 oldmap = osdc->osdmap; 1375 oldmap = osdc->osdmap;
1376 osdc->osdmap = newmap; 1376 osdc->osdmap = newmap;
1377 if (oldmap) { 1377 if (oldmap) {
1378 if (oldmap->epoch + 1 < newmap->epoch) 1378 if (oldmap->epoch + 1 < newmap->epoch)
1379 skipped_map = 1; 1379 skipped_map = 1;
1380 ceph_osdmap_destroy(oldmap); 1380 ceph_osdmap_destroy(oldmap);
1381 } 1381 }
1382 kick_requests(osdc, skipped_map); 1382 kick_requests(osdc, skipped_map);
1383 } 1383 }
1384 p += maplen; 1384 p += maplen;
1385 nr_maps--; 1385 nr_maps--;
1386 } 1386 }
1387 1387
1388 done: 1388 done:
1389 downgrade_write(&osdc->map_sem); 1389 downgrade_write(&osdc->map_sem);
1390 ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); 1390 ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
1391 1391
1392 /* 1392 /*
1393 * subscribe to subsequent osdmap updates if full to ensure 1393 * subscribe to subsequent osdmap updates if full to ensure
1394 * we find out when we are no longer full and stop returning 1394 * we find out when we are no longer full and stop returning
1395 * ENOSPC. 1395 * ENOSPC.
1396 */ 1396 */
1397 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) 1397 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
1398 ceph_monc_request_next_osdmap(&osdc->client->monc); 1398 ceph_monc_request_next_osdmap(&osdc->client->monc);
1399 1399
1400 send_queued(osdc); 1400 send_queued(osdc);
1401 up_read(&osdc->map_sem); 1401 up_read(&osdc->map_sem);
1402 wake_up_all(&osdc->client->auth_wq); 1402 wake_up_all(&osdc->client->auth_wq);
1403 return; 1403 return;
1404 1404
1405 bad: 1405 bad:
1406 pr_err("osdc handle_map corrupt msg\n"); 1406 pr_err("osdc handle_map corrupt msg\n");
1407 ceph_msg_dump(msg); 1407 ceph_msg_dump(msg);
1408 up_write(&osdc->map_sem); 1408 up_write(&osdc->map_sem);
1409 return; 1409 return;
1410 } 1410 }
1411 1411
1412 /* 1412 /*
1413 * watch/notify callback event infrastructure 1413 * watch/notify callback event infrastructure
1414 * 1414 *
1415 * These callbacks are used both for watch and notify operations. 1415 * These callbacks are used both for watch and notify operations.
1416 */ 1416 */
1417 static void __release_event(struct kref *kref) 1417 static void __release_event(struct kref *kref)
1418 { 1418 {
1419 struct ceph_osd_event *event = 1419 struct ceph_osd_event *event =
1420 container_of(kref, struct ceph_osd_event, kref); 1420 container_of(kref, struct ceph_osd_event, kref);
1421 1421
1422 dout("__release_event %p\n", event); 1422 dout("__release_event %p\n", event);
1423 kfree(event); 1423 kfree(event);
1424 } 1424 }
1425 1425
1426 static void get_event(struct ceph_osd_event *event) 1426 static void get_event(struct ceph_osd_event *event)
1427 { 1427 {
1428 kref_get(&event->kref); 1428 kref_get(&event->kref);
1429 } 1429 }
1430 1430
1431 void ceph_osdc_put_event(struct ceph_osd_event *event) 1431 void ceph_osdc_put_event(struct ceph_osd_event *event)
1432 { 1432 {
1433 kref_put(&event->kref, __release_event); 1433 kref_put(&event->kref, __release_event);
1434 } 1434 }
1435 EXPORT_SYMBOL(ceph_osdc_put_event); 1435 EXPORT_SYMBOL(ceph_osdc_put_event);
1436 1436
1437 static void __insert_event(struct ceph_osd_client *osdc, 1437 static void __insert_event(struct ceph_osd_client *osdc,
1438 struct ceph_osd_event *new) 1438 struct ceph_osd_event *new)
1439 { 1439 {
1440 struct rb_node **p = &osdc->event_tree.rb_node; 1440 struct rb_node **p = &osdc->event_tree.rb_node;
1441 struct rb_node *parent = NULL; 1441 struct rb_node *parent = NULL;
1442 struct ceph_osd_event *event = NULL; 1442 struct ceph_osd_event *event = NULL;
1443 1443
1444 while (*p) { 1444 while (*p) {
1445 parent = *p; 1445 parent = *p;
1446 event = rb_entry(parent, struct ceph_osd_event, node); 1446 event = rb_entry(parent, struct ceph_osd_event, node);
1447 if (new->cookie < event->cookie) 1447 if (new->cookie < event->cookie)
1448 p = &(*p)->rb_left; 1448 p = &(*p)->rb_left;
1449 else if (new->cookie > event->cookie) 1449 else if (new->cookie > event->cookie)
1450 p = &(*p)->rb_right; 1450 p = &(*p)->rb_right;
1451 else 1451 else
1452 BUG(); 1452 BUG();
1453 } 1453 }
1454 1454
1455 rb_link_node(&new->node, parent, p); 1455 rb_link_node(&new->node, parent, p);
1456 rb_insert_color(&new->node, &osdc->event_tree); 1456 rb_insert_color(&new->node, &osdc->event_tree);
1457 } 1457 }
1458 1458
1459 static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc, 1459 static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc,
1460 u64 cookie) 1460 u64 cookie)
1461 { 1461 {
1462 struct rb_node **p = &osdc->event_tree.rb_node; 1462 struct rb_node **p = &osdc->event_tree.rb_node;
1463 struct rb_node *parent = NULL; 1463 struct rb_node *parent = NULL;
1464 struct ceph_osd_event *event = NULL; 1464 struct ceph_osd_event *event = NULL;
1465 1465
1466 while (*p) { 1466 while (*p) {
1467 parent = *p; 1467 parent = *p;
1468 event = rb_entry(parent, struct ceph_osd_event, node); 1468 event = rb_entry(parent, struct ceph_osd_event, node);
1469 if (cookie < event->cookie) 1469 if (cookie < event->cookie)
1470 p = &(*p)->rb_left; 1470 p = &(*p)->rb_left;
1471 else if (cookie > event->cookie) 1471 else if (cookie > event->cookie)
1472 p = &(*p)->rb_right; 1472 p = &(*p)->rb_right;
1473 else 1473 else
1474 return event; 1474 return event;
1475 } 1475 }
1476 return NULL; 1476 return NULL;
1477 } 1477 }
1478 1478
1479 static void __remove_event(struct ceph_osd_event *event) 1479 static void __remove_event(struct ceph_osd_event *event)
1480 { 1480 {
1481 struct ceph_osd_client *osdc = event->osdc; 1481 struct ceph_osd_client *osdc = event->osdc;
1482 1482
1483 if (!RB_EMPTY_NODE(&event->node)) { 1483 if (!RB_EMPTY_NODE(&event->node)) {
1484 dout("__remove_event removed %p\n", event); 1484 dout("__remove_event removed %p\n", event);
1485 rb_erase(&event->node, &osdc->event_tree); 1485 rb_erase(&event->node, &osdc->event_tree);
1486 ceph_osdc_put_event(event); 1486 ceph_osdc_put_event(event);
1487 } else { 1487 } else {
1488 dout("__remove_event didn't remove %p\n", event); 1488 dout("__remove_event didn't remove %p\n", event);
1489 } 1489 }
1490 } 1490 }
1491 1491
1492 int ceph_osdc_create_event(struct ceph_osd_client *osdc, 1492 int ceph_osdc_create_event(struct ceph_osd_client *osdc,
1493 void (*event_cb)(u64, u64, u8, void *), 1493 void (*event_cb)(u64, u64, u8, void *),
1494 int one_shot, void *data, 1494 int one_shot, void *data,
1495 struct ceph_osd_event **pevent) 1495 struct ceph_osd_event **pevent)
1496 { 1496 {
1497 struct ceph_osd_event *event; 1497 struct ceph_osd_event *event;
1498 1498
1499 event = kmalloc(sizeof(*event), GFP_NOIO); 1499 event = kmalloc(sizeof(*event), GFP_NOIO);
1500 if (!event) 1500 if (!event)
1501 return -ENOMEM; 1501 return -ENOMEM;
1502 1502
1503 dout("create_event %p\n", event); 1503 dout("create_event %p\n", event);
1504 event->cb = event_cb; 1504 event->cb = event_cb;
1505 event->one_shot = one_shot; 1505 event->one_shot = one_shot;
1506 event->data = data; 1506 event->data = data;
1507 event->osdc = osdc; 1507 event->osdc = osdc;
1508 INIT_LIST_HEAD(&event->osd_node); 1508 INIT_LIST_HEAD(&event->osd_node);
1509 RB_CLEAR_NODE(&event->node); 1509 RB_CLEAR_NODE(&event->node);
1510 kref_init(&event->kref); /* one ref for us */ 1510 kref_init(&event->kref); /* one ref for us */
1511 kref_get(&event->kref); /* one ref for the caller */ 1511 kref_get(&event->kref); /* one ref for the caller */
1512 init_completion(&event->completion); 1512 init_completion(&event->completion);
1513 1513
1514 spin_lock(&osdc->event_lock); 1514 spin_lock(&osdc->event_lock);
1515 event->cookie = ++osdc->event_count; 1515 event->cookie = ++osdc->event_count;
1516 __insert_event(osdc, event); 1516 __insert_event(osdc, event);
1517 spin_unlock(&osdc->event_lock); 1517 spin_unlock(&osdc->event_lock);
1518 1518
1519 *pevent = event; 1519 *pevent = event;
1520 return 0; 1520 return 0;
1521 } 1521 }
1522 EXPORT_SYMBOL(ceph_osdc_create_event); 1522 EXPORT_SYMBOL(ceph_osdc_create_event);
1523 1523
1524 void ceph_osdc_cancel_event(struct ceph_osd_event *event) 1524 void ceph_osdc_cancel_event(struct ceph_osd_event *event)
1525 { 1525 {
1526 struct ceph_osd_client *osdc = event->osdc; 1526 struct ceph_osd_client *osdc = event->osdc;
1527 1527
1528 dout("cancel_event %p\n", event); 1528 dout("cancel_event %p\n", event);
1529 spin_lock(&osdc->event_lock); 1529 spin_lock(&osdc->event_lock);
1530 __remove_event(event); 1530 __remove_event(event);
1531 spin_unlock(&osdc->event_lock); 1531 spin_unlock(&osdc->event_lock);
1532 ceph_osdc_put_event(event); /* caller's */ 1532 ceph_osdc_put_event(event); /* caller's */
1533 } 1533 }
1534 EXPORT_SYMBOL(ceph_osdc_cancel_event); 1534 EXPORT_SYMBOL(ceph_osdc_cancel_event);
1535 1535
1536 1536
1537 static void do_event_work(struct work_struct *work) 1537 static void do_event_work(struct work_struct *work)
1538 { 1538 {
1539 struct ceph_osd_event_work *event_work = 1539 struct ceph_osd_event_work *event_work =
1540 container_of(work, struct ceph_osd_event_work, work); 1540 container_of(work, struct ceph_osd_event_work, work);
1541 struct ceph_osd_event *event = event_work->event; 1541 struct ceph_osd_event *event = event_work->event;
1542 u64 ver = event_work->ver; 1542 u64 ver = event_work->ver;
1543 u64 notify_id = event_work->notify_id; 1543 u64 notify_id = event_work->notify_id;
1544 u8 opcode = event_work->opcode; 1544 u8 opcode = event_work->opcode;
1545 1545
1546 dout("do_event_work completing %p\n", event); 1546 dout("do_event_work completing %p\n", event);
1547 event->cb(ver, notify_id, opcode, event->data); 1547 event->cb(ver, notify_id, opcode, event->data);
1548 complete(&event->completion); 1548 complete(&event->completion);
1549 dout("do_event_work completed %p\n", event); 1549 dout("do_event_work completed %p\n", event);
1550 ceph_osdc_put_event(event); 1550 ceph_osdc_put_event(event);
1551 kfree(event_work); 1551 kfree(event_work);
1552 } 1552 }
1553 1553
1554 1554
1555 /* 1555 /*
1556 * Process osd watch notifications 1556 * Process osd watch notifications
1557 */ 1557 */
1558 void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) 1558 void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1559 { 1559 {
1560 void *p, *end; 1560 void *p, *end;
1561 u8 proto_ver; 1561 u8 proto_ver;
1562 u64 cookie, ver, notify_id; 1562 u64 cookie, ver, notify_id;
1563 u8 opcode; 1563 u8 opcode;
1564 struct ceph_osd_event *event; 1564 struct ceph_osd_event *event;
1565 struct ceph_osd_event_work *event_work; 1565 struct ceph_osd_event_work *event_work;
1566 1566
1567 p = msg->front.iov_base; 1567 p = msg->front.iov_base;
1568 end = p + msg->front.iov_len; 1568 end = p + msg->front.iov_len;
1569 1569
1570 ceph_decode_8_safe(&p, end, proto_ver, bad); 1570 ceph_decode_8_safe(&p, end, proto_ver, bad);
1571 ceph_decode_8_safe(&p, end, opcode, bad); 1571 ceph_decode_8_safe(&p, end, opcode, bad);
1572 ceph_decode_64_safe(&p, end, cookie, bad); 1572 ceph_decode_64_safe(&p, end, cookie, bad);
1573 ceph_decode_64_safe(&p, end, ver, bad); 1573 ceph_decode_64_safe(&p, end, ver, bad);
1574 ceph_decode_64_safe(&p, end, notify_id, bad); 1574 ceph_decode_64_safe(&p, end, notify_id, bad);
1575 1575
1576 spin_lock(&osdc->event_lock); 1576 spin_lock(&osdc->event_lock);
1577 event = __find_event(osdc, cookie); 1577 event = __find_event(osdc, cookie);
1578 if (event) { 1578 if (event) {
1579 get_event(event); 1579 get_event(event);
1580 if (event->one_shot) 1580 if (event->one_shot)
1581 __remove_event(event); 1581 __remove_event(event);
1582 } 1582 }
1583 spin_unlock(&osdc->event_lock); 1583 spin_unlock(&osdc->event_lock);
1584 dout("handle_watch_notify cookie %lld ver %lld event %p\n", 1584 dout("handle_watch_notify cookie %lld ver %lld event %p\n",
1585 cookie, ver, event); 1585 cookie, ver, event);
1586 if (event) { 1586 if (event) {
1587 event_work = kmalloc(sizeof(*event_work), GFP_NOIO); 1587 event_work = kmalloc(sizeof(*event_work), GFP_NOIO);
1588 if (!event_work) { 1588 if (!event_work) {
1589 dout("ERROR: could not allocate event_work\n"); 1589 dout("ERROR: could not allocate event_work\n");
1590 goto done_err; 1590 goto done_err;
1591 } 1591 }
1592 INIT_WORK(&event_work->work, do_event_work); 1592 INIT_WORK(&event_work->work, do_event_work);
1593 event_work->event = event; 1593 event_work->event = event;
1594 event_work->ver = ver; 1594 event_work->ver = ver;
1595 event_work->notify_id = notify_id; 1595 event_work->notify_id = notify_id;
1596 event_work->opcode = opcode; 1596 event_work->opcode = opcode;
1597 if (!queue_work(osdc->notify_wq, &event_work->work)) { 1597 if (!queue_work(osdc->notify_wq, &event_work->work)) {
1598 dout("WARNING: failed to queue notify event work\n"); 1598 dout("WARNING: failed to queue notify event work\n");
1599 goto done_err; 1599 goto done_err;
1600 } 1600 }
1601 } 1601 }
1602 1602
1603 return; 1603 return;
1604 1604
1605 done_err: 1605 done_err:
1606 complete(&event->completion); 1606 complete(&event->completion);
1607 ceph_osdc_put_event(event); 1607 ceph_osdc_put_event(event);
1608 return; 1608 return;
1609 1609
1610 bad: 1610 bad:
1611 pr_err("osdc handle_watch_notify corrupt msg\n"); 1611 pr_err("osdc handle_watch_notify corrupt msg\n");
1612 return; 1612 return;
1613 } 1613 }
1614 1614
1615 int ceph_osdc_wait_event(struct ceph_osd_event *event, unsigned long timeout) 1615 int ceph_osdc_wait_event(struct ceph_osd_event *event, unsigned long timeout)
1616 { 1616 {
1617 int err; 1617 int err;
1618 1618
1619 dout("wait_event %p\n", event); 1619 dout("wait_event %p\n", event);
1620 err = wait_for_completion_interruptible_timeout(&event->completion, 1620 err = wait_for_completion_interruptible_timeout(&event->completion,
1621 timeout * HZ); 1621 timeout * HZ);
1622 ceph_osdc_put_event(event); 1622 ceph_osdc_put_event(event);
1623 if (err > 0) 1623 if (err > 0)
1624 err = 0; 1624 err = 0;
1625 dout("wait_event %p returns %d\n", event, err); 1625 dout("wait_event %p returns %d\n", event, err);
1626 return err; 1626 return err;
1627 } 1627 }
1628 EXPORT_SYMBOL(ceph_osdc_wait_event); 1628 EXPORT_SYMBOL(ceph_osdc_wait_event);
1629 1629
1630 /* 1630 /*
1631 * Register request, send initial attempt. 1631 * Register request, send initial attempt.
1632 */ 1632 */
1633 int ceph_osdc_start_request(struct ceph_osd_client *osdc, 1633 int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1634 struct ceph_osd_request *req, 1634 struct ceph_osd_request *req,
1635 bool nofail) 1635 bool nofail)
1636 { 1636 {
1637 int rc = 0; 1637 int rc = 0;
1638 1638
1639 req->r_request->pages = req->r_pages; 1639 req->r_request->pages = req->r_pages;
1640 req->r_request->nr_pages = req->r_num_pages; 1640 req->r_request->nr_pages = req->r_num_pages;
1641 #ifdef CONFIG_BLOCK 1641 #ifdef CONFIG_BLOCK
1642 req->r_request->bio = req->r_bio; 1642 req->r_request->bio = req->r_bio;
1643 #endif 1643 #endif
1644 req->r_request->trail = &req->r_trail; 1644 req->r_request->trail = &req->r_trail;
1645 1645
1646 register_request(osdc, req); 1646 register_request(osdc, req);
1647 1647
1648 down_read(&osdc->map_sem); 1648 down_read(&osdc->map_sem);
1649 mutex_lock(&osdc->request_mutex); 1649 mutex_lock(&osdc->request_mutex);
1650 /* 1650 /*
1651 * a racing kick_requests() may have sent the message for us 1651 * a racing kick_requests() may have sent the message for us
1652 * while we dropped request_mutex above, so only send now if 1652 * while we dropped request_mutex above, so only send now if
1653 * the request still han't been touched yet. 1653 * the request still han't been touched yet.
1654 */ 1654 */
1655 if (req->r_sent == 0) { 1655 if (req->r_sent == 0) {
1656 rc = __map_request(osdc, req, 0); 1656 rc = __map_request(osdc, req, 0);
1657 if (rc < 0) { 1657 if (rc < 0) {
1658 if (nofail) { 1658 if (nofail) {
1659 dout("osdc_start_request failed map, " 1659 dout("osdc_start_request failed map, "
1660 " will retry %lld\n", req->r_tid); 1660 " will retry %lld\n", req->r_tid);
1661 rc = 0; 1661 rc = 0;
1662 } 1662 }
1663 goto out_unlock; 1663 goto out_unlock;
1664 } 1664 }
1665 if (req->r_osd == NULL) { 1665 if (req->r_osd == NULL) {
1666 dout("send_request %p no up osds in pg\n", req); 1666 dout("send_request %p no up osds in pg\n", req);
1667 ceph_monc_request_next_osdmap(&osdc->client->monc); 1667 ceph_monc_request_next_osdmap(&osdc->client->monc);
1668 } else { 1668 } else {
1669 __send_request(osdc, req); 1669 __send_request(osdc, req);
1670 } 1670 }
1671 rc = 0; 1671 rc = 0;
1672 } 1672 }
1673 1673
1674 out_unlock: 1674 out_unlock:
1675 mutex_unlock(&osdc->request_mutex); 1675 mutex_unlock(&osdc->request_mutex);
1676 up_read(&osdc->map_sem); 1676 up_read(&osdc->map_sem);
1677 return rc; 1677 return rc;
1678 } 1678 }
1679 EXPORT_SYMBOL(ceph_osdc_start_request); 1679 EXPORT_SYMBOL(ceph_osdc_start_request);
1680 1680
1681 /* 1681 /*
1682 * wait for a request to complete 1682 * wait for a request to complete
1683 */ 1683 */
1684 int ceph_osdc_wait_request(struct ceph_osd_client *osdc, 1684 int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
1685 struct ceph_osd_request *req) 1685 struct ceph_osd_request *req)
1686 { 1686 {
1687 int rc; 1687 int rc;
1688 1688
1689 rc = wait_for_completion_interruptible(&req->r_completion); 1689 rc = wait_for_completion_interruptible(&req->r_completion);
1690 if (rc < 0) { 1690 if (rc < 0) {
1691 mutex_lock(&osdc->request_mutex); 1691 mutex_lock(&osdc->request_mutex);
1692 __cancel_request(req); 1692 __cancel_request(req);
1693 __unregister_request(osdc, req); 1693 __unregister_request(osdc, req);
1694 mutex_unlock(&osdc->request_mutex); 1694 mutex_unlock(&osdc->request_mutex);
1695 complete_request(req); 1695 complete_request(req);
1696 dout("wait_request tid %llu canceled/timed out\n", req->r_tid); 1696 dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
1697 return rc; 1697 return rc;
1698 } 1698 }
1699 1699
1700 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result); 1700 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
1701 return req->r_result; 1701 return req->r_result;
1702 } 1702 }
1703 EXPORT_SYMBOL(ceph_osdc_wait_request); 1703 EXPORT_SYMBOL(ceph_osdc_wait_request);
1704 1704
1705 /* 1705 /*
1706 * sync - wait for all in-flight requests to flush. avoid starvation. 1706 * sync - wait for all in-flight requests to flush. avoid starvation.
1707 */ 1707 */
1708 void ceph_osdc_sync(struct ceph_osd_client *osdc) 1708 void ceph_osdc_sync(struct ceph_osd_client *osdc)
1709 { 1709 {
1710 struct ceph_osd_request *req; 1710 struct ceph_osd_request *req;
1711 u64 last_tid, next_tid = 0; 1711 u64 last_tid, next_tid = 0;
1712 1712
1713 mutex_lock(&osdc->request_mutex); 1713 mutex_lock(&osdc->request_mutex);
1714 last_tid = osdc->last_tid; 1714 last_tid = osdc->last_tid;
1715 while (1) { 1715 while (1) {
1716 req = __lookup_request_ge(osdc, next_tid); 1716 req = __lookup_request_ge(osdc, next_tid);
1717 if (!req) 1717 if (!req)
1718 break; 1718 break;
1719 if (req->r_tid > last_tid) 1719 if (req->r_tid > last_tid)
1720 break; 1720 break;
1721 1721
1722 next_tid = req->r_tid + 1; 1722 next_tid = req->r_tid + 1;
1723 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0) 1723 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
1724 continue; 1724 continue;
1725 1725
1726 ceph_osdc_get_request(req); 1726 ceph_osdc_get_request(req);
1727 mutex_unlock(&osdc->request_mutex); 1727 mutex_unlock(&osdc->request_mutex);
1728 dout("sync waiting on tid %llu (last is %llu)\n", 1728 dout("sync waiting on tid %llu (last is %llu)\n",
1729 req->r_tid, last_tid); 1729 req->r_tid, last_tid);
1730 wait_for_completion(&req->r_safe_completion); 1730 wait_for_completion(&req->r_safe_completion);
1731 mutex_lock(&osdc->request_mutex); 1731 mutex_lock(&osdc->request_mutex);
1732 ceph_osdc_put_request(req); 1732 ceph_osdc_put_request(req);
1733 } 1733 }
1734 mutex_unlock(&osdc->request_mutex); 1734 mutex_unlock(&osdc->request_mutex);
1735 dout("sync done (thru tid %llu)\n", last_tid); 1735 dout("sync done (thru tid %llu)\n", last_tid);
1736 } 1736 }
1737 EXPORT_SYMBOL(ceph_osdc_sync); 1737 EXPORT_SYMBOL(ceph_osdc_sync);
1738 1738
1739 /* 1739 /*
1740 * init, shutdown 1740 * init, shutdown
1741 */ 1741 */
1742 int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) 1742 int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1743 { 1743 {
1744 int err; 1744 int err;
1745 1745
1746 dout("init\n"); 1746 dout("init\n");
1747 osdc->client = client; 1747 osdc->client = client;
1748 osdc->osdmap = NULL; 1748 osdc->osdmap = NULL;
1749 init_rwsem(&osdc->map_sem); 1749 init_rwsem(&osdc->map_sem);
1750 init_completion(&osdc->map_waiters); 1750 init_completion(&osdc->map_waiters);
1751 osdc->last_requested_map = 0; 1751 osdc->last_requested_map = 0;
1752 mutex_init(&osdc->request_mutex); 1752 mutex_init(&osdc->request_mutex);
1753 osdc->last_tid = 0; 1753 osdc->last_tid = 0;
1754 osdc->osds = RB_ROOT; 1754 osdc->osds = RB_ROOT;
1755 INIT_LIST_HEAD(&osdc->osd_lru); 1755 INIT_LIST_HEAD(&osdc->osd_lru);
1756 osdc->requests = RB_ROOT; 1756 osdc->requests = RB_ROOT;
1757 INIT_LIST_HEAD(&osdc->req_lru); 1757 INIT_LIST_HEAD(&osdc->req_lru);
1758 INIT_LIST_HEAD(&osdc->req_unsent); 1758 INIT_LIST_HEAD(&osdc->req_unsent);
1759 INIT_LIST_HEAD(&osdc->req_notarget); 1759 INIT_LIST_HEAD(&osdc->req_notarget);
1760 INIT_LIST_HEAD(&osdc->req_linger); 1760 INIT_LIST_HEAD(&osdc->req_linger);
1761 osdc->num_requests = 0; 1761 osdc->num_requests = 0;
1762 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); 1762 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
1763 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); 1763 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
1764 spin_lock_init(&osdc->event_lock); 1764 spin_lock_init(&osdc->event_lock);
1765 osdc->event_tree = RB_ROOT; 1765 osdc->event_tree = RB_ROOT;
1766 osdc->event_count = 0; 1766 osdc->event_count = 0;
1767 1767
1768 schedule_delayed_work(&osdc->osds_timeout_work, 1768 schedule_delayed_work(&osdc->osds_timeout_work,
1769 round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ)); 1769 round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
1770 1770
1771 err = -ENOMEM; 1771 err = -ENOMEM;
1772 osdc->req_mempool = mempool_create_kmalloc_pool(10, 1772 osdc->req_mempool = mempool_create_kmalloc_pool(10,
1773 sizeof(struct ceph_osd_request)); 1773 sizeof(struct ceph_osd_request));
1774 if (!osdc->req_mempool) 1774 if (!osdc->req_mempool)
1775 goto out; 1775 goto out;
1776 1776
1777 err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, 1777 err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
1778 OSD_OP_FRONT_LEN, 10, true, 1778 OSD_OP_FRONT_LEN, 10, true,
1779 "osd_op"); 1779 "osd_op");
1780 if (err < 0) 1780 if (err < 0)
1781 goto out_mempool; 1781 goto out_mempool;
1782 err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, 1782 err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
1783 OSD_OPREPLY_FRONT_LEN, 10, true, 1783 OSD_OPREPLY_FRONT_LEN, 10, true,
1784 "osd_op_reply"); 1784 "osd_op_reply");
1785 if (err < 0) 1785 if (err < 0)
1786 goto out_msgpool; 1786 goto out_msgpool;
1787 1787
1788 osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify"); 1788 osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify");
1789 if (IS_ERR(osdc->notify_wq)) { 1789 if (IS_ERR(osdc->notify_wq)) {
1790 err = PTR_ERR(osdc->notify_wq); 1790 err = PTR_ERR(osdc->notify_wq);
1791 osdc->notify_wq = NULL; 1791 osdc->notify_wq = NULL;
1792 goto out_msgpool; 1792 goto out_msgpool;
1793 } 1793 }
1794 return 0; 1794 return 0;
1795 1795
1796 out_msgpool: 1796 out_msgpool:
1797 ceph_msgpool_destroy(&osdc->msgpool_op); 1797 ceph_msgpool_destroy(&osdc->msgpool_op);
1798 out_mempool: 1798 out_mempool:
1799 mempool_destroy(osdc->req_mempool); 1799 mempool_destroy(osdc->req_mempool);
1800 out: 1800 out:
1801 return err; 1801 return err;
1802 } 1802 }
1803 EXPORT_SYMBOL(ceph_osdc_init); 1803 EXPORT_SYMBOL(ceph_osdc_init);
1804 1804
1805 void ceph_osdc_stop(struct ceph_osd_client *osdc) 1805 void ceph_osdc_stop(struct ceph_osd_client *osdc)
1806 { 1806 {
1807 flush_workqueue(osdc->notify_wq); 1807 flush_workqueue(osdc->notify_wq);
1808 destroy_workqueue(osdc->notify_wq); 1808 destroy_workqueue(osdc->notify_wq);
1809 cancel_delayed_work_sync(&osdc->timeout_work); 1809 cancel_delayed_work_sync(&osdc->timeout_work);
1810 cancel_delayed_work_sync(&osdc->osds_timeout_work); 1810 cancel_delayed_work_sync(&osdc->osds_timeout_work);
1811 if (osdc->osdmap) { 1811 if (osdc->osdmap) {
1812 ceph_osdmap_destroy(osdc->osdmap); 1812 ceph_osdmap_destroy(osdc->osdmap);
1813 osdc->osdmap = NULL; 1813 osdc->osdmap = NULL;
1814 } 1814 }
1815 remove_all_osds(osdc); 1815 remove_all_osds(osdc);
1816 mempool_destroy(osdc->req_mempool); 1816 mempool_destroy(osdc->req_mempool);
1817 ceph_msgpool_destroy(&osdc->msgpool_op); 1817 ceph_msgpool_destroy(&osdc->msgpool_op);
1818 ceph_msgpool_destroy(&osdc->msgpool_op_reply); 1818 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1819 } 1819 }
1820 EXPORT_SYMBOL(ceph_osdc_stop); 1820 EXPORT_SYMBOL(ceph_osdc_stop);
1821 1821
1822 /* 1822 /*
1823 * Read some contiguous pages. If we cross a stripe boundary, shorten 1823 * Read some contiguous pages. If we cross a stripe boundary, shorten
1824 * *plen. Return number of bytes read, or error. 1824 * *plen. Return number of bytes read, or error.
1825 */ 1825 */
1826 int ceph_osdc_readpages(struct ceph_osd_client *osdc, 1826 int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1827 struct ceph_vino vino, struct ceph_file_layout *layout, 1827 struct ceph_vino vino, struct ceph_file_layout *layout,
1828 u64 off, u64 *plen, 1828 u64 off, u64 *plen,
1829 u32 truncate_seq, u64 truncate_size, 1829 u32 truncate_seq, u64 truncate_size,
1830 struct page **pages, int num_pages, int page_align) 1830 struct page **pages, int num_pages, int page_align)
1831 { 1831 {
1832 struct ceph_osd_request *req; 1832 struct ceph_osd_request *req;
1833 int rc = 0; 1833 int rc = 0;
1834 1834
1835 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, 1835 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
1836 vino.snap, off, *plen); 1836 vino.snap, off, *plen);
1837 req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1837 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1838 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 1838 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1839 NULL, 0, truncate_seq, truncate_size, NULL, 1839 NULL, 0, truncate_seq, truncate_size, NULL,
1840 false, 1, page_align); 1840 false, 1, page_align);
1841 if (IS_ERR(req)) 1841 if (IS_ERR(req))
1842 return PTR_ERR(req); 1842 return PTR_ERR(req);
1843 1843
1844 /* it may be a short read due to an object boundary */ 1844 /* it may be a short read due to an object boundary */
1845 req->r_pages = pages; 1845 req->r_pages = pages;
1846 1846
1847 dout("readpages final extent is %llu~%llu (%d pages align %d)\n", 1847 dout("readpages final extent is %llu~%llu (%d pages align %d)\n",
1848 off, *plen, req->r_num_pages, page_align); 1848 off, *plen, req->r_num_pages, page_align);
1849 1849
1850 rc = ceph_osdc_start_request(osdc, req, false); 1850 rc = ceph_osdc_start_request(osdc, req, false);
1851 if (!rc) 1851 if (!rc)
1852 rc = ceph_osdc_wait_request(osdc, req); 1852 rc = ceph_osdc_wait_request(osdc, req);
1853 1853
1854 ceph_osdc_put_request(req); 1854 ceph_osdc_put_request(req);
1855 dout("readpages result %d\n", rc); 1855 dout("readpages result %d\n", rc);
1856 return rc; 1856 return rc;
1857 } 1857 }
1858 EXPORT_SYMBOL(ceph_osdc_readpages); 1858 EXPORT_SYMBOL(ceph_osdc_readpages);
1859 1859
1860 /* 1860 /*
1861 * do a synchronous write on N pages 1861 * do a synchronous write on N pages
1862 */ 1862 */
1863 int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, 1863 int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1864 struct ceph_file_layout *layout, 1864 struct ceph_file_layout *layout,
1865 struct ceph_snap_context *snapc, 1865 struct ceph_snap_context *snapc,
1866 u64 off, u64 len, 1866 u64 off, u64 len,
1867 u32 truncate_seq, u64 truncate_size, 1867 u32 truncate_seq, u64 truncate_size,
1868 struct timespec *mtime, 1868 struct timespec *mtime,
1869 struct page **pages, int num_pages, 1869 struct page **pages, int num_pages,
1870 int flags, int do_sync, bool nofail) 1870 int flags, int do_sync)
1871 { 1871 {
1872 struct ceph_osd_request *req; 1872 struct ceph_osd_request *req;
1873 int rc = 0; 1873 int rc = 0;
1874 int page_align = off & ~PAGE_MASK; 1874 int page_align = off & ~PAGE_MASK;
1875 1875
1876 BUG_ON(vino.snap != CEPH_NOSNAP); 1876 BUG_ON(vino.snap != CEPH_NOSNAP);
1877 req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1877 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
1878 CEPH_OSD_OP_WRITE, 1878 CEPH_OSD_OP_WRITE,
1879 flags | CEPH_OSD_FLAG_ONDISK | 1879 flags | CEPH_OSD_FLAG_ONDISK |
1880 CEPH_OSD_FLAG_WRITE, 1880 CEPH_OSD_FLAG_WRITE,
1881 snapc, do_sync, 1881 snapc, do_sync,
1882 truncate_seq, truncate_size, mtime, 1882 truncate_seq, truncate_size, mtime,
1883 nofail, 1, page_align); 1883 true, 1, page_align);
1884 if (IS_ERR(req)) 1884 if (IS_ERR(req))
1885 return PTR_ERR(req); 1885 return PTR_ERR(req);
1886 1886
1887 /* it may be a short write due to an object boundary */ 1887 /* it may be a short write due to an object boundary */
1888 req->r_pages = pages; 1888 req->r_pages = pages;
1889 dout("writepages %llu~%llu (%d pages)\n", off, len, 1889 dout("writepages %llu~%llu (%d pages)\n", off, len,
1890 req->r_num_pages); 1890 req->r_num_pages);
1891 1891
1892 rc = ceph_osdc_start_request(osdc, req, nofail); 1892 rc = ceph_osdc_start_request(osdc, req, true);
1893 if (!rc) 1893 if (!rc)
1894 rc = ceph_osdc_wait_request(osdc, req); 1894 rc = ceph_osdc_wait_request(osdc, req);
1895 1895
1896 ceph_osdc_put_request(req); 1896 ceph_osdc_put_request(req);
1897 if (rc == 0) 1897 if (rc == 0)
1898 rc = len; 1898 rc = len;
1899 dout("writepages result %d\n", rc); 1899 dout("writepages result %d\n", rc);
1900 return rc; 1900 return rc;
1901 } 1901 }
1902 EXPORT_SYMBOL(ceph_osdc_writepages); 1902 EXPORT_SYMBOL(ceph_osdc_writepages);
1903 1903
1904 /* 1904 /*
1905 * handle incoming message 1905 * handle incoming message
1906 */ 1906 */
1907 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) 1907 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1908 { 1908 {
1909 struct ceph_osd *osd = con->private; 1909 struct ceph_osd *osd = con->private;
1910 struct ceph_osd_client *osdc; 1910 struct ceph_osd_client *osdc;
1911 int type = le16_to_cpu(msg->hdr.type); 1911 int type = le16_to_cpu(msg->hdr.type);
1912 1912
1913 if (!osd) 1913 if (!osd)
1914 goto out; 1914 goto out;
1915 osdc = osd->o_osdc; 1915 osdc = osd->o_osdc;
1916 1916
1917 switch (type) { 1917 switch (type) {
1918 case CEPH_MSG_OSD_MAP: 1918 case CEPH_MSG_OSD_MAP:
1919 ceph_osdc_handle_map(osdc, msg); 1919 ceph_osdc_handle_map(osdc, msg);
1920 break; 1920 break;
1921 case CEPH_MSG_OSD_OPREPLY: 1921 case CEPH_MSG_OSD_OPREPLY:
1922 handle_reply(osdc, msg, con); 1922 handle_reply(osdc, msg, con);
1923 break; 1923 break;
1924 case CEPH_MSG_WATCH_NOTIFY: 1924 case CEPH_MSG_WATCH_NOTIFY:
1925 handle_watch_notify(osdc, msg); 1925 handle_watch_notify(osdc, msg);
1926 break; 1926 break;
1927 1927
1928 default: 1928 default:
1929 pr_err("received unknown message type %d %s\n", type, 1929 pr_err("received unknown message type %d %s\n", type,
1930 ceph_msg_type_name(type)); 1930 ceph_msg_type_name(type));
1931 } 1931 }
1932 out: 1932 out:
1933 ceph_msg_put(msg); 1933 ceph_msg_put(msg);
1934 } 1934 }
1935 1935
1936 /* 1936 /*
1937 * lookup and return message for incoming reply. set up reply message 1937 * lookup and return message for incoming reply. set up reply message
1938 * pages. 1938 * pages.
1939 */ 1939 */
1940 static struct ceph_msg *get_reply(struct ceph_connection *con, 1940 static struct ceph_msg *get_reply(struct ceph_connection *con,
1941 struct ceph_msg_header *hdr, 1941 struct ceph_msg_header *hdr,
1942 int *skip) 1942 int *skip)
1943 { 1943 {
1944 struct ceph_osd *osd = con->private; 1944 struct ceph_osd *osd = con->private;
1945 struct ceph_osd_client *osdc = osd->o_osdc; 1945 struct ceph_osd_client *osdc = osd->o_osdc;
1946 struct ceph_msg *m; 1946 struct ceph_msg *m;
1947 struct ceph_osd_request *req; 1947 struct ceph_osd_request *req;
1948 int front = le32_to_cpu(hdr->front_len); 1948 int front = le32_to_cpu(hdr->front_len);
1949 int data_len = le32_to_cpu(hdr->data_len); 1949 int data_len = le32_to_cpu(hdr->data_len);
1950 u64 tid; 1950 u64 tid;
1951 1951
1952 tid = le64_to_cpu(hdr->tid); 1952 tid = le64_to_cpu(hdr->tid);
1953 mutex_lock(&osdc->request_mutex); 1953 mutex_lock(&osdc->request_mutex);
1954 req = __lookup_request(osdc, tid); 1954 req = __lookup_request(osdc, tid);
1955 if (!req) { 1955 if (!req) {
1956 *skip = 1; 1956 *skip = 1;
1957 m = NULL; 1957 m = NULL;
1958 dout("get_reply unknown tid %llu from osd%d\n", tid, 1958 dout("get_reply unknown tid %llu from osd%d\n", tid,
1959 osd->o_osd); 1959 osd->o_osd);
1960 goto out; 1960 goto out;
1961 } 1961 }
1962 1962
1963 if (req->r_con_filling_msg) { 1963 if (req->r_con_filling_msg) {
1964 dout("%s revoking msg %p from old con %p\n", __func__, 1964 dout("%s revoking msg %p from old con %p\n", __func__,
1965 req->r_reply, req->r_con_filling_msg); 1965 req->r_reply, req->r_con_filling_msg);
1966 ceph_msg_revoke_incoming(req->r_reply); 1966 ceph_msg_revoke_incoming(req->r_reply);
1967 req->r_con_filling_msg->ops->put(req->r_con_filling_msg); 1967 req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
1968 req->r_con_filling_msg = NULL; 1968 req->r_con_filling_msg = NULL;
1969 } 1969 }
1970 1970
1971 if (front > req->r_reply->front.iov_len) { 1971 if (front > req->r_reply->front.iov_len) {
1972 pr_warning("get_reply front %d > preallocated %d\n", 1972 pr_warning("get_reply front %d > preallocated %d\n",
1973 front, (int)req->r_reply->front.iov_len); 1973 front, (int)req->r_reply->front.iov_len);
1974 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS, false); 1974 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS, false);
1975 if (!m) 1975 if (!m)
1976 goto out; 1976 goto out;
1977 ceph_msg_put(req->r_reply); 1977 ceph_msg_put(req->r_reply);
1978 req->r_reply = m; 1978 req->r_reply = m;
1979 } 1979 }
1980 m = ceph_msg_get(req->r_reply); 1980 m = ceph_msg_get(req->r_reply);
1981 1981
1982 if (data_len > 0) { 1982 if (data_len > 0) {
1983 int want = calc_pages_for(req->r_page_alignment, data_len); 1983 int want = calc_pages_for(req->r_page_alignment, data_len);
1984 1984
1985 if (req->r_pages && unlikely(req->r_num_pages < want)) { 1985 if (req->r_pages && unlikely(req->r_num_pages < want)) {
1986 pr_warning("tid %lld reply has %d bytes %d pages, we" 1986 pr_warning("tid %lld reply has %d bytes %d pages, we"
1987 " had only %d pages ready\n", tid, data_len, 1987 " had only %d pages ready\n", tid, data_len,
1988 want, req->r_num_pages); 1988 want, req->r_num_pages);
1989 *skip = 1; 1989 *skip = 1;
1990 ceph_msg_put(m); 1990 ceph_msg_put(m);
1991 m = NULL; 1991 m = NULL;
1992 goto out; 1992 goto out;
1993 } 1993 }
1994 m->pages = req->r_pages; 1994 m->pages = req->r_pages;
1995 m->nr_pages = req->r_num_pages; 1995 m->nr_pages = req->r_num_pages;
1996 m->page_alignment = req->r_page_alignment; 1996 m->page_alignment = req->r_page_alignment;
1997 #ifdef CONFIG_BLOCK 1997 #ifdef CONFIG_BLOCK
1998 m->bio = req->r_bio; 1998 m->bio = req->r_bio;
1999 #endif 1999 #endif
2000 } 2000 }
2001 *skip = 0; 2001 *skip = 0;
2002 req->r_con_filling_msg = con->ops->get(con); 2002 req->r_con_filling_msg = con->ops->get(con);
2003 dout("get_reply tid %lld %p\n", tid, m); 2003 dout("get_reply tid %lld %p\n", tid, m);
2004 2004
2005 out: 2005 out:
2006 mutex_unlock(&osdc->request_mutex); 2006 mutex_unlock(&osdc->request_mutex);
2007 return m; 2007 return m;
2008 2008
2009 } 2009 }
2010 2010
2011 static struct ceph_msg *alloc_msg(struct ceph_connection *con, 2011 static struct ceph_msg *alloc_msg(struct ceph_connection *con,
2012 struct ceph_msg_header *hdr, 2012 struct ceph_msg_header *hdr,
2013 int *skip) 2013 int *skip)
2014 { 2014 {
2015 struct ceph_osd *osd = con->private; 2015 struct ceph_osd *osd = con->private;
2016 int type = le16_to_cpu(hdr->type); 2016 int type = le16_to_cpu(hdr->type);
2017 int front = le32_to_cpu(hdr->front_len); 2017 int front = le32_to_cpu(hdr->front_len);
2018 2018
2019 *skip = 0; 2019 *skip = 0;
2020 switch (type) { 2020 switch (type) {
2021 case CEPH_MSG_OSD_MAP: 2021 case CEPH_MSG_OSD_MAP:
2022 case CEPH_MSG_WATCH_NOTIFY: 2022 case CEPH_MSG_WATCH_NOTIFY:
2023 return ceph_msg_new(type, front, GFP_NOFS, false); 2023 return ceph_msg_new(type, front, GFP_NOFS, false);
2024 case CEPH_MSG_OSD_OPREPLY: 2024 case CEPH_MSG_OSD_OPREPLY:
2025 return get_reply(con, hdr, skip); 2025 return get_reply(con, hdr, skip);
2026 default: 2026 default:
2027 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type, 2027 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
2028 osd->o_osd); 2028 osd->o_osd);
2029 *skip = 1; 2029 *skip = 1;
2030 return NULL; 2030 return NULL;
2031 } 2031 }
2032 } 2032 }
2033 2033
2034 /* 2034 /*
2035 * Wrappers to refcount containing ceph_osd struct 2035 * Wrappers to refcount containing ceph_osd struct
2036 */ 2036 */
2037 static struct ceph_connection *get_osd_con(struct ceph_connection *con) 2037 static struct ceph_connection *get_osd_con(struct ceph_connection *con)
2038 { 2038 {
2039 struct ceph_osd *osd = con->private; 2039 struct ceph_osd *osd = con->private;
2040 if (get_osd(osd)) 2040 if (get_osd(osd))
2041 return con; 2041 return con;
2042 return NULL; 2042 return NULL;
2043 } 2043 }
2044 2044
2045 static void put_osd_con(struct ceph_connection *con) 2045 static void put_osd_con(struct ceph_connection *con)
2046 { 2046 {
2047 struct ceph_osd *osd = con->private; 2047 struct ceph_osd *osd = con->private;
2048 put_osd(osd); 2048 put_osd(osd);
2049 } 2049 }
2050 2050
2051 /* 2051 /*
2052 * authentication 2052 * authentication
2053 */ 2053 */
2054 /* 2054 /*
2055 * Note: returned pointer is the address of a structure that's 2055 * Note: returned pointer is the address of a structure that's
2056 * managed separately. Caller must *not* attempt to free it. 2056 * managed separately. Caller must *not* attempt to free it.
2057 */ 2057 */
2058 static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, 2058 static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
2059 int *proto, int force_new) 2059 int *proto, int force_new)
2060 { 2060 {
2061 struct ceph_osd *o = con->private; 2061 struct ceph_osd *o = con->private;
2062 struct ceph_osd_client *osdc = o->o_osdc; 2062 struct ceph_osd_client *osdc = o->o_osdc;
2063 struct ceph_auth_client *ac = osdc->client->monc.auth; 2063 struct ceph_auth_client *ac = osdc->client->monc.auth;
2064 struct ceph_auth_handshake *auth = &o->o_auth; 2064 struct ceph_auth_handshake *auth = &o->o_auth;
2065 2065
2066 if (force_new && auth->authorizer) { 2066 if (force_new && auth->authorizer) {
2067 if (ac->ops && ac->ops->destroy_authorizer) 2067 if (ac->ops && ac->ops->destroy_authorizer)
2068 ac->ops->destroy_authorizer(ac, auth->authorizer); 2068 ac->ops->destroy_authorizer(ac, auth->authorizer);
2069 auth->authorizer = NULL; 2069 auth->authorizer = NULL;
2070 } 2070 }
2071 if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) { 2071 if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) {
2072 int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, 2072 int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
2073 auth); 2073 auth);
2074 if (ret) 2074 if (ret)
2075 return ERR_PTR(ret); 2075 return ERR_PTR(ret);
2076 } 2076 }
2077 *proto = ac->protocol; 2077 *proto = ac->protocol;
2078 2078
2079 return auth; 2079 return auth;
2080 } 2080 }
2081 2081
2082 2082
2083 static int verify_authorizer_reply(struct ceph_connection *con, int len) 2083 static int verify_authorizer_reply(struct ceph_connection *con, int len)
2084 { 2084 {
2085 struct ceph_osd *o = con->private; 2085 struct ceph_osd *o = con->private;
2086 struct ceph_osd_client *osdc = o->o_osdc; 2086 struct ceph_osd_client *osdc = o->o_osdc;
2087 struct ceph_auth_client *ac = osdc->client->monc.auth; 2087 struct ceph_auth_client *ac = osdc->client->monc.auth;
2088 2088
2089 /* 2089 /*
2090 * XXX If ac->ops or ac->ops->verify_authorizer_reply is null, 2090 * XXX If ac->ops or ac->ops->verify_authorizer_reply is null,
2091 * XXX which do we do: succeed or fail? 2091 * XXX which do we do: succeed or fail?
2092 */ 2092 */
2093 return ac->ops->verify_authorizer_reply(ac, o->o_auth.authorizer, len); 2093 return ac->ops->verify_authorizer_reply(ac, o->o_auth.authorizer, len);
2094 } 2094 }
2095 2095
2096 static int invalidate_authorizer(struct ceph_connection *con) 2096 static int invalidate_authorizer(struct ceph_connection *con)
2097 { 2097 {
2098 struct ceph_osd *o = con->private; 2098 struct ceph_osd *o = con->private;
2099 struct ceph_osd_client *osdc = o->o_osdc; 2099 struct ceph_osd_client *osdc = o->o_osdc;
2100 struct ceph_auth_client *ac = osdc->client->monc.auth; 2100 struct ceph_auth_client *ac = osdc->client->monc.auth;
2101 2101
2102 if (ac->ops && ac->ops->invalidate_authorizer) 2102 if (ac->ops && ac->ops->invalidate_authorizer)
2103 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); 2103 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
2104 2104
2105 return ceph_monc_validate_auth(&osdc->client->monc); 2105 return ceph_monc_validate_auth(&osdc->client->monc);
2106 } 2106 }
2107 2107
2108 static const struct ceph_connection_operations osd_con_ops = { 2108 static const struct ceph_connection_operations osd_con_ops = {
2109 .get = get_osd_con, 2109 .get = get_osd_con,
2110 .put = put_osd_con, 2110 .put = put_osd_con,
2111 .dispatch = dispatch, 2111 .dispatch = dispatch,
2112 .get_authorizer = get_authorizer, 2112 .get_authorizer = get_authorizer,
2113 .verify_authorizer_reply = verify_authorizer_reply, 2113 .verify_authorizer_reply = verify_authorizer_reply,
2114 .invalidate_authorizer = invalidate_authorizer, 2114 .invalidate_authorizer = invalidate_authorizer,
2115 .alloc_msg = alloc_msg, 2115 .alloc_msg = alloc_msg,
2116 .fault = osd_reset, 2116 .fault = osd_reset,
2117 }; 2117 };
2118 2118