Commit 87f979d390f9ecfa3d0038a9f9a002a62f8a1895
1 parent
e7e319a9c5
Exists in
smarc-l5.0.0_1.0.0-ga
and in
5 other branches
ceph: kill ceph_osdc_writepages() "nofail" parameter
There is only one caller of ceph_osdc_writepages(), and it always passes the value true as its "nofail" argument. Get rid of that argument and replace its use in ceph_osdc_writepages() with the constant value true. This and a number of cleanup patches that follow resolve: http://tracker.ceph.com/issues/4126 Signed-off-by: Alex Elder <elder@inktank.com> Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
Showing 3 changed files with 5 additions and 5 deletions Inline Diff
fs/ceph/addr.c
1 | #include <linux/ceph/ceph_debug.h> | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include <linux/backing-dev.h> | 3 | #include <linux/backing-dev.h> |
4 | #include <linux/fs.h> | 4 | #include <linux/fs.h> |
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | #include <linux/pagemap.h> | 6 | #include <linux/pagemap.h> |
7 | #include <linux/writeback.h> /* generic_writepages */ | 7 | #include <linux/writeback.h> /* generic_writepages */ |
8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
9 | #include <linux/pagevec.h> | 9 | #include <linux/pagevec.h> |
10 | #include <linux/task_io_accounting_ops.h> | 10 | #include <linux/task_io_accounting_ops.h> |
11 | 11 | ||
12 | #include "super.h" | 12 | #include "super.h" |
13 | #include "mds_client.h" | 13 | #include "mds_client.h" |
14 | #include <linux/ceph/osd_client.h> | 14 | #include <linux/ceph/osd_client.h> |
15 | 15 | ||
16 | /* | 16 | /* |
17 | * Ceph address space ops. | 17 | * Ceph address space ops. |
18 | * | 18 | * |
19 | * There are a few funny things going on here. | 19 | * There are a few funny things going on here. |
20 | * | 20 | * |
21 | * The page->private field is used to reference a struct | 21 | * The page->private field is used to reference a struct |
22 | * ceph_snap_context for _every_ dirty page. This indicates which | 22 | * ceph_snap_context for _every_ dirty page. This indicates which |
23 | * snapshot the page was logically dirtied in, and thus which snap | 23 | * snapshot the page was logically dirtied in, and thus which snap |
24 | * context needs to be associated with the osd write during writeback. | 24 | * context needs to be associated with the osd write during writeback. |
25 | * | 25 | * |
26 | * Similarly, struct ceph_inode_info maintains a set of counters to | 26 | * Similarly, struct ceph_inode_info maintains a set of counters to |
27 | * count dirty pages on the inode. In the absence of snapshots, | 27 | * count dirty pages on the inode. In the absence of snapshots, |
28 | * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count. | 28 | * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count. |
29 | * | 29 | * |
30 | * When a snapshot is taken (that is, when the client receives | 30 | * When a snapshot is taken (that is, when the client receives |
31 | * notification that a snapshot was taken), each inode with caps and | 31 | * notification that a snapshot was taken), each inode with caps and |
32 | * with dirty pages (dirty pages implies there is a cap) gets a new | 32 | * with dirty pages (dirty pages implies there is a cap) gets a new |
33 | * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending | 33 | * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending |
34 | * order, new snaps go to the tail). The i_wrbuffer_ref_head count is | 34 | * order, new snaps go to the tail). The i_wrbuffer_ref_head count is |
35 | * moved to capsnap->dirty. (Unless a sync write is currently in | 35 | * moved to capsnap->dirty. (Unless a sync write is currently in |
36 | * progress. In that case, the capsnap is said to be "pending", new | 36 | * progress. In that case, the capsnap is said to be "pending", new |
37 | * writes cannot start, and the capsnap isn't "finalized" until the | 37 | * writes cannot start, and the capsnap isn't "finalized" until the |
38 | * write completes (or fails) and a final size/mtime for the inode for | 38 | * write completes (or fails) and a final size/mtime for the inode for |
39 | * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0. | 39 | * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0. |
40 | * | 40 | * |
41 | * On writeback, we must submit writes to the osd IN SNAP ORDER. So, | 41 | * On writeback, we must submit writes to the osd IN SNAP ORDER. So, |
42 | * we look for the first capsnap in i_cap_snaps and write out pages in | 42 | * we look for the first capsnap in i_cap_snaps and write out pages in |
43 | * that snap context _only_. Then we move on to the next capsnap, | 43 | * that snap context _only_. Then we move on to the next capsnap, |
44 | * eventually reaching the "live" or "head" context (i.e., pages that | 44 | * eventually reaching the "live" or "head" context (i.e., pages that |
45 | * are not yet snapped) and are writing the most recently dirtied | 45 | * are not yet snapped) and are writing the most recently dirtied |
46 | * pages. | 46 | * pages. |
47 | * | 47 | * |
48 | * Invalidate and so forth must take care to ensure the dirty page | 48 | * Invalidate and so forth must take care to ensure the dirty page |
49 | * accounting is preserved. | 49 | * accounting is preserved. |
50 | */ | 50 | */ |
51 | 51 | ||
52 | #define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10)) | 52 | #define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10)) |
53 | #define CONGESTION_OFF_THRESH(congestion_kb) \ | 53 | #define CONGESTION_OFF_THRESH(congestion_kb) \ |
54 | (CONGESTION_ON_THRESH(congestion_kb) - \ | 54 | (CONGESTION_ON_THRESH(congestion_kb) - \ |
55 | (CONGESTION_ON_THRESH(congestion_kb) >> 2)) | 55 | (CONGESTION_ON_THRESH(congestion_kb) >> 2)) |
56 | 56 | ||
57 | static inline struct ceph_snap_context *page_snap_context(struct page *page) | 57 | static inline struct ceph_snap_context *page_snap_context(struct page *page) |
58 | { | 58 | { |
59 | if (PagePrivate(page)) | 59 | if (PagePrivate(page)) |
60 | return (void *)page->private; | 60 | return (void *)page->private; |
61 | return NULL; | 61 | return NULL; |
62 | } | 62 | } |
63 | 63 | ||
64 | /* | 64 | /* |
65 | * Dirty a page. Optimistically adjust accounting, on the assumption | 65 | * Dirty a page. Optimistically adjust accounting, on the assumption |
66 | * that we won't race with invalidate. If we do, readjust. | 66 | * that we won't race with invalidate. If we do, readjust. |
67 | */ | 67 | */ |
68 | static int ceph_set_page_dirty(struct page *page) | 68 | static int ceph_set_page_dirty(struct page *page) |
69 | { | 69 | { |
70 | struct address_space *mapping = page->mapping; | 70 | struct address_space *mapping = page->mapping; |
71 | struct inode *inode; | 71 | struct inode *inode; |
72 | struct ceph_inode_info *ci; | 72 | struct ceph_inode_info *ci; |
73 | int undo = 0; | 73 | int undo = 0; |
74 | struct ceph_snap_context *snapc; | 74 | struct ceph_snap_context *snapc; |
75 | 75 | ||
76 | if (unlikely(!mapping)) | 76 | if (unlikely(!mapping)) |
77 | return !TestSetPageDirty(page); | 77 | return !TestSetPageDirty(page); |
78 | 78 | ||
79 | if (TestSetPageDirty(page)) { | 79 | if (TestSetPageDirty(page)) { |
80 | dout("%p set_page_dirty %p idx %lu -- already dirty\n", | 80 | dout("%p set_page_dirty %p idx %lu -- already dirty\n", |
81 | mapping->host, page, page->index); | 81 | mapping->host, page, page->index); |
82 | return 0; | 82 | return 0; |
83 | } | 83 | } |
84 | 84 | ||
85 | inode = mapping->host; | 85 | inode = mapping->host; |
86 | ci = ceph_inode(inode); | 86 | ci = ceph_inode(inode); |
87 | 87 | ||
88 | /* | 88 | /* |
89 | * Note that we're grabbing a snapc ref here without holding | 89 | * Note that we're grabbing a snapc ref here without holding |
90 | * any locks! | 90 | * any locks! |
91 | */ | 91 | */ |
92 | snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); | 92 | snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); |
93 | 93 | ||
94 | /* dirty the head */ | 94 | /* dirty the head */ |
95 | spin_lock(&ci->i_ceph_lock); | 95 | spin_lock(&ci->i_ceph_lock); |
96 | if (ci->i_head_snapc == NULL) | 96 | if (ci->i_head_snapc == NULL) |
97 | ci->i_head_snapc = ceph_get_snap_context(snapc); | 97 | ci->i_head_snapc = ceph_get_snap_context(snapc); |
98 | ++ci->i_wrbuffer_ref_head; | 98 | ++ci->i_wrbuffer_ref_head; |
99 | if (ci->i_wrbuffer_ref == 0) | 99 | if (ci->i_wrbuffer_ref == 0) |
100 | ihold(inode); | 100 | ihold(inode); |
101 | ++ci->i_wrbuffer_ref; | 101 | ++ci->i_wrbuffer_ref; |
102 | dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d " | 102 | dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d " |
103 | "snapc %p seq %lld (%d snaps)\n", | 103 | "snapc %p seq %lld (%d snaps)\n", |
104 | mapping->host, page, page->index, | 104 | mapping->host, page, page->index, |
105 | ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, | 105 | ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, |
106 | ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, | 106 | ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, |
107 | snapc, snapc->seq, snapc->num_snaps); | 107 | snapc, snapc->seq, snapc->num_snaps); |
108 | spin_unlock(&ci->i_ceph_lock); | 108 | spin_unlock(&ci->i_ceph_lock); |
109 | 109 | ||
110 | /* now adjust page */ | 110 | /* now adjust page */ |
111 | spin_lock_irq(&mapping->tree_lock); | 111 | spin_lock_irq(&mapping->tree_lock); |
112 | if (page->mapping) { /* Race with truncate? */ | 112 | if (page->mapping) { /* Race with truncate? */ |
113 | WARN_ON_ONCE(!PageUptodate(page)); | 113 | WARN_ON_ONCE(!PageUptodate(page)); |
114 | account_page_dirtied(page, page->mapping); | 114 | account_page_dirtied(page, page->mapping); |
115 | radix_tree_tag_set(&mapping->page_tree, | 115 | radix_tree_tag_set(&mapping->page_tree, |
116 | page_index(page), PAGECACHE_TAG_DIRTY); | 116 | page_index(page), PAGECACHE_TAG_DIRTY); |
117 | 117 | ||
118 | /* | 118 | /* |
119 | * Reference snap context in page->private. Also set | 119 | * Reference snap context in page->private. Also set |
120 | * PagePrivate so that we get invalidatepage callback. | 120 | * PagePrivate so that we get invalidatepage callback. |
121 | */ | 121 | */ |
122 | page->private = (unsigned long)snapc; | 122 | page->private = (unsigned long)snapc; |
123 | SetPagePrivate(page); | 123 | SetPagePrivate(page); |
124 | } else { | 124 | } else { |
125 | dout("ANON set_page_dirty %p (raced truncate?)\n", page); | 125 | dout("ANON set_page_dirty %p (raced truncate?)\n", page); |
126 | undo = 1; | 126 | undo = 1; |
127 | } | 127 | } |
128 | 128 | ||
129 | spin_unlock_irq(&mapping->tree_lock); | 129 | spin_unlock_irq(&mapping->tree_lock); |
130 | 130 | ||
131 | if (undo) | 131 | if (undo) |
132 | /* whoops, we failed to dirty the page */ | 132 | /* whoops, we failed to dirty the page */ |
133 | ceph_put_wrbuffer_cap_refs(ci, 1, snapc); | 133 | ceph_put_wrbuffer_cap_refs(ci, 1, snapc); |
134 | 134 | ||
135 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | 135 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
136 | 136 | ||
137 | BUG_ON(!PageDirty(page)); | 137 | BUG_ON(!PageDirty(page)); |
138 | return 1; | 138 | return 1; |
139 | } | 139 | } |
140 | 140 | ||
141 | /* | 141 | /* |
142 | * If we are truncating the full page (i.e. offset == 0), adjust the | 142 | * If we are truncating the full page (i.e. offset == 0), adjust the |
143 | * dirty page counters appropriately. Only called if there is private | 143 | * dirty page counters appropriately. Only called if there is private |
144 | * data on the page. | 144 | * data on the page. |
145 | */ | 145 | */ |
146 | static void ceph_invalidatepage(struct page *page, unsigned long offset) | 146 | static void ceph_invalidatepage(struct page *page, unsigned long offset) |
147 | { | 147 | { |
148 | struct inode *inode; | 148 | struct inode *inode; |
149 | struct ceph_inode_info *ci; | 149 | struct ceph_inode_info *ci; |
150 | struct ceph_snap_context *snapc = page_snap_context(page); | 150 | struct ceph_snap_context *snapc = page_snap_context(page); |
151 | 151 | ||
152 | BUG_ON(!PageLocked(page)); | 152 | BUG_ON(!PageLocked(page)); |
153 | BUG_ON(!PagePrivate(page)); | 153 | BUG_ON(!PagePrivate(page)); |
154 | BUG_ON(!page->mapping); | 154 | BUG_ON(!page->mapping); |
155 | 155 | ||
156 | inode = page->mapping->host; | 156 | inode = page->mapping->host; |
157 | 157 | ||
158 | /* | 158 | /* |
159 | * We can get non-dirty pages here due to races between | 159 | * We can get non-dirty pages here due to races between |
160 | * set_page_dirty and truncate_complete_page; just spit out a | 160 | * set_page_dirty and truncate_complete_page; just spit out a |
161 | * warning, in case we end up with accounting problems later. | 161 | * warning, in case we end up with accounting problems later. |
162 | */ | 162 | */ |
163 | if (!PageDirty(page)) | 163 | if (!PageDirty(page)) |
164 | pr_err("%p invalidatepage %p page not dirty\n", inode, page); | 164 | pr_err("%p invalidatepage %p page not dirty\n", inode, page); |
165 | 165 | ||
166 | if (offset == 0) | 166 | if (offset == 0) |
167 | ClearPageChecked(page); | 167 | ClearPageChecked(page); |
168 | 168 | ||
169 | ci = ceph_inode(inode); | 169 | ci = ceph_inode(inode); |
170 | if (offset == 0) { | 170 | if (offset == 0) { |
171 | dout("%p invalidatepage %p idx %lu full dirty page %lu\n", | 171 | dout("%p invalidatepage %p idx %lu full dirty page %lu\n", |
172 | inode, page, page->index, offset); | 172 | inode, page, page->index, offset); |
173 | ceph_put_wrbuffer_cap_refs(ci, 1, snapc); | 173 | ceph_put_wrbuffer_cap_refs(ci, 1, snapc); |
174 | ceph_put_snap_context(snapc); | 174 | ceph_put_snap_context(snapc); |
175 | page->private = 0; | 175 | page->private = 0; |
176 | ClearPagePrivate(page); | 176 | ClearPagePrivate(page); |
177 | } else { | 177 | } else { |
178 | dout("%p invalidatepage %p idx %lu partial dirty page\n", | 178 | dout("%p invalidatepage %p idx %lu partial dirty page\n", |
179 | inode, page, page->index); | 179 | inode, page, page->index); |
180 | } | 180 | } |
181 | } | 181 | } |
182 | 182 | ||
183 | /* just a sanity check */ | 183 | /* just a sanity check */ |
184 | static int ceph_releasepage(struct page *page, gfp_t g) | 184 | static int ceph_releasepage(struct page *page, gfp_t g) |
185 | { | 185 | { |
186 | struct inode *inode = page->mapping ? page->mapping->host : NULL; | 186 | struct inode *inode = page->mapping ? page->mapping->host : NULL; |
187 | dout("%p releasepage %p idx %lu\n", inode, page, page->index); | 187 | dout("%p releasepage %p idx %lu\n", inode, page, page->index); |
188 | WARN_ON(PageDirty(page)); | 188 | WARN_ON(PageDirty(page)); |
189 | WARN_ON(PagePrivate(page)); | 189 | WARN_ON(PagePrivate(page)); |
190 | return 0; | 190 | return 0; |
191 | } | 191 | } |
192 | 192 | ||
193 | /* | 193 | /* |
194 | * read a single page, without unlocking it. | 194 | * read a single page, without unlocking it. |
195 | */ | 195 | */ |
196 | static int readpage_nounlock(struct file *filp, struct page *page) | 196 | static int readpage_nounlock(struct file *filp, struct page *page) |
197 | { | 197 | { |
198 | struct inode *inode = filp->f_dentry->d_inode; | 198 | struct inode *inode = filp->f_dentry->d_inode; |
199 | struct ceph_inode_info *ci = ceph_inode(inode); | 199 | struct ceph_inode_info *ci = ceph_inode(inode); |
200 | struct ceph_osd_client *osdc = | 200 | struct ceph_osd_client *osdc = |
201 | &ceph_inode_to_client(inode)->client->osdc; | 201 | &ceph_inode_to_client(inode)->client->osdc; |
202 | int err = 0; | 202 | int err = 0; |
203 | u64 len = PAGE_CACHE_SIZE; | 203 | u64 len = PAGE_CACHE_SIZE; |
204 | 204 | ||
205 | dout("readpage inode %p file %p page %p index %lu\n", | 205 | dout("readpage inode %p file %p page %p index %lu\n", |
206 | inode, filp, page, page->index); | 206 | inode, filp, page, page->index); |
207 | err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, | 207 | err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, |
208 | (u64) page_offset(page), &len, | 208 | (u64) page_offset(page), &len, |
209 | ci->i_truncate_seq, ci->i_truncate_size, | 209 | ci->i_truncate_seq, ci->i_truncate_size, |
210 | &page, 1, 0); | 210 | &page, 1, 0); |
211 | if (err == -ENOENT) | 211 | if (err == -ENOENT) |
212 | err = 0; | 212 | err = 0; |
213 | if (err < 0) { | 213 | if (err < 0) { |
214 | SetPageError(page); | 214 | SetPageError(page); |
215 | goto out; | 215 | goto out; |
216 | } else if (err < PAGE_CACHE_SIZE) { | 216 | } else if (err < PAGE_CACHE_SIZE) { |
217 | /* zero fill remainder of page */ | 217 | /* zero fill remainder of page */ |
218 | zero_user_segment(page, err, PAGE_CACHE_SIZE); | 218 | zero_user_segment(page, err, PAGE_CACHE_SIZE); |
219 | } | 219 | } |
220 | SetPageUptodate(page); | 220 | SetPageUptodate(page); |
221 | 221 | ||
222 | out: | 222 | out: |
223 | return err < 0 ? err : 0; | 223 | return err < 0 ? err : 0; |
224 | } | 224 | } |
225 | 225 | ||
226 | static int ceph_readpage(struct file *filp, struct page *page) | 226 | static int ceph_readpage(struct file *filp, struct page *page) |
227 | { | 227 | { |
228 | int r = readpage_nounlock(filp, page); | 228 | int r = readpage_nounlock(filp, page); |
229 | unlock_page(page); | 229 | unlock_page(page); |
230 | return r; | 230 | return r; |
231 | } | 231 | } |
232 | 232 | ||
233 | /* | 233 | /* |
234 | * Finish an async read(ahead) op. | 234 | * Finish an async read(ahead) op. |
235 | */ | 235 | */ |
236 | static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) | 236 | static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) |
237 | { | 237 | { |
238 | struct inode *inode = req->r_inode; | 238 | struct inode *inode = req->r_inode; |
239 | struct ceph_osd_reply_head *replyhead; | 239 | struct ceph_osd_reply_head *replyhead; |
240 | int rc, bytes; | 240 | int rc, bytes; |
241 | int i; | 241 | int i; |
242 | 242 | ||
243 | /* parse reply */ | 243 | /* parse reply */ |
244 | replyhead = msg->front.iov_base; | 244 | replyhead = msg->front.iov_base; |
245 | WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); | 245 | WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); |
246 | rc = le32_to_cpu(replyhead->result); | 246 | rc = le32_to_cpu(replyhead->result); |
247 | bytes = le32_to_cpu(msg->hdr.data_len); | 247 | bytes = le32_to_cpu(msg->hdr.data_len); |
248 | 248 | ||
249 | dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); | 249 | dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); |
250 | 250 | ||
251 | /* unlock all pages, zeroing any data we didn't read */ | 251 | /* unlock all pages, zeroing any data we didn't read */ |
252 | for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) { | 252 | for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) { |
253 | struct page *page = req->r_pages[i]; | 253 | struct page *page = req->r_pages[i]; |
254 | 254 | ||
255 | if (bytes < (int)PAGE_CACHE_SIZE) { | 255 | if (bytes < (int)PAGE_CACHE_SIZE) { |
256 | /* zero (remainder of) page */ | 256 | /* zero (remainder of) page */ |
257 | int s = bytes < 0 ? 0 : bytes; | 257 | int s = bytes < 0 ? 0 : bytes; |
258 | zero_user_segment(page, s, PAGE_CACHE_SIZE); | 258 | zero_user_segment(page, s, PAGE_CACHE_SIZE); |
259 | } | 259 | } |
260 | dout("finish_read %p uptodate %p idx %lu\n", inode, page, | 260 | dout("finish_read %p uptodate %p idx %lu\n", inode, page, |
261 | page->index); | 261 | page->index); |
262 | flush_dcache_page(page); | 262 | flush_dcache_page(page); |
263 | SetPageUptodate(page); | 263 | SetPageUptodate(page); |
264 | unlock_page(page); | 264 | unlock_page(page); |
265 | page_cache_release(page); | 265 | page_cache_release(page); |
266 | } | 266 | } |
267 | kfree(req->r_pages); | 267 | kfree(req->r_pages); |
268 | } | 268 | } |
269 | 269 | ||
270 | static void ceph_unlock_page_vector(struct page **pages, int num_pages) | 270 | static void ceph_unlock_page_vector(struct page **pages, int num_pages) |
271 | { | 271 | { |
272 | int i; | 272 | int i; |
273 | 273 | ||
274 | for (i = 0; i < num_pages; i++) | 274 | for (i = 0; i < num_pages; i++) |
275 | unlock_page(pages[i]); | 275 | unlock_page(pages[i]); |
276 | } | 276 | } |
277 | 277 | ||
278 | /* | 278 | /* |
279 | * start an async read(ahead) operation. return nr_pages we submitted | 279 | * start an async read(ahead) operation. return nr_pages we submitted |
280 | * a read for on success, or negative error code. | 280 | * a read for on success, or negative error code. |
281 | */ | 281 | */ |
282 | static int start_read(struct inode *inode, struct list_head *page_list, int max) | 282 | static int start_read(struct inode *inode, struct list_head *page_list, int max) |
283 | { | 283 | { |
284 | struct ceph_osd_client *osdc = | 284 | struct ceph_osd_client *osdc = |
285 | &ceph_inode_to_client(inode)->client->osdc; | 285 | &ceph_inode_to_client(inode)->client->osdc; |
286 | struct ceph_inode_info *ci = ceph_inode(inode); | 286 | struct ceph_inode_info *ci = ceph_inode(inode); |
287 | struct page *page = list_entry(page_list->prev, struct page, lru); | 287 | struct page *page = list_entry(page_list->prev, struct page, lru); |
288 | struct ceph_osd_request *req; | 288 | struct ceph_osd_request *req; |
289 | u64 off; | 289 | u64 off; |
290 | u64 len; | 290 | u64 len; |
291 | int i; | 291 | int i; |
292 | struct page **pages; | 292 | struct page **pages; |
293 | pgoff_t next_index; | 293 | pgoff_t next_index; |
294 | int nr_pages = 0; | 294 | int nr_pages = 0; |
295 | int ret; | 295 | int ret; |
296 | 296 | ||
297 | off = (u64) page_offset(page); | 297 | off = (u64) page_offset(page); |
298 | 298 | ||
299 | /* count pages */ | 299 | /* count pages */ |
300 | next_index = page->index; | 300 | next_index = page->index; |
301 | list_for_each_entry_reverse(page, page_list, lru) { | 301 | list_for_each_entry_reverse(page, page_list, lru) { |
302 | if (page->index != next_index) | 302 | if (page->index != next_index) |
303 | break; | 303 | break; |
304 | nr_pages++; | 304 | nr_pages++; |
305 | next_index++; | 305 | next_index++; |
306 | if (max && nr_pages == max) | 306 | if (max && nr_pages == max) |
307 | break; | 307 | break; |
308 | } | 308 | } |
309 | len = nr_pages << PAGE_CACHE_SHIFT; | 309 | len = nr_pages << PAGE_CACHE_SHIFT; |
310 | dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, | 310 | dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, |
311 | off, len); | 311 | off, len); |
312 | 312 | ||
313 | req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), | 313 | req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), |
314 | off, &len, | 314 | off, &len, |
315 | CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, | 315 | CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, |
316 | NULL, 0, | 316 | NULL, 0, |
317 | ci->i_truncate_seq, ci->i_truncate_size, | 317 | ci->i_truncate_seq, ci->i_truncate_size, |
318 | NULL, false, 1, 0); | 318 | NULL, false, 1, 0); |
319 | if (IS_ERR(req)) | 319 | if (IS_ERR(req)) |
320 | return PTR_ERR(req); | 320 | return PTR_ERR(req); |
321 | 321 | ||
322 | /* build page vector */ | 322 | /* build page vector */ |
323 | nr_pages = len >> PAGE_CACHE_SHIFT; | 323 | nr_pages = len >> PAGE_CACHE_SHIFT; |
324 | pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); | 324 | pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); |
325 | ret = -ENOMEM; | 325 | ret = -ENOMEM; |
326 | if (!pages) | 326 | if (!pages) |
327 | goto out; | 327 | goto out; |
328 | for (i = 0; i < nr_pages; ++i) { | 328 | for (i = 0; i < nr_pages; ++i) { |
329 | page = list_entry(page_list->prev, struct page, lru); | 329 | page = list_entry(page_list->prev, struct page, lru); |
330 | BUG_ON(PageLocked(page)); | 330 | BUG_ON(PageLocked(page)); |
331 | list_del(&page->lru); | 331 | list_del(&page->lru); |
332 | 332 | ||
333 | dout("start_read %p adding %p idx %lu\n", inode, page, | 333 | dout("start_read %p adding %p idx %lu\n", inode, page, |
334 | page->index); | 334 | page->index); |
335 | if (add_to_page_cache_lru(page, &inode->i_data, page->index, | 335 | if (add_to_page_cache_lru(page, &inode->i_data, page->index, |
336 | GFP_NOFS)) { | 336 | GFP_NOFS)) { |
337 | page_cache_release(page); | 337 | page_cache_release(page); |
338 | dout("start_read %p add_to_page_cache failed %p\n", | 338 | dout("start_read %p add_to_page_cache failed %p\n", |
339 | inode, page); | 339 | inode, page); |
340 | nr_pages = i; | 340 | nr_pages = i; |
341 | goto out_pages; | 341 | goto out_pages; |
342 | } | 342 | } |
343 | pages[i] = page; | 343 | pages[i] = page; |
344 | } | 344 | } |
345 | req->r_pages = pages; | 345 | req->r_pages = pages; |
346 | req->r_num_pages = nr_pages; | 346 | req->r_num_pages = nr_pages; |
347 | req->r_callback = finish_read; | 347 | req->r_callback = finish_read; |
348 | req->r_inode = inode; | 348 | req->r_inode = inode; |
349 | 349 | ||
350 | dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); | 350 | dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); |
351 | ret = ceph_osdc_start_request(osdc, req, false); | 351 | ret = ceph_osdc_start_request(osdc, req, false); |
352 | if (ret < 0) | 352 | if (ret < 0) |
353 | goto out_pages; | 353 | goto out_pages; |
354 | ceph_osdc_put_request(req); | 354 | ceph_osdc_put_request(req); |
355 | return nr_pages; | 355 | return nr_pages; |
356 | 356 | ||
357 | out_pages: | 357 | out_pages: |
358 | ceph_unlock_page_vector(pages, nr_pages); | 358 | ceph_unlock_page_vector(pages, nr_pages); |
359 | ceph_release_page_vector(pages, nr_pages); | 359 | ceph_release_page_vector(pages, nr_pages); |
360 | out: | 360 | out: |
361 | ceph_osdc_put_request(req); | 361 | ceph_osdc_put_request(req); |
362 | return ret; | 362 | return ret; |
363 | } | 363 | } |
364 | 364 | ||
365 | 365 | ||
366 | /* | 366 | /* |
367 | * Read multiple pages. Leave pages we don't read + unlock in page_list; | 367 | * Read multiple pages. Leave pages we don't read + unlock in page_list; |
368 | * the caller (VM) cleans them up. | 368 | * the caller (VM) cleans them up. |
369 | */ | 369 | */ |
370 | static int ceph_readpages(struct file *file, struct address_space *mapping, | 370 | static int ceph_readpages(struct file *file, struct address_space *mapping, |
371 | struct list_head *page_list, unsigned nr_pages) | 371 | struct list_head *page_list, unsigned nr_pages) |
372 | { | 372 | { |
373 | struct inode *inode = file->f_dentry->d_inode; | 373 | struct inode *inode = file->f_dentry->d_inode; |
374 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 374 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
375 | int rc = 0; | 375 | int rc = 0; |
376 | int max = 0; | 376 | int max = 0; |
377 | 377 | ||
378 | if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) | 378 | if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) |
379 | max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) | 379 | max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) |
380 | >> PAGE_SHIFT; | 380 | >> PAGE_SHIFT; |
381 | 381 | ||
382 | dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages, | 382 | dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages, |
383 | max); | 383 | max); |
384 | while (!list_empty(page_list)) { | 384 | while (!list_empty(page_list)) { |
385 | rc = start_read(inode, page_list, max); | 385 | rc = start_read(inode, page_list, max); |
386 | if (rc < 0) | 386 | if (rc < 0) |
387 | goto out; | 387 | goto out; |
388 | BUG_ON(rc == 0); | 388 | BUG_ON(rc == 0); |
389 | } | 389 | } |
390 | out: | 390 | out: |
391 | dout("readpages %p file %p ret %d\n", inode, file, rc); | 391 | dout("readpages %p file %p ret %d\n", inode, file, rc); |
392 | return rc; | 392 | return rc; |
393 | } | 393 | } |
394 | 394 | ||
395 | /* | 395 | /* |
396 | * Get ref for the oldest snapc for an inode with dirty data... that is, the | 396 | * Get ref for the oldest snapc for an inode with dirty data... that is, the |
397 | * only snap context we are allowed to write back. | 397 | * only snap context we are allowed to write back. |
398 | */ | 398 | */ |
399 | static struct ceph_snap_context *get_oldest_context(struct inode *inode, | 399 | static struct ceph_snap_context *get_oldest_context(struct inode *inode, |
400 | u64 *snap_size) | 400 | u64 *snap_size) |
401 | { | 401 | { |
402 | struct ceph_inode_info *ci = ceph_inode(inode); | 402 | struct ceph_inode_info *ci = ceph_inode(inode); |
403 | struct ceph_snap_context *snapc = NULL; | 403 | struct ceph_snap_context *snapc = NULL; |
404 | struct ceph_cap_snap *capsnap = NULL; | 404 | struct ceph_cap_snap *capsnap = NULL; |
405 | 405 | ||
406 | spin_lock(&ci->i_ceph_lock); | 406 | spin_lock(&ci->i_ceph_lock); |
407 | list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { | 407 | list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { |
408 | dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, | 408 | dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, |
409 | capsnap->context, capsnap->dirty_pages); | 409 | capsnap->context, capsnap->dirty_pages); |
410 | if (capsnap->dirty_pages) { | 410 | if (capsnap->dirty_pages) { |
411 | snapc = ceph_get_snap_context(capsnap->context); | 411 | snapc = ceph_get_snap_context(capsnap->context); |
412 | if (snap_size) | 412 | if (snap_size) |
413 | *snap_size = capsnap->size; | 413 | *snap_size = capsnap->size; |
414 | break; | 414 | break; |
415 | } | 415 | } |
416 | } | 416 | } |
417 | if (!snapc && ci->i_wrbuffer_ref_head) { | 417 | if (!snapc && ci->i_wrbuffer_ref_head) { |
418 | snapc = ceph_get_snap_context(ci->i_head_snapc); | 418 | snapc = ceph_get_snap_context(ci->i_head_snapc); |
419 | dout(" head snapc %p has %d dirty pages\n", | 419 | dout(" head snapc %p has %d dirty pages\n", |
420 | snapc, ci->i_wrbuffer_ref_head); | 420 | snapc, ci->i_wrbuffer_ref_head); |
421 | } | 421 | } |
422 | spin_unlock(&ci->i_ceph_lock); | 422 | spin_unlock(&ci->i_ceph_lock); |
423 | return snapc; | 423 | return snapc; |
424 | } | 424 | } |
425 | 425 | ||
426 | /* | 426 | /* |
427 | * Write a single page, but leave the page locked. | 427 | * Write a single page, but leave the page locked. |
428 | * | 428 | * |
429 | * If we get a write error, set the page error bit, but still adjust the | 429 | * If we get a write error, set the page error bit, but still adjust the |
430 | * dirty page accounting (i.e., page is no longer dirty). | 430 | * dirty page accounting (i.e., page is no longer dirty). |
431 | */ | 431 | */ |
432 | static int writepage_nounlock(struct page *page, struct writeback_control *wbc) | 432 | static int writepage_nounlock(struct page *page, struct writeback_control *wbc) |
433 | { | 433 | { |
434 | struct inode *inode; | 434 | struct inode *inode; |
435 | struct ceph_inode_info *ci; | 435 | struct ceph_inode_info *ci; |
436 | struct ceph_fs_client *fsc; | 436 | struct ceph_fs_client *fsc; |
437 | struct ceph_osd_client *osdc; | 437 | struct ceph_osd_client *osdc; |
438 | loff_t page_off = page_offset(page); | 438 | loff_t page_off = page_offset(page); |
439 | int len = PAGE_CACHE_SIZE; | 439 | int len = PAGE_CACHE_SIZE; |
440 | loff_t i_size; | 440 | loff_t i_size; |
441 | int err = 0; | 441 | int err = 0; |
442 | struct ceph_snap_context *snapc, *oldest; | 442 | struct ceph_snap_context *snapc, *oldest; |
443 | u64 snap_size = 0; | 443 | u64 snap_size = 0; |
444 | long writeback_stat; | 444 | long writeback_stat; |
445 | 445 | ||
446 | dout("writepage %p idx %lu\n", page, page->index); | 446 | dout("writepage %p idx %lu\n", page, page->index); |
447 | 447 | ||
448 | if (!page->mapping || !page->mapping->host) { | 448 | if (!page->mapping || !page->mapping->host) { |
449 | dout("writepage %p - no mapping\n", page); | 449 | dout("writepage %p - no mapping\n", page); |
450 | return -EFAULT; | 450 | return -EFAULT; |
451 | } | 451 | } |
452 | inode = page->mapping->host; | 452 | inode = page->mapping->host; |
453 | ci = ceph_inode(inode); | 453 | ci = ceph_inode(inode); |
454 | fsc = ceph_inode_to_client(inode); | 454 | fsc = ceph_inode_to_client(inode); |
455 | osdc = &fsc->client->osdc; | 455 | osdc = &fsc->client->osdc; |
456 | 456 | ||
457 | /* verify this is a writeable snap context */ | 457 | /* verify this is a writeable snap context */ |
458 | snapc = page_snap_context(page); | 458 | snapc = page_snap_context(page); |
459 | if (snapc == NULL) { | 459 | if (snapc == NULL) { |
460 | dout("writepage %p page %p not dirty?\n", inode, page); | 460 | dout("writepage %p page %p not dirty?\n", inode, page); |
461 | goto out; | 461 | goto out; |
462 | } | 462 | } |
463 | oldest = get_oldest_context(inode, &snap_size); | 463 | oldest = get_oldest_context(inode, &snap_size); |
464 | if (snapc->seq > oldest->seq) { | 464 | if (snapc->seq > oldest->seq) { |
465 | dout("writepage %p page %p snapc %p not writeable - noop\n", | 465 | dout("writepage %p page %p snapc %p not writeable - noop\n", |
466 | inode, page, snapc); | 466 | inode, page, snapc); |
467 | /* we should only noop if called by kswapd */ | 467 | /* we should only noop if called by kswapd */ |
468 | WARN_ON((current->flags & PF_MEMALLOC) == 0); | 468 | WARN_ON((current->flags & PF_MEMALLOC) == 0); |
469 | ceph_put_snap_context(oldest); | 469 | ceph_put_snap_context(oldest); |
470 | goto out; | 470 | goto out; |
471 | } | 471 | } |
472 | ceph_put_snap_context(oldest); | 472 | ceph_put_snap_context(oldest); |
473 | 473 | ||
474 | /* is this a partial page at end of file? */ | 474 | /* is this a partial page at end of file? */ |
475 | if (snap_size) | 475 | if (snap_size) |
476 | i_size = snap_size; | 476 | i_size = snap_size; |
477 | else | 477 | else |
478 | i_size = i_size_read(inode); | 478 | i_size = i_size_read(inode); |
479 | if (i_size < page_off + len) | 479 | if (i_size < page_off + len) |
480 | len = i_size - page_off; | 480 | len = i_size - page_off; |
481 | 481 | ||
482 | dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", | 482 | dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", |
483 | inode, page, page->index, page_off, len, snapc); | 483 | inode, page, page->index, page_off, len, snapc); |
484 | 484 | ||
485 | writeback_stat = atomic_long_inc_return(&fsc->writeback_count); | 485 | writeback_stat = atomic_long_inc_return(&fsc->writeback_count); |
486 | if (writeback_stat > | 486 | if (writeback_stat > |
487 | CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) | 487 | CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) |
488 | set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); | 488 | set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); |
489 | 489 | ||
490 | set_page_writeback(page); | 490 | set_page_writeback(page); |
491 | err = ceph_osdc_writepages(osdc, ceph_vino(inode), | 491 | err = ceph_osdc_writepages(osdc, ceph_vino(inode), |
492 | &ci->i_layout, snapc, | 492 | &ci->i_layout, snapc, |
493 | page_off, len, | 493 | page_off, len, |
494 | ci->i_truncate_seq, ci->i_truncate_size, | 494 | ci->i_truncate_seq, ci->i_truncate_size, |
495 | &inode->i_mtime, | 495 | &inode->i_mtime, |
496 | &page, 1, 0, 0, true); | 496 | &page, 1, 0, 0); |
497 | if (err < 0) { | 497 | if (err < 0) { |
498 | dout("writepage setting page/mapping error %d %p\n", err, page); | 498 | dout("writepage setting page/mapping error %d %p\n", err, page); |
499 | SetPageError(page); | 499 | SetPageError(page); |
500 | mapping_set_error(&inode->i_data, err); | 500 | mapping_set_error(&inode->i_data, err); |
501 | if (wbc) | 501 | if (wbc) |
502 | wbc->pages_skipped++; | 502 | wbc->pages_skipped++; |
503 | } else { | 503 | } else { |
504 | dout("writepage cleaned page %p\n", page); | 504 | dout("writepage cleaned page %p\n", page); |
505 | err = 0; /* vfs expects us to return 0 */ | 505 | err = 0; /* vfs expects us to return 0 */ |
506 | } | 506 | } |
507 | page->private = 0; | 507 | page->private = 0; |
508 | ClearPagePrivate(page); | 508 | ClearPagePrivate(page); |
509 | end_page_writeback(page); | 509 | end_page_writeback(page); |
510 | ceph_put_wrbuffer_cap_refs(ci, 1, snapc); | 510 | ceph_put_wrbuffer_cap_refs(ci, 1, snapc); |
511 | ceph_put_snap_context(snapc); /* page's reference */ | 511 | ceph_put_snap_context(snapc); /* page's reference */ |
512 | out: | 512 | out: |
513 | return err; | 513 | return err; |
514 | } | 514 | } |
515 | 515 | ||
516 | static int ceph_writepage(struct page *page, struct writeback_control *wbc) | 516 | static int ceph_writepage(struct page *page, struct writeback_control *wbc) |
517 | { | 517 | { |
518 | int err; | 518 | int err; |
519 | struct inode *inode = page->mapping->host; | 519 | struct inode *inode = page->mapping->host; |
520 | BUG_ON(!inode); | 520 | BUG_ON(!inode); |
521 | ihold(inode); | 521 | ihold(inode); |
522 | err = writepage_nounlock(page, wbc); | 522 | err = writepage_nounlock(page, wbc); |
523 | unlock_page(page); | 523 | unlock_page(page); |
524 | iput(inode); | 524 | iput(inode); |
525 | return err; | 525 | return err; |
526 | } | 526 | } |
527 | 527 | ||
528 | 528 | ||
529 | /* | 529 | /* |
530 | * lame release_pages helper. release_pages() isn't exported to | 530 | * lame release_pages helper. release_pages() isn't exported to |
531 | * modules. | 531 | * modules. |
532 | */ | 532 | */ |
533 | static void ceph_release_pages(struct page **pages, int num) | 533 | static void ceph_release_pages(struct page **pages, int num) |
534 | { | 534 | { |
535 | struct pagevec pvec; | 535 | struct pagevec pvec; |
536 | int i; | 536 | int i; |
537 | 537 | ||
538 | pagevec_init(&pvec, 0); | 538 | pagevec_init(&pvec, 0); |
539 | for (i = 0; i < num; i++) { | 539 | for (i = 0; i < num; i++) { |
540 | if (pagevec_add(&pvec, pages[i]) == 0) | 540 | if (pagevec_add(&pvec, pages[i]) == 0) |
541 | pagevec_release(&pvec); | 541 | pagevec_release(&pvec); |
542 | } | 542 | } |
543 | pagevec_release(&pvec); | 543 | pagevec_release(&pvec); |
544 | } | 544 | } |
545 | 545 | ||
546 | 546 | ||
547 | /* | 547 | /* |
548 | * async writeback completion handler. | 548 | * async writeback completion handler. |
549 | * | 549 | * |
550 | * If we get an error, set the mapping error bit, but not the individual | 550 | * If we get an error, set the mapping error bit, but not the individual |
551 | * page error bits. | 551 | * page error bits. |
552 | */ | 552 | */ |
553 | static void writepages_finish(struct ceph_osd_request *req, | 553 | static void writepages_finish(struct ceph_osd_request *req, |
554 | struct ceph_msg *msg) | 554 | struct ceph_msg *msg) |
555 | { | 555 | { |
556 | struct inode *inode = req->r_inode; | 556 | struct inode *inode = req->r_inode; |
557 | struct ceph_osd_reply_head *replyhead; | 557 | struct ceph_osd_reply_head *replyhead; |
558 | struct ceph_osd_op *op; | 558 | struct ceph_osd_op *op; |
559 | struct ceph_inode_info *ci = ceph_inode(inode); | 559 | struct ceph_inode_info *ci = ceph_inode(inode); |
560 | unsigned wrote; | 560 | unsigned wrote; |
561 | struct page *page; | 561 | struct page *page; |
562 | int i; | 562 | int i; |
563 | struct ceph_snap_context *snapc = req->r_snapc; | 563 | struct ceph_snap_context *snapc = req->r_snapc; |
564 | struct address_space *mapping = inode->i_mapping; | 564 | struct address_space *mapping = inode->i_mapping; |
565 | __s32 rc = -EIO; | 565 | __s32 rc = -EIO; |
566 | u64 bytes = 0; | 566 | u64 bytes = 0; |
567 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 567 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
568 | long writeback_stat; | 568 | long writeback_stat; |
569 | unsigned issued = ceph_caps_issued(ci); | 569 | unsigned issued = ceph_caps_issued(ci); |
570 | 570 | ||
571 | /* parse reply */ | 571 | /* parse reply */ |
572 | replyhead = msg->front.iov_base; | 572 | replyhead = msg->front.iov_base; |
573 | WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); | 573 | WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); |
574 | op = (void *)(replyhead + 1); | 574 | op = (void *)(replyhead + 1); |
575 | rc = le32_to_cpu(replyhead->result); | 575 | rc = le32_to_cpu(replyhead->result); |
576 | bytes = le64_to_cpu(op->extent.length); | 576 | bytes = le64_to_cpu(op->extent.length); |
577 | 577 | ||
578 | if (rc >= 0) { | 578 | if (rc >= 0) { |
579 | /* | 579 | /* |
580 | * Assume we wrote the pages we originally sent. The | 580 | * Assume we wrote the pages we originally sent. The |
581 | * osd might reply with fewer pages if our writeback | 581 | * osd might reply with fewer pages if our writeback |
582 | * raced with a truncation and was adjusted at the osd, | 582 | * raced with a truncation and was adjusted at the osd, |
583 | * so don't believe the reply. | 583 | * so don't believe the reply. |
584 | */ | 584 | */ |
585 | wrote = req->r_num_pages; | 585 | wrote = req->r_num_pages; |
586 | } else { | 586 | } else { |
587 | wrote = 0; | 587 | wrote = 0; |
588 | mapping_set_error(mapping, rc); | 588 | mapping_set_error(mapping, rc); |
589 | } | 589 | } |
590 | dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n", | 590 | dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n", |
591 | inode, rc, bytes, wrote); | 591 | inode, rc, bytes, wrote); |
592 | 592 | ||
593 | /* clean all pages */ | 593 | /* clean all pages */ |
594 | for (i = 0; i < req->r_num_pages; i++) { | 594 | for (i = 0; i < req->r_num_pages; i++) { |
595 | page = req->r_pages[i]; | 595 | page = req->r_pages[i]; |
596 | BUG_ON(!page); | 596 | BUG_ON(!page); |
597 | WARN_ON(!PageUptodate(page)); | 597 | WARN_ON(!PageUptodate(page)); |
598 | 598 | ||
599 | writeback_stat = | 599 | writeback_stat = |
600 | atomic_long_dec_return(&fsc->writeback_count); | 600 | atomic_long_dec_return(&fsc->writeback_count); |
601 | if (writeback_stat < | 601 | if (writeback_stat < |
602 | CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) | 602 | CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) |
603 | clear_bdi_congested(&fsc->backing_dev_info, | 603 | clear_bdi_congested(&fsc->backing_dev_info, |
604 | BLK_RW_ASYNC); | 604 | BLK_RW_ASYNC); |
605 | 605 | ||
606 | ceph_put_snap_context(page_snap_context(page)); | 606 | ceph_put_snap_context(page_snap_context(page)); |
607 | page->private = 0; | 607 | page->private = 0; |
608 | ClearPagePrivate(page); | 608 | ClearPagePrivate(page); |
609 | dout("unlocking %d %p\n", i, page); | 609 | dout("unlocking %d %p\n", i, page); |
610 | end_page_writeback(page); | 610 | end_page_writeback(page); |
611 | 611 | ||
612 | /* | 612 | /* |
613 | * We lost the cache cap, need to truncate the page before | 613 | * We lost the cache cap, need to truncate the page before |
614 | * it is unlocked, otherwise we'd truncate it later in the | 614 | * it is unlocked, otherwise we'd truncate it later in the |
615 | * page truncation thread, possibly losing some data that | 615 | * page truncation thread, possibly losing some data that |
616 | * raced its way in | 616 | * raced its way in |
617 | */ | 617 | */ |
618 | if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) | 618 | if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) |
619 | generic_error_remove_page(inode->i_mapping, page); | 619 | generic_error_remove_page(inode->i_mapping, page); |
620 | 620 | ||
621 | unlock_page(page); | 621 | unlock_page(page); |
622 | } | 622 | } |
623 | dout("%p wrote+cleaned %d pages\n", inode, wrote); | 623 | dout("%p wrote+cleaned %d pages\n", inode, wrote); |
624 | ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc); | 624 | ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc); |
625 | 625 | ||
626 | ceph_release_pages(req->r_pages, req->r_num_pages); | 626 | ceph_release_pages(req->r_pages, req->r_num_pages); |
627 | if (req->r_pages_from_pool) | 627 | if (req->r_pages_from_pool) |
628 | mempool_free(req->r_pages, | 628 | mempool_free(req->r_pages, |
629 | ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); | 629 | ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); |
630 | else | 630 | else |
631 | kfree(req->r_pages); | 631 | kfree(req->r_pages); |
632 | ceph_osdc_put_request(req); | 632 | ceph_osdc_put_request(req); |
633 | } | 633 | } |
634 | 634 | ||
635 | /* | 635 | /* |
636 | * allocate a page vec, either directly, or if necessary, via a the | 636 | * allocate a page vec, either directly, or if necessary, via a the |
637 | * mempool. we avoid the mempool if we can because req->r_num_pages | 637 | * mempool. we avoid the mempool if we can because req->r_num_pages |
638 | * may be less than the maximum write size. | 638 | * may be less than the maximum write size. |
639 | */ | 639 | */ |
640 | static void alloc_page_vec(struct ceph_fs_client *fsc, | 640 | static void alloc_page_vec(struct ceph_fs_client *fsc, |
641 | struct ceph_osd_request *req) | 641 | struct ceph_osd_request *req) |
642 | { | 642 | { |
643 | req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, | 643 | req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, |
644 | GFP_NOFS); | 644 | GFP_NOFS); |
645 | if (!req->r_pages) { | 645 | if (!req->r_pages) { |
646 | req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS); | 646 | req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS); |
647 | req->r_pages_from_pool = 1; | 647 | req->r_pages_from_pool = 1; |
648 | WARN_ON(!req->r_pages); | 648 | WARN_ON(!req->r_pages); |
649 | } | 649 | } |
650 | } | 650 | } |
651 | 651 | ||
652 | /* | 652 | /* |
653 | * initiate async writeback | 653 | * initiate async writeback |
654 | */ | 654 | */ |
655 | static int ceph_writepages_start(struct address_space *mapping, | 655 | static int ceph_writepages_start(struct address_space *mapping, |
656 | struct writeback_control *wbc) | 656 | struct writeback_control *wbc) |
657 | { | 657 | { |
658 | struct inode *inode = mapping->host; | 658 | struct inode *inode = mapping->host; |
659 | struct ceph_inode_info *ci = ceph_inode(inode); | 659 | struct ceph_inode_info *ci = ceph_inode(inode); |
660 | struct ceph_fs_client *fsc; | 660 | struct ceph_fs_client *fsc; |
661 | pgoff_t index, start, end; | 661 | pgoff_t index, start, end; |
662 | int range_whole = 0; | 662 | int range_whole = 0; |
663 | int should_loop = 1; | 663 | int should_loop = 1; |
664 | pgoff_t max_pages = 0, max_pages_ever = 0; | 664 | pgoff_t max_pages = 0, max_pages_ever = 0; |
665 | struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; | 665 | struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; |
666 | struct pagevec pvec; | 666 | struct pagevec pvec; |
667 | int done = 0; | 667 | int done = 0; |
668 | int rc = 0; | 668 | int rc = 0; |
669 | unsigned wsize = 1 << inode->i_blkbits; | 669 | unsigned wsize = 1 << inode->i_blkbits; |
670 | struct ceph_osd_request *req = NULL; | 670 | struct ceph_osd_request *req = NULL; |
671 | int do_sync; | 671 | int do_sync; |
672 | u64 snap_size = 0; | 672 | u64 snap_size = 0; |
673 | 673 | ||
674 | /* | 674 | /* |
675 | * Include a 'sync' in the OSD request if this is a data | 675 | * Include a 'sync' in the OSD request if this is a data |
676 | * integrity write (e.g., O_SYNC write or fsync()), or if our | 676 | * integrity write (e.g., O_SYNC write or fsync()), or if our |
677 | * cap is being revoked. | 677 | * cap is being revoked. |
678 | */ | 678 | */ |
679 | do_sync = wbc->sync_mode == WB_SYNC_ALL; | 679 | do_sync = wbc->sync_mode == WB_SYNC_ALL; |
680 | if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) | 680 | if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) |
681 | do_sync = 1; | 681 | do_sync = 1; |
682 | dout("writepages_start %p dosync=%d (mode=%s)\n", | 682 | dout("writepages_start %p dosync=%d (mode=%s)\n", |
683 | inode, do_sync, | 683 | inode, do_sync, |
684 | wbc->sync_mode == WB_SYNC_NONE ? "NONE" : | 684 | wbc->sync_mode == WB_SYNC_NONE ? "NONE" : |
685 | (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); | 685 | (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); |
686 | 686 | ||
687 | fsc = ceph_inode_to_client(inode); | 687 | fsc = ceph_inode_to_client(inode); |
688 | if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { | 688 | if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { |
689 | pr_warning("writepage_start %p on forced umount\n", inode); | 689 | pr_warning("writepage_start %p on forced umount\n", inode); |
690 | return -EIO; /* we're in a forced umount, don't write! */ | 690 | return -EIO; /* we're in a forced umount, don't write! */ |
691 | } | 691 | } |
692 | if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) | 692 | if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) |
693 | wsize = fsc->mount_options->wsize; | 693 | wsize = fsc->mount_options->wsize; |
694 | if (wsize < PAGE_CACHE_SIZE) | 694 | if (wsize < PAGE_CACHE_SIZE) |
695 | wsize = PAGE_CACHE_SIZE; | 695 | wsize = PAGE_CACHE_SIZE; |
696 | max_pages_ever = wsize >> PAGE_CACHE_SHIFT; | 696 | max_pages_ever = wsize >> PAGE_CACHE_SHIFT; |
697 | 697 | ||
698 | pagevec_init(&pvec, 0); | 698 | pagevec_init(&pvec, 0); |
699 | 699 | ||
700 | /* where to start/end? */ | 700 | /* where to start/end? */ |
701 | if (wbc->range_cyclic) { | 701 | if (wbc->range_cyclic) { |
702 | start = mapping->writeback_index; /* Start from prev offset */ | 702 | start = mapping->writeback_index; /* Start from prev offset */ |
703 | end = -1; | 703 | end = -1; |
704 | dout(" cyclic, start at %lu\n", start); | 704 | dout(" cyclic, start at %lu\n", start); |
705 | } else { | 705 | } else { |
706 | start = wbc->range_start >> PAGE_CACHE_SHIFT; | 706 | start = wbc->range_start >> PAGE_CACHE_SHIFT; |
707 | end = wbc->range_end >> PAGE_CACHE_SHIFT; | 707 | end = wbc->range_end >> PAGE_CACHE_SHIFT; |
708 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) | 708 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) |
709 | range_whole = 1; | 709 | range_whole = 1; |
710 | should_loop = 0; | 710 | should_loop = 0; |
711 | dout(" not cyclic, %lu to %lu\n", start, end); | 711 | dout(" not cyclic, %lu to %lu\n", start, end); |
712 | } | 712 | } |
713 | index = start; | 713 | index = start; |
714 | 714 | ||
715 | retry: | 715 | retry: |
716 | /* find oldest snap context with dirty data */ | 716 | /* find oldest snap context with dirty data */ |
717 | ceph_put_snap_context(snapc); | 717 | ceph_put_snap_context(snapc); |
718 | snapc = get_oldest_context(inode, &snap_size); | 718 | snapc = get_oldest_context(inode, &snap_size); |
719 | if (!snapc) { | 719 | if (!snapc) { |
720 | /* hmm, why does writepages get called when there | 720 | /* hmm, why does writepages get called when there |
721 | is no dirty data? */ | 721 | is no dirty data? */ |
722 | dout(" no snap context with dirty data?\n"); | 722 | dout(" no snap context with dirty data?\n"); |
723 | goto out; | 723 | goto out; |
724 | } | 724 | } |
725 | dout(" oldest snapc is %p seq %lld (%d snaps)\n", | 725 | dout(" oldest snapc is %p seq %lld (%d snaps)\n", |
726 | snapc, snapc->seq, snapc->num_snaps); | 726 | snapc, snapc->seq, snapc->num_snaps); |
727 | if (last_snapc && snapc != last_snapc) { | 727 | if (last_snapc && snapc != last_snapc) { |
728 | /* if we switched to a newer snapc, restart our scan at the | 728 | /* if we switched to a newer snapc, restart our scan at the |
729 | * start of the original file range. */ | 729 | * start of the original file range. */ |
730 | dout(" snapc differs from last pass, restarting at %lu\n", | 730 | dout(" snapc differs from last pass, restarting at %lu\n", |
731 | index); | 731 | index); |
732 | index = start; | 732 | index = start; |
733 | } | 733 | } |
734 | last_snapc = snapc; | 734 | last_snapc = snapc; |
735 | 735 | ||
736 | while (!done && index <= end) { | 736 | while (!done && index <= end) { |
737 | unsigned i; | 737 | unsigned i; |
738 | int first; | 738 | int first; |
739 | pgoff_t next; | 739 | pgoff_t next; |
740 | int pvec_pages, locked_pages; | 740 | int pvec_pages, locked_pages; |
741 | struct page *page; | 741 | struct page *page; |
742 | int want; | 742 | int want; |
743 | u64 offset, len; | 743 | u64 offset, len; |
744 | struct ceph_osd_request_head *reqhead; | 744 | struct ceph_osd_request_head *reqhead; |
745 | struct ceph_osd_op *op; | 745 | struct ceph_osd_op *op; |
746 | long writeback_stat; | 746 | long writeback_stat; |
747 | 747 | ||
748 | next = 0; | 748 | next = 0; |
749 | locked_pages = 0; | 749 | locked_pages = 0; |
750 | max_pages = max_pages_ever; | 750 | max_pages = max_pages_ever; |
751 | 751 | ||
752 | get_more_pages: | 752 | get_more_pages: |
753 | first = -1; | 753 | first = -1; |
754 | want = min(end - index, | 754 | want = min(end - index, |
755 | min((pgoff_t)PAGEVEC_SIZE, | 755 | min((pgoff_t)PAGEVEC_SIZE, |
756 | max_pages - (pgoff_t)locked_pages) - 1) | 756 | max_pages - (pgoff_t)locked_pages) - 1) |
757 | + 1; | 757 | + 1; |
758 | pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, | 758 | pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, |
759 | PAGECACHE_TAG_DIRTY, | 759 | PAGECACHE_TAG_DIRTY, |
760 | want); | 760 | want); |
761 | dout("pagevec_lookup_tag got %d\n", pvec_pages); | 761 | dout("pagevec_lookup_tag got %d\n", pvec_pages); |
762 | if (!pvec_pages && !locked_pages) | 762 | if (!pvec_pages && !locked_pages) |
763 | break; | 763 | break; |
764 | for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { | 764 | for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { |
765 | page = pvec.pages[i]; | 765 | page = pvec.pages[i]; |
766 | dout("? %p idx %lu\n", page, page->index); | 766 | dout("? %p idx %lu\n", page, page->index); |
767 | if (locked_pages == 0) | 767 | if (locked_pages == 0) |
768 | lock_page(page); /* first page */ | 768 | lock_page(page); /* first page */ |
769 | else if (!trylock_page(page)) | 769 | else if (!trylock_page(page)) |
770 | break; | 770 | break; |
771 | 771 | ||
772 | /* only dirty pages, or our accounting breaks */ | 772 | /* only dirty pages, or our accounting breaks */ |
773 | if (unlikely(!PageDirty(page)) || | 773 | if (unlikely(!PageDirty(page)) || |
774 | unlikely(page->mapping != mapping)) { | 774 | unlikely(page->mapping != mapping)) { |
775 | dout("!dirty or !mapping %p\n", page); | 775 | dout("!dirty or !mapping %p\n", page); |
776 | unlock_page(page); | 776 | unlock_page(page); |
777 | break; | 777 | break; |
778 | } | 778 | } |
779 | if (!wbc->range_cyclic && page->index > end) { | 779 | if (!wbc->range_cyclic && page->index > end) { |
780 | dout("end of range %p\n", page); | 780 | dout("end of range %p\n", page); |
781 | done = 1; | 781 | done = 1; |
782 | unlock_page(page); | 782 | unlock_page(page); |
783 | break; | 783 | break; |
784 | } | 784 | } |
785 | if (next && (page->index != next)) { | 785 | if (next && (page->index != next)) { |
786 | dout("not consecutive %p\n", page); | 786 | dout("not consecutive %p\n", page); |
787 | unlock_page(page); | 787 | unlock_page(page); |
788 | break; | 788 | break; |
789 | } | 789 | } |
790 | if (wbc->sync_mode != WB_SYNC_NONE) { | 790 | if (wbc->sync_mode != WB_SYNC_NONE) { |
791 | dout("waiting on writeback %p\n", page); | 791 | dout("waiting on writeback %p\n", page); |
792 | wait_on_page_writeback(page); | 792 | wait_on_page_writeback(page); |
793 | } | 793 | } |
794 | if ((snap_size && page_offset(page) > snap_size) || | 794 | if ((snap_size && page_offset(page) > snap_size) || |
795 | (!snap_size && | 795 | (!snap_size && |
796 | page_offset(page) > i_size_read(inode))) { | 796 | page_offset(page) > i_size_read(inode))) { |
797 | dout("%p page eof %llu\n", page, snap_size ? | 797 | dout("%p page eof %llu\n", page, snap_size ? |
798 | snap_size : i_size_read(inode)); | 798 | snap_size : i_size_read(inode)); |
799 | done = 1; | 799 | done = 1; |
800 | unlock_page(page); | 800 | unlock_page(page); |
801 | break; | 801 | break; |
802 | } | 802 | } |
803 | if (PageWriteback(page)) { | 803 | if (PageWriteback(page)) { |
804 | dout("%p under writeback\n", page); | 804 | dout("%p under writeback\n", page); |
805 | unlock_page(page); | 805 | unlock_page(page); |
806 | break; | 806 | break; |
807 | } | 807 | } |
808 | 808 | ||
809 | /* only if matching snap context */ | 809 | /* only if matching snap context */ |
810 | pgsnapc = page_snap_context(page); | 810 | pgsnapc = page_snap_context(page); |
811 | if (pgsnapc->seq > snapc->seq) { | 811 | if (pgsnapc->seq > snapc->seq) { |
812 | dout("page snapc %p %lld > oldest %p %lld\n", | 812 | dout("page snapc %p %lld > oldest %p %lld\n", |
813 | pgsnapc, pgsnapc->seq, snapc, snapc->seq); | 813 | pgsnapc, pgsnapc->seq, snapc, snapc->seq); |
814 | unlock_page(page); | 814 | unlock_page(page); |
815 | if (!locked_pages) | 815 | if (!locked_pages) |
816 | continue; /* keep looking for snap */ | 816 | continue; /* keep looking for snap */ |
817 | break; | 817 | break; |
818 | } | 818 | } |
819 | 819 | ||
820 | if (!clear_page_dirty_for_io(page)) { | 820 | if (!clear_page_dirty_for_io(page)) { |
821 | dout("%p !clear_page_dirty_for_io\n", page); | 821 | dout("%p !clear_page_dirty_for_io\n", page); |
822 | unlock_page(page); | 822 | unlock_page(page); |
823 | break; | 823 | break; |
824 | } | 824 | } |
825 | 825 | ||
826 | /* ok */ | 826 | /* ok */ |
827 | if (locked_pages == 0) { | 827 | if (locked_pages == 0) { |
828 | /* prepare async write request */ | 828 | /* prepare async write request */ |
829 | offset = (u64) page_offset(page); | 829 | offset = (u64) page_offset(page); |
830 | len = wsize; | 830 | len = wsize; |
831 | req = ceph_osdc_new_request(&fsc->client->osdc, | 831 | req = ceph_osdc_new_request(&fsc->client->osdc, |
832 | &ci->i_layout, | 832 | &ci->i_layout, |
833 | ceph_vino(inode), | 833 | ceph_vino(inode), |
834 | offset, &len, | 834 | offset, &len, |
835 | CEPH_OSD_OP_WRITE, | 835 | CEPH_OSD_OP_WRITE, |
836 | CEPH_OSD_FLAG_WRITE | | 836 | CEPH_OSD_FLAG_WRITE | |
837 | CEPH_OSD_FLAG_ONDISK, | 837 | CEPH_OSD_FLAG_ONDISK, |
838 | snapc, do_sync, | 838 | snapc, do_sync, |
839 | ci->i_truncate_seq, | 839 | ci->i_truncate_seq, |
840 | ci->i_truncate_size, | 840 | ci->i_truncate_size, |
841 | &inode->i_mtime, true, 1, 0); | 841 | &inode->i_mtime, true, 1, 0); |
842 | 842 | ||
843 | if (IS_ERR(req)) { | 843 | if (IS_ERR(req)) { |
844 | rc = PTR_ERR(req); | 844 | rc = PTR_ERR(req); |
845 | unlock_page(page); | 845 | unlock_page(page); |
846 | break; | 846 | break; |
847 | } | 847 | } |
848 | 848 | ||
849 | max_pages = req->r_num_pages; | 849 | max_pages = req->r_num_pages; |
850 | 850 | ||
851 | alloc_page_vec(fsc, req); | 851 | alloc_page_vec(fsc, req); |
852 | req->r_callback = writepages_finish; | 852 | req->r_callback = writepages_finish; |
853 | req->r_inode = inode; | 853 | req->r_inode = inode; |
854 | } | 854 | } |
855 | 855 | ||
856 | /* note position of first page in pvec */ | 856 | /* note position of first page in pvec */ |
857 | if (first < 0) | 857 | if (first < 0) |
858 | first = i; | 858 | first = i; |
859 | dout("%p will write page %p idx %lu\n", | 859 | dout("%p will write page %p idx %lu\n", |
860 | inode, page, page->index); | 860 | inode, page, page->index); |
861 | 861 | ||
862 | writeback_stat = | 862 | writeback_stat = |
863 | atomic_long_inc_return(&fsc->writeback_count); | 863 | atomic_long_inc_return(&fsc->writeback_count); |
864 | if (writeback_stat > CONGESTION_ON_THRESH( | 864 | if (writeback_stat > CONGESTION_ON_THRESH( |
865 | fsc->mount_options->congestion_kb)) { | 865 | fsc->mount_options->congestion_kb)) { |
866 | set_bdi_congested(&fsc->backing_dev_info, | 866 | set_bdi_congested(&fsc->backing_dev_info, |
867 | BLK_RW_ASYNC); | 867 | BLK_RW_ASYNC); |
868 | } | 868 | } |
869 | 869 | ||
870 | set_page_writeback(page); | 870 | set_page_writeback(page); |
871 | req->r_pages[locked_pages] = page; | 871 | req->r_pages[locked_pages] = page; |
872 | locked_pages++; | 872 | locked_pages++; |
873 | next = page->index + 1; | 873 | next = page->index + 1; |
874 | } | 874 | } |
875 | 875 | ||
876 | /* did we get anything? */ | 876 | /* did we get anything? */ |
877 | if (!locked_pages) | 877 | if (!locked_pages) |
878 | goto release_pvec_pages; | 878 | goto release_pvec_pages; |
879 | if (i) { | 879 | if (i) { |
880 | int j; | 880 | int j; |
881 | BUG_ON(!locked_pages || first < 0); | 881 | BUG_ON(!locked_pages || first < 0); |
882 | 882 | ||
883 | if (pvec_pages && i == pvec_pages && | 883 | if (pvec_pages && i == pvec_pages && |
884 | locked_pages < max_pages) { | 884 | locked_pages < max_pages) { |
885 | dout("reached end pvec, trying for more\n"); | 885 | dout("reached end pvec, trying for more\n"); |
886 | pagevec_reinit(&pvec); | 886 | pagevec_reinit(&pvec); |
887 | goto get_more_pages; | 887 | goto get_more_pages; |
888 | } | 888 | } |
889 | 889 | ||
890 | /* shift unused pages over in the pvec... we | 890 | /* shift unused pages over in the pvec... we |
891 | * will need to release them below. */ | 891 | * will need to release them below. */ |
892 | for (j = i; j < pvec_pages; j++) { | 892 | for (j = i; j < pvec_pages; j++) { |
893 | dout(" pvec leftover page %p\n", | 893 | dout(" pvec leftover page %p\n", |
894 | pvec.pages[j]); | 894 | pvec.pages[j]); |
895 | pvec.pages[j-i+first] = pvec.pages[j]; | 895 | pvec.pages[j-i+first] = pvec.pages[j]; |
896 | } | 896 | } |
897 | pvec.nr -= i-first; | 897 | pvec.nr -= i-first; |
898 | } | 898 | } |
899 | 899 | ||
900 | /* submit the write */ | 900 | /* submit the write */ |
901 | offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT; | 901 | offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT; |
902 | len = min((snap_size ? snap_size : i_size_read(inode)) - offset, | 902 | len = min((snap_size ? snap_size : i_size_read(inode)) - offset, |
903 | (u64)locked_pages << PAGE_CACHE_SHIFT); | 903 | (u64)locked_pages << PAGE_CACHE_SHIFT); |
904 | dout("writepages got %d pages at %llu~%llu\n", | 904 | dout("writepages got %d pages at %llu~%llu\n", |
905 | locked_pages, offset, len); | 905 | locked_pages, offset, len); |
906 | 906 | ||
907 | /* revise final length, page count */ | 907 | /* revise final length, page count */ |
908 | req->r_num_pages = locked_pages; | 908 | req->r_num_pages = locked_pages; |
909 | reqhead = req->r_request->front.iov_base; | 909 | reqhead = req->r_request->front.iov_base; |
910 | op = (void *)(reqhead + 1); | 910 | op = (void *)(reqhead + 1); |
911 | op->extent.length = cpu_to_le64(len); | 911 | op->extent.length = cpu_to_le64(len); |
912 | op->payload_len = cpu_to_le32(len); | 912 | op->payload_len = cpu_to_le32(len); |
913 | req->r_request->hdr.data_len = cpu_to_le32(len); | 913 | req->r_request->hdr.data_len = cpu_to_le32(len); |
914 | 914 | ||
915 | rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); | 915 | rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); |
916 | BUG_ON(rc); | 916 | BUG_ON(rc); |
917 | req = NULL; | 917 | req = NULL; |
918 | 918 | ||
919 | /* continue? */ | 919 | /* continue? */ |
920 | index = next; | 920 | index = next; |
921 | wbc->nr_to_write -= locked_pages; | 921 | wbc->nr_to_write -= locked_pages; |
922 | if (wbc->nr_to_write <= 0) | 922 | if (wbc->nr_to_write <= 0) |
923 | done = 1; | 923 | done = 1; |
924 | 924 | ||
925 | release_pvec_pages: | 925 | release_pvec_pages: |
926 | dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, | 926 | dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, |
927 | pvec.nr ? pvec.pages[0] : NULL); | 927 | pvec.nr ? pvec.pages[0] : NULL); |
928 | pagevec_release(&pvec); | 928 | pagevec_release(&pvec); |
929 | 929 | ||
930 | if (locked_pages && !done) | 930 | if (locked_pages && !done) |
931 | goto retry; | 931 | goto retry; |
932 | } | 932 | } |
933 | 933 | ||
934 | if (should_loop && !done) { | 934 | if (should_loop && !done) { |
935 | /* more to do; loop back to beginning of file */ | 935 | /* more to do; loop back to beginning of file */ |
936 | dout("writepages looping back to beginning of file\n"); | 936 | dout("writepages looping back to beginning of file\n"); |
937 | should_loop = 0; | 937 | should_loop = 0; |
938 | index = 0; | 938 | index = 0; |
939 | goto retry; | 939 | goto retry; |
940 | } | 940 | } |
941 | 941 | ||
942 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) | 942 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) |
943 | mapping->writeback_index = index; | 943 | mapping->writeback_index = index; |
944 | 944 | ||
945 | out: | 945 | out: |
946 | if (req) | 946 | if (req) |
947 | ceph_osdc_put_request(req); | 947 | ceph_osdc_put_request(req); |
948 | ceph_put_snap_context(snapc); | 948 | ceph_put_snap_context(snapc); |
949 | dout("writepages done, rc = %d\n", rc); | 949 | dout("writepages done, rc = %d\n", rc); |
950 | return rc; | 950 | return rc; |
951 | } | 951 | } |
952 | 952 | ||
953 | 953 | ||
954 | 954 | ||
955 | /* | 955 | /* |
956 | * See if a given @snapc is either writeable, or already written. | 956 | * See if a given @snapc is either writeable, or already written. |
957 | */ | 957 | */ |
958 | static int context_is_writeable_or_written(struct inode *inode, | 958 | static int context_is_writeable_or_written(struct inode *inode, |
959 | struct ceph_snap_context *snapc) | 959 | struct ceph_snap_context *snapc) |
960 | { | 960 | { |
961 | struct ceph_snap_context *oldest = get_oldest_context(inode, NULL); | 961 | struct ceph_snap_context *oldest = get_oldest_context(inode, NULL); |
962 | int ret = !oldest || snapc->seq <= oldest->seq; | 962 | int ret = !oldest || snapc->seq <= oldest->seq; |
963 | 963 | ||
964 | ceph_put_snap_context(oldest); | 964 | ceph_put_snap_context(oldest); |
965 | return ret; | 965 | return ret; |
966 | } | 966 | } |
967 | 967 | ||
968 | /* | 968 | /* |
969 | * We are only allowed to write into/dirty the page if the page is | 969 | * We are only allowed to write into/dirty the page if the page is |
970 | * clean, or already dirty within the same snap context. | 970 | * clean, or already dirty within the same snap context. |
971 | * | 971 | * |
972 | * called with page locked. | 972 | * called with page locked. |
973 | * return success with page locked, | 973 | * return success with page locked, |
974 | * or any failure (incl -EAGAIN) with page unlocked. | 974 | * or any failure (incl -EAGAIN) with page unlocked. |
975 | */ | 975 | */ |
976 | static int ceph_update_writeable_page(struct file *file, | 976 | static int ceph_update_writeable_page(struct file *file, |
977 | loff_t pos, unsigned len, | 977 | loff_t pos, unsigned len, |
978 | struct page *page) | 978 | struct page *page) |
979 | { | 979 | { |
980 | struct inode *inode = file->f_dentry->d_inode; | 980 | struct inode *inode = file->f_dentry->d_inode; |
981 | struct ceph_inode_info *ci = ceph_inode(inode); | 981 | struct ceph_inode_info *ci = ceph_inode(inode); |
982 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; | 982 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; |
983 | loff_t page_off = pos & PAGE_CACHE_MASK; | 983 | loff_t page_off = pos & PAGE_CACHE_MASK; |
984 | int pos_in_page = pos & ~PAGE_CACHE_MASK; | 984 | int pos_in_page = pos & ~PAGE_CACHE_MASK; |
985 | int end_in_page = pos_in_page + len; | 985 | int end_in_page = pos_in_page + len; |
986 | loff_t i_size; | 986 | loff_t i_size; |
987 | int r; | 987 | int r; |
988 | struct ceph_snap_context *snapc, *oldest; | 988 | struct ceph_snap_context *snapc, *oldest; |
989 | 989 | ||
990 | retry_locked: | 990 | retry_locked: |
991 | /* writepages currently holds page lock, but if we change that later, */ | 991 | /* writepages currently holds page lock, but if we change that later, */ |
992 | wait_on_page_writeback(page); | 992 | wait_on_page_writeback(page); |
993 | 993 | ||
994 | /* check snap context */ | 994 | /* check snap context */ |
995 | BUG_ON(!ci->i_snap_realm); | 995 | BUG_ON(!ci->i_snap_realm); |
996 | down_read(&mdsc->snap_rwsem); | 996 | down_read(&mdsc->snap_rwsem); |
997 | BUG_ON(!ci->i_snap_realm->cached_context); | 997 | BUG_ON(!ci->i_snap_realm->cached_context); |
998 | snapc = page_snap_context(page); | 998 | snapc = page_snap_context(page); |
999 | if (snapc && snapc != ci->i_head_snapc) { | 999 | if (snapc && snapc != ci->i_head_snapc) { |
1000 | /* | 1000 | /* |
1001 | * this page is already dirty in another (older) snap | 1001 | * this page is already dirty in another (older) snap |
1002 | * context! is it writeable now? | 1002 | * context! is it writeable now? |
1003 | */ | 1003 | */ |
1004 | oldest = get_oldest_context(inode, NULL); | 1004 | oldest = get_oldest_context(inode, NULL); |
1005 | up_read(&mdsc->snap_rwsem); | 1005 | up_read(&mdsc->snap_rwsem); |
1006 | 1006 | ||
1007 | if (snapc->seq > oldest->seq) { | 1007 | if (snapc->seq > oldest->seq) { |
1008 | ceph_put_snap_context(oldest); | 1008 | ceph_put_snap_context(oldest); |
1009 | dout(" page %p snapc %p not current or oldest\n", | 1009 | dout(" page %p snapc %p not current or oldest\n", |
1010 | page, snapc); | 1010 | page, snapc); |
1011 | /* | 1011 | /* |
1012 | * queue for writeback, and wait for snapc to | 1012 | * queue for writeback, and wait for snapc to |
1013 | * be writeable or written | 1013 | * be writeable or written |
1014 | */ | 1014 | */ |
1015 | snapc = ceph_get_snap_context(snapc); | 1015 | snapc = ceph_get_snap_context(snapc); |
1016 | unlock_page(page); | 1016 | unlock_page(page); |
1017 | ceph_queue_writeback(inode); | 1017 | ceph_queue_writeback(inode); |
1018 | r = wait_event_interruptible(ci->i_cap_wq, | 1018 | r = wait_event_interruptible(ci->i_cap_wq, |
1019 | context_is_writeable_or_written(inode, snapc)); | 1019 | context_is_writeable_or_written(inode, snapc)); |
1020 | ceph_put_snap_context(snapc); | 1020 | ceph_put_snap_context(snapc); |
1021 | if (r == -ERESTARTSYS) | 1021 | if (r == -ERESTARTSYS) |
1022 | return r; | 1022 | return r; |
1023 | return -EAGAIN; | 1023 | return -EAGAIN; |
1024 | } | 1024 | } |
1025 | ceph_put_snap_context(oldest); | 1025 | ceph_put_snap_context(oldest); |
1026 | 1026 | ||
1027 | /* yay, writeable, do it now (without dropping page lock) */ | 1027 | /* yay, writeable, do it now (without dropping page lock) */ |
1028 | dout(" page %p snapc %p not current, but oldest\n", | 1028 | dout(" page %p snapc %p not current, but oldest\n", |
1029 | page, snapc); | 1029 | page, snapc); |
1030 | if (!clear_page_dirty_for_io(page)) | 1030 | if (!clear_page_dirty_for_io(page)) |
1031 | goto retry_locked; | 1031 | goto retry_locked; |
1032 | r = writepage_nounlock(page, NULL); | 1032 | r = writepage_nounlock(page, NULL); |
1033 | if (r < 0) | 1033 | if (r < 0) |
1034 | goto fail_nosnap; | 1034 | goto fail_nosnap; |
1035 | goto retry_locked; | 1035 | goto retry_locked; |
1036 | } | 1036 | } |
1037 | 1037 | ||
1038 | if (PageUptodate(page)) { | 1038 | if (PageUptodate(page)) { |
1039 | dout(" page %p already uptodate\n", page); | 1039 | dout(" page %p already uptodate\n", page); |
1040 | return 0; | 1040 | return 0; |
1041 | } | 1041 | } |
1042 | 1042 | ||
1043 | /* full page? */ | 1043 | /* full page? */ |
1044 | if (pos_in_page == 0 && len == PAGE_CACHE_SIZE) | 1044 | if (pos_in_page == 0 && len == PAGE_CACHE_SIZE) |
1045 | return 0; | 1045 | return 0; |
1046 | 1046 | ||
1047 | /* past end of file? */ | 1047 | /* past end of file? */ |
1048 | i_size = inode->i_size; /* caller holds i_mutex */ | 1048 | i_size = inode->i_size; /* caller holds i_mutex */ |
1049 | 1049 | ||
1050 | if (i_size + len > inode->i_sb->s_maxbytes) { | 1050 | if (i_size + len > inode->i_sb->s_maxbytes) { |
1051 | /* file is too big */ | 1051 | /* file is too big */ |
1052 | r = -EINVAL; | 1052 | r = -EINVAL; |
1053 | goto fail; | 1053 | goto fail; |
1054 | } | 1054 | } |
1055 | 1055 | ||
1056 | if (page_off >= i_size || | 1056 | if (page_off >= i_size || |
1057 | (pos_in_page == 0 && (pos+len) >= i_size && | 1057 | (pos_in_page == 0 && (pos+len) >= i_size && |
1058 | end_in_page - pos_in_page != PAGE_CACHE_SIZE)) { | 1058 | end_in_page - pos_in_page != PAGE_CACHE_SIZE)) { |
1059 | dout(" zeroing %p 0 - %d and %d - %d\n", | 1059 | dout(" zeroing %p 0 - %d and %d - %d\n", |
1060 | page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE); | 1060 | page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE); |
1061 | zero_user_segments(page, | 1061 | zero_user_segments(page, |
1062 | 0, pos_in_page, | 1062 | 0, pos_in_page, |
1063 | end_in_page, PAGE_CACHE_SIZE); | 1063 | end_in_page, PAGE_CACHE_SIZE); |
1064 | return 0; | 1064 | return 0; |
1065 | } | 1065 | } |
1066 | 1066 | ||
1067 | /* we need to read it. */ | 1067 | /* we need to read it. */ |
1068 | up_read(&mdsc->snap_rwsem); | 1068 | up_read(&mdsc->snap_rwsem); |
1069 | r = readpage_nounlock(file, page); | 1069 | r = readpage_nounlock(file, page); |
1070 | if (r < 0) | 1070 | if (r < 0) |
1071 | goto fail_nosnap; | 1071 | goto fail_nosnap; |
1072 | goto retry_locked; | 1072 | goto retry_locked; |
1073 | 1073 | ||
1074 | fail: | 1074 | fail: |
1075 | up_read(&mdsc->snap_rwsem); | 1075 | up_read(&mdsc->snap_rwsem); |
1076 | fail_nosnap: | 1076 | fail_nosnap: |
1077 | unlock_page(page); | 1077 | unlock_page(page); |
1078 | return r; | 1078 | return r; |
1079 | } | 1079 | } |
1080 | 1080 | ||
1081 | /* | 1081 | /* |
1082 | * We are only allowed to write into/dirty the page if the page is | 1082 | * We are only allowed to write into/dirty the page if the page is |
1083 | * clean, or already dirty within the same snap context. | 1083 | * clean, or already dirty within the same snap context. |
1084 | */ | 1084 | */ |
1085 | static int ceph_write_begin(struct file *file, struct address_space *mapping, | 1085 | static int ceph_write_begin(struct file *file, struct address_space *mapping, |
1086 | loff_t pos, unsigned len, unsigned flags, | 1086 | loff_t pos, unsigned len, unsigned flags, |
1087 | struct page **pagep, void **fsdata) | 1087 | struct page **pagep, void **fsdata) |
1088 | { | 1088 | { |
1089 | struct inode *inode = file->f_dentry->d_inode; | 1089 | struct inode *inode = file->f_dentry->d_inode; |
1090 | struct ceph_inode_info *ci = ceph_inode(inode); | 1090 | struct ceph_inode_info *ci = ceph_inode(inode); |
1091 | struct ceph_file_info *fi = file->private_data; | 1091 | struct ceph_file_info *fi = file->private_data; |
1092 | struct page *page; | 1092 | struct page *page; |
1093 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | 1093 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; |
1094 | int r, want, got = 0; | 1094 | int r, want, got = 0; |
1095 | 1095 | ||
1096 | if (fi->fmode & CEPH_FILE_MODE_LAZY) | 1096 | if (fi->fmode & CEPH_FILE_MODE_LAZY) |
1097 | want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; | 1097 | want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; |
1098 | else | 1098 | else |
1099 | want = CEPH_CAP_FILE_BUFFER; | 1099 | want = CEPH_CAP_FILE_BUFFER; |
1100 | 1100 | ||
1101 | dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n", | 1101 | dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n", |
1102 | inode, ceph_vinop(inode), pos, len, inode->i_size); | 1102 | inode, ceph_vinop(inode), pos, len, inode->i_size); |
1103 | r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len); | 1103 | r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len); |
1104 | if (r < 0) | 1104 | if (r < 0) |
1105 | return r; | 1105 | return r; |
1106 | dout("write_begin %p %llx.%llx %llu~%u got cap refs on %s\n", | 1106 | dout("write_begin %p %llx.%llx %llu~%u got cap refs on %s\n", |
1107 | inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); | 1107 | inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); |
1108 | if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) { | 1108 | if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) { |
1109 | ceph_put_cap_refs(ci, got); | 1109 | ceph_put_cap_refs(ci, got); |
1110 | return -EAGAIN; | 1110 | return -EAGAIN; |
1111 | } | 1111 | } |
1112 | 1112 | ||
1113 | do { | 1113 | do { |
1114 | /* get a page */ | 1114 | /* get a page */ |
1115 | page = grab_cache_page_write_begin(mapping, index, 0); | 1115 | page = grab_cache_page_write_begin(mapping, index, 0); |
1116 | if (!page) { | 1116 | if (!page) { |
1117 | r = -ENOMEM; | 1117 | r = -ENOMEM; |
1118 | break; | 1118 | break; |
1119 | } | 1119 | } |
1120 | 1120 | ||
1121 | dout("write_begin file %p inode %p page %p %d~%d\n", file, | 1121 | dout("write_begin file %p inode %p page %p %d~%d\n", file, |
1122 | inode, page, (int)pos, (int)len); | 1122 | inode, page, (int)pos, (int)len); |
1123 | 1123 | ||
1124 | r = ceph_update_writeable_page(file, pos, len, page); | 1124 | r = ceph_update_writeable_page(file, pos, len, page); |
1125 | if (r) | 1125 | if (r) |
1126 | page_cache_release(page); | 1126 | page_cache_release(page); |
1127 | } while (r == -EAGAIN); | 1127 | } while (r == -EAGAIN); |
1128 | 1128 | ||
1129 | if (r) { | 1129 | if (r) { |
1130 | ceph_put_cap_refs(ci, got); | 1130 | ceph_put_cap_refs(ci, got); |
1131 | } else { | 1131 | } else { |
1132 | *pagep = page; | 1132 | *pagep = page; |
1133 | *(int *)fsdata = got; | 1133 | *(int *)fsdata = got; |
1134 | } | 1134 | } |
1135 | return r; | 1135 | return r; |
1136 | } | 1136 | } |
1137 | 1137 | ||
1138 | /* | 1138 | /* |
1139 | * we don't do anything in here that simple_write_end doesn't do | 1139 | * we don't do anything in here that simple_write_end doesn't do |
1140 | * except adjust dirty page accounting and drop read lock on | 1140 | * except adjust dirty page accounting and drop read lock on |
1141 | * mdsc->snap_rwsem. | 1141 | * mdsc->snap_rwsem. |
1142 | */ | 1142 | */ |
1143 | static int ceph_write_end(struct file *file, struct address_space *mapping, | 1143 | static int ceph_write_end(struct file *file, struct address_space *mapping, |
1144 | loff_t pos, unsigned len, unsigned copied, | 1144 | loff_t pos, unsigned len, unsigned copied, |
1145 | struct page *page, void *fsdata) | 1145 | struct page *page, void *fsdata) |
1146 | { | 1146 | { |
1147 | struct inode *inode = file->f_dentry->d_inode; | 1147 | struct inode *inode = file->f_dentry->d_inode; |
1148 | struct ceph_inode_info *ci = ceph_inode(inode); | 1148 | struct ceph_inode_info *ci = ceph_inode(inode); |
1149 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 1149 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
1150 | struct ceph_mds_client *mdsc = fsc->mdsc; | 1150 | struct ceph_mds_client *mdsc = fsc->mdsc; |
1151 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); | 1151 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); |
1152 | int check_cap = 0; | 1152 | int check_cap = 0; |
1153 | int got = (unsigned long)fsdata; | 1153 | int got = (unsigned long)fsdata; |
1154 | 1154 | ||
1155 | dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, | 1155 | dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, |
1156 | inode, page, (int)pos, (int)copied, (int)len); | 1156 | inode, page, (int)pos, (int)copied, (int)len); |
1157 | 1157 | ||
1158 | /* zero the stale part of the page if we did a short copy */ | 1158 | /* zero the stale part of the page if we did a short copy */ |
1159 | if (copied < len) | 1159 | if (copied < len) |
1160 | zero_user_segment(page, from+copied, len); | 1160 | zero_user_segment(page, from+copied, len); |
1161 | 1161 | ||
1162 | /* did file size increase? */ | 1162 | /* did file size increase? */ |
1163 | /* (no need for i_size_read(); we caller holds i_mutex */ | 1163 | /* (no need for i_size_read(); we caller holds i_mutex */ |
1164 | if (pos+copied > inode->i_size) | 1164 | if (pos+copied > inode->i_size) |
1165 | check_cap = ceph_inode_set_size(inode, pos+copied); | 1165 | check_cap = ceph_inode_set_size(inode, pos+copied); |
1166 | 1166 | ||
1167 | if (!PageUptodate(page)) | 1167 | if (!PageUptodate(page)) |
1168 | SetPageUptodate(page); | 1168 | SetPageUptodate(page); |
1169 | 1169 | ||
1170 | set_page_dirty(page); | 1170 | set_page_dirty(page); |
1171 | 1171 | ||
1172 | unlock_page(page); | 1172 | unlock_page(page); |
1173 | up_read(&mdsc->snap_rwsem); | 1173 | up_read(&mdsc->snap_rwsem); |
1174 | page_cache_release(page); | 1174 | page_cache_release(page); |
1175 | 1175 | ||
1176 | if (copied > 0) { | 1176 | if (copied > 0) { |
1177 | int dirty; | 1177 | int dirty; |
1178 | spin_lock(&ci->i_ceph_lock); | 1178 | spin_lock(&ci->i_ceph_lock); |
1179 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); | 1179 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); |
1180 | spin_unlock(&ci->i_ceph_lock); | 1180 | spin_unlock(&ci->i_ceph_lock); |
1181 | if (dirty) | 1181 | if (dirty) |
1182 | __mark_inode_dirty(inode, dirty); | 1182 | __mark_inode_dirty(inode, dirty); |
1183 | } | 1183 | } |
1184 | 1184 | ||
1185 | dout("write_end %p %llx.%llx %llu~%u dropping cap refs on %s\n", | 1185 | dout("write_end %p %llx.%llx %llu~%u dropping cap refs on %s\n", |
1186 | inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); | 1186 | inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); |
1187 | ceph_put_cap_refs(ci, got); | 1187 | ceph_put_cap_refs(ci, got); |
1188 | 1188 | ||
1189 | if (check_cap) | 1189 | if (check_cap) |
1190 | ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); | 1190 | ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); |
1191 | 1191 | ||
1192 | return copied; | 1192 | return copied; |
1193 | } | 1193 | } |
1194 | 1194 | ||
1195 | /* | 1195 | /* |
1196 | * we set .direct_IO to indicate direct io is supported, but since we | 1196 | * we set .direct_IO to indicate direct io is supported, but since we |
1197 | * intercept O_DIRECT reads and writes early, this function should | 1197 | * intercept O_DIRECT reads and writes early, this function should |
1198 | * never get called. | 1198 | * never get called. |
1199 | */ | 1199 | */ |
1200 | static ssize_t ceph_direct_io(int rw, struct kiocb *iocb, | 1200 | static ssize_t ceph_direct_io(int rw, struct kiocb *iocb, |
1201 | const struct iovec *iov, | 1201 | const struct iovec *iov, |
1202 | loff_t pos, unsigned long nr_segs) | 1202 | loff_t pos, unsigned long nr_segs) |
1203 | { | 1203 | { |
1204 | WARN_ON(1); | 1204 | WARN_ON(1); |
1205 | return -EINVAL; | 1205 | return -EINVAL; |
1206 | } | 1206 | } |
1207 | 1207 | ||
1208 | const struct address_space_operations ceph_aops = { | 1208 | const struct address_space_operations ceph_aops = { |
1209 | .readpage = ceph_readpage, | 1209 | .readpage = ceph_readpage, |
1210 | .readpages = ceph_readpages, | 1210 | .readpages = ceph_readpages, |
1211 | .writepage = ceph_writepage, | 1211 | .writepage = ceph_writepage, |
1212 | .writepages = ceph_writepages_start, | 1212 | .writepages = ceph_writepages_start, |
1213 | .write_begin = ceph_write_begin, | 1213 | .write_begin = ceph_write_begin, |
1214 | .write_end = ceph_write_end, | 1214 | .write_end = ceph_write_end, |
1215 | .set_page_dirty = ceph_set_page_dirty, | 1215 | .set_page_dirty = ceph_set_page_dirty, |
1216 | .invalidatepage = ceph_invalidatepage, | 1216 | .invalidatepage = ceph_invalidatepage, |
1217 | .releasepage = ceph_releasepage, | 1217 | .releasepage = ceph_releasepage, |
1218 | .direct_IO = ceph_direct_io, | 1218 | .direct_IO = ceph_direct_io, |
1219 | }; | 1219 | }; |
1220 | 1220 | ||
1221 | 1221 | ||
1222 | /* | 1222 | /* |
1223 | * vm ops | 1223 | * vm ops |
1224 | */ | 1224 | */ |
1225 | 1225 | ||
1226 | /* | 1226 | /* |
1227 | * Reuse write_begin here for simplicity. | 1227 | * Reuse write_begin here for simplicity. |
1228 | */ | 1228 | */ |
1229 | static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | 1229 | static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) |
1230 | { | 1230 | { |
1231 | struct inode *inode = vma->vm_file->f_dentry->d_inode; | 1231 | struct inode *inode = vma->vm_file->f_dentry->d_inode; |
1232 | struct page *page = vmf->page; | 1232 | struct page *page = vmf->page; |
1233 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; | 1233 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; |
1234 | loff_t off = page_offset(page); | 1234 | loff_t off = page_offset(page); |
1235 | loff_t size, len; | 1235 | loff_t size, len; |
1236 | int ret; | 1236 | int ret; |
1237 | 1237 | ||
1238 | /* Update time before taking page lock */ | 1238 | /* Update time before taking page lock */ |
1239 | file_update_time(vma->vm_file); | 1239 | file_update_time(vma->vm_file); |
1240 | 1240 | ||
1241 | size = i_size_read(inode); | 1241 | size = i_size_read(inode); |
1242 | if (off + PAGE_CACHE_SIZE <= size) | 1242 | if (off + PAGE_CACHE_SIZE <= size) |
1243 | len = PAGE_CACHE_SIZE; | 1243 | len = PAGE_CACHE_SIZE; |
1244 | else | 1244 | else |
1245 | len = size & ~PAGE_CACHE_MASK; | 1245 | len = size & ~PAGE_CACHE_MASK; |
1246 | 1246 | ||
1247 | dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode, | 1247 | dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode, |
1248 | off, len, page, page->index); | 1248 | off, len, page, page->index); |
1249 | 1249 | ||
1250 | lock_page(page); | 1250 | lock_page(page); |
1251 | 1251 | ||
1252 | ret = VM_FAULT_NOPAGE; | 1252 | ret = VM_FAULT_NOPAGE; |
1253 | if ((off > size) || | 1253 | if ((off > size) || |
1254 | (page->mapping != inode->i_mapping)) | 1254 | (page->mapping != inode->i_mapping)) |
1255 | goto out; | 1255 | goto out; |
1256 | 1256 | ||
1257 | ret = ceph_update_writeable_page(vma->vm_file, off, len, page); | 1257 | ret = ceph_update_writeable_page(vma->vm_file, off, len, page); |
1258 | if (ret == 0) { | 1258 | if (ret == 0) { |
1259 | /* success. we'll keep the page locked. */ | 1259 | /* success. we'll keep the page locked. */ |
1260 | set_page_dirty(page); | 1260 | set_page_dirty(page); |
1261 | up_read(&mdsc->snap_rwsem); | 1261 | up_read(&mdsc->snap_rwsem); |
1262 | ret = VM_FAULT_LOCKED; | 1262 | ret = VM_FAULT_LOCKED; |
1263 | } else { | 1263 | } else { |
1264 | if (ret == -ENOMEM) | 1264 | if (ret == -ENOMEM) |
1265 | ret = VM_FAULT_OOM; | 1265 | ret = VM_FAULT_OOM; |
1266 | else | 1266 | else |
1267 | ret = VM_FAULT_SIGBUS; | 1267 | ret = VM_FAULT_SIGBUS; |
1268 | } | 1268 | } |
1269 | out: | 1269 | out: |
1270 | dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret); | 1270 | dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret); |
1271 | if (ret != VM_FAULT_LOCKED) | 1271 | if (ret != VM_FAULT_LOCKED) |
1272 | unlock_page(page); | 1272 | unlock_page(page); |
1273 | return ret; | 1273 | return ret; |
1274 | } | 1274 | } |
1275 | 1275 | ||
1276 | static struct vm_operations_struct ceph_vmops = { | 1276 | static struct vm_operations_struct ceph_vmops = { |
1277 | .fault = filemap_fault, | 1277 | .fault = filemap_fault, |
1278 | .page_mkwrite = ceph_page_mkwrite, | 1278 | .page_mkwrite = ceph_page_mkwrite, |
1279 | .remap_pages = generic_file_remap_pages, | 1279 | .remap_pages = generic_file_remap_pages, |
1280 | }; | 1280 | }; |
1281 | 1281 | ||
1282 | int ceph_mmap(struct file *file, struct vm_area_struct *vma) | 1282 | int ceph_mmap(struct file *file, struct vm_area_struct *vma) |
1283 | { | 1283 | { |
1284 | struct address_space *mapping = file->f_mapping; | 1284 | struct address_space *mapping = file->f_mapping; |
1285 | 1285 | ||
1286 | if (!mapping->a_ops->readpage) | 1286 | if (!mapping->a_ops->readpage) |
1287 | return -ENOEXEC; | 1287 | return -ENOEXEC; |
1288 | file_accessed(file); | 1288 | file_accessed(file); |
1289 | vma->vm_ops = &ceph_vmops; | 1289 | vma->vm_ops = &ceph_vmops; |
1290 | return 0; | 1290 | return 0; |
1291 | } | 1291 | } |
1292 | 1292 |
include/linux/ceph/osd_client.h
1 | #ifndef _FS_CEPH_OSD_CLIENT_H | 1 | #ifndef _FS_CEPH_OSD_CLIENT_H |
2 | #define _FS_CEPH_OSD_CLIENT_H | 2 | #define _FS_CEPH_OSD_CLIENT_H |
3 | 3 | ||
4 | #include <linux/completion.h> | 4 | #include <linux/completion.h> |
5 | #include <linux/kref.h> | 5 | #include <linux/kref.h> |
6 | #include <linux/mempool.h> | 6 | #include <linux/mempool.h> |
7 | #include <linux/rbtree.h> | 7 | #include <linux/rbtree.h> |
8 | 8 | ||
9 | #include <linux/ceph/types.h> | 9 | #include <linux/ceph/types.h> |
10 | #include <linux/ceph/osdmap.h> | 10 | #include <linux/ceph/osdmap.h> |
11 | #include <linux/ceph/messenger.h> | 11 | #include <linux/ceph/messenger.h> |
12 | #include <linux/ceph/auth.h> | 12 | #include <linux/ceph/auth.h> |
13 | #include <linux/ceph/pagelist.h> | 13 | #include <linux/ceph/pagelist.h> |
14 | 14 | ||
15 | /* | 15 | /* |
16 | * Maximum object name size | 16 | * Maximum object name size |
17 | * (must be at least as big as RBD_MAX_MD_NAME_LEN -- currently 100) | 17 | * (must be at least as big as RBD_MAX_MD_NAME_LEN -- currently 100) |
18 | */ | 18 | */ |
19 | #define MAX_OBJ_NAME_SIZE 100 | 19 | #define MAX_OBJ_NAME_SIZE 100 |
20 | 20 | ||
21 | struct ceph_msg; | 21 | struct ceph_msg; |
22 | struct ceph_snap_context; | 22 | struct ceph_snap_context; |
23 | struct ceph_osd_request; | 23 | struct ceph_osd_request; |
24 | struct ceph_osd_client; | 24 | struct ceph_osd_client; |
25 | struct ceph_authorizer; | 25 | struct ceph_authorizer; |
26 | 26 | ||
27 | /* | 27 | /* |
28 | * completion callback for async writepages | 28 | * completion callback for async writepages |
29 | */ | 29 | */ |
30 | typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, | 30 | typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, |
31 | struct ceph_msg *); | 31 | struct ceph_msg *); |
32 | 32 | ||
33 | /* a given osd we're communicating with */ | 33 | /* a given osd we're communicating with */ |
34 | struct ceph_osd { | 34 | struct ceph_osd { |
35 | atomic_t o_ref; | 35 | atomic_t o_ref; |
36 | struct ceph_osd_client *o_osdc; | 36 | struct ceph_osd_client *o_osdc; |
37 | int o_osd; | 37 | int o_osd; |
38 | int o_incarnation; | 38 | int o_incarnation; |
39 | struct rb_node o_node; | 39 | struct rb_node o_node; |
40 | struct ceph_connection o_con; | 40 | struct ceph_connection o_con; |
41 | struct list_head o_requests; | 41 | struct list_head o_requests; |
42 | struct list_head o_linger_requests; | 42 | struct list_head o_linger_requests; |
43 | struct list_head o_osd_lru; | 43 | struct list_head o_osd_lru; |
44 | struct ceph_auth_handshake o_auth; | 44 | struct ceph_auth_handshake o_auth; |
45 | unsigned long lru_ttl; | 45 | unsigned long lru_ttl; |
46 | int o_marked_for_keepalive; | 46 | int o_marked_for_keepalive; |
47 | struct list_head o_keepalive_item; | 47 | struct list_head o_keepalive_item; |
48 | }; | 48 | }; |
49 | 49 | ||
50 | /* an in-flight request */ | 50 | /* an in-flight request */ |
51 | struct ceph_osd_request { | 51 | struct ceph_osd_request { |
52 | u64 r_tid; /* unique for this client */ | 52 | u64 r_tid; /* unique for this client */ |
53 | struct rb_node r_node; | 53 | struct rb_node r_node; |
54 | struct list_head r_req_lru_item; | 54 | struct list_head r_req_lru_item; |
55 | struct list_head r_osd_item; | 55 | struct list_head r_osd_item; |
56 | struct list_head r_linger_item; | 56 | struct list_head r_linger_item; |
57 | struct list_head r_linger_osd; | 57 | struct list_head r_linger_osd; |
58 | struct ceph_osd *r_osd; | 58 | struct ceph_osd *r_osd; |
59 | struct ceph_pg r_pgid; | 59 | struct ceph_pg r_pgid; |
60 | int r_pg_osds[CEPH_PG_MAX_SIZE]; | 60 | int r_pg_osds[CEPH_PG_MAX_SIZE]; |
61 | int r_num_pg_osds; | 61 | int r_num_pg_osds; |
62 | 62 | ||
63 | struct ceph_connection *r_con_filling_msg; | 63 | struct ceph_connection *r_con_filling_msg; |
64 | 64 | ||
65 | struct ceph_msg *r_request, *r_reply; | 65 | struct ceph_msg *r_request, *r_reply; |
66 | int r_result; | 66 | int r_result; |
67 | int r_flags; /* any additional flags for the osd */ | 67 | int r_flags; /* any additional flags for the osd */ |
68 | u32 r_sent; /* >0 if r_request is sending/sent */ | 68 | u32 r_sent; /* >0 if r_request is sending/sent */ |
69 | int r_got_reply; | 69 | int r_got_reply; |
70 | int r_linger; | 70 | int r_linger; |
71 | 71 | ||
72 | struct ceph_osd_client *r_osdc; | 72 | struct ceph_osd_client *r_osdc; |
73 | struct kref r_kref; | 73 | struct kref r_kref; |
74 | bool r_mempool; | 74 | bool r_mempool; |
75 | struct completion r_completion, r_safe_completion; | 75 | struct completion r_completion, r_safe_completion; |
76 | ceph_osdc_callback_t r_callback, r_safe_callback; | 76 | ceph_osdc_callback_t r_callback, r_safe_callback; |
77 | struct ceph_eversion r_reassert_version; | 77 | struct ceph_eversion r_reassert_version; |
78 | struct list_head r_unsafe_item; | 78 | struct list_head r_unsafe_item; |
79 | 79 | ||
80 | struct inode *r_inode; /* for use by callbacks */ | 80 | struct inode *r_inode; /* for use by callbacks */ |
81 | void *r_priv; /* ditto */ | 81 | void *r_priv; /* ditto */ |
82 | 82 | ||
83 | char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */ | 83 | char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */ |
84 | int r_oid_len; | 84 | int r_oid_len; |
85 | unsigned long r_stamp; /* send OR check time */ | 85 | unsigned long r_stamp; /* send OR check time */ |
86 | 86 | ||
87 | struct ceph_file_layout r_file_layout; | 87 | struct ceph_file_layout r_file_layout; |
88 | struct ceph_snap_context *r_snapc; /* snap context for writes */ | 88 | struct ceph_snap_context *r_snapc; /* snap context for writes */ |
89 | unsigned r_num_pages; /* size of page array (follows) */ | 89 | unsigned r_num_pages; /* size of page array (follows) */ |
90 | unsigned r_page_alignment; /* io offset in first page */ | 90 | unsigned r_page_alignment; /* io offset in first page */ |
91 | struct page **r_pages; /* pages for data payload */ | 91 | struct page **r_pages; /* pages for data payload */ |
92 | int r_pages_from_pool; | 92 | int r_pages_from_pool; |
93 | int r_own_pages; /* if true, i own page list */ | 93 | int r_own_pages; /* if true, i own page list */ |
94 | #ifdef CONFIG_BLOCK | 94 | #ifdef CONFIG_BLOCK |
95 | struct bio *r_bio; /* instead of pages */ | 95 | struct bio *r_bio; /* instead of pages */ |
96 | #endif | 96 | #endif |
97 | 97 | ||
98 | struct ceph_pagelist r_trail; /* trailing part of the data */ | 98 | struct ceph_pagelist r_trail; /* trailing part of the data */ |
99 | }; | 99 | }; |
100 | 100 | ||
101 | struct ceph_osd_event { | 101 | struct ceph_osd_event { |
102 | u64 cookie; | 102 | u64 cookie; |
103 | int one_shot; | 103 | int one_shot; |
104 | struct ceph_osd_client *osdc; | 104 | struct ceph_osd_client *osdc; |
105 | void (*cb)(u64, u64, u8, void *); | 105 | void (*cb)(u64, u64, u8, void *); |
106 | void *data; | 106 | void *data; |
107 | struct rb_node node; | 107 | struct rb_node node; |
108 | struct list_head osd_node; | 108 | struct list_head osd_node; |
109 | struct kref kref; | 109 | struct kref kref; |
110 | struct completion completion; | 110 | struct completion completion; |
111 | }; | 111 | }; |
112 | 112 | ||
113 | struct ceph_osd_event_work { | 113 | struct ceph_osd_event_work { |
114 | struct work_struct work; | 114 | struct work_struct work; |
115 | struct ceph_osd_event *event; | 115 | struct ceph_osd_event *event; |
116 | u64 ver; | 116 | u64 ver; |
117 | u64 notify_id; | 117 | u64 notify_id; |
118 | u8 opcode; | 118 | u8 opcode; |
119 | }; | 119 | }; |
120 | 120 | ||
121 | struct ceph_osd_client { | 121 | struct ceph_osd_client { |
122 | struct ceph_client *client; | 122 | struct ceph_client *client; |
123 | 123 | ||
124 | struct ceph_osdmap *osdmap; /* current map */ | 124 | struct ceph_osdmap *osdmap; /* current map */ |
125 | struct rw_semaphore map_sem; | 125 | struct rw_semaphore map_sem; |
126 | struct completion map_waiters; | 126 | struct completion map_waiters; |
127 | u64 last_requested_map; | 127 | u64 last_requested_map; |
128 | 128 | ||
129 | struct mutex request_mutex; | 129 | struct mutex request_mutex; |
130 | struct rb_root osds; /* osds */ | 130 | struct rb_root osds; /* osds */ |
131 | struct list_head osd_lru; /* idle osds */ | 131 | struct list_head osd_lru; /* idle osds */ |
132 | u64 timeout_tid; /* tid of timeout triggering rq */ | 132 | u64 timeout_tid; /* tid of timeout triggering rq */ |
133 | u64 last_tid; /* tid of last request */ | 133 | u64 last_tid; /* tid of last request */ |
134 | struct rb_root requests; /* pending requests */ | 134 | struct rb_root requests; /* pending requests */ |
135 | struct list_head req_lru; /* in-flight lru */ | 135 | struct list_head req_lru; /* in-flight lru */ |
136 | struct list_head req_unsent; /* unsent/need-resend queue */ | 136 | struct list_head req_unsent; /* unsent/need-resend queue */ |
137 | struct list_head req_notarget; /* map to no osd */ | 137 | struct list_head req_notarget; /* map to no osd */ |
138 | struct list_head req_linger; /* lingering requests */ | 138 | struct list_head req_linger; /* lingering requests */ |
139 | int num_requests; | 139 | int num_requests; |
140 | struct delayed_work timeout_work; | 140 | struct delayed_work timeout_work; |
141 | struct delayed_work osds_timeout_work; | 141 | struct delayed_work osds_timeout_work; |
142 | #ifdef CONFIG_DEBUG_FS | 142 | #ifdef CONFIG_DEBUG_FS |
143 | struct dentry *debugfs_file; | 143 | struct dentry *debugfs_file; |
144 | #endif | 144 | #endif |
145 | 145 | ||
146 | mempool_t *req_mempool; | 146 | mempool_t *req_mempool; |
147 | 147 | ||
148 | struct ceph_msgpool msgpool_op; | 148 | struct ceph_msgpool msgpool_op; |
149 | struct ceph_msgpool msgpool_op_reply; | 149 | struct ceph_msgpool msgpool_op_reply; |
150 | 150 | ||
151 | spinlock_t event_lock; | 151 | spinlock_t event_lock; |
152 | struct rb_root event_tree; | 152 | struct rb_root event_tree; |
153 | u64 event_count; | 153 | u64 event_count; |
154 | 154 | ||
155 | struct workqueue_struct *notify_wq; | 155 | struct workqueue_struct *notify_wq; |
156 | }; | 156 | }; |
157 | 157 | ||
158 | struct ceph_osd_req_op { | 158 | struct ceph_osd_req_op { |
159 | u16 op; /* CEPH_OSD_OP_* */ | 159 | u16 op; /* CEPH_OSD_OP_* */ |
160 | u32 payload_len; | 160 | u32 payload_len; |
161 | union { | 161 | union { |
162 | struct { | 162 | struct { |
163 | u64 offset, length; | 163 | u64 offset, length; |
164 | u64 truncate_size; | 164 | u64 truncate_size; |
165 | u32 truncate_seq; | 165 | u32 truncate_seq; |
166 | } extent; | 166 | } extent; |
167 | struct { | 167 | struct { |
168 | const char *name; | 168 | const char *name; |
169 | const char *val; | 169 | const char *val; |
170 | u32 name_len; | 170 | u32 name_len; |
171 | u32 value_len; | 171 | u32 value_len; |
172 | __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ | 172 | __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ |
173 | __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ | 173 | __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ |
174 | } xattr; | 174 | } xattr; |
175 | struct { | 175 | struct { |
176 | const char *class_name; | 176 | const char *class_name; |
177 | const char *method_name; | 177 | const char *method_name; |
178 | const char *indata; | 178 | const char *indata; |
179 | u32 indata_len; | 179 | u32 indata_len; |
180 | __u8 class_len; | 180 | __u8 class_len; |
181 | __u8 method_len; | 181 | __u8 method_len; |
182 | __u8 argc; | 182 | __u8 argc; |
183 | } cls; | 183 | } cls; |
184 | struct { | 184 | struct { |
185 | u64 cookie; | 185 | u64 cookie; |
186 | u64 count; | 186 | u64 count; |
187 | } pgls; | 187 | } pgls; |
188 | struct { | 188 | struct { |
189 | u64 snapid; | 189 | u64 snapid; |
190 | } snap; | 190 | } snap; |
191 | struct { | 191 | struct { |
192 | u64 cookie; | 192 | u64 cookie; |
193 | u64 ver; | 193 | u64 ver; |
194 | u32 prot_ver; | 194 | u32 prot_ver; |
195 | u32 timeout; | 195 | u32 timeout; |
196 | __u8 flag; | 196 | __u8 flag; |
197 | } watch; | 197 | } watch; |
198 | }; | 198 | }; |
199 | }; | 199 | }; |
200 | 200 | ||
201 | extern int ceph_osdc_init(struct ceph_osd_client *osdc, | 201 | extern int ceph_osdc_init(struct ceph_osd_client *osdc, |
202 | struct ceph_client *client); | 202 | struct ceph_client *client); |
203 | extern void ceph_osdc_stop(struct ceph_osd_client *osdc); | 203 | extern void ceph_osdc_stop(struct ceph_osd_client *osdc); |
204 | 204 | ||
205 | extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, | 205 | extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, |
206 | struct ceph_msg *msg); | 206 | struct ceph_msg *msg); |
207 | extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, | 207 | extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, |
208 | struct ceph_msg *msg); | 208 | struct ceph_msg *msg); |
209 | 209 | ||
210 | extern int ceph_calc_raw_layout(struct ceph_file_layout *layout, | 210 | extern int ceph_calc_raw_layout(struct ceph_file_layout *layout, |
211 | u64 off, u64 *plen, u64 *bno, | 211 | u64 off, u64 *plen, u64 *bno, |
212 | struct ceph_osd_request *req, | 212 | struct ceph_osd_request *req, |
213 | struct ceph_osd_req_op *op); | 213 | struct ceph_osd_req_op *op); |
214 | 214 | ||
215 | extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, | 215 | extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, |
216 | struct ceph_snap_context *snapc, | 216 | struct ceph_snap_context *snapc, |
217 | unsigned int num_op, | 217 | unsigned int num_op, |
218 | bool use_mempool, | 218 | bool use_mempool, |
219 | gfp_t gfp_flags); | 219 | gfp_t gfp_flags); |
220 | 220 | ||
221 | extern void ceph_osdc_build_request(struct ceph_osd_request *req, | 221 | extern void ceph_osdc_build_request(struct ceph_osd_request *req, |
222 | u64 off, u64 len, | 222 | u64 off, u64 len, |
223 | unsigned int num_op, | 223 | unsigned int num_op, |
224 | struct ceph_osd_req_op *src_ops, | 224 | struct ceph_osd_req_op *src_ops, |
225 | struct ceph_snap_context *snapc, | 225 | struct ceph_snap_context *snapc, |
226 | u64 snap_id, | 226 | u64 snap_id, |
227 | struct timespec *mtime); | 227 | struct timespec *mtime); |
228 | 228 | ||
229 | extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, | 229 | extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, |
230 | struct ceph_file_layout *layout, | 230 | struct ceph_file_layout *layout, |
231 | struct ceph_vino vino, | 231 | struct ceph_vino vino, |
232 | u64 offset, u64 *len, int op, int flags, | 232 | u64 offset, u64 *len, int op, int flags, |
233 | struct ceph_snap_context *snapc, | 233 | struct ceph_snap_context *snapc, |
234 | int do_sync, u32 truncate_seq, | 234 | int do_sync, u32 truncate_seq, |
235 | u64 truncate_size, | 235 | u64 truncate_size, |
236 | struct timespec *mtime, | 236 | struct timespec *mtime, |
237 | bool use_mempool, int num_reply, | 237 | bool use_mempool, int num_reply, |
238 | int page_align); | 238 | int page_align); |
239 | 239 | ||
240 | extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, | 240 | extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, |
241 | struct ceph_osd_request *req); | 241 | struct ceph_osd_request *req); |
242 | extern void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, | 242 | extern void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, |
243 | struct ceph_osd_request *req); | 243 | struct ceph_osd_request *req); |
244 | 244 | ||
245 | static inline void ceph_osdc_get_request(struct ceph_osd_request *req) | 245 | static inline void ceph_osdc_get_request(struct ceph_osd_request *req) |
246 | { | 246 | { |
247 | kref_get(&req->r_kref); | 247 | kref_get(&req->r_kref); |
248 | } | 248 | } |
249 | extern void ceph_osdc_release_request(struct kref *kref); | 249 | extern void ceph_osdc_release_request(struct kref *kref); |
250 | static inline void ceph_osdc_put_request(struct ceph_osd_request *req) | 250 | static inline void ceph_osdc_put_request(struct ceph_osd_request *req) |
251 | { | 251 | { |
252 | kref_put(&req->r_kref, ceph_osdc_release_request); | 252 | kref_put(&req->r_kref, ceph_osdc_release_request); |
253 | } | 253 | } |
254 | 254 | ||
255 | extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, | 255 | extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, |
256 | struct ceph_osd_request *req, | 256 | struct ceph_osd_request *req, |
257 | bool nofail); | 257 | bool nofail); |
258 | extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, | 258 | extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, |
259 | struct ceph_osd_request *req); | 259 | struct ceph_osd_request *req); |
260 | extern void ceph_osdc_sync(struct ceph_osd_client *osdc); | 260 | extern void ceph_osdc_sync(struct ceph_osd_client *osdc); |
261 | 261 | ||
262 | extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, | 262 | extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, |
263 | struct ceph_vino vino, | 263 | struct ceph_vino vino, |
264 | struct ceph_file_layout *layout, | 264 | struct ceph_file_layout *layout, |
265 | u64 off, u64 *plen, | 265 | u64 off, u64 *plen, |
266 | u32 truncate_seq, u64 truncate_size, | 266 | u32 truncate_seq, u64 truncate_size, |
267 | struct page **pages, int nr_pages, | 267 | struct page **pages, int nr_pages, |
268 | int page_align); | 268 | int page_align); |
269 | 269 | ||
270 | extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, | 270 | extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, |
271 | struct ceph_vino vino, | 271 | struct ceph_vino vino, |
272 | struct ceph_file_layout *layout, | 272 | struct ceph_file_layout *layout, |
273 | struct ceph_snap_context *sc, | 273 | struct ceph_snap_context *sc, |
274 | u64 off, u64 len, | 274 | u64 off, u64 len, |
275 | u32 truncate_seq, u64 truncate_size, | 275 | u32 truncate_seq, u64 truncate_size, |
276 | struct timespec *mtime, | 276 | struct timespec *mtime, |
277 | struct page **pages, int nr_pages, | 277 | struct page **pages, int nr_pages, |
278 | int flags, int do_sync, bool nofail); | 278 | int flags, int do_sync); |
279 | 279 | ||
280 | /* watch/notify events */ | 280 | /* watch/notify events */ |
281 | extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, | 281 | extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, |
282 | void (*event_cb)(u64, u64, u8, void *), | 282 | void (*event_cb)(u64, u64, u8, void *), |
283 | int one_shot, void *data, | 283 | int one_shot, void *data, |
284 | struct ceph_osd_event **pevent); | 284 | struct ceph_osd_event **pevent); |
285 | extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); | 285 | extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); |
286 | extern int ceph_osdc_wait_event(struct ceph_osd_event *event, | 286 | extern int ceph_osdc_wait_event(struct ceph_osd_event *event, |
287 | unsigned long timeout); | 287 | unsigned long timeout); |
288 | extern void ceph_osdc_put_event(struct ceph_osd_event *event); | 288 | extern void ceph_osdc_put_event(struct ceph_osd_event *event); |
289 | #endif | 289 | #endif |
290 | 290 | ||
291 | 291 |
net/ceph/osd_client.c
1 | #include <linux/ceph/ceph_debug.h> | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include <linux/module.h> | 3 | #include <linux/module.h> |
4 | #include <linux/err.h> | 4 | #include <linux/err.h> |
5 | #include <linux/highmem.h> | 5 | #include <linux/highmem.h> |
6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
7 | #include <linux/pagemap.h> | 7 | #include <linux/pagemap.h> |
8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
9 | #include <linux/uaccess.h> | 9 | #include <linux/uaccess.h> |
10 | #ifdef CONFIG_BLOCK | 10 | #ifdef CONFIG_BLOCK |
11 | #include <linux/bio.h> | 11 | #include <linux/bio.h> |
12 | #endif | 12 | #endif |
13 | 13 | ||
14 | #include <linux/ceph/libceph.h> | 14 | #include <linux/ceph/libceph.h> |
15 | #include <linux/ceph/osd_client.h> | 15 | #include <linux/ceph/osd_client.h> |
16 | #include <linux/ceph/messenger.h> | 16 | #include <linux/ceph/messenger.h> |
17 | #include <linux/ceph/decode.h> | 17 | #include <linux/ceph/decode.h> |
18 | #include <linux/ceph/auth.h> | 18 | #include <linux/ceph/auth.h> |
19 | #include <linux/ceph/pagelist.h> | 19 | #include <linux/ceph/pagelist.h> |
20 | 20 | ||
21 | #define OSD_OP_FRONT_LEN 4096 | 21 | #define OSD_OP_FRONT_LEN 4096 |
22 | #define OSD_OPREPLY_FRONT_LEN 512 | 22 | #define OSD_OPREPLY_FRONT_LEN 512 |
23 | 23 | ||
24 | static const struct ceph_connection_operations osd_con_ops; | 24 | static const struct ceph_connection_operations osd_con_ops; |
25 | 25 | ||
26 | static void send_queued(struct ceph_osd_client *osdc); | 26 | static void send_queued(struct ceph_osd_client *osdc); |
27 | static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); | 27 | static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); |
28 | static void __register_request(struct ceph_osd_client *osdc, | 28 | static void __register_request(struct ceph_osd_client *osdc, |
29 | struct ceph_osd_request *req); | 29 | struct ceph_osd_request *req); |
30 | static void __unregister_linger_request(struct ceph_osd_client *osdc, | 30 | static void __unregister_linger_request(struct ceph_osd_client *osdc, |
31 | struct ceph_osd_request *req); | 31 | struct ceph_osd_request *req); |
32 | static void __send_request(struct ceph_osd_client *osdc, | 32 | static void __send_request(struct ceph_osd_client *osdc, |
33 | struct ceph_osd_request *req); | 33 | struct ceph_osd_request *req); |
34 | 34 | ||
35 | static int op_has_extent(int op) | 35 | static int op_has_extent(int op) |
36 | { | 36 | { |
37 | return (op == CEPH_OSD_OP_READ || | 37 | return (op == CEPH_OSD_OP_READ || |
38 | op == CEPH_OSD_OP_WRITE); | 38 | op == CEPH_OSD_OP_WRITE); |
39 | } | 39 | } |
40 | 40 | ||
41 | int ceph_calc_raw_layout(struct ceph_file_layout *layout, | 41 | int ceph_calc_raw_layout(struct ceph_file_layout *layout, |
42 | u64 off, u64 *plen, u64 *bno, | 42 | u64 off, u64 *plen, u64 *bno, |
43 | struct ceph_osd_request *req, | 43 | struct ceph_osd_request *req, |
44 | struct ceph_osd_req_op *op) | 44 | struct ceph_osd_req_op *op) |
45 | { | 45 | { |
46 | u64 orig_len = *plen; | 46 | u64 orig_len = *plen; |
47 | u64 objoff, objlen; /* extent in object */ | 47 | u64 objoff, objlen; /* extent in object */ |
48 | int r; | 48 | int r; |
49 | 49 | ||
50 | /* object extent? */ | 50 | /* object extent? */ |
51 | r = ceph_calc_file_object_mapping(layout, off, orig_len, bno, | 51 | r = ceph_calc_file_object_mapping(layout, off, orig_len, bno, |
52 | &objoff, &objlen); | 52 | &objoff, &objlen); |
53 | if (r < 0) | 53 | if (r < 0) |
54 | return r; | 54 | return r; |
55 | if (objlen < orig_len) { | 55 | if (objlen < orig_len) { |
56 | *plen = objlen; | 56 | *plen = objlen; |
57 | dout(" skipping last %llu, final file extent %llu~%llu\n", | 57 | dout(" skipping last %llu, final file extent %llu~%llu\n", |
58 | orig_len - *plen, off, *plen); | 58 | orig_len - *plen, off, *plen); |
59 | } | 59 | } |
60 | 60 | ||
61 | if (op_has_extent(op->op)) { | 61 | if (op_has_extent(op->op)) { |
62 | u32 osize = le32_to_cpu(layout->fl_object_size); | 62 | u32 osize = le32_to_cpu(layout->fl_object_size); |
63 | op->extent.offset = objoff; | 63 | op->extent.offset = objoff; |
64 | op->extent.length = objlen; | 64 | op->extent.length = objlen; |
65 | if (op->extent.truncate_size <= off - objoff) { | 65 | if (op->extent.truncate_size <= off - objoff) { |
66 | op->extent.truncate_size = 0; | 66 | op->extent.truncate_size = 0; |
67 | } else { | 67 | } else { |
68 | op->extent.truncate_size -= off - objoff; | 68 | op->extent.truncate_size -= off - objoff; |
69 | if (op->extent.truncate_size > osize) | 69 | if (op->extent.truncate_size > osize) |
70 | op->extent.truncate_size = osize; | 70 | op->extent.truncate_size = osize; |
71 | } | 71 | } |
72 | } | 72 | } |
73 | req->r_num_pages = calc_pages_for(off, *plen); | 73 | req->r_num_pages = calc_pages_for(off, *plen); |
74 | req->r_page_alignment = off & ~PAGE_MASK; | 74 | req->r_page_alignment = off & ~PAGE_MASK; |
75 | if (op->op == CEPH_OSD_OP_WRITE) | 75 | if (op->op == CEPH_OSD_OP_WRITE) |
76 | op->payload_len = *plen; | 76 | op->payload_len = *plen; |
77 | 77 | ||
78 | dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", | 78 | dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", |
79 | *bno, objoff, objlen, req->r_num_pages); | 79 | *bno, objoff, objlen, req->r_num_pages); |
80 | return 0; | 80 | return 0; |
81 | } | 81 | } |
82 | EXPORT_SYMBOL(ceph_calc_raw_layout); | 82 | EXPORT_SYMBOL(ceph_calc_raw_layout); |
83 | 83 | ||
84 | /* | 84 | /* |
85 | * Implement client access to distributed object storage cluster. | 85 | * Implement client access to distributed object storage cluster. |
86 | * | 86 | * |
87 | * All data objects are stored within a cluster/cloud of OSDs, or | 87 | * All data objects are stored within a cluster/cloud of OSDs, or |
88 | * "object storage devices." (Note that Ceph OSDs have _nothing_ to | 88 | * "object storage devices." (Note that Ceph OSDs have _nothing_ to |
89 | * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply | 89 | * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply |
90 | * remote daemons serving up and coordinating consistent and safe | 90 | * remote daemons serving up and coordinating consistent and safe |
91 | * access to storage. | 91 | * access to storage. |
92 | * | 92 | * |
93 | * Cluster membership and the mapping of data objects onto storage devices | 93 | * Cluster membership and the mapping of data objects onto storage devices |
94 | * are described by the osd map. | 94 | * are described by the osd map. |
95 | * | 95 | * |
96 | * We keep track of pending OSD requests (read, write), resubmit | 96 | * We keep track of pending OSD requests (read, write), resubmit |
97 | * requests to different OSDs when the cluster topology/data layout | 97 | * requests to different OSDs when the cluster topology/data layout |
98 | * change, or retry the affected requests when the communications | 98 | * change, or retry the affected requests when the communications |
99 | * channel with an OSD is reset. | 99 | * channel with an OSD is reset. |
100 | */ | 100 | */ |
101 | 101 | ||
102 | /* | 102 | /* |
103 | * calculate the mapping of a file extent onto an object, and fill out the | 103 | * calculate the mapping of a file extent onto an object, and fill out the |
104 | * request accordingly. shorten extent as necessary if it crosses an | 104 | * request accordingly. shorten extent as necessary if it crosses an |
105 | * object boundary. | 105 | * object boundary. |
106 | * | 106 | * |
107 | * fill osd op in request message. | 107 | * fill osd op in request message. |
108 | */ | 108 | */ |
109 | static int calc_layout(struct ceph_vino vino, | 109 | static int calc_layout(struct ceph_vino vino, |
110 | struct ceph_file_layout *layout, | 110 | struct ceph_file_layout *layout, |
111 | u64 off, u64 *plen, | 111 | u64 off, u64 *plen, |
112 | struct ceph_osd_request *req, | 112 | struct ceph_osd_request *req, |
113 | struct ceph_osd_req_op *op) | 113 | struct ceph_osd_req_op *op) |
114 | { | 114 | { |
115 | u64 bno; | 115 | u64 bno; |
116 | int r; | 116 | int r; |
117 | 117 | ||
118 | r = ceph_calc_raw_layout(layout, off, plen, &bno, req, op); | 118 | r = ceph_calc_raw_layout(layout, off, plen, &bno, req, op); |
119 | if (r < 0) | 119 | if (r < 0) |
120 | return r; | 120 | return r; |
121 | 121 | ||
122 | snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); | 122 | snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); |
123 | req->r_oid_len = strlen(req->r_oid); | 123 | req->r_oid_len = strlen(req->r_oid); |
124 | 124 | ||
125 | return r; | 125 | return r; |
126 | } | 126 | } |
127 | 127 | ||
128 | /* | 128 | /* |
129 | * requests | 129 | * requests |
130 | */ | 130 | */ |
131 | void ceph_osdc_release_request(struct kref *kref) | 131 | void ceph_osdc_release_request(struct kref *kref) |
132 | { | 132 | { |
133 | struct ceph_osd_request *req = container_of(kref, | 133 | struct ceph_osd_request *req = container_of(kref, |
134 | struct ceph_osd_request, | 134 | struct ceph_osd_request, |
135 | r_kref); | 135 | r_kref); |
136 | 136 | ||
137 | if (req->r_request) | 137 | if (req->r_request) |
138 | ceph_msg_put(req->r_request); | 138 | ceph_msg_put(req->r_request); |
139 | if (req->r_con_filling_msg) { | 139 | if (req->r_con_filling_msg) { |
140 | dout("%s revoking msg %p from con %p\n", __func__, | 140 | dout("%s revoking msg %p from con %p\n", __func__, |
141 | req->r_reply, req->r_con_filling_msg); | 141 | req->r_reply, req->r_con_filling_msg); |
142 | ceph_msg_revoke_incoming(req->r_reply); | 142 | ceph_msg_revoke_incoming(req->r_reply); |
143 | req->r_con_filling_msg->ops->put(req->r_con_filling_msg); | 143 | req->r_con_filling_msg->ops->put(req->r_con_filling_msg); |
144 | req->r_con_filling_msg = NULL; | 144 | req->r_con_filling_msg = NULL; |
145 | } | 145 | } |
146 | if (req->r_reply) | 146 | if (req->r_reply) |
147 | ceph_msg_put(req->r_reply); | 147 | ceph_msg_put(req->r_reply); |
148 | if (req->r_own_pages) | 148 | if (req->r_own_pages) |
149 | ceph_release_page_vector(req->r_pages, | 149 | ceph_release_page_vector(req->r_pages, |
150 | req->r_num_pages); | 150 | req->r_num_pages); |
151 | ceph_put_snap_context(req->r_snapc); | 151 | ceph_put_snap_context(req->r_snapc); |
152 | ceph_pagelist_release(&req->r_trail); | 152 | ceph_pagelist_release(&req->r_trail); |
153 | if (req->r_mempool) | 153 | if (req->r_mempool) |
154 | mempool_free(req, req->r_osdc->req_mempool); | 154 | mempool_free(req, req->r_osdc->req_mempool); |
155 | else | 155 | else |
156 | kfree(req); | 156 | kfree(req); |
157 | } | 157 | } |
158 | EXPORT_SYMBOL(ceph_osdc_release_request); | 158 | EXPORT_SYMBOL(ceph_osdc_release_request); |
159 | 159 | ||
160 | struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, | 160 | struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, |
161 | struct ceph_snap_context *snapc, | 161 | struct ceph_snap_context *snapc, |
162 | unsigned int num_op, | 162 | unsigned int num_op, |
163 | bool use_mempool, | 163 | bool use_mempool, |
164 | gfp_t gfp_flags) | 164 | gfp_t gfp_flags) |
165 | { | 165 | { |
166 | struct ceph_osd_request *req; | 166 | struct ceph_osd_request *req; |
167 | struct ceph_msg *msg; | 167 | struct ceph_msg *msg; |
168 | size_t msg_size = sizeof(struct ceph_osd_request_head); | 168 | size_t msg_size = sizeof(struct ceph_osd_request_head); |
169 | 169 | ||
170 | msg_size += num_op*sizeof(struct ceph_osd_op); | 170 | msg_size += num_op*sizeof(struct ceph_osd_op); |
171 | 171 | ||
172 | if (use_mempool) { | 172 | if (use_mempool) { |
173 | req = mempool_alloc(osdc->req_mempool, gfp_flags); | 173 | req = mempool_alloc(osdc->req_mempool, gfp_flags); |
174 | memset(req, 0, sizeof(*req)); | 174 | memset(req, 0, sizeof(*req)); |
175 | } else { | 175 | } else { |
176 | req = kzalloc(sizeof(*req), gfp_flags); | 176 | req = kzalloc(sizeof(*req), gfp_flags); |
177 | } | 177 | } |
178 | if (req == NULL) | 178 | if (req == NULL) |
179 | return NULL; | 179 | return NULL; |
180 | 180 | ||
181 | req->r_osdc = osdc; | 181 | req->r_osdc = osdc; |
182 | req->r_mempool = use_mempool; | 182 | req->r_mempool = use_mempool; |
183 | 183 | ||
184 | kref_init(&req->r_kref); | 184 | kref_init(&req->r_kref); |
185 | init_completion(&req->r_completion); | 185 | init_completion(&req->r_completion); |
186 | init_completion(&req->r_safe_completion); | 186 | init_completion(&req->r_safe_completion); |
187 | RB_CLEAR_NODE(&req->r_node); | 187 | RB_CLEAR_NODE(&req->r_node); |
188 | INIT_LIST_HEAD(&req->r_unsafe_item); | 188 | INIT_LIST_HEAD(&req->r_unsafe_item); |
189 | INIT_LIST_HEAD(&req->r_linger_item); | 189 | INIT_LIST_HEAD(&req->r_linger_item); |
190 | INIT_LIST_HEAD(&req->r_linger_osd); | 190 | INIT_LIST_HEAD(&req->r_linger_osd); |
191 | INIT_LIST_HEAD(&req->r_req_lru_item); | 191 | INIT_LIST_HEAD(&req->r_req_lru_item); |
192 | INIT_LIST_HEAD(&req->r_osd_item); | 192 | INIT_LIST_HEAD(&req->r_osd_item); |
193 | 193 | ||
194 | /* create reply message */ | 194 | /* create reply message */ |
195 | if (use_mempool) | 195 | if (use_mempool) |
196 | msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); | 196 | msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); |
197 | else | 197 | else |
198 | msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, | 198 | msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, |
199 | OSD_OPREPLY_FRONT_LEN, gfp_flags, true); | 199 | OSD_OPREPLY_FRONT_LEN, gfp_flags, true); |
200 | if (!msg) { | 200 | if (!msg) { |
201 | ceph_osdc_put_request(req); | 201 | ceph_osdc_put_request(req); |
202 | return NULL; | 202 | return NULL; |
203 | } | 203 | } |
204 | req->r_reply = msg; | 204 | req->r_reply = msg; |
205 | 205 | ||
206 | ceph_pagelist_init(&req->r_trail); | 206 | ceph_pagelist_init(&req->r_trail); |
207 | 207 | ||
208 | /* create request message; allow space for oid */ | 208 | /* create request message; allow space for oid */ |
209 | msg_size += MAX_OBJ_NAME_SIZE; | 209 | msg_size += MAX_OBJ_NAME_SIZE; |
210 | if (snapc) | 210 | if (snapc) |
211 | msg_size += sizeof(u64) * snapc->num_snaps; | 211 | msg_size += sizeof(u64) * snapc->num_snaps; |
212 | if (use_mempool) | 212 | if (use_mempool) |
213 | msg = ceph_msgpool_get(&osdc->msgpool_op, 0); | 213 | msg = ceph_msgpool_get(&osdc->msgpool_op, 0); |
214 | else | 214 | else |
215 | msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true); | 215 | msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true); |
216 | if (!msg) { | 216 | if (!msg) { |
217 | ceph_osdc_put_request(req); | 217 | ceph_osdc_put_request(req); |
218 | return NULL; | 218 | return NULL; |
219 | } | 219 | } |
220 | 220 | ||
221 | memset(msg->front.iov_base, 0, msg->front.iov_len); | 221 | memset(msg->front.iov_base, 0, msg->front.iov_len); |
222 | 222 | ||
223 | req->r_request = msg; | 223 | req->r_request = msg; |
224 | 224 | ||
225 | return req; | 225 | return req; |
226 | } | 226 | } |
227 | EXPORT_SYMBOL(ceph_osdc_alloc_request); | 227 | EXPORT_SYMBOL(ceph_osdc_alloc_request); |
228 | 228 | ||
229 | static void osd_req_encode_op(struct ceph_osd_request *req, | 229 | static void osd_req_encode_op(struct ceph_osd_request *req, |
230 | struct ceph_osd_op *dst, | 230 | struct ceph_osd_op *dst, |
231 | struct ceph_osd_req_op *src) | 231 | struct ceph_osd_req_op *src) |
232 | { | 232 | { |
233 | dst->op = cpu_to_le16(src->op); | 233 | dst->op = cpu_to_le16(src->op); |
234 | 234 | ||
235 | switch (src->op) { | 235 | switch (src->op) { |
236 | case CEPH_OSD_OP_READ: | 236 | case CEPH_OSD_OP_READ: |
237 | case CEPH_OSD_OP_WRITE: | 237 | case CEPH_OSD_OP_WRITE: |
238 | dst->extent.offset = | 238 | dst->extent.offset = |
239 | cpu_to_le64(src->extent.offset); | 239 | cpu_to_le64(src->extent.offset); |
240 | dst->extent.length = | 240 | dst->extent.length = |
241 | cpu_to_le64(src->extent.length); | 241 | cpu_to_le64(src->extent.length); |
242 | dst->extent.truncate_size = | 242 | dst->extent.truncate_size = |
243 | cpu_to_le64(src->extent.truncate_size); | 243 | cpu_to_le64(src->extent.truncate_size); |
244 | dst->extent.truncate_seq = | 244 | dst->extent.truncate_seq = |
245 | cpu_to_le32(src->extent.truncate_seq); | 245 | cpu_to_le32(src->extent.truncate_seq); |
246 | break; | 246 | break; |
247 | 247 | ||
248 | case CEPH_OSD_OP_GETXATTR: | 248 | case CEPH_OSD_OP_GETXATTR: |
249 | case CEPH_OSD_OP_SETXATTR: | 249 | case CEPH_OSD_OP_SETXATTR: |
250 | case CEPH_OSD_OP_CMPXATTR: | 250 | case CEPH_OSD_OP_CMPXATTR: |
251 | dst->xattr.name_len = cpu_to_le32(src->xattr.name_len); | 251 | dst->xattr.name_len = cpu_to_le32(src->xattr.name_len); |
252 | dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); | 252 | dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); |
253 | dst->xattr.cmp_op = src->xattr.cmp_op; | 253 | dst->xattr.cmp_op = src->xattr.cmp_op; |
254 | dst->xattr.cmp_mode = src->xattr.cmp_mode; | 254 | dst->xattr.cmp_mode = src->xattr.cmp_mode; |
255 | ceph_pagelist_append(&req->r_trail, src->xattr.name, | 255 | ceph_pagelist_append(&req->r_trail, src->xattr.name, |
256 | src->xattr.name_len); | 256 | src->xattr.name_len); |
257 | ceph_pagelist_append(&req->r_trail, src->xattr.val, | 257 | ceph_pagelist_append(&req->r_trail, src->xattr.val, |
258 | src->xattr.value_len); | 258 | src->xattr.value_len); |
259 | break; | 259 | break; |
260 | case CEPH_OSD_OP_CALL: | 260 | case CEPH_OSD_OP_CALL: |
261 | dst->cls.class_len = src->cls.class_len; | 261 | dst->cls.class_len = src->cls.class_len; |
262 | dst->cls.method_len = src->cls.method_len; | 262 | dst->cls.method_len = src->cls.method_len; |
263 | dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); | 263 | dst->cls.indata_len = cpu_to_le32(src->cls.indata_len); |
264 | 264 | ||
265 | ceph_pagelist_append(&req->r_trail, src->cls.class_name, | 265 | ceph_pagelist_append(&req->r_trail, src->cls.class_name, |
266 | src->cls.class_len); | 266 | src->cls.class_len); |
267 | ceph_pagelist_append(&req->r_trail, src->cls.method_name, | 267 | ceph_pagelist_append(&req->r_trail, src->cls.method_name, |
268 | src->cls.method_len); | 268 | src->cls.method_len); |
269 | ceph_pagelist_append(&req->r_trail, src->cls.indata, | 269 | ceph_pagelist_append(&req->r_trail, src->cls.indata, |
270 | src->cls.indata_len); | 270 | src->cls.indata_len); |
271 | break; | 271 | break; |
272 | case CEPH_OSD_OP_ROLLBACK: | 272 | case CEPH_OSD_OP_ROLLBACK: |
273 | dst->snap.snapid = cpu_to_le64(src->snap.snapid); | 273 | dst->snap.snapid = cpu_to_le64(src->snap.snapid); |
274 | break; | 274 | break; |
275 | case CEPH_OSD_OP_STARTSYNC: | 275 | case CEPH_OSD_OP_STARTSYNC: |
276 | break; | 276 | break; |
277 | case CEPH_OSD_OP_NOTIFY: | 277 | case CEPH_OSD_OP_NOTIFY: |
278 | { | 278 | { |
279 | __le32 prot_ver = cpu_to_le32(src->watch.prot_ver); | 279 | __le32 prot_ver = cpu_to_le32(src->watch.prot_ver); |
280 | __le32 timeout = cpu_to_le32(src->watch.timeout); | 280 | __le32 timeout = cpu_to_le32(src->watch.timeout); |
281 | 281 | ||
282 | ceph_pagelist_append(&req->r_trail, | 282 | ceph_pagelist_append(&req->r_trail, |
283 | &prot_ver, sizeof(prot_ver)); | 283 | &prot_ver, sizeof(prot_ver)); |
284 | ceph_pagelist_append(&req->r_trail, | 284 | ceph_pagelist_append(&req->r_trail, |
285 | &timeout, sizeof(timeout)); | 285 | &timeout, sizeof(timeout)); |
286 | } | 286 | } |
287 | case CEPH_OSD_OP_NOTIFY_ACK: | 287 | case CEPH_OSD_OP_NOTIFY_ACK: |
288 | case CEPH_OSD_OP_WATCH: | 288 | case CEPH_OSD_OP_WATCH: |
289 | dst->watch.cookie = cpu_to_le64(src->watch.cookie); | 289 | dst->watch.cookie = cpu_to_le64(src->watch.cookie); |
290 | dst->watch.ver = cpu_to_le64(src->watch.ver); | 290 | dst->watch.ver = cpu_to_le64(src->watch.ver); |
291 | dst->watch.flag = src->watch.flag; | 291 | dst->watch.flag = src->watch.flag; |
292 | break; | 292 | break; |
293 | default: | 293 | default: |
294 | pr_err("unrecognized osd opcode %d\n", dst->op); | 294 | pr_err("unrecognized osd opcode %d\n", dst->op); |
295 | WARN_ON(1); | 295 | WARN_ON(1); |
296 | break; | 296 | break; |
297 | } | 297 | } |
298 | dst->payload_len = cpu_to_le32(src->payload_len); | 298 | dst->payload_len = cpu_to_le32(src->payload_len); |
299 | } | 299 | } |
300 | 300 | ||
301 | /* | 301 | /* |
302 | * build new request AND message | 302 | * build new request AND message |
303 | * | 303 | * |
304 | */ | 304 | */ |
305 | void ceph_osdc_build_request(struct ceph_osd_request *req, | 305 | void ceph_osdc_build_request(struct ceph_osd_request *req, |
306 | u64 off, u64 len, unsigned int num_op, | 306 | u64 off, u64 len, unsigned int num_op, |
307 | struct ceph_osd_req_op *src_ops, | 307 | struct ceph_osd_req_op *src_ops, |
308 | struct ceph_snap_context *snapc, u64 snap_id, | 308 | struct ceph_snap_context *snapc, u64 snap_id, |
309 | struct timespec *mtime) | 309 | struct timespec *mtime) |
310 | { | 310 | { |
311 | struct ceph_msg *msg = req->r_request; | 311 | struct ceph_msg *msg = req->r_request; |
312 | struct ceph_osd_request_head *head; | 312 | struct ceph_osd_request_head *head; |
313 | struct ceph_osd_req_op *src_op; | 313 | struct ceph_osd_req_op *src_op; |
314 | struct ceph_osd_op *op; | 314 | struct ceph_osd_op *op; |
315 | void *p; | 315 | void *p; |
316 | size_t msg_size = sizeof(*head) + num_op*sizeof(*op); | 316 | size_t msg_size = sizeof(*head) + num_op*sizeof(*op); |
317 | int flags = req->r_flags; | 317 | int flags = req->r_flags; |
318 | u64 data_len = 0; | 318 | u64 data_len = 0; |
319 | int i; | 319 | int i; |
320 | 320 | ||
321 | WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); | 321 | WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); |
322 | 322 | ||
323 | head = msg->front.iov_base; | 323 | head = msg->front.iov_base; |
324 | head->snapid = cpu_to_le64(snap_id); | 324 | head->snapid = cpu_to_le64(snap_id); |
325 | op = (void *)(head + 1); | 325 | op = (void *)(head + 1); |
326 | p = (void *)(op + num_op); | 326 | p = (void *)(op + num_op); |
327 | 327 | ||
328 | req->r_snapc = ceph_get_snap_context(snapc); | 328 | req->r_snapc = ceph_get_snap_context(snapc); |
329 | 329 | ||
330 | head->client_inc = cpu_to_le32(1); /* always, for now. */ | 330 | head->client_inc = cpu_to_le32(1); /* always, for now. */ |
331 | head->flags = cpu_to_le32(flags); | 331 | head->flags = cpu_to_le32(flags); |
332 | if (flags & CEPH_OSD_FLAG_WRITE) | 332 | if (flags & CEPH_OSD_FLAG_WRITE) |
333 | ceph_encode_timespec(&head->mtime, mtime); | 333 | ceph_encode_timespec(&head->mtime, mtime); |
334 | BUG_ON(num_op > (unsigned int) ((u16) -1)); | 334 | BUG_ON(num_op > (unsigned int) ((u16) -1)); |
335 | head->num_ops = cpu_to_le16(num_op); | 335 | head->num_ops = cpu_to_le16(num_op); |
336 | 336 | ||
337 | /* fill in oid */ | 337 | /* fill in oid */ |
338 | head->object_len = cpu_to_le32(req->r_oid_len); | 338 | head->object_len = cpu_to_le32(req->r_oid_len); |
339 | memcpy(p, req->r_oid, req->r_oid_len); | 339 | memcpy(p, req->r_oid, req->r_oid_len); |
340 | p += req->r_oid_len; | 340 | p += req->r_oid_len; |
341 | 341 | ||
342 | src_op = src_ops; | 342 | src_op = src_ops; |
343 | while (num_op--) | 343 | while (num_op--) |
344 | osd_req_encode_op(req, op++, src_op++); | 344 | osd_req_encode_op(req, op++, src_op++); |
345 | 345 | ||
346 | data_len += req->r_trail.length; | 346 | data_len += req->r_trail.length; |
347 | 347 | ||
348 | if (snapc) { | 348 | if (snapc) { |
349 | head->snap_seq = cpu_to_le64(snapc->seq); | 349 | head->snap_seq = cpu_to_le64(snapc->seq); |
350 | head->num_snaps = cpu_to_le32(snapc->num_snaps); | 350 | head->num_snaps = cpu_to_le32(snapc->num_snaps); |
351 | for (i = 0; i < snapc->num_snaps; i++) { | 351 | for (i = 0; i < snapc->num_snaps; i++) { |
352 | put_unaligned_le64(snapc->snaps[i], p); | 352 | put_unaligned_le64(snapc->snaps[i], p); |
353 | p += sizeof(u64); | 353 | p += sizeof(u64); |
354 | } | 354 | } |
355 | } | 355 | } |
356 | 356 | ||
357 | if (flags & CEPH_OSD_FLAG_WRITE) { | 357 | if (flags & CEPH_OSD_FLAG_WRITE) { |
358 | req->r_request->hdr.data_off = cpu_to_le16(off); | 358 | req->r_request->hdr.data_off = cpu_to_le16(off); |
359 | req->r_request->hdr.data_len = cpu_to_le32(len + data_len); | 359 | req->r_request->hdr.data_len = cpu_to_le32(len + data_len); |
360 | } else if (data_len) { | 360 | } else if (data_len) { |
361 | req->r_request->hdr.data_off = 0; | 361 | req->r_request->hdr.data_off = 0; |
362 | req->r_request->hdr.data_len = cpu_to_le32(data_len); | 362 | req->r_request->hdr.data_len = cpu_to_le32(data_len); |
363 | } | 363 | } |
364 | 364 | ||
365 | req->r_request->page_alignment = req->r_page_alignment; | 365 | req->r_request->page_alignment = req->r_page_alignment; |
366 | 366 | ||
367 | BUG_ON(p > msg->front.iov_base + msg->front.iov_len); | 367 | BUG_ON(p > msg->front.iov_base + msg->front.iov_len); |
368 | msg_size = p - msg->front.iov_base; | 368 | msg_size = p - msg->front.iov_base; |
369 | msg->front.iov_len = msg_size; | 369 | msg->front.iov_len = msg_size; |
370 | msg->hdr.front_len = cpu_to_le32(msg_size); | 370 | msg->hdr.front_len = cpu_to_le32(msg_size); |
371 | return; | 371 | return; |
372 | } | 372 | } |
373 | EXPORT_SYMBOL(ceph_osdc_build_request); | 373 | EXPORT_SYMBOL(ceph_osdc_build_request); |
374 | 374 | ||
375 | /* | 375 | /* |
376 | * build new request AND message, calculate layout, and adjust file | 376 | * build new request AND message, calculate layout, and adjust file |
377 | * extent as needed. | 377 | * extent as needed. |
378 | * | 378 | * |
379 | * if the file was recently truncated, we include information about its | 379 | * if the file was recently truncated, we include information about its |
380 | * old and new size so that the object can be updated appropriately. (we | 380 | * old and new size so that the object can be updated appropriately. (we |
381 | * avoid synchronously deleting truncated objects because it's slow.) | 381 | * avoid synchronously deleting truncated objects because it's slow.) |
382 | * | 382 | * |
383 | * if @do_sync, include a 'startsync' command so that the osd will flush | 383 | * if @do_sync, include a 'startsync' command so that the osd will flush |
384 | * data quickly. | 384 | * data quickly. |
385 | */ | 385 | */ |
386 | struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, | 386 | struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, |
387 | struct ceph_file_layout *layout, | 387 | struct ceph_file_layout *layout, |
388 | struct ceph_vino vino, | 388 | struct ceph_vino vino, |
389 | u64 off, u64 *plen, | 389 | u64 off, u64 *plen, |
390 | int opcode, int flags, | 390 | int opcode, int flags, |
391 | struct ceph_snap_context *snapc, | 391 | struct ceph_snap_context *snapc, |
392 | int do_sync, | 392 | int do_sync, |
393 | u32 truncate_seq, | 393 | u32 truncate_seq, |
394 | u64 truncate_size, | 394 | u64 truncate_size, |
395 | struct timespec *mtime, | 395 | struct timespec *mtime, |
396 | bool use_mempool, int num_reply, | 396 | bool use_mempool, int num_reply, |
397 | int page_align) | 397 | int page_align) |
398 | { | 398 | { |
399 | struct ceph_osd_req_op ops[2]; | 399 | struct ceph_osd_req_op ops[2]; |
400 | struct ceph_osd_request *req; | 400 | struct ceph_osd_request *req; |
401 | unsigned int num_op = 1; | 401 | unsigned int num_op = 1; |
402 | int r; | 402 | int r; |
403 | 403 | ||
404 | memset(&ops, 0, sizeof ops); | 404 | memset(&ops, 0, sizeof ops); |
405 | 405 | ||
406 | ops[0].op = opcode; | 406 | ops[0].op = opcode; |
407 | ops[0].extent.truncate_seq = truncate_seq; | 407 | ops[0].extent.truncate_seq = truncate_seq; |
408 | ops[0].extent.truncate_size = truncate_size; | 408 | ops[0].extent.truncate_size = truncate_size; |
409 | 409 | ||
410 | if (do_sync) { | 410 | if (do_sync) { |
411 | ops[1].op = CEPH_OSD_OP_STARTSYNC; | 411 | ops[1].op = CEPH_OSD_OP_STARTSYNC; |
412 | num_op++; | 412 | num_op++; |
413 | } | 413 | } |
414 | 414 | ||
415 | req = ceph_osdc_alloc_request(osdc, snapc, num_op, use_mempool, | 415 | req = ceph_osdc_alloc_request(osdc, snapc, num_op, use_mempool, |
416 | GFP_NOFS); | 416 | GFP_NOFS); |
417 | if (!req) | 417 | if (!req) |
418 | return ERR_PTR(-ENOMEM); | 418 | return ERR_PTR(-ENOMEM); |
419 | req->r_flags = flags; | 419 | req->r_flags = flags; |
420 | 420 | ||
421 | /* calculate max write size */ | 421 | /* calculate max write size */ |
422 | r = calc_layout(vino, layout, off, plen, req, ops); | 422 | r = calc_layout(vino, layout, off, plen, req, ops); |
423 | if (r < 0) | 423 | if (r < 0) |
424 | return ERR_PTR(r); | 424 | return ERR_PTR(r); |
425 | req->r_file_layout = *layout; /* keep a copy */ | 425 | req->r_file_layout = *layout; /* keep a copy */ |
426 | 426 | ||
427 | /* in case it differs from natural (file) alignment that | 427 | /* in case it differs from natural (file) alignment that |
428 | calc_layout filled in for us */ | 428 | calc_layout filled in for us */ |
429 | req->r_num_pages = calc_pages_for(page_align, *plen); | 429 | req->r_num_pages = calc_pages_for(page_align, *plen); |
430 | req->r_page_alignment = page_align; | 430 | req->r_page_alignment = page_align; |
431 | 431 | ||
432 | ceph_osdc_build_request(req, off, *plen, num_op, ops, | 432 | ceph_osdc_build_request(req, off, *plen, num_op, ops, |
433 | snapc, vino.snap, mtime); | 433 | snapc, vino.snap, mtime); |
434 | 434 | ||
435 | return req; | 435 | return req; |
436 | } | 436 | } |
437 | EXPORT_SYMBOL(ceph_osdc_new_request); | 437 | EXPORT_SYMBOL(ceph_osdc_new_request); |
438 | 438 | ||
439 | /* | 439 | /* |
440 | * We keep osd requests in an rbtree, sorted by ->r_tid. | 440 | * We keep osd requests in an rbtree, sorted by ->r_tid. |
441 | */ | 441 | */ |
442 | static void __insert_request(struct ceph_osd_client *osdc, | 442 | static void __insert_request(struct ceph_osd_client *osdc, |
443 | struct ceph_osd_request *new) | 443 | struct ceph_osd_request *new) |
444 | { | 444 | { |
445 | struct rb_node **p = &osdc->requests.rb_node; | 445 | struct rb_node **p = &osdc->requests.rb_node; |
446 | struct rb_node *parent = NULL; | 446 | struct rb_node *parent = NULL; |
447 | struct ceph_osd_request *req = NULL; | 447 | struct ceph_osd_request *req = NULL; |
448 | 448 | ||
449 | while (*p) { | 449 | while (*p) { |
450 | parent = *p; | 450 | parent = *p; |
451 | req = rb_entry(parent, struct ceph_osd_request, r_node); | 451 | req = rb_entry(parent, struct ceph_osd_request, r_node); |
452 | if (new->r_tid < req->r_tid) | 452 | if (new->r_tid < req->r_tid) |
453 | p = &(*p)->rb_left; | 453 | p = &(*p)->rb_left; |
454 | else if (new->r_tid > req->r_tid) | 454 | else if (new->r_tid > req->r_tid) |
455 | p = &(*p)->rb_right; | 455 | p = &(*p)->rb_right; |
456 | else | 456 | else |
457 | BUG(); | 457 | BUG(); |
458 | } | 458 | } |
459 | 459 | ||
460 | rb_link_node(&new->r_node, parent, p); | 460 | rb_link_node(&new->r_node, parent, p); |
461 | rb_insert_color(&new->r_node, &osdc->requests); | 461 | rb_insert_color(&new->r_node, &osdc->requests); |
462 | } | 462 | } |
463 | 463 | ||
464 | static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc, | 464 | static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc, |
465 | u64 tid) | 465 | u64 tid) |
466 | { | 466 | { |
467 | struct ceph_osd_request *req; | 467 | struct ceph_osd_request *req; |
468 | struct rb_node *n = osdc->requests.rb_node; | 468 | struct rb_node *n = osdc->requests.rb_node; |
469 | 469 | ||
470 | while (n) { | 470 | while (n) { |
471 | req = rb_entry(n, struct ceph_osd_request, r_node); | 471 | req = rb_entry(n, struct ceph_osd_request, r_node); |
472 | if (tid < req->r_tid) | 472 | if (tid < req->r_tid) |
473 | n = n->rb_left; | 473 | n = n->rb_left; |
474 | else if (tid > req->r_tid) | 474 | else if (tid > req->r_tid) |
475 | n = n->rb_right; | 475 | n = n->rb_right; |
476 | else | 476 | else |
477 | return req; | 477 | return req; |
478 | } | 478 | } |
479 | return NULL; | 479 | return NULL; |
480 | } | 480 | } |
481 | 481 | ||
482 | static struct ceph_osd_request * | 482 | static struct ceph_osd_request * |
483 | __lookup_request_ge(struct ceph_osd_client *osdc, | 483 | __lookup_request_ge(struct ceph_osd_client *osdc, |
484 | u64 tid) | 484 | u64 tid) |
485 | { | 485 | { |
486 | struct ceph_osd_request *req; | 486 | struct ceph_osd_request *req; |
487 | struct rb_node *n = osdc->requests.rb_node; | 487 | struct rb_node *n = osdc->requests.rb_node; |
488 | 488 | ||
489 | while (n) { | 489 | while (n) { |
490 | req = rb_entry(n, struct ceph_osd_request, r_node); | 490 | req = rb_entry(n, struct ceph_osd_request, r_node); |
491 | if (tid < req->r_tid) { | 491 | if (tid < req->r_tid) { |
492 | if (!n->rb_left) | 492 | if (!n->rb_left) |
493 | return req; | 493 | return req; |
494 | n = n->rb_left; | 494 | n = n->rb_left; |
495 | } else if (tid > req->r_tid) { | 495 | } else if (tid > req->r_tid) { |
496 | n = n->rb_right; | 496 | n = n->rb_right; |
497 | } else { | 497 | } else { |
498 | return req; | 498 | return req; |
499 | } | 499 | } |
500 | } | 500 | } |
501 | return NULL; | 501 | return NULL; |
502 | } | 502 | } |
503 | 503 | ||
504 | /* | 504 | /* |
505 | * Resubmit requests pending on the given osd. | 505 | * Resubmit requests pending on the given osd. |
506 | */ | 506 | */ |
507 | static void __kick_osd_requests(struct ceph_osd_client *osdc, | 507 | static void __kick_osd_requests(struct ceph_osd_client *osdc, |
508 | struct ceph_osd *osd) | 508 | struct ceph_osd *osd) |
509 | { | 509 | { |
510 | struct ceph_osd_request *req, *nreq; | 510 | struct ceph_osd_request *req, *nreq; |
511 | int err; | 511 | int err; |
512 | 512 | ||
513 | dout("__kick_osd_requests osd%d\n", osd->o_osd); | 513 | dout("__kick_osd_requests osd%d\n", osd->o_osd); |
514 | err = __reset_osd(osdc, osd); | 514 | err = __reset_osd(osdc, osd); |
515 | if (err) | 515 | if (err) |
516 | return; | 516 | return; |
517 | 517 | ||
518 | list_for_each_entry(req, &osd->o_requests, r_osd_item) { | 518 | list_for_each_entry(req, &osd->o_requests, r_osd_item) { |
519 | list_move(&req->r_req_lru_item, &osdc->req_unsent); | 519 | list_move(&req->r_req_lru_item, &osdc->req_unsent); |
520 | dout("requeued %p tid %llu osd%d\n", req, req->r_tid, | 520 | dout("requeued %p tid %llu osd%d\n", req, req->r_tid, |
521 | osd->o_osd); | 521 | osd->o_osd); |
522 | if (!req->r_linger) | 522 | if (!req->r_linger) |
523 | req->r_flags |= CEPH_OSD_FLAG_RETRY; | 523 | req->r_flags |= CEPH_OSD_FLAG_RETRY; |
524 | } | 524 | } |
525 | 525 | ||
526 | list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, | 526 | list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, |
527 | r_linger_osd) { | 527 | r_linger_osd) { |
528 | /* | 528 | /* |
529 | * reregister request prior to unregistering linger so | 529 | * reregister request prior to unregistering linger so |
530 | * that r_osd is preserved. | 530 | * that r_osd is preserved. |
531 | */ | 531 | */ |
532 | BUG_ON(!list_empty(&req->r_req_lru_item)); | 532 | BUG_ON(!list_empty(&req->r_req_lru_item)); |
533 | __register_request(osdc, req); | 533 | __register_request(osdc, req); |
534 | list_add(&req->r_req_lru_item, &osdc->req_unsent); | 534 | list_add(&req->r_req_lru_item, &osdc->req_unsent); |
535 | list_add(&req->r_osd_item, &req->r_osd->o_requests); | 535 | list_add(&req->r_osd_item, &req->r_osd->o_requests); |
536 | __unregister_linger_request(osdc, req); | 536 | __unregister_linger_request(osdc, req); |
537 | dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid, | 537 | dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid, |
538 | osd->o_osd); | 538 | osd->o_osd); |
539 | } | 539 | } |
540 | } | 540 | } |
541 | 541 | ||
542 | /* | 542 | /* |
543 | * If the osd connection drops, we need to resubmit all requests. | 543 | * If the osd connection drops, we need to resubmit all requests. |
544 | */ | 544 | */ |
545 | static void osd_reset(struct ceph_connection *con) | 545 | static void osd_reset(struct ceph_connection *con) |
546 | { | 546 | { |
547 | struct ceph_osd *osd = con->private; | 547 | struct ceph_osd *osd = con->private; |
548 | struct ceph_osd_client *osdc; | 548 | struct ceph_osd_client *osdc; |
549 | 549 | ||
550 | if (!osd) | 550 | if (!osd) |
551 | return; | 551 | return; |
552 | dout("osd_reset osd%d\n", osd->o_osd); | 552 | dout("osd_reset osd%d\n", osd->o_osd); |
553 | osdc = osd->o_osdc; | 553 | osdc = osd->o_osdc; |
554 | down_read(&osdc->map_sem); | 554 | down_read(&osdc->map_sem); |
555 | mutex_lock(&osdc->request_mutex); | 555 | mutex_lock(&osdc->request_mutex); |
556 | __kick_osd_requests(osdc, osd); | 556 | __kick_osd_requests(osdc, osd); |
557 | mutex_unlock(&osdc->request_mutex); | 557 | mutex_unlock(&osdc->request_mutex); |
558 | send_queued(osdc); | 558 | send_queued(osdc); |
559 | up_read(&osdc->map_sem); | 559 | up_read(&osdc->map_sem); |
560 | } | 560 | } |
561 | 561 | ||
562 | /* | 562 | /* |
563 | * Track open sessions with osds. | 563 | * Track open sessions with osds. |
564 | */ | 564 | */ |
565 | static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) | 565 | static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) |
566 | { | 566 | { |
567 | struct ceph_osd *osd; | 567 | struct ceph_osd *osd; |
568 | 568 | ||
569 | osd = kzalloc(sizeof(*osd), GFP_NOFS); | 569 | osd = kzalloc(sizeof(*osd), GFP_NOFS); |
570 | if (!osd) | 570 | if (!osd) |
571 | return NULL; | 571 | return NULL; |
572 | 572 | ||
573 | atomic_set(&osd->o_ref, 1); | 573 | atomic_set(&osd->o_ref, 1); |
574 | osd->o_osdc = osdc; | 574 | osd->o_osdc = osdc; |
575 | osd->o_osd = onum; | 575 | osd->o_osd = onum; |
576 | RB_CLEAR_NODE(&osd->o_node); | 576 | RB_CLEAR_NODE(&osd->o_node); |
577 | INIT_LIST_HEAD(&osd->o_requests); | 577 | INIT_LIST_HEAD(&osd->o_requests); |
578 | INIT_LIST_HEAD(&osd->o_linger_requests); | 578 | INIT_LIST_HEAD(&osd->o_linger_requests); |
579 | INIT_LIST_HEAD(&osd->o_osd_lru); | 579 | INIT_LIST_HEAD(&osd->o_osd_lru); |
580 | osd->o_incarnation = 1; | 580 | osd->o_incarnation = 1; |
581 | 581 | ||
582 | ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr); | 582 | ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr); |
583 | 583 | ||
584 | INIT_LIST_HEAD(&osd->o_keepalive_item); | 584 | INIT_LIST_HEAD(&osd->o_keepalive_item); |
585 | return osd; | 585 | return osd; |
586 | } | 586 | } |
587 | 587 | ||
588 | static struct ceph_osd *get_osd(struct ceph_osd *osd) | 588 | static struct ceph_osd *get_osd(struct ceph_osd *osd) |
589 | { | 589 | { |
590 | if (atomic_inc_not_zero(&osd->o_ref)) { | 590 | if (atomic_inc_not_zero(&osd->o_ref)) { |
591 | dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1, | 591 | dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1, |
592 | atomic_read(&osd->o_ref)); | 592 | atomic_read(&osd->o_ref)); |
593 | return osd; | 593 | return osd; |
594 | } else { | 594 | } else { |
595 | dout("get_osd %p FAIL\n", osd); | 595 | dout("get_osd %p FAIL\n", osd); |
596 | return NULL; | 596 | return NULL; |
597 | } | 597 | } |
598 | } | 598 | } |
599 | 599 | ||
600 | static void put_osd(struct ceph_osd *osd) | 600 | static void put_osd(struct ceph_osd *osd) |
601 | { | 601 | { |
602 | dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), | 602 | dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), |
603 | atomic_read(&osd->o_ref) - 1); | 603 | atomic_read(&osd->o_ref) - 1); |
604 | if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) { | 604 | if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) { |
605 | struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; | 605 | struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; |
606 | 606 | ||
607 | if (ac->ops && ac->ops->destroy_authorizer) | 607 | if (ac->ops && ac->ops->destroy_authorizer) |
608 | ac->ops->destroy_authorizer(ac, osd->o_auth.authorizer); | 608 | ac->ops->destroy_authorizer(ac, osd->o_auth.authorizer); |
609 | kfree(osd); | 609 | kfree(osd); |
610 | } | 610 | } |
611 | } | 611 | } |
612 | 612 | ||
613 | /* | 613 | /* |
614 | * remove an osd from our map | 614 | * remove an osd from our map |
615 | */ | 615 | */ |
616 | static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) | 616 | static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) |
617 | { | 617 | { |
618 | dout("__remove_osd %p\n", osd); | 618 | dout("__remove_osd %p\n", osd); |
619 | BUG_ON(!list_empty(&osd->o_requests)); | 619 | BUG_ON(!list_empty(&osd->o_requests)); |
620 | rb_erase(&osd->o_node, &osdc->osds); | 620 | rb_erase(&osd->o_node, &osdc->osds); |
621 | list_del_init(&osd->o_osd_lru); | 621 | list_del_init(&osd->o_osd_lru); |
622 | ceph_con_close(&osd->o_con); | 622 | ceph_con_close(&osd->o_con); |
623 | put_osd(osd); | 623 | put_osd(osd); |
624 | } | 624 | } |
625 | 625 | ||
626 | static void remove_all_osds(struct ceph_osd_client *osdc) | 626 | static void remove_all_osds(struct ceph_osd_client *osdc) |
627 | { | 627 | { |
628 | dout("%s %p\n", __func__, osdc); | 628 | dout("%s %p\n", __func__, osdc); |
629 | mutex_lock(&osdc->request_mutex); | 629 | mutex_lock(&osdc->request_mutex); |
630 | while (!RB_EMPTY_ROOT(&osdc->osds)) { | 630 | while (!RB_EMPTY_ROOT(&osdc->osds)) { |
631 | struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), | 631 | struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), |
632 | struct ceph_osd, o_node); | 632 | struct ceph_osd, o_node); |
633 | __remove_osd(osdc, osd); | 633 | __remove_osd(osdc, osd); |
634 | } | 634 | } |
635 | mutex_unlock(&osdc->request_mutex); | 635 | mutex_unlock(&osdc->request_mutex); |
636 | } | 636 | } |
637 | 637 | ||
638 | static void __move_osd_to_lru(struct ceph_osd_client *osdc, | 638 | static void __move_osd_to_lru(struct ceph_osd_client *osdc, |
639 | struct ceph_osd *osd) | 639 | struct ceph_osd *osd) |
640 | { | 640 | { |
641 | dout("__move_osd_to_lru %p\n", osd); | 641 | dout("__move_osd_to_lru %p\n", osd); |
642 | BUG_ON(!list_empty(&osd->o_osd_lru)); | 642 | BUG_ON(!list_empty(&osd->o_osd_lru)); |
643 | list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); | 643 | list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); |
644 | osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ; | 644 | osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ; |
645 | } | 645 | } |
646 | 646 | ||
647 | static void __remove_osd_from_lru(struct ceph_osd *osd) | 647 | static void __remove_osd_from_lru(struct ceph_osd *osd) |
648 | { | 648 | { |
649 | dout("__remove_osd_from_lru %p\n", osd); | 649 | dout("__remove_osd_from_lru %p\n", osd); |
650 | if (!list_empty(&osd->o_osd_lru)) | 650 | if (!list_empty(&osd->o_osd_lru)) |
651 | list_del_init(&osd->o_osd_lru); | 651 | list_del_init(&osd->o_osd_lru); |
652 | } | 652 | } |
653 | 653 | ||
654 | static void remove_old_osds(struct ceph_osd_client *osdc) | 654 | static void remove_old_osds(struct ceph_osd_client *osdc) |
655 | { | 655 | { |
656 | struct ceph_osd *osd, *nosd; | 656 | struct ceph_osd *osd, *nosd; |
657 | 657 | ||
658 | dout("__remove_old_osds %p\n", osdc); | 658 | dout("__remove_old_osds %p\n", osdc); |
659 | mutex_lock(&osdc->request_mutex); | 659 | mutex_lock(&osdc->request_mutex); |
660 | list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { | 660 | list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { |
661 | if (time_before(jiffies, osd->lru_ttl)) | 661 | if (time_before(jiffies, osd->lru_ttl)) |
662 | break; | 662 | break; |
663 | __remove_osd(osdc, osd); | 663 | __remove_osd(osdc, osd); |
664 | } | 664 | } |
665 | mutex_unlock(&osdc->request_mutex); | 665 | mutex_unlock(&osdc->request_mutex); |
666 | } | 666 | } |
667 | 667 | ||
668 | /* | 668 | /* |
669 | * reset osd connect | 669 | * reset osd connect |
670 | */ | 670 | */ |
671 | static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) | 671 | static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) |
672 | { | 672 | { |
673 | struct ceph_entity_addr *peer_addr; | 673 | struct ceph_entity_addr *peer_addr; |
674 | 674 | ||
675 | dout("__reset_osd %p osd%d\n", osd, osd->o_osd); | 675 | dout("__reset_osd %p osd%d\n", osd, osd->o_osd); |
676 | if (list_empty(&osd->o_requests) && | 676 | if (list_empty(&osd->o_requests) && |
677 | list_empty(&osd->o_linger_requests)) { | 677 | list_empty(&osd->o_linger_requests)) { |
678 | __remove_osd(osdc, osd); | 678 | __remove_osd(osdc, osd); |
679 | 679 | ||
680 | return -ENODEV; | 680 | return -ENODEV; |
681 | } | 681 | } |
682 | 682 | ||
683 | peer_addr = &osdc->osdmap->osd_addr[osd->o_osd]; | 683 | peer_addr = &osdc->osdmap->osd_addr[osd->o_osd]; |
684 | if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) && | 684 | if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) && |
685 | !ceph_con_opened(&osd->o_con)) { | 685 | !ceph_con_opened(&osd->o_con)) { |
686 | struct ceph_osd_request *req; | 686 | struct ceph_osd_request *req; |
687 | 687 | ||
688 | dout(" osd addr hasn't changed and connection never opened," | 688 | dout(" osd addr hasn't changed and connection never opened," |
689 | " letting msgr retry"); | 689 | " letting msgr retry"); |
690 | /* touch each r_stamp for handle_timeout()'s benfit */ | 690 | /* touch each r_stamp for handle_timeout()'s benfit */ |
691 | list_for_each_entry(req, &osd->o_requests, r_osd_item) | 691 | list_for_each_entry(req, &osd->o_requests, r_osd_item) |
692 | req->r_stamp = jiffies; | 692 | req->r_stamp = jiffies; |
693 | 693 | ||
694 | return -EAGAIN; | 694 | return -EAGAIN; |
695 | } | 695 | } |
696 | 696 | ||
697 | ceph_con_close(&osd->o_con); | 697 | ceph_con_close(&osd->o_con); |
698 | ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr); | 698 | ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr); |
699 | osd->o_incarnation++; | 699 | osd->o_incarnation++; |
700 | 700 | ||
701 | return 0; | 701 | return 0; |
702 | } | 702 | } |
703 | 703 | ||
704 | static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new) | 704 | static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new) |
705 | { | 705 | { |
706 | struct rb_node **p = &osdc->osds.rb_node; | 706 | struct rb_node **p = &osdc->osds.rb_node; |
707 | struct rb_node *parent = NULL; | 707 | struct rb_node *parent = NULL; |
708 | struct ceph_osd *osd = NULL; | 708 | struct ceph_osd *osd = NULL; |
709 | 709 | ||
710 | dout("__insert_osd %p osd%d\n", new, new->o_osd); | 710 | dout("__insert_osd %p osd%d\n", new, new->o_osd); |
711 | while (*p) { | 711 | while (*p) { |
712 | parent = *p; | 712 | parent = *p; |
713 | osd = rb_entry(parent, struct ceph_osd, o_node); | 713 | osd = rb_entry(parent, struct ceph_osd, o_node); |
714 | if (new->o_osd < osd->o_osd) | 714 | if (new->o_osd < osd->o_osd) |
715 | p = &(*p)->rb_left; | 715 | p = &(*p)->rb_left; |
716 | else if (new->o_osd > osd->o_osd) | 716 | else if (new->o_osd > osd->o_osd) |
717 | p = &(*p)->rb_right; | 717 | p = &(*p)->rb_right; |
718 | else | 718 | else |
719 | BUG(); | 719 | BUG(); |
720 | } | 720 | } |
721 | 721 | ||
722 | rb_link_node(&new->o_node, parent, p); | 722 | rb_link_node(&new->o_node, parent, p); |
723 | rb_insert_color(&new->o_node, &osdc->osds); | 723 | rb_insert_color(&new->o_node, &osdc->osds); |
724 | } | 724 | } |
725 | 725 | ||
726 | static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o) | 726 | static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o) |
727 | { | 727 | { |
728 | struct ceph_osd *osd; | 728 | struct ceph_osd *osd; |
729 | struct rb_node *n = osdc->osds.rb_node; | 729 | struct rb_node *n = osdc->osds.rb_node; |
730 | 730 | ||
731 | while (n) { | 731 | while (n) { |
732 | osd = rb_entry(n, struct ceph_osd, o_node); | 732 | osd = rb_entry(n, struct ceph_osd, o_node); |
733 | if (o < osd->o_osd) | 733 | if (o < osd->o_osd) |
734 | n = n->rb_left; | 734 | n = n->rb_left; |
735 | else if (o > osd->o_osd) | 735 | else if (o > osd->o_osd) |
736 | n = n->rb_right; | 736 | n = n->rb_right; |
737 | else | 737 | else |
738 | return osd; | 738 | return osd; |
739 | } | 739 | } |
740 | return NULL; | 740 | return NULL; |
741 | } | 741 | } |
742 | 742 | ||
743 | static void __schedule_osd_timeout(struct ceph_osd_client *osdc) | 743 | static void __schedule_osd_timeout(struct ceph_osd_client *osdc) |
744 | { | 744 | { |
745 | schedule_delayed_work(&osdc->timeout_work, | 745 | schedule_delayed_work(&osdc->timeout_work, |
746 | osdc->client->options->osd_keepalive_timeout * HZ); | 746 | osdc->client->options->osd_keepalive_timeout * HZ); |
747 | } | 747 | } |
748 | 748 | ||
749 | static void __cancel_osd_timeout(struct ceph_osd_client *osdc) | 749 | static void __cancel_osd_timeout(struct ceph_osd_client *osdc) |
750 | { | 750 | { |
751 | cancel_delayed_work(&osdc->timeout_work); | 751 | cancel_delayed_work(&osdc->timeout_work); |
752 | } | 752 | } |
753 | 753 | ||
754 | /* | 754 | /* |
755 | * Register request, assign tid. If this is the first request, set up | 755 | * Register request, assign tid. If this is the first request, set up |
756 | * the timeout event. | 756 | * the timeout event. |
757 | */ | 757 | */ |
758 | static void __register_request(struct ceph_osd_client *osdc, | 758 | static void __register_request(struct ceph_osd_client *osdc, |
759 | struct ceph_osd_request *req) | 759 | struct ceph_osd_request *req) |
760 | { | 760 | { |
761 | req->r_tid = ++osdc->last_tid; | 761 | req->r_tid = ++osdc->last_tid; |
762 | req->r_request->hdr.tid = cpu_to_le64(req->r_tid); | 762 | req->r_request->hdr.tid = cpu_to_le64(req->r_tid); |
763 | dout("__register_request %p tid %lld\n", req, req->r_tid); | 763 | dout("__register_request %p tid %lld\n", req, req->r_tid); |
764 | __insert_request(osdc, req); | 764 | __insert_request(osdc, req); |
765 | ceph_osdc_get_request(req); | 765 | ceph_osdc_get_request(req); |
766 | osdc->num_requests++; | 766 | osdc->num_requests++; |
767 | if (osdc->num_requests == 1) { | 767 | if (osdc->num_requests == 1) { |
768 | dout(" first request, scheduling timeout\n"); | 768 | dout(" first request, scheduling timeout\n"); |
769 | __schedule_osd_timeout(osdc); | 769 | __schedule_osd_timeout(osdc); |
770 | } | 770 | } |
771 | } | 771 | } |
772 | 772 | ||
773 | static void register_request(struct ceph_osd_client *osdc, | 773 | static void register_request(struct ceph_osd_client *osdc, |
774 | struct ceph_osd_request *req) | 774 | struct ceph_osd_request *req) |
775 | { | 775 | { |
776 | mutex_lock(&osdc->request_mutex); | 776 | mutex_lock(&osdc->request_mutex); |
777 | __register_request(osdc, req); | 777 | __register_request(osdc, req); |
778 | mutex_unlock(&osdc->request_mutex); | 778 | mutex_unlock(&osdc->request_mutex); |
779 | } | 779 | } |
780 | 780 | ||
781 | /* | 781 | /* |
782 | * called under osdc->request_mutex | 782 | * called under osdc->request_mutex |
783 | */ | 783 | */ |
784 | static void __unregister_request(struct ceph_osd_client *osdc, | 784 | static void __unregister_request(struct ceph_osd_client *osdc, |
785 | struct ceph_osd_request *req) | 785 | struct ceph_osd_request *req) |
786 | { | 786 | { |
787 | if (RB_EMPTY_NODE(&req->r_node)) { | 787 | if (RB_EMPTY_NODE(&req->r_node)) { |
788 | dout("__unregister_request %p tid %lld not registered\n", | 788 | dout("__unregister_request %p tid %lld not registered\n", |
789 | req, req->r_tid); | 789 | req, req->r_tid); |
790 | return; | 790 | return; |
791 | } | 791 | } |
792 | 792 | ||
793 | dout("__unregister_request %p tid %lld\n", req, req->r_tid); | 793 | dout("__unregister_request %p tid %lld\n", req, req->r_tid); |
794 | rb_erase(&req->r_node, &osdc->requests); | 794 | rb_erase(&req->r_node, &osdc->requests); |
795 | osdc->num_requests--; | 795 | osdc->num_requests--; |
796 | 796 | ||
797 | if (req->r_osd) { | 797 | if (req->r_osd) { |
798 | /* make sure the original request isn't in flight. */ | 798 | /* make sure the original request isn't in flight. */ |
799 | ceph_msg_revoke(req->r_request); | 799 | ceph_msg_revoke(req->r_request); |
800 | 800 | ||
801 | list_del_init(&req->r_osd_item); | 801 | list_del_init(&req->r_osd_item); |
802 | if (list_empty(&req->r_osd->o_requests) && | 802 | if (list_empty(&req->r_osd->o_requests) && |
803 | list_empty(&req->r_osd->o_linger_requests)) { | 803 | list_empty(&req->r_osd->o_linger_requests)) { |
804 | dout("moving osd to %p lru\n", req->r_osd); | 804 | dout("moving osd to %p lru\n", req->r_osd); |
805 | __move_osd_to_lru(osdc, req->r_osd); | 805 | __move_osd_to_lru(osdc, req->r_osd); |
806 | } | 806 | } |
807 | if (list_empty(&req->r_linger_item)) | 807 | if (list_empty(&req->r_linger_item)) |
808 | req->r_osd = NULL; | 808 | req->r_osd = NULL; |
809 | } | 809 | } |
810 | 810 | ||
811 | list_del_init(&req->r_req_lru_item); | 811 | list_del_init(&req->r_req_lru_item); |
812 | ceph_osdc_put_request(req); | 812 | ceph_osdc_put_request(req); |
813 | 813 | ||
814 | if (osdc->num_requests == 0) { | 814 | if (osdc->num_requests == 0) { |
815 | dout(" no requests, canceling timeout\n"); | 815 | dout(" no requests, canceling timeout\n"); |
816 | __cancel_osd_timeout(osdc); | 816 | __cancel_osd_timeout(osdc); |
817 | } | 817 | } |
818 | } | 818 | } |
819 | 819 | ||
820 | /* | 820 | /* |
821 | * Cancel a previously queued request message | 821 | * Cancel a previously queued request message |
822 | */ | 822 | */ |
823 | static void __cancel_request(struct ceph_osd_request *req) | 823 | static void __cancel_request(struct ceph_osd_request *req) |
824 | { | 824 | { |
825 | if (req->r_sent && req->r_osd) { | 825 | if (req->r_sent && req->r_osd) { |
826 | ceph_msg_revoke(req->r_request); | 826 | ceph_msg_revoke(req->r_request); |
827 | req->r_sent = 0; | 827 | req->r_sent = 0; |
828 | } | 828 | } |
829 | } | 829 | } |
830 | 830 | ||
831 | static void __register_linger_request(struct ceph_osd_client *osdc, | 831 | static void __register_linger_request(struct ceph_osd_client *osdc, |
832 | struct ceph_osd_request *req) | 832 | struct ceph_osd_request *req) |
833 | { | 833 | { |
834 | dout("__register_linger_request %p\n", req); | 834 | dout("__register_linger_request %p\n", req); |
835 | list_add_tail(&req->r_linger_item, &osdc->req_linger); | 835 | list_add_tail(&req->r_linger_item, &osdc->req_linger); |
836 | if (req->r_osd) | 836 | if (req->r_osd) |
837 | list_add_tail(&req->r_linger_osd, | 837 | list_add_tail(&req->r_linger_osd, |
838 | &req->r_osd->o_linger_requests); | 838 | &req->r_osd->o_linger_requests); |
839 | } | 839 | } |
840 | 840 | ||
841 | static void __unregister_linger_request(struct ceph_osd_client *osdc, | 841 | static void __unregister_linger_request(struct ceph_osd_client *osdc, |
842 | struct ceph_osd_request *req) | 842 | struct ceph_osd_request *req) |
843 | { | 843 | { |
844 | dout("__unregister_linger_request %p\n", req); | 844 | dout("__unregister_linger_request %p\n", req); |
845 | list_del_init(&req->r_linger_item); | 845 | list_del_init(&req->r_linger_item); |
846 | if (req->r_osd) { | 846 | if (req->r_osd) { |
847 | list_del_init(&req->r_linger_osd); | 847 | list_del_init(&req->r_linger_osd); |
848 | 848 | ||
849 | if (list_empty(&req->r_osd->o_requests) && | 849 | if (list_empty(&req->r_osd->o_requests) && |
850 | list_empty(&req->r_osd->o_linger_requests)) { | 850 | list_empty(&req->r_osd->o_linger_requests)) { |
851 | dout("moving osd to %p lru\n", req->r_osd); | 851 | dout("moving osd to %p lru\n", req->r_osd); |
852 | __move_osd_to_lru(osdc, req->r_osd); | 852 | __move_osd_to_lru(osdc, req->r_osd); |
853 | } | 853 | } |
854 | if (list_empty(&req->r_osd_item)) | 854 | if (list_empty(&req->r_osd_item)) |
855 | req->r_osd = NULL; | 855 | req->r_osd = NULL; |
856 | } | 856 | } |
857 | } | 857 | } |
858 | 858 | ||
859 | void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, | 859 | void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc, |
860 | struct ceph_osd_request *req) | 860 | struct ceph_osd_request *req) |
861 | { | 861 | { |
862 | mutex_lock(&osdc->request_mutex); | 862 | mutex_lock(&osdc->request_mutex); |
863 | if (req->r_linger) { | 863 | if (req->r_linger) { |
864 | __unregister_linger_request(osdc, req); | 864 | __unregister_linger_request(osdc, req); |
865 | ceph_osdc_put_request(req); | 865 | ceph_osdc_put_request(req); |
866 | } | 866 | } |
867 | mutex_unlock(&osdc->request_mutex); | 867 | mutex_unlock(&osdc->request_mutex); |
868 | } | 868 | } |
869 | EXPORT_SYMBOL(ceph_osdc_unregister_linger_request); | 869 | EXPORT_SYMBOL(ceph_osdc_unregister_linger_request); |
870 | 870 | ||
871 | void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, | 871 | void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, |
872 | struct ceph_osd_request *req) | 872 | struct ceph_osd_request *req) |
873 | { | 873 | { |
874 | if (!req->r_linger) { | 874 | if (!req->r_linger) { |
875 | dout("set_request_linger %p\n", req); | 875 | dout("set_request_linger %p\n", req); |
876 | req->r_linger = 1; | 876 | req->r_linger = 1; |
877 | /* | 877 | /* |
878 | * caller is now responsible for calling | 878 | * caller is now responsible for calling |
879 | * unregister_linger_request | 879 | * unregister_linger_request |
880 | */ | 880 | */ |
881 | ceph_osdc_get_request(req); | 881 | ceph_osdc_get_request(req); |
882 | } | 882 | } |
883 | } | 883 | } |
884 | EXPORT_SYMBOL(ceph_osdc_set_request_linger); | 884 | EXPORT_SYMBOL(ceph_osdc_set_request_linger); |
885 | 885 | ||
886 | /* | 886 | /* |
887 | * Pick an osd (the first 'up' osd in the pg), allocate the osd struct | 887 | * Pick an osd (the first 'up' osd in the pg), allocate the osd struct |
888 | * (as needed), and set the request r_osd appropriately. If there is | 888 | * (as needed), and set the request r_osd appropriately. If there is |
889 | * no up osd, set r_osd to NULL. Move the request to the appropriate list | 889 | * no up osd, set r_osd to NULL. Move the request to the appropriate list |
890 | * (unsent, homeless) or leave on in-flight lru. | 890 | * (unsent, homeless) or leave on in-flight lru. |
891 | * | 891 | * |
892 | * Return 0 if unchanged, 1 if changed, or negative on error. | 892 | * Return 0 if unchanged, 1 if changed, or negative on error. |
893 | * | 893 | * |
894 | * Caller should hold map_sem for read and request_mutex. | 894 | * Caller should hold map_sem for read and request_mutex. |
895 | */ | 895 | */ |
896 | static int __map_request(struct ceph_osd_client *osdc, | 896 | static int __map_request(struct ceph_osd_client *osdc, |
897 | struct ceph_osd_request *req, int force_resend) | 897 | struct ceph_osd_request *req, int force_resend) |
898 | { | 898 | { |
899 | struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; | 899 | struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; |
900 | struct ceph_pg pgid; | 900 | struct ceph_pg pgid; |
901 | int acting[CEPH_PG_MAX_SIZE]; | 901 | int acting[CEPH_PG_MAX_SIZE]; |
902 | int o = -1, num = 0; | 902 | int o = -1, num = 0; |
903 | int err; | 903 | int err; |
904 | 904 | ||
905 | dout("map_request %p tid %lld\n", req, req->r_tid); | 905 | dout("map_request %p tid %lld\n", req, req->r_tid); |
906 | err = ceph_calc_object_layout(&reqhead->layout, req->r_oid, | 906 | err = ceph_calc_object_layout(&reqhead->layout, req->r_oid, |
907 | &req->r_file_layout, osdc->osdmap); | 907 | &req->r_file_layout, osdc->osdmap); |
908 | if (err) { | 908 | if (err) { |
909 | list_move(&req->r_req_lru_item, &osdc->req_notarget); | 909 | list_move(&req->r_req_lru_item, &osdc->req_notarget); |
910 | return err; | 910 | return err; |
911 | } | 911 | } |
912 | pgid = reqhead->layout.ol_pgid; | 912 | pgid = reqhead->layout.ol_pgid; |
913 | req->r_pgid = pgid; | 913 | req->r_pgid = pgid; |
914 | 914 | ||
915 | err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); | 915 | err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); |
916 | if (err > 0) { | 916 | if (err > 0) { |
917 | o = acting[0]; | 917 | o = acting[0]; |
918 | num = err; | 918 | num = err; |
919 | } | 919 | } |
920 | 920 | ||
921 | if ((!force_resend && | 921 | if ((!force_resend && |
922 | req->r_osd && req->r_osd->o_osd == o && | 922 | req->r_osd && req->r_osd->o_osd == o && |
923 | req->r_sent >= req->r_osd->o_incarnation && | 923 | req->r_sent >= req->r_osd->o_incarnation && |
924 | req->r_num_pg_osds == num && | 924 | req->r_num_pg_osds == num && |
925 | memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || | 925 | memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || |
926 | (req->r_osd == NULL && o == -1)) | 926 | (req->r_osd == NULL && o == -1)) |
927 | return 0; /* no change */ | 927 | return 0; /* no change */ |
928 | 928 | ||
929 | dout("map_request tid %llu pgid %d.%x osd%d (was osd%d)\n", | 929 | dout("map_request tid %llu pgid %d.%x osd%d (was osd%d)\n", |
930 | req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, | 930 | req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, |
931 | req->r_osd ? req->r_osd->o_osd : -1); | 931 | req->r_osd ? req->r_osd->o_osd : -1); |
932 | 932 | ||
933 | /* record full pg acting set */ | 933 | /* record full pg acting set */ |
934 | memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num); | 934 | memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num); |
935 | req->r_num_pg_osds = num; | 935 | req->r_num_pg_osds = num; |
936 | 936 | ||
937 | if (req->r_osd) { | 937 | if (req->r_osd) { |
938 | __cancel_request(req); | 938 | __cancel_request(req); |
939 | list_del_init(&req->r_osd_item); | 939 | list_del_init(&req->r_osd_item); |
940 | req->r_osd = NULL; | 940 | req->r_osd = NULL; |
941 | } | 941 | } |
942 | 942 | ||
943 | req->r_osd = __lookup_osd(osdc, o); | 943 | req->r_osd = __lookup_osd(osdc, o); |
944 | if (!req->r_osd && o >= 0) { | 944 | if (!req->r_osd && o >= 0) { |
945 | err = -ENOMEM; | 945 | err = -ENOMEM; |
946 | req->r_osd = create_osd(osdc, o); | 946 | req->r_osd = create_osd(osdc, o); |
947 | if (!req->r_osd) { | 947 | if (!req->r_osd) { |
948 | list_move(&req->r_req_lru_item, &osdc->req_notarget); | 948 | list_move(&req->r_req_lru_item, &osdc->req_notarget); |
949 | goto out; | 949 | goto out; |
950 | } | 950 | } |
951 | 951 | ||
952 | dout("map_request osd %p is osd%d\n", req->r_osd, o); | 952 | dout("map_request osd %p is osd%d\n", req->r_osd, o); |
953 | __insert_osd(osdc, req->r_osd); | 953 | __insert_osd(osdc, req->r_osd); |
954 | 954 | ||
955 | ceph_con_open(&req->r_osd->o_con, | 955 | ceph_con_open(&req->r_osd->o_con, |
956 | CEPH_ENTITY_TYPE_OSD, o, | 956 | CEPH_ENTITY_TYPE_OSD, o, |
957 | &osdc->osdmap->osd_addr[o]); | 957 | &osdc->osdmap->osd_addr[o]); |
958 | } | 958 | } |
959 | 959 | ||
960 | if (req->r_osd) { | 960 | if (req->r_osd) { |
961 | __remove_osd_from_lru(req->r_osd); | 961 | __remove_osd_from_lru(req->r_osd); |
962 | list_add(&req->r_osd_item, &req->r_osd->o_requests); | 962 | list_add(&req->r_osd_item, &req->r_osd->o_requests); |
963 | list_move(&req->r_req_lru_item, &osdc->req_unsent); | 963 | list_move(&req->r_req_lru_item, &osdc->req_unsent); |
964 | } else { | 964 | } else { |
965 | list_move(&req->r_req_lru_item, &osdc->req_notarget); | 965 | list_move(&req->r_req_lru_item, &osdc->req_notarget); |
966 | } | 966 | } |
967 | err = 1; /* osd or pg changed */ | 967 | err = 1; /* osd or pg changed */ |
968 | 968 | ||
969 | out: | 969 | out: |
970 | return err; | 970 | return err; |
971 | } | 971 | } |
972 | 972 | ||
973 | /* | 973 | /* |
974 | * caller should hold map_sem (for read) and request_mutex | 974 | * caller should hold map_sem (for read) and request_mutex |
975 | */ | 975 | */ |
976 | static void __send_request(struct ceph_osd_client *osdc, | 976 | static void __send_request(struct ceph_osd_client *osdc, |
977 | struct ceph_osd_request *req) | 977 | struct ceph_osd_request *req) |
978 | { | 978 | { |
979 | struct ceph_osd_request_head *reqhead; | 979 | struct ceph_osd_request_head *reqhead; |
980 | 980 | ||
981 | dout("send_request %p tid %llu to osd%d flags %d\n", | 981 | dout("send_request %p tid %llu to osd%d flags %d\n", |
982 | req, req->r_tid, req->r_osd->o_osd, req->r_flags); | 982 | req, req->r_tid, req->r_osd->o_osd, req->r_flags); |
983 | 983 | ||
984 | reqhead = req->r_request->front.iov_base; | 984 | reqhead = req->r_request->front.iov_base; |
985 | reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch); | 985 | reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch); |
986 | reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */ | 986 | reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */ |
987 | reqhead->reassert_version = req->r_reassert_version; | 987 | reqhead->reassert_version = req->r_reassert_version; |
988 | 988 | ||
989 | req->r_stamp = jiffies; | 989 | req->r_stamp = jiffies; |
990 | list_move_tail(&req->r_req_lru_item, &osdc->req_lru); | 990 | list_move_tail(&req->r_req_lru_item, &osdc->req_lru); |
991 | 991 | ||
992 | ceph_msg_get(req->r_request); /* send consumes a ref */ | 992 | ceph_msg_get(req->r_request); /* send consumes a ref */ |
993 | ceph_con_send(&req->r_osd->o_con, req->r_request); | 993 | ceph_con_send(&req->r_osd->o_con, req->r_request); |
994 | req->r_sent = req->r_osd->o_incarnation; | 994 | req->r_sent = req->r_osd->o_incarnation; |
995 | } | 995 | } |
996 | 996 | ||
997 | /* | 997 | /* |
998 | * Send any requests in the queue (req_unsent). | 998 | * Send any requests in the queue (req_unsent). |
999 | */ | 999 | */ |
1000 | static void send_queued(struct ceph_osd_client *osdc) | 1000 | static void send_queued(struct ceph_osd_client *osdc) |
1001 | { | 1001 | { |
1002 | struct ceph_osd_request *req, *tmp; | 1002 | struct ceph_osd_request *req, *tmp; |
1003 | 1003 | ||
1004 | dout("send_queued\n"); | 1004 | dout("send_queued\n"); |
1005 | mutex_lock(&osdc->request_mutex); | 1005 | mutex_lock(&osdc->request_mutex); |
1006 | list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) { | 1006 | list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) { |
1007 | __send_request(osdc, req); | 1007 | __send_request(osdc, req); |
1008 | } | 1008 | } |
1009 | mutex_unlock(&osdc->request_mutex); | 1009 | mutex_unlock(&osdc->request_mutex); |
1010 | } | 1010 | } |
1011 | 1011 | ||
1012 | /* | 1012 | /* |
1013 | * Timeout callback, called every N seconds when 1 or more osd | 1013 | * Timeout callback, called every N seconds when 1 or more osd |
1014 | * requests has been active for more than N seconds. When this | 1014 | * requests has been active for more than N seconds. When this |
1015 | * happens, we ping all OSDs with requests who have timed out to | 1015 | * happens, we ping all OSDs with requests who have timed out to |
1016 | * ensure any communications channel reset is detected. Reset the | 1016 | * ensure any communications channel reset is detected. Reset the |
1017 | * request timeouts another N seconds in the future as we go. | 1017 | * request timeouts another N seconds in the future as we go. |
1018 | * Reschedule the timeout event another N seconds in future (unless | 1018 | * Reschedule the timeout event another N seconds in future (unless |
1019 | * there are no open requests). | 1019 | * there are no open requests). |
1020 | */ | 1020 | */ |
1021 | static void handle_timeout(struct work_struct *work) | 1021 | static void handle_timeout(struct work_struct *work) |
1022 | { | 1022 | { |
1023 | struct ceph_osd_client *osdc = | 1023 | struct ceph_osd_client *osdc = |
1024 | container_of(work, struct ceph_osd_client, timeout_work.work); | 1024 | container_of(work, struct ceph_osd_client, timeout_work.work); |
1025 | struct ceph_osd_request *req; | 1025 | struct ceph_osd_request *req; |
1026 | struct ceph_osd *osd; | 1026 | struct ceph_osd *osd; |
1027 | unsigned long keepalive = | 1027 | unsigned long keepalive = |
1028 | osdc->client->options->osd_keepalive_timeout * HZ; | 1028 | osdc->client->options->osd_keepalive_timeout * HZ; |
1029 | struct list_head slow_osds; | 1029 | struct list_head slow_osds; |
1030 | dout("timeout\n"); | 1030 | dout("timeout\n"); |
1031 | down_read(&osdc->map_sem); | 1031 | down_read(&osdc->map_sem); |
1032 | 1032 | ||
1033 | ceph_monc_request_next_osdmap(&osdc->client->monc); | 1033 | ceph_monc_request_next_osdmap(&osdc->client->monc); |
1034 | 1034 | ||
1035 | mutex_lock(&osdc->request_mutex); | 1035 | mutex_lock(&osdc->request_mutex); |
1036 | 1036 | ||
1037 | /* | 1037 | /* |
1038 | * ping osds that are a bit slow. this ensures that if there | 1038 | * ping osds that are a bit slow. this ensures that if there |
1039 | * is a break in the TCP connection we will notice, and reopen | 1039 | * is a break in the TCP connection we will notice, and reopen |
1040 | * a connection with that osd (from the fault callback). | 1040 | * a connection with that osd (from the fault callback). |
1041 | */ | 1041 | */ |
1042 | INIT_LIST_HEAD(&slow_osds); | 1042 | INIT_LIST_HEAD(&slow_osds); |
1043 | list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { | 1043 | list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { |
1044 | if (time_before(jiffies, req->r_stamp + keepalive)) | 1044 | if (time_before(jiffies, req->r_stamp + keepalive)) |
1045 | break; | 1045 | break; |
1046 | 1046 | ||
1047 | osd = req->r_osd; | 1047 | osd = req->r_osd; |
1048 | BUG_ON(!osd); | 1048 | BUG_ON(!osd); |
1049 | dout(" tid %llu is slow, will send keepalive on osd%d\n", | 1049 | dout(" tid %llu is slow, will send keepalive on osd%d\n", |
1050 | req->r_tid, osd->o_osd); | 1050 | req->r_tid, osd->o_osd); |
1051 | list_move_tail(&osd->o_keepalive_item, &slow_osds); | 1051 | list_move_tail(&osd->o_keepalive_item, &slow_osds); |
1052 | } | 1052 | } |
1053 | while (!list_empty(&slow_osds)) { | 1053 | while (!list_empty(&slow_osds)) { |
1054 | osd = list_entry(slow_osds.next, struct ceph_osd, | 1054 | osd = list_entry(slow_osds.next, struct ceph_osd, |
1055 | o_keepalive_item); | 1055 | o_keepalive_item); |
1056 | list_del_init(&osd->o_keepalive_item); | 1056 | list_del_init(&osd->o_keepalive_item); |
1057 | ceph_con_keepalive(&osd->o_con); | 1057 | ceph_con_keepalive(&osd->o_con); |
1058 | } | 1058 | } |
1059 | 1059 | ||
1060 | __schedule_osd_timeout(osdc); | 1060 | __schedule_osd_timeout(osdc); |
1061 | mutex_unlock(&osdc->request_mutex); | 1061 | mutex_unlock(&osdc->request_mutex); |
1062 | send_queued(osdc); | 1062 | send_queued(osdc); |
1063 | up_read(&osdc->map_sem); | 1063 | up_read(&osdc->map_sem); |
1064 | } | 1064 | } |
1065 | 1065 | ||
1066 | static void handle_osds_timeout(struct work_struct *work) | 1066 | static void handle_osds_timeout(struct work_struct *work) |
1067 | { | 1067 | { |
1068 | struct ceph_osd_client *osdc = | 1068 | struct ceph_osd_client *osdc = |
1069 | container_of(work, struct ceph_osd_client, | 1069 | container_of(work, struct ceph_osd_client, |
1070 | osds_timeout_work.work); | 1070 | osds_timeout_work.work); |
1071 | unsigned long delay = | 1071 | unsigned long delay = |
1072 | osdc->client->options->osd_idle_ttl * HZ >> 2; | 1072 | osdc->client->options->osd_idle_ttl * HZ >> 2; |
1073 | 1073 | ||
1074 | dout("osds timeout\n"); | 1074 | dout("osds timeout\n"); |
1075 | down_read(&osdc->map_sem); | 1075 | down_read(&osdc->map_sem); |
1076 | remove_old_osds(osdc); | 1076 | remove_old_osds(osdc); |
1077 | up_read(&osdc->map_sem); | 1077 | up_read(&osdc->map_sem); |
1078 | 1078 | ||
1079 | schedule_delayed_work(&osdc->osds_timeout_work, | 1079 | schedule_delayed_work(&osdc->osds_timeout_work, |
1080 | round_jiffies_relative(delay)); | 1080 | round_jiffies_relative(delay)); |
1081 | } | 1081 | } |
1082 | 1082 | ||
1083 | static void complete_request(struct ceph_osd_request *req) | 1083 | static void complete_request(struct ceph_osd_request *req) |
1084 | { | 1084 | { |
1085 | if (req->r_safe_callback) | 1085 | if (req->r_safe_callback) |
1086 | req->r_safe_callback(req, NULL); | 1086 | req->r_safe_callback(req, NULL); |
1087 | complete_all(&req->r_safe_completion); /* fsync waiter */ | 1087 | complete_all(&req->r_safe_completion); /* fsync waiter */ |
1088 | } | 1088 | } |
1089 | 1089 | ||
1090 | /* | 1090 | /* |
1091 | * handle osd op reply. either call the callback if it is specified, | 1091 | * handle osd op reply. either call the callback if it is specified, |
1092 | * or do the completion to wake up the waiting thread. | 1092 | * or do the completion to wake up the waiting thread. |
1093 | */ | 1093 | */ |
1094 | static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, | 1094 | static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, |
1095 | struct ceph_connection *con) | 1095 | struct ceph_connection *con) |
1096 | { | 1096 | { |
1097 | struct ceph_osd_reply_head *rhead = msg->front.iov_base; | 1097 | struct ceph_osd_reply_head *rhead = msg->front.iov_base; |
1098 | struct ceph_osd_request *req; | 1098 | struct ceph_osd_request *req; |
1099 | u64 tid; | 1099 | u64 tid; |
1100 | int numops, object_len, flags; | 1100 | int numops, object_len, flags; |
1101 | s32 result; | 1101 | s32 result; |
1102 | 1102 | ||
1103 | tid = le64_to_cpu(msg->hdr.tid); | 1103 | tid = le64_to_cpu(msg->hdr.tid); |
1104 | if (msg->front.iov_len < sizeof(*rhead)) | 1104 | if (msg->front.iov_len < sizeof(*rhead)) |
1105 | goto bad; | 1105 | goto bad; |
1106 | numops = le32_to_cpu(rhead->num_ops); | 1106 | numops = le32_to_cpu(rhead->num_ops); |
1107 | object_len = le32_to_cpu(rhead->object_len); | 1107 | object_len = le32_to_cpu(rhead->object_len); |
1108 | result = le32_to_cpu(rhead->result); | 1108 | result = le32_to_cpu(rhead->result); |
1109 | if (msg->front.iov_len != sizeof(*rhead) + object_len + | 1109 | if (msg->front.iov_len != sizeof(*rhead) + object_len + |
1110 | numops * sizeof(struct ceph_osd_op)) | 1110 | numops * sizeof(struct ceph_osd_op)) |
1111 | goto bad; | 1111 | goto bad; |
1112 | dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result); | 1112 | dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result); |
1113 | /* lookup */ | 1113 | /* lookup */ |
1114 | mutex_lock(&osdc->request_mutex); | 1114 | mutex_lock(&osdc->request_mutex); |
1115 | req = __lookup_request(osdc, tid); | 1115 | req = __lookup_request(osdc, tid); |
1116 | if (req == NULL) { | 1116 | if (req == NULL) { |
1117 | dout("handle_reply tid %llu dne\n", tid); | 1117 | dout("handle_reply tid %llu dne\n", tid); |
1118 | mutex_unlock(&osdc->request_mutex); | 1118 | mutex_unlock(&osdc->request_mutex); |
1119 | return; | 1119 | return; |
1120 | } | 1120 | } |
1121 | ceph_osdc_get_request(req); | 1121 | ceph_osdc_get_request(req); |
1122 | flags = le32_to_cpu(rhead->flags); | 1122 | flags = le32_to_cpu(rhead->flags); |
1123 | 1123 | ||
1124 | /* | 1124 | /* |
1125 | * if this connection filled our message, drop our reference now, to | 1125 | * if this connection filled our message, drop our reference now, to |
1126 | * avoid a (safe but slower) revoke later. | 1126 | * avoid a (safe but slower) revoke later. |
1127 | */ | 1127 | */ |
1128 | if (req->r_con_filling_msg == con && req->r_reply == msg) { | 1128 | if (req->r_con_filling_msg == con && req->r_reply == msg) { |
1129 | dout(" dropping con_filling_msg ref %p\n", con); | 1129 | dout(" dropping con_filling_msg ref %p\n", con); |
1130 | req->r_con_filling_msg = NULL; | 1130 | req->r_con_filling_msg = NULL; |
1131 | con->ops->put(con); | 1131 | con->ops->put(con); |
1132 | } | 1132 | } |
1133 | 1133 | ||
1134 | if (!req->r_got_reply) { | 1134 | if (!req->r_got_reply) { |
1135 | unsigned int bytes; | 1135 | unsigned int bytes; |
1136 | 1136 | ||
1137 | req->r_result = le32_to_cpu(rhead->result); | 1137 | req->r_result = le32_to_cpu(rhead->result); |
1138 | bytes = le32_to_cpu(msg->hdr.data_len); | 1138 | bytes = le32_to_cpu(msg->hdr.data_len); |
1139 | dout("handle_reply result %d bytes %d\n", req->r_result, | 1139 | dout("handle_reply result %d bytes %d\n", req->r_result, |
1140 | bytes); | 1140 | bytes); |
1141 | if (req->r_result == 0) | 1141 | if (req->r_result == 0) |
1142 | req->r_result = bytes; | 1142 | req->r_result = bytes; |
1143 | 1143 | ||
1144 | /* in case this is a write and we need to replay, */ | 1144 | /* in case this is a write and we need to replay, */ |
1145 | req->r_reassert_version = rhead->reassert_version; | 1145 | req->r_reassert_version = rhead->reassert_version; |
1146 | 1146 | ||
1147 | req->r_got_reply = 1; | 1147 | req->r_got_reply = 1; |
1148 | } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { | 1148 | } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { |
1149 | dout("handle_reply tid %llu dup ack\n", tid); | 1149 | dout("handle_reply tid %llu dup ack\n", tid); |
1150 | mutex_unlock(&osdc->request_mutex); | 1150 | mutex_unlock(&osdc->request_mutex); |
1151 | goto done; | 1151 | goto done; |
1152 | } | 1152 | } |
1153 | 1153 | ||
1154 | dout("handle_reply tid %llu flags %d\n", tid, flags); | 1154 | dout("handle_reply tid %llu flags %d\n", tid, flags); |
1155 | 1155 | ||
1156 | if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK)) | 1156 | if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK)) |
1157 | __register_linger_request(osdc, req); | 1157 | __register_linger_request(osdc, req); |
1158 | 1158 | ||
1159 | /* either this is a read, or we got the safe response */ | 1159 | /* either this is a read, or we got the safe response */ |
1160 | if (result < 0 || | 1160 | if (result < 0 || |
1161 | (flags & CEPH_OSD_FLAG_ONDISK) || | 1161 | (flags & CEPH_OSD_FLAG_ONDISK) || |
1162 | ((flags & CEPH_OSD_FLAG_WRITE) == 0)) | 1162 | ((flags & CEPH_OSD_FLAG_WRITE) == 0)) |
1163 | __unregister_request(osdc, req); | 1163 | __unregister_request(osdc, req); |
1164 | 1164 | ||
1165 | mutex_unlock(&osdc->request_mutex); | 1165 | mutex_unlock(&osdc->request_mutex); |
1166 | 1166 | ||
1167 | if (req->r_callback) | 1167 | if (req->r_callback) |
1168 | req->r_callback(req, msg); | 1168 | req->r_callback(req, msg); |
1169 | else | 1169 | else |
1170 | complete_all(&req->r_completion); | 1170 | complete_all(&req->r_completion); |
1171 | 1171 | ||
1172 | if (flags & CEPH_OSD_FLAG_ONDISK) | 1172 | if (flags & CEPH_OSD_FLAG_ONDISK) |
1173 | complete_request(req); | 1173 | complete_request(req); |
1174 | 1174 | ||
1175 | done: | 1175 | done: |
1176 | dout("req=%p req->r_linger=%d\n", req, req->r_linger); | 1176 | dout("req=%p req->r_linger=%d\n", req, req->r_linger); |
1177 | ceph_osdc_put_request(req); | 1177 | ceph_osdc_put_request(req); |
1178 | return; | 1178 | return; |
1179 | 1179 | ||
1180 | bad: | 1180 | bad: |
1181 | pr_err("corrupt osd_op_reply got %d %d expected %d\n", | 1181 | pr_err("corrupt osd_op_reply got %d %d expected %d\n", |
1182 | (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len), | 1182 | (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len), |
1183 | (int)sizeof(*rhead)); | 1183 | (int)sizeof(*rhead)); |
1184 | ceph_msg_dump(msg); | 1184 | ceph_msg_dump(msg); |
1185 | } | 1185 | } |
1186 | 1186 | ||
1187 | static void reset_changed_osds(struct ceph_osd_client *osdc) | 1187 | static void reset_changed_osds(struct ceph_osd_client *osdc) |
1188 | { | 1188 | { |
1189 | struct rb_node *p, *n; | 1189 | struct rb_node *p, *n; |
1190 | 1190 | ||
1191 | for (p = rb_first(&osdc->osds); p; p = n) { | 1191 | for (p = rb_first(&osdc->osds); p; p = n) { |
1192 | struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node); | 1192 | struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node); |
1193 | 1193 | ||
1194 | n = rb_next(p); | 1194 | n = rb_next(p); |
1195 | if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || | 1195 | if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || |
1196 | memcmp(&osd->o_con.peer_addr, | 1196 | memcmp(&osd->o_con.peer_addr, |
1197 | ceph_osd_addr(osdc->osdmap, | 1197 | ceph_osd_addr(osdc->osdmap, |
1198 | osd->o_osd), | 1198 | osd->o_osd), |
1199 | sizeof(struct ceph_entity_addr)) != 0) | 1199 | sizeof(struct ceph_entity_addr)) != 0) |
1200 | __reset_osd(osdc, osd); | 1200 | __reset_osd(osdc, osd); |
1201 | } | 1201 | } |
1202 | } | 1202 | } |
1203 | 1203 | ||
1204 | /* | 1204 | /* |
1205 | * Requeue requests whose mapping to an OSD has changed. If requests map to | 1205 | * Requeue requests whose mapping to an OSD has changed. If requests map to |
1206 | * no osd, request a new map. | 1206 | * no osd, request a new map. |
1207 | * | 1207 | * |
1208 | * Caller should hold map_sem for read. | 1208 | * Caller should hold map_sem for read. |
1209 | */ | 1209 | */ |
1210 | static void kick_requests(struct ceph_osd_client *osdc, int force_resend) | 1210 | static void kick_requests(struct ceph_osd_client *osdc, int force_resend) |
1211 | { | 1211 | { |
1212 | struct ceph_osd_request *req, *nreq; | 1212 | struct ceph_osd_request *req, *nreq; |
1213 | struct rb_node *p; | 1213 | struct rb_node *p; |
1214 | int needmap = 0; | 1214 | int needmap = 0; |
1215 | int err; | 1215 | int err; |
1216 | 1216 | ||
1217 | dout("kick_requests %s\n", force_resend ? " (force resend)" : ""); | 1217 | dout("kick_requests %s\n", force_resend ? " (force resend)" : ""); |
1218 | mutex_lock(&osdc->request_mutex); | 1218 | mutex_lock(&osdc->request_mutex); |
1219 | for (p = rb_first(&osdc->requests); p; ) { | 1219 | for (p = rb_first(&osdc->requests); p; ) { |
1220 | req = rb_entry(p, struct ceph_osd_request, r_node); | 1220 | req = rb_entry(p, struct ceph_osd_request, r_node); |
1221 | p = rb_next(p); | 1221 | p = rb_next(p); |
1222 | 1222 | ||
1223 | /* | 1223 | /* |
1224 | * For linger requests that have not yet been | 1224 | * For linger requests that have not yet been |
1225 | * registered, move them to the linger list; they'll | 1225 | * registered, move them to the linger list; they'll |
1226 | * be sent to the osd in the loop below. Unregister | 1226 | * be sent to the osd in the loop below. Unregister |
1227 | * the request before re-registering it as a linger | 1227 | * the request before re-registering it as a linger |
1228 | * request to ensure the __map_request() below | 1228 | * request to ensure the __map_request() below |
1229 | * will decide it needs to be sent. | 1229 | * will decide it needs to be sent. |
1230 | */ | 1230 | */ |
1231 | if (req->r_linger && list_empty(&req->r_linger_item)) { | 1231 | if (req->r_linger && list_empty(&req->r_linger_item)) { |
1232 | dout("%p tid %llu restart on osd%d\n", | 1232 | dout("%p tid %llu restart on osd%d\n", |
1233 | req, req->r_tid, | 1233 | req, req->r_tid, |
1234 | req->r_osd ? req->r_osd->o_osd : -1); | 1234 | req->r_osd ? req->r_osd->o_osd : -1); |
1235 | __unregister_request(osdc, req); | 1235 | __unregister_request(osdc, req); |
1236 | __register_linger_request(osdc, req); | 1236 | __register_linger_request(osdc, req); |
1237 | continue; | 1237 | continue; |
1238 | } | 1238 | } |
1239 | 1239 | ||
1240 | err = __map_request(osdc, req, force_resend); | 1240 | err = __map_request(osdc, req, force_resend); |
1241 | if (err < 0) | 1241 | if (err < 0) |
1242 | continue; /* error */ | 1242 | continue; /* error */ |
1243 | if (req->r_osd == NULL) { | 1243 | if (req->r_osd == NULL) { |
1244 | dout("%p tid %llu maps to no osd\n", req, req->r_tid); | 1244 | dout("%p tid %llu maps to no osd\n", req, req->r_tid); |
1245 | needmap++; /* request a newer map */ | 1245 | needmap++; /* request a newer map */ |
1246 | } else if (err > 0) { | 1246 | } else if (err > 0) { |
1247 | if (!req->r_linger) { | 1247 | if (!req->r_linger) { |
1248 | dout("%p tid %llu requeued on osd%d\n", req, | 1248 | dout("%p tid %llu requeued on osd%d\n", req, |
1249 | req->r_tid, | 1249 | req->r_tid, |
1250 | req->r_osd ? req->r_osd->o_osd : -1); | 1250 | req->r_osd ? req->r_osd->o_osd : -1); |
1251 | req->r_flags |= CEPH_OSD_FLAG_RETRY; | 1251 | req->r_flags |= CEPH_OSD_FLAG_RETRY; |
1252 | } | 1252 | } |
1253 | } | 1253 | } |
1254 | } | 1254 | } |
1255 | 1255 | ||
1256 | list_for_each_entry_safe(req, nreq, &osdc->req_linger, | 1256 | list_for_each_entry_safe(req, nreq, &osdc->req_linger, |
1257 | r_linger_item) { | 1257 | r_linger_item) { |
1258 | dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); | 1258 | dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); |
1259 | 1259 | ||
1260 | err = __map_request(osdc, req, force_resend); | 1260 | err = __map_request(osdc, req, force_resend); |
1261 | dout("__map_request returned %d\n", err); | 1261 | dout("__map_request returned %d\n", err); |
1262 | if (err == 0) | 1262 | if (err == 0) |
1263 | continue; /* no change and no osd was specified */ | 1263 | continue; /* no change and no osd was specified */ |
1264 | if (err < 0) | 1264 | if (err < 0) |
1265 | continue; /* hrm! */ | 1265 | continue; /* hrm! */ |
1266 | if (req->r_osd == NULL) { | 1266 | if (req->r_osd == NULL) { |
1267 | dout("tid %llu maps to no valid osd\n", req->r_tid); | 1267 | dout("tid %llu maps to no valid osd\n", req->r_tid); |
1268 | needmap++; /* request a newer map */ | 1268 | needmap++; /* request a newer map */ |
1269 | continue; | 1269 | continue; |
1270 | } | 1270 | } |
1271 | 1271 | ||
1272 | dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid, | 1272 | dout("kicking lingering %p tid %llu osd%d\n", req, req->r_tid, |
1273 | req->r_osd ? req->r_osd->o_osd : -1); | 1273 | req->r_osd ? req->r_osd->o_osd : -1); |
1274 | __register_request(osdc, req); | 1274 | __register_request(osdc, req); |
1275 | __unregister_linger_request(osdc, req); | 1275 | __unregister_linger_request(osdc, req); |
1276 | } | 1276 | } |
1277 | mutex_unlock(&osdc->request_mutex); | 1277 | mutex_unlock(&osdc->request_mutex); |
1278 | 1278 | ||
1279 | if (needmap) { | 1279 | if (needmap) { |
1280 | dout("%d requests for down osds, need new map\n", needmap); | 1280 | dout("%d requests for down osds, need new map\n", needmap); |
1281 | ceph_monc_request_next_osdmap(&osdc->client->monc); | 1281 | ceph_monc_request_next_osdmap(&osdc->client->monc); |
1282 | } | 1282 | } |
1283 | reset_changed_osds(osdc); | 1283 | reset_changed_osds(osdc); |
1284 | } | 1284 | } |
1285 | 1285 | ||
1286 | 1286 | ||
1287 | /* | 1287 | /* |
1288 | * Process updated osd map. | 1288 | * Process updated osd map. |
1289 | * | 1289 | * |
1290 | * The message contains any number of incremental and full maps, normally | 1290 | * The message contains any number of incremental and full maps, normally |
1291 | * indicating some sort of topology change in the cluster. Kick requests | 1291 | * indicating some sort of topology change in the cluster. Kick requests |
1292 | * off to different OSDs as needed. | 1292 | * off to different OSDs as needed. |
1293 | */ | 1293 | */ |
1294 | void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) | 1294 | void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) |
1295 | { | 1295 | { |
1296 | void *p, *end, *next; | 1296 | void *p, *end, *next; |
1297 | u32 nr_maps, maplen; | 1297 | u32 nr_maps, maplen; |
1298 | u32 epoch; | 1298 | u32 epoch; |
1299 | struct ceph_osdmap *newmap = NULL, *oldmap; | 1299 | struct ceph_osdmap *newmap = NULL, *oldmap; |
1300 | int err; | 1300 | int err; |
1301 | struct ceph_fsid fsid; | 1301 | struct ceph_fsid fsid; |
1302 | 1302 | ||
1303 | dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); | 1303 | dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); |
1304 | p = msg->front.iov_base; | 1304 | p = msg->front.iov_base; |
1305 | end = p + msg->front.iov_len; | 1305 | end = p + msg->front.iov_len; |
1306 | 1306 | ||
1307 | /* verify fsid */ | 1307 | /* verify fsid */ |
1308 | ceph_decode_need(&p, end, sizeof(fsid), bad); | 1308 | ceph_decode_need(&p, end, sizeof(fsid), bad); |
1309 | ceph_decode_copy(&p, &fsid, sizeof(fsid)); | 1309 | ceph_decode_copy(&p, &fsid, sizeof(fsid)); |
1310 | if (ceph_check_fsid(osdc->client, &fsid) < 0) | 1310 | if (ceph_check_fsid(osdc->client, &fsid) < 0) |
1311 | return; | 1311 | return; |
1312 | 1312 | ||
1313 | down_write(&osdc->map_sem); | 1313 | down_write(&osdc->map_sem); |
1314 | 1314 | ||
1315 | /* incremental maps */ | 1315 | /* incremental maps */ |
1316 | ceph_decode_32_safe(&p, end, nr_maps, bad); | 1316 | ceph_decode_32_safe(&p, end, nr_maps, bad); |
1317 | dout(" %d inc maps\n", nr_maps); | 1317 | dout(" %d inc maps\n", nr_maps); |
1318 | while (nr_maps > 0) { | 1318 | while (nr_maps > 0) { |
1319 | ceph_decode_need(&p, end, 2*sizeof(u32), bad); | 1319 | ceph_decode_need(&p, end, 2*sizeof(u32), bad); |
1320 | epoch = ceph_decode_32(&p); | 1320 | epoch = ceph_decode_32(&p); |
1321 | maplen = ceph_decode_32(&p); | 1321 | maplen = ceph_decode_32(&p); |
1322 | ceph_decode_need(&p, end, maplen, bad); | 1322 | ceph_decode_need(&p, end, maplen, bad); |
1323 | next = p + maplen; | 1323 | next = p + maplen; |
1324 | if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) { | 1324 | if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) { |
1325 | dout("applying incremental map %u len %d\n", | 1325 | dout("applying incremental map %u len %d\n", |
1326 | epoch, maplen); | 1326 | epoch, maplen); |
1327 | newmap = osdmap_apply_incremental(&p, next, | 1327 | newmap = osdmap_apply_incremental(&p, next, |
1328 | osdc->osdmap, | 1328 | osdc->osdmap, |
1329 | &osdc->client->msgr); | 1329 | &osdc->client->msgr); |
1330 | if (IS_ERR(newmap)) { | 1330 | if (IS_ERR(newmap)) { |
1331 | err = PTR_ERR(newmap); | 1331 | err = PTR_ERR(newmap); |
1332 | goto bad; | 1332 | goto bad; |
1333 | } | 1333 | } |
1334 | BUG_ON(!newmap); | 1334 | BUG_ON(!newmap); |
1335 | if (newmap != osdc->osdmap) { | 1335 | if (newmap != osdc->osdmap) { |
1336 | ceph_osdmap_destroy(osdc->osdmap); | 1336 | ceph_osdmap_destroy(osdc->osdmap); |
1337 | osdc->osdmap = newmap; | 1337 | osdc->osdmap = newmap; |
1338 | } | 1338 | } |
1339 | kick_requests(osdc, 0); | 1339 | kick_requests(osdc, 0); |
1340 | } else { | 1340 | } else { |
1341 | dout("ignoring incremental map %u len %d\n", | 1341 | dout("ignoring incremental map %u len %d\n", |
1342 | epoch, maplen); | 1342 | epoch, maplen); |
1343 | } | 1343 | } |
1344 | p = next; | 1344 | p = next; |
1345 | nr_maps--; | 1345 | nr_maps--; |
1346 | } | 1346 | } |
1347 | if (newmap) | 1347 | if (newmap) |
1348 | goto done; | 1348 | goto done; |
1349 | 1349 | ||
1350 | /* full maps */ | 1350 | /* full maps */ |
1351 | ceph_decode_32_safe(&p, end, nr_maps, bad); | 1351 | ceph_decode_32_safe(&p, end, nr_maps, bad); |
1352 | dout(" %d full maps\n", nr_maps); | 1352 | dout(" %d full maps\n", nr_maps); |
1353 | while (nr_maps) { | 1353 | while (nr_maps) { |
1354 | ceph_decode_need(&p, end, 2*sizeof(u32), bad); | 1354 | ceph_decode_need(&p, end, 2*sizeof(u32), bad); |
1355 | epoch = ceph_decode_32(&p); | 1355 | epoch = ceph_decode_32(&p); |
1356 | maplen = ceph_decode_32(&p); | 1356 | maplen = ceph_decode_32(&p); |
1357 | ceph_decode_need(&p, end, maplen, bad); | 1357 | ceph_decode_need(&p, end, maplen, bad); |
1358 | if (nr_maps > 1) { | 1358 | if (nr_maps > 1) { |
1359 | dout("skipping non-latest full map %u len %d\n", | 1359 | dout("skipping non-latest full map %u len %d\n", |
1360 | epoch, maplen); | 1360 | epoch, maplen); |
1361 | } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) { | 1361 | } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) { |
1362 | dout("skipping full map %u len %d, " | 1362 | dout("skipping full map %u len %d, " |
1363 | "older than our %u\n", epoch, maplen, | 1363 | "older than our %u\n", epoch, maplen, |
1364 | osdc->osdmap->epoch); | 1364 | osdc->osdmap->epoch); |
1365 | } else { | 1365 | } else { |
1366 | int skipped_map = 0; | 1366 | int skipped_map = 0; |
1367 | 1367 | ||
1368 | dout("taking full map %u len %d\n", epoch, maplen); | 1368 | dout("taking full map %u len %d\n", epoch, maplen); |
1369 | newmap = osdmap_decode(&p, p+maplen); | 1369 | newmap = osdmap_decode(&p, p+maplen); |
1370 | if (IS_ERR(newmap)) { | 1370 | if (IS_ERR(newmap)) { |
1371 | err = PTR_ERR(newmap); | 1371 | err = PTR_ERR(newmap); |
1372 | goto bad; | 1372 | goto bad; |
1373 | } | 1373 | } |
1374 | BUG_ON(!newmap); | 1374 | BUG_ON(!newmap); |
1375 | oldmap = osdc->osdmap; | 1375 | oldmap = osdc->osdmap; |
1376 | osdc->osdmap = newmap; | 1376 | osdc->osdmap = newmap; |
1377 | if (oldmap) { | 1377 | if (oldmap) { |
1378 | if (oldmap->epoch + 1 < newmap->epoch) | 1378 | if (oldmap->epoch + 1 < newmap->epoch) |
1379 | skipped_map = 1; | 1379 | skipped_map = 1; |
1380 | ceph_osdmap_destroy(oldmap); | 1380 | ceph_osdmap_destroy(oldmap); |
1381 | } | 1381 | } |
1382 | kick_requests(osdc, skipped_map); | 1382 | kick_requests(osdc, skipped_map); |
1383 | } | 1383 | } |
1384 | p += maplen; | 1384 | p += maplen; |
1385 | nr_maps--; | 1385 | nr_maps--; |
1386 | } | 1386 | } |
1387 | 1387 | ||
1388 | done: | 1388 | done: |
1389 | downgrade_write(&osdc->map_sem); | 1389 | downgrade_write(&osdc->map_sem); |
1390 | ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); | 1390 | ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); |
1391 | 1391 | ||
1392 | /* | 1392 | /* |
1393 | * subscribe to subsequent osdmap updates if full to ensure | 1393 | * subscribe to subsequent osdmap updates if full to ensure |
1394 | * we find out when we are no longer full and stop returning | 1394 | * we find out when we are no longer full and stop returning |
1395 | * ENOSPC. | 1395 | * ENOSPC. |
1396 | */ | 1396 | */ |
1397 | if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) | 1397 | if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) |
1398 | ceph_monc_request_next_osdmap(&osdc->client->monc); | 1398 | ceph_monc_request_next_osdmap(&osdc->client->monc); |
1399 | 1399 | ||
1400 | send_queued(osdc); | 1400 | send_queued(osdc); |
1401 | up_read(&osdc->map_sem); | 1401 | up_read(&osdc->map_sem); |
1402 | wake_up_all(&osdc->client->auth_wq); | 1402 | wake_up_all(&osdc->client->auth_wq); |
1403 | return; | 1403 | return; |
1404 | 1404 | ||
1405 | bad: | 1405 | bad: |
1406 | pr_err("osdc handle_map corrupt msg\n"); | 1406 | pr_err("osdc handle_map corrupt msg\n"); |
1407 | ceph_msg_dump(msg); | 1407 | ceph_msg_dump(msg); |
1408 | up_write(&osdc->map_sem); | 1408 | up_write(&osdc->map_sem); |
1409 | return; | 1409 | return; |
1410 | } | 1410 | } |
1411 | 1411 | ||
1412 | /* | 1412 | /* |
1413 | * watch/notify callback event infrastructure | 1413 | * watch/notify callback event infrastructure |
1414 | * | 1414 | * |
1415 | * These callbacks are used both for watch and notify operations. | 1415 | * These callbacks are used both for watch and notify operations. |
1416 | */ | 1416 | */ |
1417 | static void __release_event(struct kref *kref) | 1417 | static void __release_event(struct kref *kref) |
1418 | { | 1418 | { |
1419 | struct ceph_osd_event *event = | 1419 | struct ceph_osd_event *event = |
1420 | container_of(kref, struct ceph_osd_event, kref); | 1420 | container_of(kref, struct ceph_osd_event, kref); |
1421 | 1421 | ||
1422 | dout("__release_event %p\n", event); | 1422 | dout("__release_event %p\n", event); |
1423 | kfree(event); | 1423 | kfree(event); |
1424 | } | 1424 | } |
1425 | 1425 | ||
1426 | static void get_event(struct ceph_osd_event *event) | 1426 | static void get_event(struct ceph_osd_event *event) |
1427 | { | 1427 | { |
1428 | kref_get(&event->kref); | 1428 | kref_get(&event->kref); |
1429 | } | 1429 | } |
1430 | 1430 | ||
1431 | void ceph_osdc_put_event(struct ceph_osd_event *event) | 1431 | void ceph_osdc_put_event(struct ceph_osd_event *event) |
1432 | { | 1432 | { |
1433 | kref_put(&event->kref, __release_event); | 1433 | kref_put(&event->kref, __release_event); |
1434 | } | 1434 | } |
1435 | EXPORT_SYMBOL(ceph_osdc_put_event); | 1435 | EXPORT_SYMBOL(ceph_osdc_put_event); |
1436 | 1436 | ||
1437 | static void __insert_event(struct ceph_osd_client *osdc, | 1437 | static void __insert_event(struct ceph_osd_client *osdc, |
1438 | struct ceph_osd_event *new) | 1438 | struct ceph_osd_event *new) |
1439 | { | 1439 | { |
1440 | struct rb_node **p = &osdc->event_tree.rb_node; | 1440 | struct rb_node **p = &osdc->event_tree.rb_node; |
1441 | struct rb_node *parent = NULL; | 1441 | struct rb_node *parent = NULL; |
1442 | struct ceph_osd_event *event = NULL; | 1442 | struct ceph_osd_event *event = NULL; |
1443 | 1443 | ||
1444 | while (*p) { | 1444 | while (*p) { |
1445 | parent = *p; | 1445 | parent = *p; |
1446 | event = rb_entry(parent, struct ceph_osd_event, node); | 1446 | event = rb_entry(parent, struct ceph_osd_event, node); |
1447 | if (new->cookie < event->cookie) | 1447 | if (new->cookie < event->cookie) |
1448 | p = &(*p)->rb_left; | 1448 | p = &(*p)->rb_left; |
1449 | else if (new->cookie > event->cookie) | 1449 | else if (new->cookie > event->cookie) |
1450 | p = &(*p)->rb_right; | 1450 | p = &(*p)->rb_right; |
1451 | else | 1451 | else |
1452 | BUG(); | 1452 | BUG(); |
1453 | } | 1453 | } |
1454 | 1454 | ||
1455 | rb_link_node(&new->node, parent, p); | 1455 | rb_link_node(&new->node, parent, p); |
1456 | rb_insert_color(&new->node, &osdc->event_tree); | 1456 | rb_insert_color(&new->node, &osdc->event_tree); |
1457 | } | 1457 | } |
1458 | 1458 | ||
1459 | static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc, | 1459 | static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc, |
1460 | u64 cookie) | 1460 | u64 cookie) |
1461 | { | 1461 | { |
1462 | struct rb_node **p = &osdc->event_tree.rb_node; | 1462 | struct rb_node **p = &osdc->event_tree.rb_node; |
1463 | struct rb_node *parent = NULL; | 1463 | struct rb_node *parent = NULL; |
1464 | struct ceph_osd_event *event = NULL; | 1464 | struct ceph_osd_event *event = NULL; |
1465 | 1465 | ||
1466 | while (*p) { | 1466 | while (*p) { |
1467 | parent = *p; | 1467 | parent = *p; |
1468 | event = rb_entry(parent, struct ceph_osd_event, node); | 1468 | event = rb_entry(parent, struct ceph_osd_event, node); |
1469 | if (cookie < event->cookie) | 1469 | if (cookie < event->cookie) |
1470 | p = &(*p)->rb_left; | 1470 | p = &(*p)->rb_left; |
1471 | else if (cookie > event->cookie) | 1471 | else if (cookie > event->cookie) |
1472 | p = &(*p)->rb_right; | 1472 | p = &(*p)->rb_right; |
1473 | else | 1473 | else |
1474 | return event; | 1474 | return event; |
1475 | } | 1475 | } |
1476 | return NULL; | 1476 | return NULL; |
1477 | } | 1477 | } |
1478 | 1478 | ||
1479 | static void __remove_event(struct ceph_osd_event *event) | 1479 | static void __remove_event(struct ceph_osd_event *event) |
1480 | { | 1480 | { |
1481 | struct ceph_osd_client *osdc = event->osdc; | 1481 | struct ceph_osd_client *osdc = event->osdc; |
1482 | 1482 | ||
1483 | if (!RB_EMPTY_NODE(&event->node)) { | 1483 | if (!RB_EMPTY_NODE(&event->node)) { |
1484 | dout("__remove_event removed %p\n", event); | 1484 | dout("__remove_event removed %p\n", event); |
1485 | rb_erase(&event->node, &osdc->event_tree); | 1485 | rb_erase(&event->node, &osdc->event_tree); |
1486 | ceph_osdc_put_event(event); | 1486 | ceph_osdc_put_event(event); |
1487 | } else { | 1487 | } else { |
1488 | dout("__remove_event didn't remove %p\n", event); | 1488 | dout("__remove_event didn't remove %p\n", event); |
1489 | } | 1489 | } |
1490 | } | 1490 | } |
1491 | 1491 | ||
1492 | int ceph_osdc_create_event(struct ceph_osd_client *osdc, | 1492 | int ceph_osdc_create_event(struct ceph_osd_client *osdc, |
1493 | void (*event_cb)(u64, u64, u8, void *), | 1493 | void (*event_cb)(u64, u64, u8, void *), |
1494 | int one_shot, void *data, | 1494 | int one_shot, void *data, |
1495 | struct ceph_osd_event **pevent) | 1495 | struct ceph_osd_event **pevent) |
1496 | { | 1496 | { |
1497 | struct ceph_osd_event *event; | 1497 | struct ceph_osd_event *event; |
1498 | 1498 | ||
1499 | event = kmalloc(sizeof(*event), GFP_NOIO); | 1499 | event = kmalloc(sizeof(*event), GFP_NOIO); |
1500 | if (!event) | 1500 | if (!event) |
1501 | return -ENOMEM; | 1501 | return -ENOMEM; |
1502 | 1502 | ||
1503 | dout("create_event %p\n", event); | 1503 | dout("create_event %p\n", event); |
1504 | event->cb = event_cb; | 1504 | event->cb = event_cb; |
1505 | event->one_shot = one_shot; | 1505 | event->one_shot = one_shot; |
1506 | event->data = data; | 1506 | event->data = data; |
1507 | event->osdc = osdc; | 1507 | event->osdc = osdc; |
1508 | INIT_LIST_HEAD(&event->osd_node); | 1508 | INIT_LIST_HEAD(&event->osd_node); |
1509 | RB_CLEAR_NODE(&event->node); | 1509 | RB_CLEAR_NODE(&event->node); |
1510 | kref_init(&event->kref); /* one ref for us */ | 1510 | kref_init(&event->kref); /* one ref for us */ |
1511 | kref_get(&event->kref); /* one ref for the caller */ | 1511 | kref_get(&event->kref); /* one ref for the caller */ |
1512 | init_completion(&event->completion); | 1512 | init_completion(&event->completion); |
1513 | 1513 | ||
1514 | spin_lock(&osdc->event_lock); | 1514 | spin_lock(&osdc->event_lock); |
1515 | event->cookie = ++osdc->event_count; | 1515 | event->cookie = ++osdc->event_count; |
1516 | __insert_event(osdc, event); | 1516 | __insert_event(osdc, event); |
1517 | spin_unlock(&osdc->event_lock); | 1517 | spin_unlock(&osdc->event_lock); |
1518 | 1518 | ||
1519 | *pevent = event; | 1519 | *pevent = event; |
1520 | return 0; | 1520 | return 0; |
1521 | } | 1521 | } |
1522 | EXPORT_SYMBOL(ceph_osdc_create_event); | 1522 | EXPORT_SYMBOL(ceph_osdc_create_event); |
1523 | 1523 | ||
1524 | void ceph_osdc_cancel_event(struct ceph_osd_event *event) | 1524 | void ceph_osdc_cancel_event(struct ceph_osd_event *event) |
1525 | { | 1525 | { |
1526 | struct ceph_osd_client *osdc = event->osdc; | 1526 | struct ceph_osd_client *osdc = event->osdc; |
1527 | 1527 | ||
1528 | dout("cancel_event %p\n", event); | 1528 | dout("cancel_event %p\n", event); |
1529 | spin_lock(&osdc->event_lock); | 1529 | spin_lock(&osdc->event_lock); |
1530 | __remove_event(event); | 1530 | __remove_event(event); |
1531 | spin_unlock(&osdc->event_lock); | 1531 | spin_unlock(&osdc->event_lock); |
1532 | ceph_osdc_put_event(event); /* caller's */ | 1532 | ceph_osdc_put_event(event); /* caller's */ |
1533 | } | 1533 | } |
1534 | EXPORT_SYMBOL(ceph_osdc_cancel_event); | 1534 | EXPORT_SYMBOL(ceph_osdc_cancel_event); |
1535 | 1535 | ||
1536 | 1536 | ||
1537 | static void do_event_work(struct work_struct *work) | 1537 | static void do_event_work(struct work_struct *work) |
1538 | { | 1538 | { |
1539 | struct ceph_osd_event_work *event_work = | 1539 | struct ceph_osd_event_work *event_work = |
1540 | container_of(work, struct ceph_osd_event_work, work); | 1540 | container_of(work, struct ceph_osd_event_work, work); |
1541 | struct ceph_osd_event *event = event_work->event; | 1541 | struct ceph_osd_event *event = event_work->event; |
1542 | u64 ver = event_work->ver; | 1542 | u64 ver = event_work->ver; |
1543 | u64 notify_id = event_work->notify_id; | 1543 | u64 notify_id = event_work->notify_id; |
1544 | u8 opcode = event_work->opcode; | 1544 | u8 opcode = event_work->opcode; |
1545 | 1545 | ||
1546 | dout("do_event_work completing %p\n", event); | 1546 | dout("do_event_work completing %p\n", event); |
1547 | event->cb(ver, notify_id, opcode, event->data); | 1547 | event->cb(ver, notify_id, opcode, event->data); |
1548 | complete(&event->completion); | 1548 | complete(&event->completion); |
1549 | dout("do_event_work completed %p\n", event); | 1549 | dout("do_event_work completed %p\n", event); |
1550 | ceph_osdc_put_event(event); | 1550 | ceph_osdc_put_event(event); |
1551 | kfree(event_work); | 1551 | kfree(event_work); |
1552 | } | 1552 | } |
1553 | 1553 | ||
1554 | 1554 | ||
1555 | /* | 1555 | /* |
1556 | * Process osd watch notifications | 1556 | * Process osd watch notifications |
1557 | */ | 1557 | */ |
1558 | void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) | 1558 | void handle_watch_notify(struct ceph_osd_client *osdc, struct ceph_msg *msg) |
1559 | { | 1559 | { |
1560 | void *p, *end; | 1560 | void *p, *end; |
1561 | u8 proto_ver; | 1561 | u8 proto_ver; |
1562 | u64 cookie, ver, notify_id; | 1562 | u64 cookie, ver, notify_id; |
1563 | u8 opcode; | 1563 | u8 opcode; |
1564 | struct ceph_osd_event *event; | 1564 | struct ceph_osd_event *event; |
1565 | struct ceph_osd_event_work *event_work; | 1565 | struct ceph_osd_event_work *event_work; |
1566 | 1566 | ||
1567 | p = msg->front.iov_base; | 1567 | p = msg->front.iov_base; |
1568 | end = p + msg->front.iov_len; | 1568 | end = p + msg->front.iov_len; |
1569 | 1569 | ||
1570 | ceph_decode_8_safe(&p, end, proto_ver, bad); | 1570 | ceph_decode_8_safe(&p, end, proto_ver, bad); |
1571 | ceph_decode_8_safe(&p, end, opcode, bad); | 1571 | ceph_decode_8_safe(&p, end, opcode, bad); |
1572 | ceph_decode_64_safe(&p, end, cookie, bad); | 1572 | ceph_decode_64_safe(&p, end, cookie, bad); |
1573 | ceph_decode_64_safe(&p, end, ver, bad); | 1573 | ceph_decode_64_safe(&p, end, ver, bad); |
1574 | ceph_decode_64_safe(&p, end, notify_id, bad); | 1574 | ceph_decode_64_safe(&p, end, notify_id, bad); |
1575 | 1575 | ||
1576 | spin_lock(&osdc->event_lock); | 1576 | spin_lock(&osdc->event_lock); |
1577 | event = __find_event(osdc, cookie); | 1577 | event = __find_event(osdc, cookie); |
1578 | if (event) { | 1578 | if (event) { |
1579 | get_event(event); | 1579 | get_event(event); |
1580 | if (event->one_shot) | 1580 | if (event->one_shot) |
1581 | __remove_event(event); | 1581 | __remove_event(event); |
1582 | } | 1582 | } |
1583 | spin_unlock(&osdc->event_lock); | 1583 | spin_unlock(&osdc->event_lock); |
1584 | dout("handle_watch_notify cookie %lld ver %lld event %p\n", | 1584 | dout("handle_watch_notify cookie %lld ver %lld event %p\n", |
1585 | cookie, ver, event); | 1585 | cookie, ver, event); |
1586 | if (event) { | 1586 | if (event) { |
1587 | event_work = kmalloc(sizeof(*event_work), GFP_NOIO); | 1587 | event_work = kmalloc(sizeof(*event_work), GFP_NOIO); |
1588 | if (!event_work) { | 1588 | if (!event_work) { |
1589 | dout("ERROR: could not allocate event_work\n"); | 1589 | dout("ERROR: could not allocate event_work\n"); |
1590 | goto done_err; | 1590 | goto done_err; |
1591 | } | 1591 | } |
1592 | INIT_WORK(&event_work->work, do_event_work); | 1592 | INIT_WORK(&event_work->work, do_event_work); |
1593 | event_work->event = event; | 1593 | event_work->event = event; |
1594 | event_work->ver = ver; | 1594 | event_work->ver = ver; |
1595 | event_work->notify_id = notify_id; | 1595 | event_work->notify_id = notify_id; |
1596 | event_work->opcode = opcode; | 1596 | event_work->opcode = opcode; |
1597 | if (!queue_work(osdc->notify_wq, &event_work->work)) { | 1597 | if (!queue_work(osdc->notify_wq, &event_work->work)) { |
1598 | dout("WARNING: failed to queue notify event work\n"); | 1598 | dout("WARNING: failed to queue notify event work\n"); |
1599 | goto done_err; | 1599 | goto done_err; |
1600 | } | 1600 | } |
1601 | } | 1601 | } |
1602 | 1602 | ||
1603 | return; | 1603 | return; |
1604 | 1604 | ||
1605 | done_err: | 1605 | done_err: |
1606 | complete(&event->completion); | 1606 | complete(&event->completion); |
1607 | ceph_osdc_put_event(event); | 1607 | ceph_osdc_put_event(event); |
1608 | return; | 1608 | return; |
1609 | 1609 | ||
1610 | bad: | 1610 | bad: |
1611 | pr_err("osdc handle_watch_notify corrupt msg\n"); | 1611 | pr_err("osdc handle_watch_notify corrupt msg\n"); |
1612 | return; | 1612 | return; |
1613 | } | 1613 | } |
1614 | 1614 | ||
1615 | int ceph_osdc_wait_event(struct ceph_osd_event *event, unsigned long timeout) | 1615 | int ceph_osdc_wait_event(struct ceph_osd_event *event, unsigned long timeout) |
1616 | { | 1616 | { |
1617 | int err; | 1617 | int err; |
1618 | 1618 | ||
1619 | dout("wait_event %p\n", event); | 1619 | dout("wait_event %p\n", event); |
1620 | err = wait_for_completion_interruptible_timeout(&event->completion, | 1620 | err = wait_for_completion_interruptible_timeout(&event->completion, |
1621 | timeout * HZ); | 1621 | timeout * HZ); |
1622 | ceph_osdc_put_event(event); | 1622 | ceph_osdc_put_event(event); |
1623 | if (err > 0) | 1623 | if (err > 0) |
1624 | err = 0; | 1624 | err = 0; |
1625 | dout("wait_event %p returns %d\n", event, err); | 1625 | dout("wait_event %p returns %d\n", event, err); |
1626 | return err; | 1626 | return err; |
1627 | } | 1627 | } |
1628 | EXPORT_SYMBOL(ceph_osdc_wait_event); | 1628 | EXPORT_SYMBOL(ceph_osdc_wait_event); |
1629 | 1629 | ||
1630 | /* | 1630 | /* |
1631 | * Register request, send initial attempt. | 1631 | * Register request, send initial attempt. |
1632 | */ | 1632 | */ |
1633 | int ceph_osdc_start_request(struct ceph_osd_client *osdc, | 1633 | int ceph_osdc_start_request(struct ceph_osd_client *osdc, |
1634 | struct ceph_osd_request *req, | 1634 | struct ceph_osd_request *req, |
1635 | bool nofail) | 1635 | bool nofail) |
1636 | { | 1636 | { |
1637 | int rc = 0; | 1637 | int rc = 0; |
1638 | 1638 | ||
1639 | req->r_request->pages = req->r_pages; | 1639 | req->r_request->pages = req->r_pages; |
1640 | req->r_request->nr_pages = req->r_num_pages; | 1640 | req->r_request->nr_pages = req->r_num_pages; |
1641 | #ifdef CONFIG_BLOCK | 1641 | #ifdef CONFIG_BLOCK |
1642 | req->r_request->bio = req->r_bio; | 1642 | req->r_request->bio = req->r_bio; |
1643 | #endif | 1643 | #endif |
1644 | req->r_request->trail = &req->r_trail; | 1644 | req->r_request->trail = &req->r_trail; |
1645 | 1645 | ||
1646 | register_request(osdc, req); | 1646 | register_request(osdc, req); |
1647 | 1647 | ||
1648 | down_read(&osdc->map_sem); | 1648 | down_read(&osdc->map_sem); |
1649 | mutex_lock(&osdc->request_mutex); | 1649 | mutex_lock(&osdc->request_mutex); |
1650 | /* | 1650 | /* |
1651 | * a racing kick_requests() may have sent the message for us | 1651 | * a racing kick_requests() may have sent the message for us |
1652 | * while we dropped request_mutex above, so only send now if | 1652 | * while we dropped request_mutex above, so only send now if |
1653 | * the request still han't been touched yet. | 1653 | * the request still han't been touched yet. |
1654 | */ | 1654 | */ |
1655 | if (req->r_sent == 0) { | 1655 | if (req->r_sent == 0) { |
1656 | rc = __map_request(osdc, req, 0); | 1656 | rc = __map_request(osdc, req, 0); |
1657 | if (rc < 0) { | 1657 | if (rc < 0) { |
1658 | if (nofail) { | 1658 | if (nofail) { |
1659 | dout("osdc_start_request failed map, " | 1659 | dout("osdc_start_request failed map, " |
1660 | " will retry %lld\n", req->r_tid); | 1660 | " will retry %lld\n", req->r_tid); |
1661 | rc = 0; | 1661 | rc = 0; |
1662 | } | 1662 | } |
1663 | goto out_unlock; | 1663 | goto out_unlock; |
1664 | } | 1664 | } |
1665 | if (req->r_osd == NULL) { | 1665 | if (req->r_osd == NULL) { |
1666 | dout("send_request %p no up osds in pg\n", req); | 1666 | dout("send_request %p no up osds in pg\n", req); |
1667 | ceph_monc_request_next_osdmap(&osdc->client->monc); | 1667 | ceph_monc_request_next_osdmap(&osdc->client->monc); |
1668 | } else { | 1668 | } else { |
1669 | __send_request(osdc, req); | 1669 | __send_request(osdc, req); |
1670 | } | 1670 | } |
1671 | rc = 0; | 1671 | rc = 0; |
1672 | } | 1672 | } |
1673 | 1673 | ||
1674 | out_unlock: | 1674 | out_unlock: |
1675 | mutex_unlock(&osdc->request_mutex); | 1675 | mutex_unlock(&osdc->request_mutex); |
1676 | up_read(&osdc->map_sem); | 1676 | up_read(&osdc->map_sem); |
1677 | return rc; | 1677 | return rc; |
1678 | } | 1678 | } |
1679 | EXPORT_SYMBOL(ceph_osdc_start_request); | 1679 | EXPORT_SYMBOL(ceph_osdc_start_request); |
1680 | 1680 | ||
1681 | /* | 1681 | /* |
1682 | * wait for a request to complete | 1682 | * wait for a request to complete |
1683 | */ | 1683 | */ |
1684 | int ceph_osdc_wait_request(struct ceph_osd_client *osdc, | 1684 | int ceph_osdc_wait_request(struct ceph_osd_client *osdc, |
1685 | struct ceph_osd_request *req) | 1685 | struct ceph_osd_request *req) |
1686 | { | 1686 | { |
1687 | int rc; | 1687 | int rc; |
1688 | 1688 | ||
1689 | rc = wait_for_completion_interruptible(&req->r_completion); | 1689 | rc = wait_for_completion_interruptible(&req->r_completion); |
1690 | if (rc < 0) { | 1690 | if (rc < 0) { |
1691 | mutex_lock(&osdc->request_mutex); | 1691 | mutex_lock(&osdc->request_mutex); |
1692 | __cancel_request(req); | 1692 | __cancel_request(req); |
1693 | __unregister_request(osdc, req); | 1693 | __unregister_request(osdc, req); |
1694 | mutex_unlock(&osdc->request_mutex); | 1694 | mutex_unlock(&osdc->request_mutex); |
1695 | complete_request(req); | 1695 | complete_request(req); |
1696 | dout("wait_request tid %llu canceled/timed out\n", req->r_tid); | 1696 | dout("wait_request tid %llu canceled/timed out\n", req->r_tid); |
1697 | return rc; | 1697 | return rc; |
1698 | } | 1698 | } |
1699 | 1699 | ||
1700 | dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result); | 1700 | dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result); |
1701 | return req->r_result; | 1701 | return req->r_result; |
1702 | } | 1702 | } |
1703 | EXPORT_SYMBOL(ceph_osdc_wait_request); | 1703 | EXPORT_SYMBOL(ceph_osdc_wait_request); |
1704 | 1704 | ||
1705 | /* | 1705 | /* |
1706 | * sync - wait for all in-flight requests to flush. avoid starvation. | 1706 | * sync - wait for all in-flight requests to flush. avoid starvation. |
1707 | */ | 1707 | */ |
1708 | void ceph_osdc_sync(struct ceph_osd_client *osdc) | 1708 | void ceph_osdc_sync(struct ceph_osd_client *osdc) |
1709 | { | 1709 | { |
1710 | struct ceph_osd_request *req; | 1710 | struct ceph_osd_request *req; |
1711 | u64 last_tid, next_tid = 0; | 1711 | u64 last_tid, next_tid = 0; |
1712 | 1712 | ||
1713 | mutex_lock(&osdc->request_mutex); | 1713 | mutex_lock(&osdc->request_mutex); |
1714 | last_tid = osdc->last_tid; | 1714 | last_tid = osdc->last_tid; |
1715 | while (1) { | 1715 | while (1) { |
1716 | req = __lookup_request_ge(osdc, next_tid); | 1716 | req = __lookup_request_ge(osdc, next_tid); |
1717 | if (!req) | 1717 | if (!req) |
1718 | break; | 1718 | break; |
1719 | if (req->r_tid > last_tid) | 1719 | if (req->r_tid > last_tid) |
1720 | break; | 1720 | break; |
1721 | 1721 | ||
1722 | next_tid = req->r_tid + 1; | 1722 | next_tid = req->r_tid + 1; |
1723 | if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0) | 1723 | if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0) |
1724 | continue; | 1724 | continue; |
1725 | 1725 | ||
1726 | ceph_osdc_get_request(req); | 1726 | ceph_osdc_get_request(req); |
1727 | mutex_unlock(&osdc->request_mutex); | 1727 | mutex_unlock(&osdc->request_mutex); |
1728 | dout("sync waiting on tid %llu (last is %llu)\n", | 1728 | dout("sync waiting on tid %llu (last is %llu)\n", |
1729 | req->r_tid, last_tid); | 1729 | req->r_tid, last_tid); |
1730 | wait_for_completion(&req->r_safe_completion); | 1730 | wait_for_completion(&req->r_safe_completion); |
1731 | mutex_lock(&osdc->request_mutex); | 1731 | mutex_lock(&osdc->request_mutex); |
1732 | ceph_osdc_put_request(req); | 1732 | ceph_osdc_put_request(req); |
1733 | } | 1733 | } |
1734 | mutex_unlock(&osdc->request_mutex); | 1734 | mutex_unlock(&osdc->request_mutex); |
1735 | dout("sync done (thru tid %llu)\n", last_tid); | 1735 | dout("sync done (thru tid %llu)\n", last_tid); |
1736 | } | 1736 | } |
1737 | EXPORT_SYMBOL(ceph_osdc_sync); | 1737 | EXPORT_SYMBOL(ceph_osdc_sync); |
1738 | 1738 | ||
1739 | /* | 1739 | /* |
1740 | * init, shutdown | 1740 | * init, shutdown |
1741 | */ | 1741 | */ |
1742 | int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) | 1742 | int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) |
1743 | { | 1743 | { |
1744 | int err; | 1744 | int err; |
1745 | 1745 | ||
1746 | dout("init\n"); | 1746 | dout("init\n"); |
1747 | osdc->client = client; | 1747 | osdc->client = client; |
1748 | osdc->osdmap = NULL; | 1748 | osdc->osdmap = NULL; |
1749 | init_rwsem(&osdc->map_sem); | 1749 | init_rwsem(&osdc->map_sem); |
1750 | init_completion(&osdc->map_waiters); | 1750 | init_completion(&osdc->map_waiters); |
1751 | osdc->last_requested_map = 0; | 1751 | osdc->last_requested_map = 0; |
1752 | mutex_init(&osdc->request_mutex); | 1752 | mutex_init(&osdc->request_mutex); |
1753 | osdc->last_tid = 0; | 1753 | osdc->last_tid = 0; |
1754 | osdc->osds = RB_ROOT; | 1754 | osdc->osds = RB_ROOT; |
1755 | INIT_LIST_HEAD(&osdc->osd_lru); | 1755 | INIT_LIST_HEAD(&osdc->osd_lru); |
1756 | osdc->requests = RB_ROOT; | 1756 | osdc->requests = RB_ROOT; |
1757 | INIT_LIST_HEAD(&osdc->req_lru); | 1757 | INIT_LIST_HEAD(&osdc->req_lru); |
1758 | INIT_LIST_HEAD(&osdc->req_unsent); | 1758 | INIT_LIST_HEAD(&osdc->req_unsent); |
1759 | INIT_LIST_HEAD(&osdc->req_notarget); | 1759 | INIT_LIST_HEAD(&osdc->req_notarget); |
1760 | INIT_LIST_HEAD(&osdc->req_linger); | 1760 | INIT_LIST_HEAD(&osdc->req_linger); |
1761 | osdc->num_requests = 0; | 1761 | osdc->num_requests = 0; |
1762 | INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); | 1762 | INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); |
1763 | INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); | 1763 | INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); |
1764 | spin_lock_init(&osdc->event_lock); | 1764 | spin_lock_init(&osdc->event_lock); |
1765 | osdc->event_tree = RB_ROOT; | 1765 | osdc->event_tree = RB_ROOT; |
1766 | osdc->event_count = 0; | 1766 | osdc->event_count = 0; |
1767 | 1767 | ||
1768 | schedule_delayed_work(&osdc->osds_timeout_work, | 1768 | schedule_delayed_work(&osdc->osds_timeout_work, |
1769 | round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ)); | 1769 | round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ)); |
1770 | 1770 | ||
1771 | err = -ENOMEM; | 1771 | err = -ENOMEM; |
1772 | osdc->req_mempool = mempool_create_kmalloc_pool(10, | 1772 | osdc->req_mempool = mempool_create_kmalloc_pool(10, |
1773 | sizeof(struct ceph_osd_request)); | 1773 | sizeof(struct ceph_osd_request)); |
1774 | if (!osdc->req_mempool) | 1774 | if (!osdc->req_mempool) |
1775 | goto out; | 1775 | goto out; |
1776 | 1776 | ||
1777 | err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, | 1777 | err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, |
1778 | OSD_OP_FRONT_LEN, 10, true, | 1778 | OSD_OP_FRONT_LEN, 10, true, |
1779 | "osd_op"); | 1779 | "osd_op"); |
1780 | if (err < 0) | 1780 | if (err < 0) |
1781 | goto out_mempool; | 1781 | goto out_mempool; |
1782 | err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, | 1782 | err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, |
1783 | OSD_OPREPLY_FRONT_LEN, 10, true, | 1783 | OSD_OPREPLY_FRONT_LEN, 10, true, |
1784 | "osd_op_reply"); | 1784 | "osd_op_reply"); |
1785 | if (err < 0) | 1785 | if (err < 0) |
1786 | goto out_msgpool; | 1786 | goto out_msgpool; |
1787 | 1787 | ||
1788 | osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify"); | 1788 | osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify"); |
1789 | if (IS_ERR(osdc->notify_wq)) { | 1789 | if (IS_ERR(osdc->notify_wq)) { |
1790 | err = PTR_ERR(osdc->notify_wq); | 1790 | err = PTR_ERR(osdc->notify_wq); |
1791 | osdc->notify_wq = NULL; | 1791 | osdc->notify_wq = NULL; |
1792 | goto out_msgpool; | 1792 | goto out_msgpool; |
1793 | } | 1793 | } |
1794 | return 0; | 1794 | return 0; |
1795 | 1795 | ||
1796 | out_msgpool: | 1796 | out_msgpool: |
1797 | ceph_msgpool_destroy(&osdc->msgpool_op); | 1797 | ceph_msgpool_destroy(&osdc->msgpool_op); |
1798 | out_mempool: | 1798 | out_mempool: |
1799 | mempool_destroy(osdc->req_mempool); | 1799 | mempool_destroy(osdc->req_mempool); |
1800 | out: | 1800 | out: |
1801 | return err; | 1801 | return err; |
1802 | } | 1802 | } |
1803 | EXPORT_SYMBOL(ceph_osdc_init); | 1803 | EXPORT_SYMBOL(ceph_osdc_init); |
1804 | 1804 | ||
1805 | void ceph_osdc_stop(struct ceph_osd_client *osdc) | 1805 | void ceph_osdc_stop(struct ceph_osd_client *osdc) |
1806 | { | 1806 | { |
1807 | flush_workqueue(osdc->notify_wq); | 1807 | flush_workqueue(osdc->notify_wq); |
1808 | destroy_workqueue(osdc->notify_wq); | 1808 | destroy_workqueue(osdc->notify_wq); |
1809 | cancel_delayed_work_sync(&osdc->timeout_work); | 1809 | cancel_delayed_work_sync(&osdc->timeout_work); |
1810 | cancel_delayed_work_sync(&osdc->osds_timeout_work); | 1810 | cancel_delayed_work_sync(&osdc->osds_timeout_work); |
1811 | if (osdc->osdmap) { | 1811 | if (osdc->osdmap) { |
1812 | ceph_osdmap_destroy(osdc->osdmap); | 1812 | ceph_osdmap_destroy(osdc->osdmap); |
1813 | osdc->osdmap = NULL; | 1813 | osdc->osdmap = NULL; |
1814 | } | 1814 | } |
1815 | remove_all_osds(osdc); | 1815 | remove_all_osds(osdc); |
1816 | mempool_destroy(osdc->req_mempool); | 1816 | mempool_destroy(osdc->req_mempool); |
1817 | ceph_msgpool_destroy(&osdc->msgpool_op); | 1817 | ceph_msgpool_destroy(&osdc->msgpool_op); |
1818 | ceph_msgpool_destroy(&osdc->msgpool_op_reply); | 1818 | ceph_msgpool_destroy(&osdc->msgpool_op_reply); |
1819 | } | 1819 | } |
1820 | EXPORT_SYMBOL(ceph_osdc_stop); | 1820 | EXPORT_SYMBOL(ceph_osdc_stop); |
1821 | 1821 | ||
1822 | /* | 1822 | /* |
1823 | * Read some contiguous pages. If we cross a stripe boundary, shorten | 1823 | * Read some contiguous pages. If we cross a stripe boundary, shorten |
1824 | * *plen. Return number of bytes read, or error. | 1824 | * *plen. Return number of bytes read, or error. |
1825 | */ | 1825 | */ |
1826 | int ceph_osdc_readpages(struct ceph_osd_client *osdc, | 1826 | int ceph_osdc_readpages(struct ceph_osd_client *osdc, |
1827 | struct ceph_vino vino, struct ceph_file_layout *layout, | 1827 | struct ceph_vino vino, struct ceph_file_layout *layout, |
1828 | u64 off, u64 *plen, | 1828 | u64 off, u64 *plen, |
1829 | u32 truncate_seq, u64 truncate_size, | 1829 | u32 truncate_seq, u64 truncate_size, |
1830 | struct page **pages, int num_pages, int page_align) | 1830 | struct page **pages, int num_pages, int page_align) |
1831 | { | 1831 | { |
1832 | struct ceph_osd_request *req; | 1832 | struct ceph_osd_request *req; |
1833 | int rc = 0; | 1833 | int rc = 0; |
1834 | 1834 | ||
1835 | dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, | 1835 | dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, |
1836 | vino.snap, off, *plen); | 1836 | vino.snap, off, *plen); |
1837 | req = ceph_osdc_new_request(osdc, layout, vino, off, plen, | 1837 | req = ceph_osdc_new_request(osdc, layout, vino, off, plen, |
1838 | CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, | 1838 | CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, |
1839 | NULL, 0, truncate_seq, truncate_size, NULL, | 1839 | NULL, 0, truncate_seq, truncate_size, NULL, |
1840 | false, 1, page_align); | 1840 | false, 1, page_align); |
1841 | if (IS_ERR(req)) | 1841 | if (IS_ERR(req)) |
1842 | return PTR_ERR(req); | 1842 | return PTR_ERR(req); |
1843 | 1843 | ||
1844 | /* it may be a short read due to an object boundary */ | 1844 | /* it may be a short read due to an object boundary */ |
1845 | req->r_pages = pages; | 1845 | req->r_pages = pages; |
1846 | 1846 | ||
1847 | dout("readpages final extent is %llu~%llu (%d pages align %d)\n", | 1847 | dout("readpages final extent is %llu~%llu (%d pages align %d)\n", |
1848 | off, *plen, req->r_num_pages, page_align); | 1848 | off, *plen, req->r_num_pages, page_align); |
1849 | 1849 | ||
1850 | rc = ceph_osdc_start_request(osdc, req, false); | 1850 | rc = ceph_osdc_start_request(osdc, req, false); |
1851 | if (!rc) | 1851 | if (!rc) |
1852 | rc = ceph_osdc_wait_request(osdc, req); | 1852 | rc = ceph_osdc_wait_request(osdc, req); |
1853 | 1853 | ||
1854 | ceph_osdc_put_request(req); | 1854 | ceph_osdc_put_request(req); |
1855 | dout("readpages result %d\n", rc); | 1855 | dout("readpages result %d\n", rc); |
1856 | return rc; | 1856 | return rc; |
1857 | } | 1857 | } |
1858 | EXPORT_SYMBOL(ceph_osdc_readpages); | 1858 | EXPORT_SYMBOL(ceph_osdc_readpages); |
1859 | 1859 | ||
1860 | /* | 1860 | /* |
1861 | * do a synchronous write on N pages | 1861 | * do a synchronous write on N pages |
1862 | */ | 1862 | */ |
1863 | int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, | 1863 | int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, |
1864 | struct ceph_file_layout *layout, | 1864 | struct ceph_file_layout *layout, |
1865 | struct ceph_snap_context *snapc, | 1865 | struct ceph_snap_context *snapc, |
1866 | u64 off, u64 len, | 1866 | u64 off, u64 len, |
1867 | u32 truncate_seq, u64 truncate_size, | 1867 | u32 truncate_seq, u64 truncate_size, |
1868 | struct timespec *mtime, | 1868 | struct timespec *mtime, |
1869 | struct page **pages, int num_pages, | 1869 | struct page **pages, int num_pages, |
1870 | int flags, int do_sync, bool nofail) | 1870 | int flags, int do_sync) |
1871 | { | 1871 | { |
1872 | struct ceph_osd_request *req; | 1872 | struct ceph_osd_request *req; |
1873 | int rc = 0; | 1873 | int rc = 0; |
1874 | int page_align = off & ~PAGE_MASK; | 1874 | int page_align = off & ~PAGE_MASK; |
1875 | 1875 | ||
1876 | BUG_ON(vino.snap != CEPH_NOSNAP); | 1876 | BUG_ON(vino.snap != CEPH_NOSNAP); |
1877 | req = ceph_osdc_new_request(osdc, layout, vino, off, &len, | 1877 | req = ceph_osdc_new_request(osdc, layout, vino, off, &len, |
1878 | CEPH_OSD_OP_WRITE, | 1878 | CEPH_OSD_OP_WRITE, |
1879 | flags | CEPH_OSD_FLAG_ONDISK | | 1879 | flags | CEPH_OSD_FLAG_ONDISK | |
1880 | CEPH_OSD_FLAG_WRITE, | 1880 | CEPH_OSD_FLAG_WRITE, |
1881 | snapc, do_sync, | 1881 | snapc, do_sync, |
1882 | truncate_seq, truncate_size, mtime, | 1882 | truncate_seq, truncate_size, mtime, |
1883 | nofail, 1, page_align); | 1883 | true, 1, page_align); |
1884 | if (IS_ERR(req)) | 1884 | if (IS_ERR(req)) |
1885 | return PTR_ERR(req); | 1885 | return PTR_ERR(req); |
1886 | 1886 | ||
1887 | /* it may be a short write due to an object boundary */ | 1887 | /* it may be a short write due to an object boundary */ |
1888 | req->r_pages = pages; | 1888 | req->r_pages = pages; |
1889 | dout("writepages %llu~%llu (%d pages)\n", off, len, | 1889 | dout("writepages %llu~%llu (%d pages)\n", off, len, |
1890 | req->r_num_pages); | 1890 | req->r_num_pages); |
1891 | 1891 | ||
1892 | rc = ceph_osdc_start_request(osdc, req, nofail); | 1892 | rc = ceph_osdc_start_request(osdc, req, true); |
1893 | if (!rc) | 1893 | if (!rc) |
1894 | rc = ceph_osdc_wait_request(osdc, req); | 1894 | rc = ceph_osdc_wait_request(osdc, req); |
1895 | 1895 | ||
1896 | ceph_osdc_put_request(req); | 1896 | ceph_osdc_put_request(req); |
1897 | if (rc == 0) | 1897 | if (rc == 0) |
1898 | rc = len; | 1898 | rc = len; |
1899 | dout("writepages result %d\n", rc); | 1899 | dout("writepages result %d\n", rc); |
1900 | return rc; | 1900 | return rc; |
1901 | } | 1901 | } |
1902 | EXPORT_SYMBOL(ceph_osdc_writepages); | 1902 | EXPORT_SYMBOL(ceph_osdc_writepages); |
1903 | 1903 | ||
1904 | /* | 1904 | /* |
1905 | * handle incoming message | 1905 | * handle incoming message |
1906 | */ | 1906 | */ |
1907 | static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) | 1907 | static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) |
1908 | { | 1908 | { |
1909 | struct ceph_osd *osd = con->private; | 1909 | struct ceph_osd *osd = con->private; |
1910 | struct ceph_osd_client *osdc; | 1910 | struct ceph_osd_client *osdc; |
1911 | int type = le16_to_cpu(msg->hdr.type); | 1911 | int type = le16_to_cpu(msg->hdr.type); |
1912 | 1912 | ||
1913 | if (!osd) | 1913 | if (!osd) |
1914 | goto out; | 1914 | goto out; |
1915 | osdc = osd->o_osdc; | 1915 | osdc = osd->o_osdc; |
1916 | 1916 | ||
1917 | switch (type) { | 1917 | switch (type) { |
1918 | case CEPH_MSG_OSD_MAP: | 1918 | case CEPH_MSG_OSD_MAP: |
1919 | ceph_osdc_handle_map(osdc, msg); | 1919 | ceph_osdc_handle_map(osdc, msg); |
1920 | break; | 1920 | break; |
1921 | case CEPH_MSG_OSD_OPREPLY: | 1921 | case CEPH_MSG_OSD_OPREPLY: |
1922 | handle_reply(osdc, msg, con); | 1922 | handle_reply(osdc, msg, con); |
1923 | break; | 1923 | break; |
1924 | case CEPH_MSG_WATCH_NOTIFY: | 1924 | case CEPH_MSG_WATCH_NOTIFY: |
1925 | handle_watch_notify(osdc, msg); | 1925 | handle_watch_notify(osdc, msg); |
1926 | break; | 1926 | break; |
1927 | 1927 | ||
1928 | default: | 1928 | default: |
1929 | pr_err("received unknown message type %d %s\n", type, | 1929 | pr_err("received unknown message type %d %s\n", type, |
1930 | ceph_msg_type_name(type)); | 1930 | ceph_msg_type_name(type)); |
1931 | } | 1931 | } |
1932 | out: | 1932 | out: |
1933 | ceph_msg_put(msg); | 1933 | ceph_msg_put(msg); |
1934 | } | 1934 | } |
1935 | 1935 | ||
1936 | /* | 1936 | /* |
1937 | * lookup and return message for incoming reply. set up reply message | 1937 | * lookup and return message for incoming reply. set up reply message |
1938 | * pages. | 1938 | * pages. |
1939 | */ | 1939 | */ |
1940 | static struct ceph_msg *get_reply(struct ceph_connection *con, | 1940 | static struct ceph_msg *get_reply(struct ceph_connection *con, |
1941 | struct ceph_msg_header *hdr, | 1941 | struct ceph_msg_header *hdr, |
1942 | int *skip) | 1942 | int *skip) |
1943 | { | 1943 | { |
1944 | struct ceph_osd *osd = con->private; | 1944 | struct ceph_osd *osd = con->private; |
1945 | struct ceph_osd_client *osdc = osd->o_osdc; | 1945 | struct ceph_osd_client *osdc = osd->o_osdc; |
1946 | struct ceph_msg *m; | 1946 | struct ceph_msg *m; |
1947 | struct ceph_osd_request *req; | 1947 | struct ceph_osd_request *req; |
1948 | int front = le32_to_cpu(hdr->front_len); | 1948 | int front = le32_to_cpu(hdr->front_len); |
1949 | int data_len = le32_to_cpu(hdr->data_len); | 1949 | int data_len = le32_to_cpu(hdr->data_len); |
1950 | u64 tid; | 1950 | u64 tid; |
1951 | 1951 | ||
1952 | tid = le64_to_cpu(hdr->tid); | 1952 | tid = le64_to_cpu(hdr->tid); |
1953 | mutex_lock(&osdc->request_mutex); | 1953 | mutex_lock(&osdc->request_mutex); |
1954 | req = __lookup_request(osdc, tid); | 1954 | req = __lookup_request(osdc, tid); |
1955 | if (!req) { | 1955 | if (!req) { |
1956 | *skip = 1; | 1956 | *skip = 1; |
1957 | m = NULL; | 1957 | m = NULL; |
1958 | dout("get_reply unknown tid %llu from osd%d\n", tid, | 1958 | dout("get_reply unknown tid %llu from osd%d\n", tid, |
1959 | osd->o_osd); | 1959 | osd->o_osd); |
1960 | goto out; | 1960 | goto out; |
1961 | } | 1961 | } |
1962 | 1962 | ||
1963 | if (req->r_con_filling_msg) { | 1963 | if (req->r_con_filling_msg) { |
1964 | dout("%s revoking msg %p from old con %p\n", __func__, | 1964 | dout("%s revoking msg %p from old con %p\n", __func__, |
1965 | req->r_reply, req->r_con_filling_msg); | 1965 | req->r_reply, req->r_con_filling_msg); |
1966 | ceph_msg_revoke_incoming(req->r_reply); | 1966 | ceph_msg_revoke_incoming(req->r_reply); |
1967 | req->r_con_filling_msg->ops->put(req->r_con_filling_msg); | 1967 | req->r_con_filling_msg->ops->put(req->r_con_filling_msg); |
1968 | req->r_con_filling_msg = NULL; | 1968 | req->r_con_filling_msg = NULL; |
1969 | } | 1969 | } |
1970 | 1970 | ||
1971 | if (front > req->r_reply->front.iov_len) { | 1971 | if (front > req->r_reply->front.iov_len) { |
1972 | pr_warning("get_reply front %d > preallocated %d\n", | 1972 | pr_warning("get_reply front %d > preallocated %d\n", |
1973 | front, (int)req->r_reply->front.iov_len); | 1973 | front, (int)req->r_reply->front.iov_len); |
1974 | m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS, false); | 1974 | m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS, false); |
1975 | if (!m) | 1975 | if (!m) |
1976 | goto out; | 1976 | goto out; |
1977 | ceph_msg_put(req->r_reply); | 1977 | ceph_msg_put(req->r_reply); |
1978 | req->r_reply = m; | 1978 | req->r_reply = m; |
1979 | } | 1979 | } |
1980 | m = ceph_msg_get(req->r_reply); | 1980 | m = ceph_msg_get(req->r_reply); |
1981 | 1981 | ||
1982 | if (data_len > 0) { | 1982 | if (data_len > 0) { |
1983 | int want = calc_pages_for(req->r_page_alignment, data_len); | 1983 | int want = calc_pages_for(req->r_page_alignment, data_len); |
1984 | 1984 | ||
1985 | if (req->r_pages && unlikely(req->r_num_pages < want)) { | 1985 | if (req->r_pages && unlikely(req->r_num_pages < want)) { |
1986 | pr_warning("tid %lld reply has %d bytes %d pages, we" | 1986 | pr_warning("tid %lld reply has %d bytes %d pages, we" |
1987 | " had only %d pages ready\n", tid, data_len, | 1987 | " had only %d pages ready\n", tid, data_len, |
1988 | want, req->r_num_pages); | 1988 | want, req->r_num_pages); |
1989 | *skip = 1; | 1989 | *skip = 1; |
1990 | ceph_msg_put(m); | 1990 | ceph_msg_put(m); |
1991 | m = NULL; | 1991 | m = NULL; |
1992 | goto out; | 1992 | goto out; |
1993 | } | 1993 | } |
1994 | m->pages = req->r_pages; | 1994 | m->pages = req->r_pages; |
1995 | m->nr_pages = req->r_num_pages; | 1995 | m->nr_pages = req->r_num_pages; |
1996 | m->page_alignment = req->r_page_alignment; | 1996 | m->page_alignment = req->r_page_alignment; |
1997 | #ifdef CONFIG_BLOCK | 1997 | #ifdef CONFIG_BLOCK |
1998 | m->bio = req->r_bio; | 1998 | m->bio = req->r_bio; |
1999 | #endif | 1999 | #endif |
2000 | } | 2000 | } |
2001 | *skip = 0; | 2001 | *skip = 0; |
2002 | req->r_con_filling_msg = con->ops->get(con); | 2002 | req->r_con_filling_msg = con->ops->get(con); |
2003 | dout("get_reply tid %lld %p\n", tid, m); | 2003 | dout("get_reply tid %lld %p\n", tid, m); |
2004 | 2004 | ||
2005 | out: | 2005 | out: |
2006 | mutex_unlock(&osdc->request_mutex); | 2006 | mutex_unlock(&osdc->request_mutex); |
2007 | return m; | 2007 | return m; |
2008 | 2008 | ||
2009 | } | 2009 | } |
2010 | 2010 | ||
2011 | static struct ceph_msg *alloc_msg(struct ceph_connection *con, | 2011 | static struct ceph_msg *alloc_msg(struct ceph_connection *con, |
2012 | struct ceph_msg_header *hdr, | 2012 | struct ceph_msg_header *hdr, |
2013 | int *skip) | 2013 | int *skip) |
2014 | { | 2014 | { |
2015 | struct ceph_osd *osd = con->private; | 2015 | struct ceph_osd *osd = con->private; |
2016 | int type = le16_to_cpu(hdr->type); | 2016 | int type = le16_to_cpu(hdr->type); |
2017 | int front = le32_to_cpu(hdr->front_len); | 2017 | int front = le32_to_cpu(hdr->front_len); |
2018 | 2018 | ||
2019 | *skip = 0; | 2019 | *skip = 0; |
2020 | switch (type) { | 2020 | switch (type) { |
2021 | case CEPH_MSG_OSD_MAP: | 2021 | case CEPH_MSG_OSD_MAP: |
2022 | case CEPH_MSG_WATCH_NOTIFY: | 2022 | case CEPH_MSG_WATCH_NOTIFY: |
2023 | return ceph_msg_new(type, front, GFP_NOFS, false); | 2023 | return ceph_msg_new(type, front, GFP_NOFS, false); |
2024 | case CEPH_MSG_OSD_OPREPLY: | 2024 | case CEPH_MSG_OSD_OPREPLY: |
2025 | return get_reply(con, hdr, skip); | 2025 | return get_reply(con, hdr, skip); |
2026 | default: | 2026 | default: |
2027 | pr_info("alloc_msg unexpected msg type %d from osd%d\n", type, | 2027 | pr_info("alloc_msg unexpected msg type %d from osd%d\n", type, |
2028 | osd->o_osd); | 2028 | osd->o_osd); |
2029 | *skip = 1; | 2029 | *skip = 1; |
2030 | return NULL; | 2030 | return NULL; |
2031 | } | 2031 | } |
2032 | } | 2032 | } |
2033 | 2033 | ||
2034 | /* | 2034 | /* |
2035 | * Wrappers to refcount containing ceph_osd struct | 2035 | * Wrappers to refcount containing ceph_osd struct |
2036 | */ | 2036 | */ |
2037 | static struct ceph_connection *get_osd_con(struct ceph_connection *con) | 2037 | static struct ceph_connection *get_osd_con(struct ceph_connection *con) |
2038 | { | 2038 | { |
2039 | struct ceph_osd *osd = con->private; | 2039 | struct ceph_osd *osd = con->private; |
2040 | if (get_osd(osd)) | 2040 | if (get_osd(osd)) |
2041 | return con; | 2041 | return con; |
2042 | return NULL; | 2042 | return NULL; |
2043 | } | 2043 | } |
2044 | 2044 | ||
2045 | static void put_osd_con(struct ceph_connection *con) | 2045 | static void put_osd_con(struct ceph_connection *con) |
2046 | { | 2046 | { |
2047 | struct ceph_osd *osd = con->private; | 2047 | struct ceph_osd *osd = con->private; |
2048 | put_osd(osd); | 2048 | put_osd(osd); |
2049 | } | 2049 | } |
2050 | 2050 | ||
2051 | /* | 2051 | /* |
2052 | * authentication | 2052 | * authentication |
2053 | */ | 2053 | */ |
2054 | /* | 2054 | /* |
2055 | * Note: returned pointer is the address of a structure that's | 2055 | * Note: returned pointer is the address of a structure that's |
2056 | * managed separately. Caller must *not* attempt to free it. | 2056 | * managed separately. Caller must *not* attempt to free it. |
2057 | */ | 2057 | */ |
2058 | static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, | 2058 | static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, |
2059 | int *proto, int force_new) | 2059 | int *proto, int force_new) |
2060 | { | 2060 | { |
2061 | struct ceph_osd *o = con->private; | 2061 | struct ceph_osd *o = con->private; |
2062 | struct ceph_osd_client *osdc = o->o_osdc; | 2062 | struct ceph_osd_client *osdc = o->o_osdc; |
2063 | struct ceph_auth_client *ac = osdc->client->monc.auth; | 2063 | struct ceph_auth_client *ac = osdc->client->monc.auth; |
2064 | struct ceph_auth_handshake *auth = &o->o_auth; | 2064 | struct ceph_auth_handshake *auth = &o->o_auth; |
2065 | 2065 | ||
2066 | if (force_new && auth->authorizer) { | 2066 | if (force_new && auth->authorizer) { |
2067 | if (ac->ops && ac->ops->destroy_authorizer) | 2067 | if (ac->ops && ac->ops->destroy_authorizer) |
2068 | ac->ops->destroy_authorizer(ac, auth->authorizer); | 2068 | ac->ops->destroy_authorizer(ac, auth->authorizer); |
2069 | auth->authorizer = NULL; | 2069 | auth->authorizer = NULL; |
2070 | } | 2070 | } |
2071 | if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) { | 2071 | if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) { |
2072 | int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, | 2072 | int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, |
2073 | auth); | 2073 | auth); |
2074 | if (ret) | 2074 | if (ret) |
2075 | return ERR_PTR(ret); | 2075 | return ERR_PTR(ret); |
2076 | } | 2076 | } |
2077 | *proto = ac->protocol; | 2077 | *proto = ac->protocol; |
2078 | 2078 | ||
2079 | return auth; | 2079 | return auth; |
2080 | } | 2080 | } |
2081 | 2081 | ||
2082 | 2082 | ||
2083 | static int verify_authorizer_reply(struct ceph_connection *con, int len) | 2083 | static int verify_authorizer_reply(struct ceph_connection *con, int len) |
2084 | { | 2084 | { |
2085 | struct ceph_osd *o = con->private; | 2085 | struct ceph_osd *o = con->private; |
2086 | struct ceph_osd_client *osdc = o->o_osdc; | 2086 | struct ceph_osd_client *osdc = o->o_osdc; |
2087 | struct ceph_auth_client *ac = osdc->client->monc.auth; | 2087 | struct ceph_auth_client *ac = osdc->client->monc.auth; |
2088 | 2088 | ||
2089 | /* | 2089 | /* |
2090 | * XXX If ac->ops or ac->ops->verify_authorizer_reply is null, | 2090 | * XXX If ac->ops or ac->ops->verify_authorizer_reply is null, |
2091 | * XXX which do we do: succeed or fail? | 2091 | * XXX which do we do: succeed or fail? |
2092 | */ | 2092 | */ |
2093 | return ac->ops->verify_authorizer_reply(ac, o->o_auth.authorizer, len); | 2093 | return ac->ops->verify_authorizer_reply(ac, o->o_auth.authorizer, len); |
2094 | } | 2094 | } |
2095 | 2095 | ||
2096 | static int invalidate_authorizer(struct ceph_connection *con) | 2096 | static int invalidate_authorizer(struct ceph_connection *con) |
2097 | { | 2097 | { |
2098 | struct ceph_osd *o = con->private; | 2098 | struct ceph_osd *o = con->private; |
2099 | struct ceph_osd_client *osdc = o->o_osdc; | 2099 | struct ceph_osd_client *osdc = o->o_osdc; |
2100 | struct ceph_auth_client *ac = osdc->client->monc.auth; | 2100 | struct ceph_auth_client *ac = osdc->client->monc.auth; |
2101 | 2101 | ||
2102 | if (ac->ops && ac->ops->invalidate_authorizer) | 2102 | if (ac->ops && ac->ops->invalidate_authorizer) |
2103 | ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); | 2103 | ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); |
2104 | 2104 | ||
2105 | return ceph_monc_validate_auth(&osdc->client->monc); | 2105 | return ceph_monc_validate_auth(&osdc->client->monc); |
2106 | } | 2106 | } |
2107 | 2107 | ||
2108 | static const struct ceph_connection_operations osd_con_ops = { | 2108 | static const struct ceph_connection_operations osd_con_ops = { |
2109 | .get = get_osd_con, | 2109 | .get = get_osd_con, |
2110 | .put = put_osd_con, | 2110 | .put = put_osd_con, |
2111 | .dispatch = dispatch, | 2111 | .dispatch = dispatch, |
2112 | .get_authorizer = get_authorizer, | 2112 | .get_authorizer = get_authorizer, |
2113 | .verify_authorizer_reply = verify_authorizer_reply, | 2113 | .verify_authorizer_reply = verify_authorizer_reply, |
2114 | .invalidate_authorizer = invalidate_authorizer, | 2114 | .invalidate_authorizer = invalidate_authorizer, |
2115 | .alloc_msg = alloc_msg, | 2115 | .alloc_msg = alloc_msg, |
2116 | .fault = osd_reset, | 2116 | .fault = osd_reset, |
2117 | }; | 2117 | }; |
2118 | 2118 |