Commit 20ebb345282d9d90603b021ced113b73e9cdb6a1
Exists in
ti-lsk-linux-4.1.y
and in
10 other branches
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull two Ceph fixes from Sage Weil: "These are both pretty trivial: a sparse warning fix and size_t printk thing" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: libceph: fix sparse endianness warnings ceph: use %zu for len in ceph_fill_inline_data()
Showing 4 changed files Inline Diff
fs/ceph/addr.c
1 | #include <linux/ceph/ceph_debug.h> | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include <linux/backing-dev.h> | 3 | #include <linux/backing-dev.h> |
4 | #include <linux/fs.h> | 4 | #include <linux/fs.h> |
5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
6 | #include <linux/pagemap.h> | 6 | #include <linux/pagemap.h> |
7 | #include <linux/writeback.h> /* generic_writepages */ | 7 | #include <linux/writeback.h> /* generic_writepages */ |
8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
9 | #include <linux/pagevec.h> | 9 | #include <linux/pagevec.h> |
10 | #include <linux/task_io_accounting_ops.h> | 10 | #include <linux/task_io_accounting_ops.h> |
11 | 11 | ||
12 | #include "super.h" | 12 | #include "super.h" |
13 | #include "mds_client.h" | 13 | #include "mds_client.h" |
14 | #include "cache.h" | 14 | #include "cache.h" |
15 | #include <linux/ceph/osd_client.h> | 15 | #include <linux/ceph/osd_client.h> |
16 | 16 | ||
17 | /* | 17 | /* |
18 | * Ceph address space ops. | 18 | * Ceph address space ops. |
19 | * | 19 | * |
20 | * There are a few funny things going on here. | 20 | * There are a few funny things going on here. |
21 | * | 21 | * |
22 | * The page->private field is used to reference a struct | 22 | * The page->private field is used to reference a struct |
23 | * ceph_snap_context for _every_ dirty page. This indicates which | 23 | * ceph_snap_context for _every_ dirty page. This indicates which |
24 | * snapshot the page was logically dirtied in, and thus which snap | 24 | * snapshot the page was logically dirtied in, and thus which snap |
25 | * context needs to be associated with the osd write during writeback. | 25 | * context needs to be associated with the osd write during writeback. |
26 | * | 26 | * |
27 | * Similarly, struct ceph_inode_info maintains a set of counters to | 27 | * Similarly, struct ceph_inode_info maintains a set of counters to |
28 | * count dirty pages on the inode. In the absence of snapshots, | 28 | * count dirty pages on the inode. In the absence of snapshots, |
29 | * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count. | 29 | * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count. |
30 | * | 30 | * |
31 | * When a snapshot is taken (that is, when the client receives | 31 | * When a snapshot is taken (that is, when the client receives |
32 | * notification that a snapshot was taken), each inode with caps and | 32 | * notification that a snapshot was taken), each inode with caps and |
33 | * with dirty pages (dirty pages implies there is a cap) gets a new | 33 | * with dirty pages (dirty pages implies there is a cap) gets a new |
34 | * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending | 34 | * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending |
35 | * order, new snaps go to the tail). The i_wrbuffer_ref_head count is | 35 | * order, new snaps go to the tail). The i_wrbuffer_ref_head count is |
36 | * moved to capsnap->dirty. (Unless a sync write is currently in | 36 | * moved to capsnap->dirty. (Unless a sync write is currently in |
37 | * progress. In that case, the capsnap is said to be "pending", new | 37 | * progress. In that case, the capsnap is said to be "pending", new |
38 | * writes cannot start, and the capsnap isn't "finalized" until the | 38 | * writes cannot start, and the capsnap isn't "finalized" until the |
39 | * write completes (or fails) and a final size/mtime for the inode for | 39 | * write completes (or fails) and a final size/mtime for the inode for |
40 | * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0. | 40 | * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0. |
41 | * | 41 | * |
42 | * On writeback, we must submit writes to the osd IN SNAP ORDER. So, | 42 | * On writeback, we must submit writes to the osd IN SNAP ORDER. So, |
43 | * we look for the first capsnap in i_cap_snaps and write out pages in | 43 | * we look for the first capsnap in i_cap_snaps and write out pages in |
44 | * that snap context _only_. Then we move on to the next capsnap, | 44 | * that snap context _only_. Then we move on to the next capsnap, |
45 | * eventually reaching the "live" or "head" context (i.e., pages that | 45 | * eventually reaching the "live" or "head" context (i.e., pages that |
46 | * are not yet snapped) and are writing the most recently dirtied | 46 | * are not yet snapped) and are writing the most recently dirtied |
47 | * pages. | 47 | * pages. |
48 | * | 48 | * |
49 | * Invalidate and so forth must take care to ensure the dirty page | 49 | * Invalidate and so forth must take care to ensure the dirty page |
50 | * accounting is preserved. | 50 | * accounting is preserved. |
51 | */ | 51 | */ |
52 | 52 | ||
53 | #define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10)) | 53 | #define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10)) |
54 | #define CONGESTION_OFF_THRESH(congestion_kb) \ | 54 | #define CONGESTION_OFF_THRESH(congestion_kb) \ |
55 | (CONGESTION_ON_THRESH(congestion_kb) - \ | 55 | (CONGESTION_ON_THRESH(congestion_kb) - \ |
56 | (CONGESTION_ON_THRESH(congestion_kb) >> 2)) | 56 | (CONGESTION_ON_THRESH(congestion_kb) >> 2)) |
57 | 57 | ||
58 | static inline struct ceph_snap_context *page_snap_context(struct page *page) | 58 | static inline struct ceph_snap_context *page_snap_context(struct page *page) |
59 | { | 59 | { |
60 | if (PagePrivate(page)) | 60 | if (PagePrivate(page)) |
61 | return (void *)page->private; | 61 | return (void *)page->private; |
62 | return NULL; | 62 | return NULL; |
63 | } | 63 | } |
64 | 64 | ||
65 | /* | 65 | /* |
66 | * Dirty a page. Optimistically adjust accounting, on the assumption | 66 | * Dirty a page. Optimistically adjust accounting, on the assumption |
67 | * that we won't race with invalidate. If we do, readjust. | 67 | * that we won't race with invalidate. If we do, readjust. |
68 | */ | 68 | */ |
69 | static int ceph_set_page_dirty(struct page *page) | 69 | static int ceph_set_page_dirty(struct page *page) |
70 | { | 70 | { |
71 | struct address_space *mapping = page->mapping; | 71 | struct address_space *mapping = page->mapping; |
72 | struct inode *inode; | 72 | struct inode *inode; |
73 | struct ceph_inode_info *ci; | 73 | struct ceph_inode_info *ci; |
74 | struct ceph_snap_context *snapc; | 74 | struct ceph_snap_context *snapc; |
75 | int ret; | 75 | int ret; |
76 | 76 | ||
77 | if (unlikely(!mapping)) | 77 | if (unlikely(!mapping)) |
78 | return !TestSetPageDirty(page); | 78 | return !TestSetPageDirty(page); |
79 | 79 | ||
80 | if (PageDirty(page)) { | 80 | if (PageDirty(page)) { |
81 | dout("%p set_page_dirty %p idx %lu -- already dirty\n", | 81 | dout("%p set_page_dirty %p idx %lu -- already dirty\n", |
82 | mapping->host, page, page->index); | 82 | mapping->host, page, page->index); |
83 | BUG_ON(!PagePrivate(page)); | 83 | BUG_ON(!PagePrivate(page)); |
84 | return 0; | 84 | return 0; |
85 | } | 85 | } |
86 | 86 | ||
87 | inode = mapping->host; | 87 | inode = mapping->host; |
88 | ci = ceph_inode(inode); | 88 | ci = ceph_inode(inode); |
89 | 89 | ||
90 | /* | 90 | /* |
91 | * Note that we're grabbing a snapc ref here without holding | 91 | * Note that we're grabbing a snapc ref here without holding |
92 | * any locks! | 92 | * any locks! |
93 | */ | 93 | */ |
94 | snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); | 94 | snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); |
95 | 95 | ||
96 | /* dirty the head */ | 96 | /* dirty the head */ |
97 | spin_lock(&ci->i_ceph_lock); | 97 | spin_lock(&ci->i_ceph_lock); |
98 | if (ci->i_head_snapc == NULL) | 98 | if (ci->i_head_snapc == NULL) |
99 | ci->i_head_snapc = ceph_get_snap_context(snapc); | 99 | ci->i_head_snapc = ceph_get_snap_context(snapc); |
100 | ++ci->i_wrbuffer_ref_head; | 100 | ++ci->i_wrbuffer_ref_head; |
101 | if (ci->i_wrbuffer_ref == 0) | 101 | if (ci->i_wrbuffer_ref == 0) |
102 | ihold(inode); | 102 | ihold(inode); |
103 | ++ci->i_wrbuffer_ref; | 103 | ++ci->i_wrbuffer_ref; |
104 | dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d " | 104 | dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d " |
105 | "snapc %p seq %lld (%d snaps)\n", | 105 | "snapc %p seq %lld (%d snaps)\n", |
106 | mapping->host, page, page->index, | 106 | mapping->host, page, page->index, |
107 | ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, | 107 | ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, |
108 | ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, | 108 | ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, |
109 | snapc, snapc->seq, snapc->num_snaps); | 109 | snapc, snapc->seq, snapc->num_snaps); |
110 | spin_unlock(&ci->i_ceph_lock); | 110 | spin_unlock(&ci->i_ceph_lock); |
111 | 111 | ||
112 | /* | 112 | /* |
113 | * Reference snap context in page->private. Also set | 113 | * Reference snap context in page->private. Also set |
114 | * PagePrivate so that we get invalidatepage callback. | 114 | * PagePrivate so that we get invalidatepage callback. |
115 | */ | 115 | */ |
116 | BUG_ON(PagePrivate(page)); | 116 | BUG_ON(PagePrivate(page)); |
117 | page->private = (unsigned long)snapc; | 117 | page->private = (unsigned long)snapc; |
118 | SetPagePrivate(page); | 118 | SetPagePrivate(page); |
119 | 119 | ||
120 | ret = __set_page_dirty_nobuffers(page); | 120 | ret = __set_page_dirty_nobuffers(page); |
121 | WARN_ON(!PageLocked(page)); | 121 | WARN_ON(!PageLocked(page)); |
122 | WARN_ON(!page->mapping); | 122 | WARN_ON(!page->mapping); |
123 | 123 | ||
124 | return ret; | 124 | return ret; |
125 | } | 125 | } |
126 | 126 | ||
127 | /* | 127 | /* |
128 | * If we are truncating the full page (i.e. offset == 0), adjust the | 128 | * If we are truncating the full page (i.e. offset == 0), adjust the |
129 | * dirty page counters appropriately. Only called if there is private | 129 | * dirty page counters appropriately. Only called if there is private |
130 | * data on the page. | 130 | * data on the page. |
131 | */ | 131 | */ |
132 | static void ceph_invalidatepage(struct page *page, unsigned int offset, | 132 | static void ceph_invalidatepage(struct page *page, unsigned int offset, |
133 | unsigned int length) | 133 | unsigned int length) |
134 | { | 134 | { |
135 | struct inode *inode; | 135 | struct inode *inode; |
136 | struct ceph_inode_info *ci; | 136 | struct ceph_inode_info *ci; |
137 | struct ceph_snap_context *snapc = page_snap_context(page); | 137 | struct ceph_snap_context *snapc = page_snap_context(page); |
138 | 138 | ||
139 | inode = page->mapping->host; | 139 | inode = page->mapping->host; |
140 | ci = ceph_inode(inode); | 140 | ci = ceph_inode(inode); |
141 | 141 | ||
142 | if (offset != 0 || length != PAGE_CACHE_SIZE) { | 142 | if (offset != 0 || length != PAGE_CACHE_SIZE) { |
143 | dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n", | 143 | dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n", |
144 | inode, page, page->index, offset, length); | 144 | inode, page, page->index, offset, length); |
145 | return; | 145 | return; |
146 | } | 146 | } |
147 | 147 | ||
148 | ceph_invalidate_fscache_page(inode, page); | 148 | ceph_invalidate_fscache_page(inode, page); |
149 | 149 | ||
150 | if (!PagePrivate(page)) | 150 | if (!PagePrivate(page)) |
151 | return; | 151 | return; |
152 | 152 | ||
153 | /* | 153 | /* |
154 | * We can get non-dirty pages here due to races between | 154 | * We can get non-dirty pages here due to races between |
155 | * set_page_dirty and truncate_complete_page; just spit out a | 155 | * set_page_dirty and truncate_complete_page; just spit out a |
156 | * warning, in case we end up with accounting problems later. | 156 | * warning, in case we end up with accounting problems later. |
157 | */ | 157 | */ |
158 | if (!PageDirty(page)) | 158 | if (!PageDirty(page)) |
159 | pr_err("%p invalidatepage %p page not dirty\n", inode, page); | 159 | pr_err("%p invalidatepage %p page not dirty\n", inode, page); |
160 | 160 | ||
161 | ClearPageChecked(page); | 161 | ClearPageChecked(page); |
162 | 162 | ||
163 | dout("%p invalidatepage %p idx %lu full dirty page\n", | 163 | dout("%p invalidatepage %p idx %lu full dirty page\n", |
164 | inode, page, page->index); | 164 | inode, page, page->index); |
165 | 165 | ||
166 | ceph_put_wrbuffer_cap_refs(ci, 1, snapc); | 166 | ceph_put_wrbuffer_cap_refs(ci, 1, snapc); |
167 | ceph_put_snap_context(snapc); | 167 | ceph_put_snap_context(snapc); |
168 | page->private = 0; | 168 | page->private = 0; |
169 | ClearPagePrivate(page); | 169 | ClearPagePrivate(page); |
170 | } | 170 | } |
171 | 171 | ||
172 | static int ceph_releasepage(struct page *page, gfp_t g) | 172 | static int ceph_releasepage(struct page *page, gfp_t g) |
173 | { | 173 | { |
174 | struct inode *inode = page->mapping ? page->mapping->host : NULL; | 174 | struct inode *inode = page->mapping ? page->mapping->host : NULL; |
175 | dout("%p releasepage %p idx %lu\n", inode, page, page->index); | 175 | dout("%p releasepage %p idx %lu\n", inode, page, page->index); |
176 | WARN_ON(PageDirty(page)); | 176 | WARN_ON(PageDirty(page)); |
177 | 177 | ||
178 | /* Can we release the page from the cache? */ | 178 | /* Can we release the page from the cache? */ |
179 | if (!ceph_release_fscache_page(page, g)) | 179 | if (!ceph_release_fscache_page(page, g)) |
180 | return 0; | 180 | return 0; |
181 | 181 | ||
182 | return !PagePrivate(page); | 182 | return !PagePrivate(page); |
183 | } | 183 | } |
184 | 184 | ||
185 | /* | 185 | /* |
186 | * read a single page, without unlocking it. | 186 | * read a single page, without unlocking it. |
187 | */ | 187 | */ |
188 | static int readpage_nounlock(struct file *filp, struct page *page) | 188 | static int readpage_nounlock(struct file *filp, struct page *page) |
189 | { | 189 | { |
190 | struct inode *inode = file_inode(filp); | 190 | struct inode *inode = file_inode(filp); |
191 | struct ceph_inode_info *ci = ceph_inode(inode); | 191 | struct ceph_inode_info *ci = ceph_inode(inode); |
192 | struct ceph_osd_client *osdc = | 192 | struct ceph_osd_client *osdc = |
193 | &ceph_inode_to_client(inode)->client->osdc; | 193 | &ceph_inode_to_client(inode)->client->osdc; |
194 | int err = 0; | 194 | int err = 0; |
195 | u64 off = page_offset(page); | 195 | u64 off = page_offset(page); |
196 | u64 len = PAGE_CACHE_SIZE; | 196 | u64 len = PAGE_CACHE_SIZE; |
197 | 197 | ||
198 | if (off >= i_size_read(inode)) { | 198 | if (off >= i_size_read(inode)) { |
199 | zero_user_segment(page, err, PAGE_CACHE_SIZE); | 199 | zero_user_segment(page, err, PAGE_CACHE_SIZE); |
200 | SetPageUptodate(page); | 200 | SetPageUptodate(page); |
201 | return 0; | 201 | return 0; |
202 | } | 202 | } |
203 | 203 | ||
204 | /* | 204 | /* |
205 | * Uptodate inline data should have been added into page cache | 205 | * Uptodate inline data should have been added into page cache |
206 | * while getting Fcr caps. | 206 | * while getting Fcr caps. |
207 | */ | 207 | */ |
208 | if (ci->i_inline_version != CEPH_INLINE_NONE) | 208 | if (ci->i_inline_version != CEPH_INLINE_NONE) |
209 | return -EINVAL; | 209 | return -EINVAL; |
210 | 210 | ||
211 | err = ceph_readpage_from_fscache(inode, page); | 211 | err = ceph_readpage_from_fscache(inode, page); |
212 | if (err == 0) | 212 | if (err == 0) |
213 | goto out; | 213 | goto out; |
214 | 214 | ||
215 | dout("readpage inode %p file %p page %p index %lu\n", | 215 | dout("readpage inode %p file %p page %p index %lu\n", |
216 | inode, filp, page, page->index); | 216 | inode, filp, page, page->index); |
217 | err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, | 217 | err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, |
218 | off, &len, | 218 | off, &len, |
219 | ci->i_truncate_seq, ci->i_truncate_size, | 219 | ci->i_truncate_seq, ci->i_truncate_size, |
220 | &page, 1, 0); | 220 | &page, 1, 0); |
221 | if (err == -ENOENT) | 221 | if (err == -ENOENT) |
222 | err = 0; | 222 | err = 0; |
223 | if (err < 0) { | 223 | if (err < 0) { |
224 | SetPageError(page); | 224 | SetPageError(page); |
225 | ceph_fscache_readpage_cancel(inode, page); | 225 | ceph_fscache_readpage_cancel(inode, page); |
226 | goto out; | 226 | goto out; |
227 | } | 227 | } |
228 | if (err < PAGE_CACHE_SIZE) | 228 | if (err < PAGE_CACHE_SIZE) |
229 | /* zero fill remainder of page */ | 229 | /* zero fill remainder of page */ |
230 | zero_user_segment(page, err, PAGE_CACHE_SIZE); | 230 | zero_user_segment(page, err, PAGE_CACHE_SIZE); |
231 | else | 231 | else |
232 | flush_dcache_page(page); | 232 | flush_dcache_page(page); |
233 | 233 | ||
234 | SetPageUptodate(page); | 234 | SetPageUptodate(page); |
235 | ceph_readpage_to_fscache(inode, page); | 235 | ceph_readpage_to_fscache(inode, page); |
236 | 236 | ||
237 | out: | 237 | out: |
238 | return err < 0 ? err : 0; | 238 | return err < 0 ? err : 0; |
239 | } | 239 | } |
240 | 240 | ||
241 | static int ceph_readpage(struct file *filp, struct page *page) | 241 | static int ceph_readpage(struct file *filp, struct page *page) |
242 | { | 242 | { |
243 | int r = readpage_nounlock(filp, page); | 243 | int r = readpage_nounlock(filp, page); |
244 | unlock_page(page); | 244 | unlock_page(page); |
245 | return r; | 245 | return r; |
246 | } | 246 | } |
247 | 247 | ||
248 | /* | 248 | /* |
249 | * Finish an async read(ahead) op. | 249 | * Finish an async read(ahead) op. |
250 | */ | 250 | */ |
251 | static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) | 251 | static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) |
252 | { | 252 | { |
253 | struct inode *inode = req->r_inode; | 253 | struct inode *inode = req->r_inode; |
254 | struct ceph_osd_data *osd_data; | 254 | struct ceph_osd_data *osd_data; |
255 | int rc = req->r_result; | 255 | int rc = req->r_result; |
256 | int bytes = le32_to_cpu(msg->hdr.data_len); | 256 | int bytes = le32_to_cpu(msg->hdr.data_len); |
257 | int num_pages; | 257 | int num_pages; |
258 | int i; | 258 | int i; |
259 | 259 | ||
260 | dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); | 260 | dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); |
261 | 261 | ||
262 | /* unlock all pages, zeroing any data we didn't read */ | 262 | /* unlock all pages, zeroing any data we didn't read */ |
263 | osd_data = osd_req_op_extent_osd_data(req, 0); | 263 | osd_data = osd_req_op_extent_osd_data(req, 0); |
264 | BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); | 264 | BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); |
265 | num_pages = calc_pages_for((u64)osd_data->alignment, | 265 | num_pages = calc_pages_for((u64)osd_data->alignment, |
266 | (u64)osd_data->length); | 266 | (u64)osd_data->length); |
267 | for (i = 0; i < num_pages; i++) { | 267 | for (i = 0; i < num_pages; i++) { |
268 | struct page *page = osd_data->pages[i]; | 268 | struct page *page = osd_data->pages[i]; |
269 | 269 | ||
270 | if (rc < 0) | 270 | if (rc < 0) |
271 | goto unlock; | 271 | goto unlock; |
272 | if (bytes < (int)PAGE_CACHE_SIZE) { | 272 | if (bytes < (int)PAGE_CACHE_SIZE) { |
273 | /* zero (remainder of) page */ | 273 | /* zero (remainder of) page */ |
274 | int s = bytes < 0 ? 0 : bytes; | 274 | int s = bytes < 0 ? 0 : bytes; |
275 | zero_user_segment(page, s, PAGE_CACHE_SIZE); | 275 | zero_user_segment(page, s, PAGE_CACHE_SIZE); |
276 | } | 276 | } |
277 | dout("finish_read %p uptodate %p idx %lu\n", inode, page, | 277 | dout("finish_read %p uptodate %p idx %lu\n", inode, page, |
278 | page->index); | 278 | page->index); |
279 | flush_dcache_page(page); | 279 | flush_dcache_page(page); |
280 | SetPageUptodate(page); | 280 | SetPageUptodate(page); |
281 | ceph_readpage_to_fscache(inode, page); | 281 | ceph_readpage_to_fscache(inode, page); |
282 | unlock: | 282 | unlock: |
283 | unlock_page(page); | 283 | unlock_page(page); |
284 | page_cache_release(page); | 284 | page_cache_release(page); |
285 | bytes -= PAGE_CACHE_SIZE; | 285 | bytes -= PAGE_CACHE_SIZE; |
286 | } | 286 | } |
287 | kfree(osd_data->pages); | 287 | kfree(osd_data->pages); |
288 | } | 288 | } |
289 | 289 | ||
290 | static void ceph_unlock_page_vector(struct page **pages, int num_pages) | 290 | static void ceph_unlock_page_vector(struct page **pages, int num_pages) |
291 | { | 291 | { |
292 | int i; | 292 | int i; |
293 | 293 | ||
294 | for (i = 0; i < num_pages; i++) | 294 | for (i = 0; i < num_pages; i++) |
295 | unlock_page(pages[i]); | 295 | unlock_page(pages[i]); |
296 | } | 296 | } |
297 | 297 | ||
298 | /* | 298 | /* |
299 | * start an async read(ahead) operation. return nr_pages we submitted | 299 | * start an async read(ahead) operation. return nr_pages we submitted |
300 | * a read for on success, or negative error code. | 300 | * a read for on success, or negative error code. |
301 | */ | 301 | */ |
302 | static int start_read(struct inode *inode, struct list_head *page_list, int max) | 302 | static int start_read(struct inode *inode, struct list_head *page_list, int max) |
303 | { | 303 | { |
304 | struct ceph_osd_client *osdc = | 304 | struct ceph_osd_client *osdc = |
305 | &ceph_inode_to_client(inode)->client->osdc; | 305 | &ceph_inode_to_client(inode)->client->osdc; |
306 | struct ceph_inode_info *ci = ceph_inode(inode); | 306 | struct ceph_inode_info *ci = ceph_inode(inode); |
307 | struct page *page = list_entry(page_list->prev, struct page, lru); | 307 | struct page *page = list_entry(page_list->prev, struct page, lru); |
308 | struct ceph_vino vino; | 308 | struct ceph_vino vino; |
309 | struct ceph_osd_request *req; | 309 | struct ceph_osd_request *req; |
310 | u64 off; | 310 | u64 off; |
311 | u64 len; | 311 | u64 len; |
312 | int i; | 312 | int i; |
313 | struct page **pages; | 313 | struct page **pages; |
314 | pgoff_t next_index; | 314 | pgoff_t next_index; |
315 | int nr_pages = 0; | 315 | int nr_pages = 0; |
316 | int ret; | 316 | int ret; |
317 | 317 | ||
318 | off = (u64) page_offset(page); | 318 | off = (u64) page_offset(page); |
319 | 319 | ||
320 | /* count pages */ | 320 | /* count pages */ |
321 | next_index = page->index; | 321 | next_index = page->index; |
322 | list_for_each_entry_reverse(page, page_list, lru) { | 322 | list_for_each_entry_reverse(page, page_list, lru) { |
323 | if (page->index != next_index) | 323 | if (page->index != next_index) |
324 | break; | 324 | break; |
325 | nr_pages++; | 325 | nr_pages++; |
326 | next_index++; | 326 | next_index++; |
327 | if (max && nr_pages == max) | 327 | if (max && nr_pages == max) |
328 | break; | 328 | break; |
329 | } | 329 | } |
330 | len = nr_pages << PAGE_CACHE_SHIFT; | 330 | len = nr_pages << PAGE_CACHE_SHIFT; |
331 | dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, | 331 | dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, |
332 | off, len); | 332 | off, len); |
333 | vino = ceph_vino(inode); | 333 | vino = ceph_vino(inode); |
334 | req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, | 334 | req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, |
335 | 0, 1, CEPH_OSD_OP_READ, | 335 | 0, 1, CEPH_OSD_OP_READ, |
336 | CEPH_OSD_FLAG_READ, NULL, | 336 | CEPH_OSD_FLAG_READ, NULL, |
337 | ci->i_truncate_seq, ci->i_truncate_size, | 337 | ci->i_truncate_seq, ci->i_truncate_size, |
338 | false); | 338 | false); |
339 | if (IS_ERR(req)) | 339 | if (IS_ERR(req)) |
340 | return PTR_ERR(req); | 340 | return PTR_ERR(req); |
341 | 341 | ||
342 | /* build page vector */ | 342 | /* build page vector */ |
343 | nr_pages = calc_pages_for(0, len); | 343 | nr_pages = calc_pages_for(0, len); |
344 | pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); | 344 | pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); |
345 | ret = -ENOMEM; | 345 | ret = -ENOMEM; |
346 | if (!pages) | 346 | if (!pages) |
347 | goto out; | 347 | goto out; |
348 | for (i = 0; i < nr_pages; ++i) { | 348 | for (i = 0; i < nr_pages; ++i) { |
349 | page = list_entry(page_list->prev, struct page, lru); | 349 | page = list_entry(page_list->prev, struct page, lru); |
350 | BUG_ON(PageLocked(page)); | 350 | BUG_ON(PageLocked(page)); |
351 | list_del(&page->lru); | 351 | list_del(&page->lru); |
352 | 352 | ||
353 | dout("start_read %p adding %p idx %lu\n", inode, page, | 353 | dout("start_read %p adding %p idx %lu\n", inode, page, |
354 | page->index); | 354 | page->index); |
355 | if (add_to_page_cache_lru(page, &inode->i_data, page->index, | 355 | if (add_to_page_cache_lru(page, &inode->i_data, page->index, |
356 | GFP_NOFS)) { | 356 | GFP_NOFS)) { |
357 | ceph_fscache_uncache_page(inode, page); | 357 | ceph_fscache_uncache_page(inode, page); |
358 | page_cache_release(page); | 358 | page_cache_release(page); |
359 | dout("start_read %p add_to_page_cache failed %p\n", | 359 | dout("start_read %p add_to_page_cache failed %p\n", |
360 | inode, page); | 360 | inode, page); |
361 | nr_pages = i; | 361 | nr_pages = i; |
362 | goto out_pages; | 362 | goto out_pages; |
363 | } | 363 | } |
364 | pages[i] = page; | 364 | pages[i] = page; |
365 | } | 365 | } |
366 | osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); | 366 | osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); |
367 | req->r_callback = finish_read; | 367 | req->r_callback = finish_read; |
368 | req->r_inode = inode; | 368 | req->r_inode = inode; |
369 | 369 | ||
370 | ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); | 370 | ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); |
371 | 371 | ||
372 | dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); | 372 | dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); |
373 | ret = ceph_osdc_start_request(osdc, req, false); | 373 | ret = ceph_osdc_start_request(osdc, req, false); |
374 | if (ret < 0) | 374 | if (ret < 0) |
375 | goto out_pages; | 375 | goto out_pages; |
376 | ceph_osdc_put_request(req); | 376 | ceph_osdc_put_request(req); |
377 | return nr_pages; | 377 | return nr_pages; |
378 | 378 | ||
379 | out_pages: | 379 | out_pages: |
380 | ceph_unlock_page_vector(pages, nr_pages); | 380 | ceph_unlock_page_vector(pages, nr_pages); |
381 | ceph_release_page_vector(pages, nr_pages); | 381 | ceph_release_page_vector(pages, nr_pages); |
382 | out: | 382 | out: |
383 | ceph_osdc_put_request(req); | 383 | ceph_osdc_put_request(req); |
384 | return ret; | 384 | return ret; |
385 | } | 385 | } |
386 | 386 | ||
387 | 387 | ||
388 | /* | 388 | /* |
389 | * Read multiple pages. Leave pages we don't read + unlock in page_list; | 389 | * Read multiple pages. Leave pages we don't read + unlock in page_list; |
390 | * the caller (VM) cleans them up. | 390 | * the caller (VM) cleans them up. |
391 | */ | 391 | */ |
392 | static int ceph_readpages(struct file *file, struct address_space *mapping, | 392 | static int ceph_readpages(struct file *file, struct address_space *mapping, |
393 | struct list_head *page_list, unsigned nr_pages) | 393 | struct list_head *page_list, unsigned nr_pages) |
394 | { | 394 | { |
395 | struct inode *inode = file_inode(file); | 395 | struct inode *inode = file_inode(file); |
396 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 396 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
397 | int rc = 0; | 397 | int rc = 0; |
398 | int max = 0; | 398 | int max = 0; |
399 | 399 | ||
400 | if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) | 400 | if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) |
401 | return -EINVAL; | 401 | return -EINVAL; |
402 | 402 | ||
403 | rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, | 403 | rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, |
404 | &nr_pages); | 404 | &nr_pages); |
405 | 405 | ||
406 | if (rc == 0) | 406 | if (rc == 0) |
407 | goto out; | 407 | goto out; |
408 | 408 | ||
409 | if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) | 409 | if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) |
410 | max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) | 410 | max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) |
411 | >> PAGE_SHIFT; | 411 | >> PAGE_SHIFT; |
412 | 412 | ||
413 | dout("readpages %p file %p nr_pages %d max %d\n", inode, | 413 | dout("readpages %p file %p nr_pages %d max %d\n", inode, |
414 | file, nr_pages, | 414 | file, nr_pages, |
415 | max); | 415 | max); |
416 | while (!list_empty(page_list)) { | 416 | while (!list_empty(page_list)) { |
417 | rc = start_read(inode, page_list, max); | 417 | rc = start_read(inode, page_list, max); |
418 | if (rc < 0) | 418 | if (rc < 0) |
419 | goto out; | 419 | goto out; |
420 | BUG_ON(rc == 0); | 420 | BUG_ON(rc == 0); |
421 | } | 421 | } |
422 | out: | 422 | out: |
423 | ceph_fscache_readpages_cancel(inode, page_list); | 423 | ceph_fscache_readpages_cancel(inode, page_list); |
424 | 424 | ||
425 | dout("readpages %p file %p ret %d\n", inode, file, rc); | 425 | dout("readpages %p file %p ret %d\n", inode, file, rc); |
426 | return rc; | 426 | return rc; |
427 | } | 427 | } |
428 | 428 | ||
429 | /* | 429 | /* |
430 | * Get ref for the oldest snapc for an inode with dirty data... that is, the | 430 | * Get ref for the oldest snapc for an inode with dirty data... that is, the |
431 | * only snap context we are allowed to write back. | 431 | * only snap context we are allowed to write back. |
432 | */ | 432 | */ |
433 | static struct ceph_snap_context *get_oldest_context(struct inode *inode, | 433 | static struct ceph_snap_context *get_oldest_context(struct inode *inode, |
434 | u64 *snap_size) | 434 | u64 *snap_size) |
435 | { | 435 | { |
436 | struct ceph_inode_info *ci = ceph_inode(inode); | 436 | struct ceph_inode_info *ci = ceph_inode(inode); |
437 | struct ceph_snap_context *snapc = NULL; | 437 | struct ceph_snap_context *snapc = NULL; |
438 | struct ceph_cap_snap *capsnap = NULL; | 438 | struct ceph_cap_snap *capsnap = NULL; |
439 | 439 | ||
440 | spin_lock(&ci->i_ceph_lock); | 440 | spin_lock(&ci->i_ceph_lock); |
441 | list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { | 441 | list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { |
442 | dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, | 442 | dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, |
443 | capsnap->context, capsnap->dirty_pages); | 443 | capsnap->context, capsnap->dirty_pages); |
444 | if (capsnap->dirty_pages) { | 444 | if (capsnap->dirty_pages) { |
445 | snapc = ceph_get_snap_context(capsnap->context); | 445 | snapc = ceph_get_snap_context(capsnap->context); |
446 | if (snap_size) | 446 | if (snap_size) |
447 | *snap_size = capsnap->size; | 447 | *snap_size = capsnap->size; |
448 | break; | 448 | break; |
449 | } | 449 | } |
450 | } | 450 | } |
451 | if (!snapc && ci->i_wrbuffer_ref_head) { | 451 | if (!snapc && ci->i_wrbuffer_ref_head) { |
452 | snapc = ceph_get_snap_context(ci->i_head_snapc); | 452 | snapc = ceph_get_snap_context(ci->i_head_snapc); |
453 | dout(" head snapc %p has %d dirty pages\n", | 453 | dout(" head snapc %p has %d dirty pages\n", |
454 | snapc, ci->i_wrbuffer_ref_head); | 454 | snapc, ci->i_wrbuffer_ref_head); |
455 | } | 455 | } |
456 | spin_unlock(&ci->i_ceph_lock); | 456 | spin_unlock(&ci->i_ceph_lock); |
457 | return snapc; | 457 | return snapc; |
458 | } | 458 | } |
459 | 459 | ||
460 | /* | 460 | /* |
461 | * Write a single page, but leave the page locked. | 461 | * Write a single page, but leave the page locked. |
462 | * | 462 | * |
463 | * If we get a write error, set the page error bit, but still adjust the | 463 | * If we get a write error, set the page error bit, but still adjust the |
464 | * dirty page accounting (i.e., page is no longer dirty). | 464 | * dirty page accounting (i.e., page is no longer dirty). |
465 | */ | 465 | */ |
466 | static int writepage_nounlock(struct page *page, struct writeback_control *wbc) | 466 | static int writepage_nounlock(struct page *page, struct writeback_control *wbc) |
467 | { | 467 | { |
468 | struct inode *inode; | 468 | struct inode *inode; |
469 | struct ceph_inode_info *ci; | 469 | struct ceph_inode_info *ci; |
470 | struct ceph_fs_client *fsc; | 470 | struct ceph_fs_client *fsc; |
471 | struct ceph_osd_client *osdc; | 471 | struct ceph_osd_client *osdc; |
472 | struct ceph_snap_context *snapc, *oldest; | 472 | struct ceph_snap_context *snapc, *oldest; |
473 | loff_t page_off = page_offset(page); | 473 | loff_t page_off = page_offset(page); |
474 | long writeback_stat; | 474 | long writeback_stat; |
475 | u64 truncate_size, snap_size = 0; | 475 | u64 truncate_size, snap_size = 0; |
476 | u32 truncate_seq; | 476 | u32 truncate_seq; |
477 | int err = 0, len = PAGE_CACHE_SIZE; | 477 | int err = 0, len = PAGE_CACHE_SIZE; |
478 | 478 | ||
479 | dout("writepage %p idx %lu\n", page, page->index); | 479 | dout("writepage %p idx %lu\n", page, page->index); |
480 | 480 | ||
481 | if (!page->mapping || !page->mapping->host) { | 481 | if (!page->mapping || !page->mapping->host) { |
482 | dout("writepage %p - no mapping\n", page); | 482 | dout("writepage %p - no mapping\n", page); |
483 | return -EFAULT; | 483 | return -EFAULT; |
484 | } | 484 | } |
485 | inode = page->mapping->host; | 485 | inode = page->mapping->host; |
486 | ci = ceph_inode(inode); | 486 | ci = ceph_inode(inode); |
487 | fsc = ceph_inode_to_client(inode); | 487 | fsc = ceph_inode_to_client(inode); |
488 | osdc = &fsc->client->osdc; | 488 | osdc = &fsc->client->osdc; |
489 | 489 | ||
490 | /* verify this is a writeable snap context */ | 490 | /* verify this is a writeable snap context */ |
491 | snapc = page_snap_context(page); | 491 | snapc = page_snap_context(page); |
492 | if (snapc == NULL) { | 492 | if (snapc == NULL) { |
493 | dout("writepage %p page %p not dirty?\n", inode, page); | 493 | dout("writepage %p page %p not dirty?\n", inode, page); |
494 | goto out; | 494 | goto out; |
495 | } | 495 | } |
496 | oldest = get_oldest_context(inode, &snap_size); | 496 | oldest = get_oldest_context(inode, &snap_size); |
497 | if (snapc->seq > oldest->seq) { | 497 | if (snapc->seq > oldest->seq) { |
498 | dout("writepage %p page %p snapc %p not writeable - noop\n", | 498 | dout("writepage %p page %p snapc %p not writeable - noop\n", |
499 | inode, page, snapc); | 499 | inode, page, snapc); |
500 | /* we should only noop if called by kswapd */ | 500 | /* we should only noop if called by kswapd */ |
501 | WARN_ON((current->flags & PF_MEMALLOC) == 0); | 501 | WARN_ON((current->flags & PF_MEMALLOC) == 0); |
502 | ceph_put_snap_context(oldest); | 502 | ceph_put_snap_context(oldest); |
503 | goto out; | 503 | goto out; |
504 | } | 504 | } |
505 | ceph_put_snap_context(oldest); | 505 | ceph_put_snap_context(oldest); |
506 | 506 | ||
507 | spin_lock(&ci->i_ceph_lock); | 507 | spin_lock(&ci->i_ceph_lock); |
508 | truncate_seq = ci->i_truncate_seq; | 508 | truncate_seq = ci->i_truncate_seq; |
509 | truncate_size = ci->i_truncate_size; | 509 | truncate_size = ci->i_truncate_size; |
510 | if (!snap_size) | 510 | if (!snap_size) |
511 | snap_size = i_size_read(inode); | 511 | snap_size = i_size_read(inode); |
512 | spin_unlock(&ci->i_ceph_lock); | 512 | spin_unlock(&ci->i_ceph_lock); |
513 | 513 | ||
514 | /* is this a partial page at end of file? */ | 514 | /* is this a partial page at end of file? */ |
515 | if (page_off >= snap_size) { | 515 | if (page_off >= snap_size) { |
516 | dout("%p page eof %llu\n", page, snap_size); | 516 | dout("%p page eof %llu\n", page, snap_size); |
517 | goto out; | 517 | goto out; |
518 | } | 518 | } |
519 | if (snap_size < page_off + len) | 519 | if (snap_size < page_off + len) |
520 | len = snap_size - page_off; | 520 | len = snap_size - page_off; |
521 | 521 | ||
522 | dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", | 522 | dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", |
523 | inode, page, page->index, page_off, len, snapc); | 523 | inode, page, page->index, page_off, len, snapc); |
524 | 524 | ||
525 | writeback_stat = atomic_long_inc_return(&fsc->writeback_count); | 525 | writeback_stat = atomic_long_inc_return(&fsc->writeback_count); |
526 | if (writeback_stat > | 526 | if (writeback_stat > |
527 | CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) | 527 | CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) |
528 | set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); | 528 | set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); |
529 | 529 | ||
530 | ceph_readpage_to_fscache(inode, page); | 530 | ceph_readpage_to_fscache(inode, page); |
531 | 531 | ||
532 | set_page_writeback(page); | 532 | set_page_writeback(page); |
533 | err = ceph_osdc_writepages(osdc, ceph_vino(inode), | 533 | err = ceph_osdc_writepages(osdc, ceph_vino(inode), |
534 | &ci->i_layout, snapc, | 534 | &ci->i_layout, snapc, |
535 | page_off, len, | 535 | page_off, len, |
536 | truncate_seq, truncate_size, | 536 | truncate_seq, truncate_size, |
537 | &inode->i_mtime, &page, 1); | 537 | &inode->i_mtime, &page, 1); |
538 | if (err < 0) { | 538 | if (err < 0) { |
539 | dout("writepage setting page/mapping error %d %p\n", err, page); | 539 | dout("writepage setting page/mapping error %d %p\n", err, page); |
540 | SetPageError(page); | 540 | SetPageError(page); |
541 | mapping_set_error(&inode->i_data, err); | 541 | mapping_set_error(&inode->i_data, err); |
542 | if (wbc) | 542 | if (wbc) |
543 | wbc->pages_skipped++; | 543 | wbc->pages_skipped++; |
544 | } else { | 544 | } else { |
545 | dout("writepage cleaned page %p\n", page); | 545 | dout("writepage cleaned page %p\n", page); |
546 | err = 0; /* vfs expects us to return 0 */ | 546 | err = 0; /* vfs expects us to return 0 */ |
547 | } | 547 | } |
548 | page->private = 0; | 548 | page->private = 0; |
549 | ClearPagePrivate(page); | 549 | ClearPagePrivate(page); |
550 | end_page_writeback(page); | 550 | end_page_writeback(page); |
551 | ceph_put_wrbuffer_cap_refs(ci, 1, snapc); | 551 | ceph_put_wrbuffer_cap_refs(ci, 1, snapc); |
552 | ceph_put_snap_context(snapc); /* page's reference */ | 552 | ceph_put_snap_context(snapc); /* page's reference */ |
553 | out: | 553 | out: |
554 | return err; | 554 | return err; |
555 | } | 555 | } |
556 | 556 | ||
557 | static int ceph_writepage(struct page *page, struct writeback_control *wbc) | 557 | static int ceph_writepage(struct page *page, struct writeback_control *wbc) |
558 | { | 558 | { |
559 | int err; | 559 | int err; |
560 | struct inode *inode = page->mapping->host; | 560 | struct inode *inode = page->mapping->host; |
561 | BUG_ON(!inode); | 561 | BUG_ON(!inode); |
562 | ihold(inode); | 562 | ihold(inode); |
563 | err = writepage_nounlock(page, wbc); | 563 | err = writepage_nounlock(page, wbc); |
564 | unlock_page(page); | 564 | unlock_page(page); |
565 | iput(inode); | 565 | iput(inode); |
566 | return err; | 566 | return err; |
567 | } | 567 | } |
568 | 568 | ||
569 | 569 | ||
570 | /* | 570 | /* |
571 | * lame release_pages helper. release_pages() isn't exported to | 571 | * lame release_pages helper. release_pages() isn't exported to |
572 | * modules. | 572 | * modules. |
573 | */ | 573 | */ |
574 | static void ceph_release_pages(struct page **pages, int num) | 574 | static void ceph_release_pages(struct page **pages, int num) |
575 | { | 575 | { |
576 | struct pagevec pvec; | 576 | struct pagevec pvec; |
577 | int i; | 577 | int i; |
578 | 578 | ||
579 | pagevec_init(&pvec, 0); | 579 | pagevec_init(&pvec, 0); |
580 | for (i = 0; i < num; i++) { | 580 | for (i = 0; i < num; i++) { |
581 | if (pagevec_add(&pvec, pages[i]) == 0) | 581 | if (pagevec_add(&pvec, pages[i]) == 0) |
582 | pagevec_release(&pvec); | 582 | pagevec_release(&pvec); |
583 | } | 583 | } |
584 | pagevec_release(&pvec); | 584 | pagevec_release(&pvec); |
585 | } | 585 | } |
586 | 586 | ||
587 | /* | 587 | /* |
588 | * async writeback completion handler. | 588 | * async writeback completion handler. |
589 | * | 589 | * |
590 | * If we get an error, set the mapping error bit, but not the individual | 590 | * If we get an error, set the mapping error bit, but not the individual |
591 | * page error bits. | 591 | * page error bits. |
592 | */ | 592 | */ |
593 | static void writepages_finish(struct ceph_osd_request *req, | 593 | static void writepages_finish(struct ceph_osd_request *req, |
594 | struct ceph_msg *msg) | 594 | struct ceph_msg *msg) |
595 | { | 595 | { |
596 | struct inode *inode = req->r_inode; | 596 | struct inode *inode = req->r_inode; |
597 | struct ceph_inode_info *ci = ceph_inode(inode); | 597 | struct ceph_inode_info *ci = ceph_inode(inode); |
598 | struct ceph_osd_data *osd_data; | 598 | struct ceph_osd_data *osd_data; |
599 | unsigned wrote; | 599 | unsigned wrote; |
600 | struct page *page; | 600 | struct page *page; |
601 | int num_pages; | 601 | int num_pages; |
602 | int i; | 602 | int i; |
603 | struct ceph_snap_context *snapc = req->r_snapc; | 603 | struct ceph_snap_context *snapc = req->r_snapc; |
604 | struct address_space *mapping = inode->i_mapping; | 604 | struct address_space *mapping = inode->i_mapping; |
605 | int rc = req->r_result; | 605 | int rc = req->r_result; |
606 | u64 bytes = req->r_ops[0].extent.length; | 606 | u64 bytes = req->r_ops[0].extent.length; |
607 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 607 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
608 | long writeback_stat; | 608 | long writeback_stat; |
609 | unsigned issued = ceph_caps_issued(ci); | 609 | unsigned issued = ceph_caps_issued(ci); |
610 | 610 | ||
611 | osd_data = osd_req_op_extent_osd_data(req, 0); | 611 | osd_data = osd_req_op_extent_osd_data(req, 0); |
612 | BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); | 612 | BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); |
613 | num_pages = calc_pages_for((u64)osd_data->alignment, | 613 | num_pages = calc_pages_for((u64)osd_data->alignment, |
614 | (u64)osd_data->length); | 614 | (u64)osd_data->length); |
615 | if (rc >= 0) { | 615 | if (rc >= 0) { |
616 | /* | 616 | /* |
617 | * Assume we wrote the pages we originally sent. The | 617 | * Assume we wrote the pages we originally sent. The |
618 | * osd might reply with fewer pages if our writeback | 618 | * osd might reply with fewer pages if our writeback |
619 | * raced with a truncation and was adjusted at the osd, | 619 | * raced with a truncation and was adjusted at the osd, |
620 | * so don't believe the reply. | 620 | * so don't believe the reply. |
621 | */ | 621 | */ |
622 | wrote = num_pages; | 622 | wrote = num_pages; |
623 | } else { | 623 | } else { |
624 | wrote = 0; | 624 | wrote = 0; |
625 | mapping_set_error(mapping, rc); | 625 | mapping_set_error(mapping, rc); |
626 | } | 626 | } |
627 | dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n", | 627 | dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n", |
628 | inode, rc, bytes, wrote); | 628 | inode, rc, bytes, wrote); |
629 | 629 | ||
630 | /* clean all pages */ | 630 | /* clean all pages */ |
631 | for (i = 0; i < num_pages; i++) { | 631 | for (i = 0; i < num_pages; i++) { |
632 | page = osd_data->pages[i]; | 632 | page = osd_data->pages[i]; |
633 | BUG_ON(!page); | 633 | BUG_ON(!page); |
634 | WARN_ON(!PageUptodate(page)); | 634 | WARN_ON(!PageUptodate(page)); |
635 | 635 | ||
636 | writeback_stat = | 636 | writeback_stat = |
637 | atomic_long_dec_return(&fsc->writeback_count); | 637 | atomic_long_dec_return(&fsc->writeback_count); |
638 | if (writeback_stat < | 638 | if (writeback_stat < |
639 | CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) | 639 | CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) |
640 | clear_bdi_congested(&fsc->backing_dev_info, | 640 | clear_bdi_congested(&fsc->backing_dev_info, |
641 | BLK_RW_ASYNC); | 641 | BLK_RW_ASYNC); |
642 | 642 | ||
643 | ceph_put_snap_context(page_snap_context(page)); | 643 | ceph_put_snap_context(page_snap_context(page)); |
644 | page->private = 0; | 644 | page->private = 0; |
645 | ClearPagePrivate(page); | 645 | ClearPagePrivate(page); |
646 | dout("unlocking %d %p\n", i, page); | 646 | dout("unlocking %d %p\n", i, page); |
647 | end_page_writeback(page); | 647 | end_page_writeback(page); |
648 | 648 | ||
649 | /* | 649 | /* |
650 | * We lost the cache cap, need to truncate the page before | 650 | * We lost the cache cap, need to truncate the page before |
651 | * it is unlocked, otherwise we'd truncate it later in the | 651 | * it is unlocked, otherwise we'd truncate it later in the |
652 | * page truncation thread, possibly losing some data that | 652 | * page truncation thread, possibly losing some data that |
653 | * raced its way in | 653 | * raced its way in |
654 | */ | 654 | */ |
655 | if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) | 655 | if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) |
656 | generic_error_remove_page(inode->i_mapping, page); | 656 | generic_error_remove_page(inode->i_mapping, page); |
657 | 657 | ||
658 | unlock_page(page); | 658 | unlock_page(page); |
659 | } | 659 | } |
660 | dout("%p wrote+cleaned %d pages\n", inode, wrote); | 660 | dout("%p wrote+cleaned %d pages\n", inode, wrote); |
661 | ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc); | 661 | ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc); |
662 | 662 | ||
663 | ceph_release_pages(osd_data->pages, num_pages); | 663 | ceph_release_pages(osd_data->pages, num_pages); |
664 | if (osd_data->pages_from_pool) | 664 | if (osd_data->pages_from_pool) |
665 | mempool_free(osd_data->pages, | 665 | mempool_free(osd_data->pages, |
666 | ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); | 666 | ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); |
667 | else | 667 | else |
668 | kfree(osd_data->pages); | 668 | kfree(osd_data->pages); |
669 | ceph_osdc_put_request(req); | 669 | ceph_osdc_put_request(req); |
670 | } | 670 | } |
671 | 671 | ||
672 | /* | 672 | /* |
673 | * initiate async writeback | 673 | * initiate async writeback |
674 | */ | 674 | */ |
675 | static int ceph_writepages_start(struct address_space *mapping, | 675 | static int ceph_writepages_start(struct address_space *mapping, |
676 | struct writeback_control *wbc) | 676 | struct writeback_control *wbc) |
677 | { | 677 | { |
678 | struct inode *inode = mapping->host; | 678 | struct inode *inode = mapping->host; |
679 | struct ceph_inode_info *ci = ceph_inode(inode); | 679 | struct ceph_inode_info *ci = ceph_inode(inode); |
680 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 680 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
681 | struct ceph_vino vino = ceph_vino(inode); | 681 | struct ceph_vino vino = ceph_vino(inode); |
682 | pgoff_t index, start, end; | 682 | pgoff_t index, start, end; |
683 | int range_whole = 0; | 683 | int range_whole = 0; |
684 | int should_loop = 1; | 684 | int should_loop = 1; |
685 | pgoff_t max_pages = 0, max_pages_ever = 0; | 685 | pgoff_t max_pages = 0, max_pages_ever = 0; |
686 | struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; | 686 | struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; |
687 | struct pagevec pvec; | 687 | struct pagevec pvec; |
688 | int done = 0; | 688 | int done = 0; |
689 | int rc = 0; | 689 | int rc = 0; |
690 | unsigned wsize = 1 << inode->i_blkbits; | 690 | unsigned wsize = 1 << inode->i_blkbits; |
691 | struct ceph_osd_request *req = NULL; | 691 | struct ceph_osd_request *req = NULL; |
692 | int do_sync = 0; | 692 | int do_sync = 0; |
693 | u64 truncate_size, snap_size; | 693 | u64 truncate_size, snap_size; |
694 | u32 truncate_seq; | 694 | u32 truncate_seq; |
695 | 695 | ||
696 | /* | 696 | /* |
697 | * Include a 'sync' in the OSD request if this is a data | 697 | * Include a 'sync' in the OSD request if this is a data |
698 | * integrity write (e.g., O_SYNC write or fsync()), or if our | 698 | * integrity write (e.g., O_SYNC write or fsync()), or if our |
699 | * cap is being revoked. | 699 | * cap is being revoked. |
700 | */ | 700 | */ |
701 | if ((wbc->sync_mode == WB_SYNC_ALL) || | 701 | if ((wbc->sync_mode == WB_SYNC_ALL) || |
702 | ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) | 702 | ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) |
703 | do_sync = 1; | 703 | do_sync = 1; |
704 | dout("writepages_start %p dosync=%d (mode=%s)\n", | 704 | dout("writepages_start %p dosync=%d (mode=%s)\n", |
705 | inode, do_sync, | 705 | inode, do_sync, |
706 | wbc->sync_mode == WB_SYNC_NONE ? "NONE" : | 706 | wbc->sync_mode == WB_SYNC_NONE ? "NONE" : |
707 | (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); | 707 | (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); |
708 | 708 | ||
709 | if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { | 709 | if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { |
710 | pr_warn("writepage_start %p on forced umount\n", inode); | 710 | pr_warn("writepage_start %p on forced umount\n", inode); |
711 | return -EIO; /* we're in a forced umount, don't write! */ | 711 | return -EIO; /* we're in a forced umount, don't write! */ |
712 | } | 712 | } |
713 | if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) | 713 | if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) |
714 | wsize = fsc->mount_options->wsize; | 714 | wsize = fsc->mount_options->wsize; |
715 | if (wsize < PAGE_CACHE_SIZE) | 715 | if (wsize < PAGE_CACHE_SIZE) |
716 | wsize = PAGE_CACHE_SIZE; | 716 | wsize = PAGE_CACHE_SIZE; |
717 | max_pages_ever = wsize >> PAGE_CACHE_SHIFT; | 717 | max_pages_ever = wsize >> PAGE_CACHE_SHIFT; |
718 | 718 | ||
719 | pagevec_init(&pvec, 0); | 719 | pagevec_init(&pvec, 0); |
720 | 720 | ||
721 | /* where to start/end? */ | 721 | /* where to start/end? */ |
722 | if (wbc->range_cyclic) { | 722 | if (wbc->range_cyclic) { |
723 | start = mapping->writeback_index; /* Start from prev offset */ | 723 | start = mapping->writeback_index; /* Start from prev offset */ |
724 | end = -1; | 724 | end = -1; |
725 | dout(" cyclic, start at %lu\n", start); | 725 | dout(" cyclic, start at %lu\n", start); |
726 | } else { | 726 | } else { |
727 | start = wbc->range_start >> PAGE_CACHE_SHIFT; | 727 | start = wbc->range_start >> PAGE_CACHE_SHIFT; |
728 | end = wbc->range_end >> PAGE_CACHE_SHIFT; | 728 | end = wbc->range_end >> PAGE_CACHE_SHIFT; |
729 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) | 729 | if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) |
730 | range_whole = 1; | 730 | range_whole = 1; |
731 | should_loop = 0; | 731 | should_loop = 0; |
732 | dout(" not cyclic, %lu to %lu\n", start, end); | 732 | dout(" not cyclic, %lu to %lu\n", start, end); |
733 | } | 733 | } |
734 | index = start; | 734 | index = start; |
735 | 735 | ||
736 | retry: | 736 | retry: |
737 | /* find oldest snap context with dirty data */ | 737 | /* find oldest snap context with dirty data */ |
738 | ceph_put_snap_context(snapc); | 738 | ceph_put_snap_context(snapc); |
739 | snap_size = 0; | 739 | snap_size = 0; |
740 | snapc = get_oldest_context(inode, &snap_size); | 740 | snapc = get_oldest_context(inode, &snap_size); |
741 | if (!snapc) { | 741 | if (!snapc) { |
742 | /* hmm, why does writepages get called when there | 742 | /* hmm, why does writepages get called when there |
743 | is no dirty data? */ | 743 | is no dirty data? */ |
744 | dout(" no snap context with dirty data?\n"); | 744 | dout(" no snap context with dirty data?\n"); |
745 | goto out; | 745 | goto out; |
746 | } | 746 | } |
747 | if (snap_size == 0) | 747 | if (snap_size == 0) |
748 | snap_size = i_size_read(inode); | 748 | snap_size = i_size_read(inode); |
749 | dout(" oldest snapc is %p seq %lld (%d snaps)\n", | 749 | dout(" oldest snapc is %p seq %lld (%d snaps)\n", |
750 | snapc, snapc->seq, snapc->num_snaps); | 750 | snapc, snapc->seq, snapc->num_snaps); |
751 | 751 | ||
752 | spin_lock(&ci->i_ceph_lock); | 752 | spin_lock(&ci->i_ceph_lock); |
753 | truncate_seq = ci->i_truncate_seq; | 753 | truncate_seq = ci->i_truncate_seq; |
754 | truncate_size = ci->i_truncate_size; | 754 | truncate_size = ci->i_truncate_size; |
755 | if (!snap_size) | 755 | if (!snap_size) |
756 | snap_size = i_size_read(inode); | 756 | snap_size = i_size_read(inode); |
757 | spin_unlock(&ci->i_ceph_lock); | 757 | spin_unlock(&ci->i_ceph_lock); |
758 | 758 | ||
759 | if (last_snapc && snapc != last_snapc) { | 759 | if (last_snapc && snapc != last_snapc) { |
760 | /* if we switched to a newer snapc, restart our scan at the | 760 | /* if we switched to a newer snapc, restart our scan at the |
761 | * start of the original file range. */ | 761 | * start of the original file range. */ |
762 | dout(" snapc differs from last pass, restarting at %lu\n", | 762 | dout(" snapc differs from last pass, restarting at %lu\n", |
763 | index); | 763 | index); |
764 | index = start; | 764 | index = start; |
765 | } | 765 | } |
766 | last_snapc = snapc; | 766 | last_snapc = snapc; |
767 | 767 | ||
768 | while (!done && index <= end) { | 768 | while (!done && index <= end) { |
769 | unsigned i; | 769 | unsigned i; |
770 | int first; | 770 | int first; |
771 | pgoff_t next; | 771 | pgoff_t next; |
772 | int pvec_pages, locked_pages; | 772 | int pvec_pages, locked_pages; |
773 | struct page **pages = NULL; | 773 | struct page **pages = NULL; |
774 | mempool_t *pool = NULL; /* Becomes non-null if mempool used */ | 774 | mempool_t *pool = NULL; /* Becomes non-null if mempool used */ |
775 | struct page *page; | 775 | struct page *page; |
776 | int want; | 776 | int want; |
777 | u64 offset, len; | 777 | u64 offset, len; |
778 | long writeback_stat; | 778 | long writeback_stat; |
779 | 779 | ||
780 | next = 0; | 780 | next = 0; |
781 | locked_pages = 0; | 781 | locked_pages = 0; |
782 | max_pages = max_pages_ever; | 782 | max_pages = max_pages_ever; |
783 | 783 | ||
784 | get_more_pages: | 784 | get_more_pages: |
785 | first = -1; | 785 | first = -1; |
786 | want = min(end - index, | 786 | want = min(end - index, |
787 | min((pgoff_t)PAGEVEC_SIZE, | 787 | min((pgoff_t)PAGEVEC_SIZE, |
788 | max_pages - (pgoff_t)locked_pages) - 1) | 788 | max_pages - (pgoff_t)locked_pages) - 1) |
789 | + 1; | 789 | + 1; |
790 | pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, | 790 | pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, |
791 | PAGECACHE_TAG_DIRTY, | 791 | PAGECACHE_TAG_DIRTY, |
792 | want); | 792 | want); |
793 | dout("pagevec_lookup_tag got %d\n", pvec_pages); | 793 | dout("pagevec_lookup_tag got %d\n", pvec_pages); |
794 | if (!pvec_pages && !locked_pages) | 794 | if (!pvec_pages && !locked_pages) |
795 | break; | 795 | break; |
796 | for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { | 796 | for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { |
797 | page = pvec.pages[i]; | 797 | page = pvec.pages[i]; |
798 | dout("? %p idx %lu\n", page, page->index); | 798 | dout("? %p idx %lu\n", page, page->index); |
799 | if (locked_pages == 0) | 799 | if (locked_pages == 0) |
800 | lock_page(page); /* first page */ | 800 | lock_page(page); /* first page */ |
801 | else if (!trylock_page(page)) | 801 | else if (!trylock_page(page)) |
802 | break; | 802 | break; |
803 | 803 | ||
804 | /* only dirty pages, or our accounting breaks */ | 804 | /* only dirty pages, or our accounting breaks */ |
805 | if (unlikely(!PageDirty(page)) || | 805 | if (unlikely(!PageDirty(page)) || |
806 | unlikely(page->mapping != mapping)) { | 806 | unlikely(page->mapping != mapping)) { |
807 | dout("!dirty or !mapping %p\n", page); | 807 | dout("!dirty or !mapping %p\n", page); |
808 | unlock_page(page); | 808 | unlock_page(page); |
809 | break; | 809 | break; |
810 | } | 810 | } |
811 | if (!wbc->range_cyclic && page->index > end) { | 811 | if (!wbc->range_cyclic && page->index > end) { |
812 | dout("end of range %p\n", page); | 812 | dout("end of range %p\n", page); |
813 | done = 1; | 813 | done = 1; |
814 | unlock_page(page); | 814 | unlock_page(page); |
815 | break; | 815 | break; |
816 | } | 816 | } |
817 | if (next && (page->index != next)) { | 817 | if (next && (page->index != next)) { |
818 | dout("not consecutive %p\n", page); | 818 | dout("not consecutive %p\n", page); |
819 | unlock_page(page); | 819 | unlock_page(page); |
820 | break; | 820 | break; |
821 | } | 821 | } |
822 | if (wbc->sync_mode != WB_SYNC_NONE) { | 822 | if (wbc->sync_mode != WB_SYNC_NONE) { |
823 | dout("waiting on writeback %p\n", page); | 823 | dout("waiting on writeback %p\n", page); |
824 | wait_on_page_writeback(page); | 824 | wait_on_page_writeback(page); |
825 | } | 825 | } |
826 | if (page_offset(page) >= snap_size) { | 826 | if (page_offset(page) >= snap_size) { |
827 | dout("%p page eof %llu\n", page, snap_size); | 827 | dout("%p page eof %llu\n", page, snap_size); |
828 | done = 1; | 828 | done = 1; |
829 | unlock_page(page); | 829 | unlock_page(page); |
830 | break; | 830 | break; |
831 | } | 831 | } |
832 | if (PageWriteback(page)) { | 832 | if (PageWriteback(page)) { |
833 | dout("%p under writeback\n", page); | 833 | dout("%p under writeback\n", page); |
834 | unlock_page(page); | 834 | unlock_page(page); |
835 | break; | 835 | break; |
836 | } | 836 | } |
837 | 837 | ||
838 | /* only if matching snap context */ | 838 | /* only if matching snap context */ |
839 | pgsnapc = page_snap_context(page); | 839 | pgsnapc = page_snap_context(page); |
840 | if (pgsnapc->seq > snapc->seq) { | 840 | if (pgsnapc->seq > snapc->seq) { |
841 | dout("page snapc %p %lld > oldest %p %lld\n", | 841 | dout("page snapc %p %lld > oldest %p %lld\n", |
842 | pgsnapc, pgsnapc->seq, snapc, snapc->seq); | 842 | pgsnapc, pgsnapc->seq, snapc, snapc->seq); |
843 | unlock_page(page); | 843 | unlock_page(page); |
844 | if (!locked_pages) | 844 | if (!locked_pages) |
845 | continue; /* keep looking for snap */ | 845 | continue; /* keep looking for snap */ |
846 | break; | 846 | break; |
847 | } | 847 | } |
848 | 848 | ||
849 | if (!clear_page_dirty_for_io(page)) { | 849 | if (!clear_page_dirty_for_io(page)) { |
850 | dout("%p !clear_page_dirty_for_io\n", page); | 850 | dout("%p !clear_page_dirty_for_io\n", page); |
851 | unlock_page(page); | 851 | unlock_page(page); |
852 | break; | 852 | break; |
853 | } | 853 | } |
854 | 854 | ||
855 | /* | 855 | /* |
856 | * We have something to write. If this is | 856 | * We have something to write. If this is |
857 | * the first locked page this time through, | 857 | * the first locked page this time through, |
858 | * allocate an osd request and a page array | 858 | * allocate an osd request and a page array |
859 | * that it will use. | 859 | * that it will use. |
860 | */ | 860 | */ |
861 | if (locked_pages == 0) { | 861 | if (locked_pages == 0) { |
862 | BUG_ON(pages); | 862 | BUG_ON(pages); |
863 | /* prepare async write request */ | 863 | /* prepare async write request */ |
864 | offset = (u64)page_offset(page); | 864 | offset = (u64)page_offset(page); |
865 | len = wsize; | 865 | len = wsize; |
866 | req = ceph_osdc_new_request(&fsc->client->osdc, | 866 | req = ceph_osdc_new_request(&fsc->client->osdc, |
867 | &ci->i_layout, vino, | 867 | &ci->i_layout, vino, |
868 | offset, &len, 0, | 868 | offset, &len, 0, |
869 | do_sync ? 2 : 1, | 869 | do_sync ? 2 : 1, |
870 | CEPH_OSD_OP_WRITE, | 870 | CEPH_OSD_OP_WRITE, |
871 | CEPH_OSD_FLAG_WRITE | | 871 | CEPH_OSD_FLAG_WRITE | |
872 | CEPH_OSD_FLAG_ONDISK, | 872 | CEPH_OSD_FLAG_ONDISK, |
873 | snapc, truncate_seq, | 873 | snapc, truncate_seq, |
874 | truncate_size, true); | 874 | truncate_size, true); |
875 | if (IS_ERR(req)) { | 875 | if (IS_ERR(req)) { |
876 | rc = PTR_ERR(req); | 876 | rc = PTR_ERR(req); |
877 | unlock_page(page); | 877 | unlock_page(page); |
878 | break; | 878 | break; |
879 | } | 879 | } |
880 | 880 | ||
881 | if (do_sync) | 881 | if (do_sync) |
882 | osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); | 882 | osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); |
883 | 883 | ||
884 | req->r_callback = writepages_finish; | 884 | req->r_callback = writepages_finish; |
885 | req->r_inode = inode; | 885 | req->r_inode = inode; |
886 | 886 | ||
887 | max_pages = calc_pages_for(0, (u64)len); | 887 | max_pages = calc_pages_for(0, (u64)len); |
888 | pages = kmalloc(max_pages * sizeof (*pages), | 888 | pages = kmalloc(max_pages * sizeof (*pages), |
889 | GFP_NOFS); | 889 | GFP_NOFS); |
890 | if (!pages) { | 890 | if (!pages) { |
891 | pool = fsc->wb_pagevec_pool; | 891 | pool = fsc->wb_pagevec_pool; |
892 | pages = mempool_alloc(pool, GFP_NOFS); | 892 | pages = mempool_alloc(pool, GFP_NOFS); |
893 | BUG_ON(!pages); | 893 | BUG_ON(!pages); |
894 | } | 894 | } |
895 | } | 895 | } |
896 | 896 | ||
897 | /* note position of first page in pvec */ | 897 | /* note position of first page in pvec */ |
898 | if (first < 0) | 898 | if (first < 0) |
899 | first = i; | 899 | first = i; |
900 | dout("%p will write page %p idx %lu\n", | 900 | dout("%p will write page %p idx %lu\n", |
901 | inode, page, page->index); | 901 | inode, page, page->index); |
902 | 902 | ||
903 | writeback_stat = | 903 | writeback_stat = |
904 | atomic_long_inc_return(&fsc->writeback_count); | 904 | atomic_long_inc_return(&fsc->writeback_count); |
905 | if (writeback_stat > CONGESTION_ON_THRESH( | 905 | if (writeback_stat > CONGESTION_ON_THRESH( |
906 | fsc->mount_options->congestion_kb)) { | 906 | fsc->mount_options->congestion_kb)) { |
907 | set_bdi_congested(&fsc->backing_dev_info, | 907 | set_bdi_congested(&fsc->backing_dev_info, |
908 | BLK_RW_ASYNC); | 908 | BLK_RW_ASYNC); |
909 | } | 909 | } |
910 | 910 | ||
911 | set_page_writeback(page); | 911 | set_page_writeback(page); |
912 | pages[locked_pages] = page; | 912 | pages[locked_pages] = page; |
913 | locked_pages++; | 913 | locked_pages++; |
914 | next = page->index + 1; | 914 | next = page->index + 1; |
915 | } | 915 | } |
916 | 916 | ||
917 | /* did we get anything? */ | 917 | /* did we get anything? */ |
918 | if (!locked_pages) | 918 | if (!locked_pages) |
919 | goto release_pvec_pages; | 919 | goto release_pvec_pages; |
920 | if (i) { | 920 | if (i) { |
921 | int j; | 921 | int j; |
922 | BUG_ON(!locked_pages || first < 0); | 922 | BUG_ON(!locked_pages || first < 0); |
923 | 923 | ||
924 | if (pvec_pages && i == pvec_pages && | 924 | if (pvec_pages && i == pvec_pages && |
925 | locked_pages < max_pages) { | 925 | locked_pages < max_pages) { |
926 | dout("reached end pvec, trying for more\n"); | 926 | dout("reached end pvec, trying for more\n"); |
927 | pagevec_reinit(&pvec); | 927 | pagevec_reinit(&pvec); |
928 | goto get_more_pages; | 928 | goto get_more_pages; |
929 | } | 929 | } |
930 | 930 | ||
931 | /* shift unused pages over in the pvec... we | 931 | /* shift unused pages over in the pvec... we |
932 | * will need to release them below. */ | 932 | * will need to release them below. */ |
933 | for (j = i; j < pvec_pages; j++) { | 933 | for (j = i; j < pvec_pages; j++) { |
934 | dout(" pvec leftover page %p\n", | 934 | dout(" pvec leftover page %p\n", |
935 | pvec.pages[j]); | 935 | pvec.pages[j]); |
936 | pvec.pages[j-i+first] = pvec.pages[j]; | 936 | pvec.pages[j-i+first] = pvec.pages[j]; |
937 | } | 937 | } |
938 | pvec.nr -= i-first; | 938 | pvec.nr -= i-first; |
939 | } | 939 | } |
940 | 940 | ||
941 | /* Format the osd request message and submit the write */ | 941 | /* Format the osd request message and submit the write */ |
942 | 942 | ||
943 | offset = page_offset(pages[0]); | 943 | offset = page_offset(pages[0]); |
944 | len = min(snap_size - offset, | 944 | len = min(snap_size - offset, |
945 | (u64)locked_pages << PAGE_CACHE_SHIFT); | 945 | (u64)locked_pages << PAGE_CACHE_SHIFT); |
946 | dout("writepages got %d pages at %llu~%llu\n", | 946 | dout("writepages got %d pages at %llu~%llu\n", |
947 | locked_pages, offset, len); | 947 | locked_pages, offset, len); |
948 | 948 | ||
949 | osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, | 949 | osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, |
950 | !!pool, false); | 950 | !!pool, false); |
951 | 951 | ||
952 | pages = NULL; /* request message now owns the pages array */ | 952 | pages = NULL; /* request message now owns the pages array */ |
953 | pool = NULL; | 953 | pool = NULL; |
954 | 954 | ||
955 | /* Update the write op length in case we changed it */ | 955 | /* Update the write op length in case we changed it */ |
956 | 956 | ||
957 | osd_req_op_extent_update(req, 0, len); | 957 | osd_req_op_extent_update(req, 0, len); |
958 | 958 | ||
959 | vino = ceph_vino(inode); | 959 | vino = ceph_vino(inode); |
960 | ceph_osdc_build_request(req, offset, snapc, vino.snap, | 960 | ceph_osdc_build_request(req, offset, snapc, vino.snap, |
961 | &inode->i_mtime); | 961 | &inode->i_mtime); |
962 | 962 | ||
963 | rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); | 963 | rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); |
964 | BUG_ON(rc); | 964 | BUG_ON(rc); |
965 | req = NULL; | 965 | req = NULL; |
966 | 966 | ||
967 | /* continue? */ | 967 | /* continue? */ |
968 | index = next; | 968 | index = next; |
969 | wbc->nr_to_write -= locked_pages; | 969 | wbc->nr_to_write -= locked_pages; |
970 | if (wbc->nr_to_write <= 0) | 970 | if (wbc->nr_to_write <= 0) |
971 | done = 1; | 971 | done = 1; |
972 | 972 | ||
973 | release_pvec_pages: | 973 | release_pvec_pages: |
974 | dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, | 974 | dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, |
975 | pvec.nr ? pvec.pages[0] : NULL); | 975 | pvec.nr ? pvec.pages[0] : NULL); |
976 | pagevec_release(&pvec); | 976 | pagevec_release(&pvec); |
977 | 977 | ||
978 | if (locked_pages && !done) | 978 | if (locked_pages && !done) |
979 | goto retry; | 979 | goto retry; |
980 | } | 980 | } |
981 | 981 | ||
982 | if (should_loop && !done) { | 982 | if (should_loop && !done) { |
983 | /* more to do; loop back to beginning of file */ | 983 | /* more to do; loop back to beginning of file */ |
984 | dout("writepages looping back to beginning of file\n"); | 984 | dout("writepages looping back to beginning of file\n"); |
985 | should_loop = 0; | 985 | should_loop = 0; |
986 | index = 0; | 986 | index = 0; |
987 | goto retry; | 987 | goto retry; |
988 | } | 988 | } |
989 | 989 | ||
990 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) | 990 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) |
991 | mapping->writeback_index = index; | 991 | mapping->writeback_index = index; |
992 | 992 | ||
993 | out: | 993 | out: |
994 | if (req) | 994 | if (req) |
995 | ceph_osdc_put_request(req); | 995 | ceph_osdc_put_request(req); |
996 | ceph_put_snap_context(snapc); | 996 | ceph_put_snap_context(snapc); |
997 | dout("writepages done, rc = %d\n", rc); | 997 | dout("writepages done, rc = %d\n", rc); |
998 | return rc; | 998 | return rc; |
999 | } | 999 | } |
1000 | 1000 | ||
1001 | 1001 | ||
1002 | 1002 | ||
1003 | /* | 1003 | /* |
1004 | * See if a given @snapc is either writeable, or already written. | 1004 | * See if a given @snapc is either writeable, or already written. |
1005 | */ | 1005 | */ |
1006 | static int context_is_writeable_or_written(struct inode *inode, | 1006 | static int context_is_writeable_or_written(struct inode *inode, |
1007 | struct ceph_snap_context *snapc) | 1007 | struct ceph_snap_context *snapc) |
1008 | { | 1008 | { |
1009 | struct ceph_snap_context *oldest = get_oldest_context(inode, NULL); | 1009 | struct ceph_snap_context *oldest = get_oldest_context(inode, NULL); |
1010 | int ret = !oldest || snapc->seq <= oldest->seq; | 1010 | int ret = !oldest || snapc->seq <= oldest->seq; |
1011 | 1011 | ||
1012 | ceph_put_snap_context(oldest); | 1012 | ceph_put_snap_context(oldest); |
1013 | return ret; | 1013 | return ret; |
1014 | } | 1014 | } |
1015 | 1015 | ||
1016 | /* | 1016 | /* |
1017 | * We are only allowed to write into/dirty the page if the page is | 1017 | * We are only allowed to write into/dirty the page if the page is |
1018 | * clean, or already dirty within the same snap context. | 1018 | * clean, or already dirty within the same snap context. |
1019 | * | 1019 | * |
1020 | * called with page locked. | 1020 | * called with page locked. |
1021 | * return success with page locked, | 1021 | * return success with page locked, |
1022 | * or any failure (incl -EAGAIN) with page unlocked. | 1022 | * or any failure (incl -EAGAIN) with page unlocked. |
1023 | */ | 1023 | */ |
1024 | static int ceph_update_writeable_page(struct file *file, | 1024 | static int ceph_update_writeable_page(struct file *file, |
1025 | loff_t pos, unsigned len, | 1025 | loff_t pos, unsigned len, |
1026 | struct page *page) | 1026 | struct page *page) |
1027 | { | 1027 | { |
1028 | struct inode *inode = file_inode(file); | 1028 | struct inode *inode = file_inode(file); |
1029 | struct ceph_inode_info *ci = ceph_inode(inode); | 1029 | struct ceph_inode_info *ci = ceph_inode(inode); |
1030 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; | 1030 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; |
1031 | loff_t page_off = pos & PAGE_CACHE_MASK; | 1031 | loff_t page_off = pos & PAGE_CACHE_MASK; |
1032 | int pos_in_page = pos & ~PAGE_CACHE_MASK; | 1032 | int pos_in_page = pos & ~PAGE_CACHE_MASK; |
1033 | int end_in_page = pos_in_page + len; | 1033 | int end_in_page = pos_in_page + len; |
1034 | loff_t i_size; | 1034 | loff_t i_size; |
1035 | int r; | 1035 | int r; |
1036 | struct ceph_snap_context *snapc, *oldest; | 1036 | struct ceph_snap_context *snapc, *oldest; |
1037 | 1037 | ||
1038 | retry_locked: | 1038 | retry_locked: |
1039 | /* writepages currently holds page lock, but if we change that later, */ | 1039 | /* writepages currently holds page lock, but if we change that later, */ |
1040 | wait_on_page_writeback(page); | 1040 | wait_on_page_writeback(page); |
1041 | 1041 | ||
1042 | /* check snap context */ | 1042 | /* check snap context */ |
1043 | BUG_ON(!ci->i_snap_realm); | 1043 | BUG_ON(!ci->i_snap_realm); |
1044 | down_read(&mdsc->snap_rwsem); | 1044 | down_read(&mdsc->snap_rwsem); |
1045 | BUG_ON(!ci->i_snap_realm->cached_context); | 1045 | BUG_ON(!ci->i_snap_realm->cached_context); |
1046 | snapc = page_snap_context(page); | 1046 | snapc = page_snap_context(page); |
1047 | if (snapc && snapc != ci->i_head_snapc) { | 1047 | if (snapc && snapc != ci->i_head_snapc) { |
1048 | /* | 1048 | /* |
1049 | * this page is already dirty in another (older) snap | 1049 | * this page is already dirty in another (older) snap |
1050 | * context! is it writeable now? | 1050 | * context! is it writeable now? |
1051 | */ | 1051 | */ |
1052 | oldest = get_oldest_context(inode, NULL); | 1052 | oldest = get_oldest_context(inode, NULL); |
1053 | up_read(&mdsc->snap_rwsem); | 1053 | up_read(&mdsc->snap_rwsem); |
1054 | 1054 | ||
1055 | if (snapc->seq > oldest->seq) { | 1055 | if (snapc->seq > oldest->seq) { |
1056 | ceph_put_snap_context(oldest); | 1056 | ceph_put_snap_context(oldest); |
1057 | dout(" page %p snapc %p not current or oldest\n", | 1057 | dout(" page %p snapc %p not current or oldest\n", |
1058 | page, snapc); | 1058 | page, snapc); |
1059 | /* | 1059 | /* |
1060 | * queue for writeback, and wait for snapc to | 1060 | * queue for writeback, and wait for snapc to |
1061 | * be writeable or written | 1061 | * be writeable or written |
1062 | */ | 1062 | */ |
1063 | snapc = ceph_get_snap_context(snapc); | 1063 | snapc = ceph_get_snap_context(snapc); |
1064 | unlock_page(page); | 1064 | unlock_page(page); |
1065 | ceph_queue_writeback(inode); | 1065 | ceph_queue_writeback(inode); |
1066 | r = wait_event_interruptible(ci->i_cap_wq, | 1066 | r = wait_event_interruptible(ci->i_cap_wq, |
1067 | context_is_writeable_or_written(inode, snapc)); | 1067 | context_is_writeable_or_written(inode, snapc)); |
1068 | ceph_put_snap_context(snapc); | 1068 | ceph_put_snap_context(snapc); |
1069 | if (r == -ERESTARTSYS) | 1069 | if (r == -ERESTARTSYS) |
1070 | return r; | 1070 | return r; |
1071 | return -EAGAIN; | 1071 | return -EAGAIN; |
1072 | } | 1072 | } |
1073 | ceph_put_snap_context(oldest); | 1073 | ceph_put_snap_context(oldest); |
1074 | 1074 | ||
1075 | /* yay, writeable, do it now (without dropping page lock) */ | 1075 | /* yay, writeable, do it now (without dropping page lock) */ |
1076 | dout(" page %p snapc %p not current, but oldest\n", | 1076 | dout(" page %p snapc %p not current, but oldest\n", |
1077 | page, snapc); | 1077 | page, snapc); |
1078 | if (!clear_page_dirty_for_io(page)) | 1078 | if (!clear_page_dirty_for_io(page)) |
1079 | goto retry_locked; | 1079 | goto retry_locked; |
1080 | r = writepage_nounlock(page, NULL); | 1080 | r = writepage_nounlock(page, NULL); |
1081 | if (r < 0) | 1081 | if (r < 0) |
1082 | goto fail_nosnap; | 1082 | goto fail_nosnap; |
1083 | goto retry_locked; | 1083 | goto retry_locked; |
1084 | } | 1084 | } |
1085 | 1085 | ||
1086 | if (PageUptodate(page)) { | 1086 | if (PageUptodate(page)) { |
1087 | dout(" page %p already uptodate\n", page); | 1087 | dout(" page %p already uptodate\n", page); |
1088 | return 0; | 1088 | return 0; |
1089 | } | 1089 | } |
1090 | 1090 | ||
1091 | /* full page? */ | 1091 | /* full page? */ |
1092 | if (pos_in_page == 0 && len == PAGE_CACHE_SIZE) | 1092 | if (pos_in_page == 0 && len == PAGE_CACHE_SIZE) |
1093 | return 0; | 1093 | return 0; |
1094 | 1094 | ||
1095 | /* past end of file? */ | 1095 | /* past end of file? */ |
1096 | i_size = inode->i_size; /* caller holds i_mutex */ | 1096 | i_size = inode->i_size; /* caller holds i_mutex */ |
1097 | 1097 | ||
1098 | if (page_off >= i_size || | 1098 | if (page_off >= i_size || |
1099 | (pos_in_page == 0 && (pos+len) >= i_size && | 1099 | (pos_in_page == 0 && (pos+len) >= i_size && |
1100 | end_in_page - pos_in_page != PAGE_CACHE_SIZE)) { | 1100 | end_in_page - pos_in_page != PAGE_CACHE_SIZE)) { |
1101 | dout(" zeroing %p 0 - %d and %d - %d\n", | 1101 | dout(" zeroing %p 0 - %d and %d - %d\n", |
1102 | page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE); | 1102 | page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE); |
1103 | zero_user_segments(page, | 1103 | zero_user_segments(page, |
1104 | 0, pos_in_page, | 1104 | 0, pos_in_page, |
1105 | end_in_page, PAGE_CACHE_SIZE); | 1105 | end_in_page, PAGE_CACHE_SIZE); |
1106 | return 0; | 1106 | return 0; |
1107 | } | 1107 | } |
1108 | 1108 | ||
1109 | /* we need to read it. */ | 1109 | /* we need to read it. */ |
1110 | up_read(&mdsc->snap_rwsem); | 1110 | up_read(&mdsc->snap_rwsem); |
1111 | r = readpage_nounlock(file, page); | 1111 | r = readpage_nounlock(file, page); |
1112 | if (r < 0) | 1112 | if (r < 0) |
1113 | goto fail_nosnap; | 1113 | goto fail_nosnap; |
1114 | goto retry_locked; | 1114 | goto retry_locked; |
1115 | fail_nosnap: | 1115 | fail_nosnap: |
1116 | unlock_page(page); | 1116 | unlock_page(page); |
1117 | return r; | 1117 | return r; |
1118 | } | 1118 | } |
1119 | 1119 | ||
1120 | /* | 1120 | /* |
1121 | * We are only allowed to write into/dirty the page if the page is | 1121 | * We are only allowed to write into/dirty the page if the page is |
1122 | * clean, or already dirty within the same snap context. | 1122 | * clean, or already dirty within the same snap context. |
1123 | */ | 1123 | */ |
1124 | static int ceph_write_begin(struct file *file, struct address_space *mapping, | 1124 | static int ceph_write_begin(struct file *file, struct address_space *mapping, |
1125 | loff_t pos, unsigned len, unsigned flags, | 1125 | loff_t pos, unsigned len, unsigned flags, |
1126 | struct page **pagep, void **fsdata) | 1126 | struct page **pagep, void **fsdata) |
1127 | { | 1127 | { |
1128 | struct inode *inode = file_inode(file); | 1128 | struct inode *inode = file_inode(file); |
1129 | struct page *page; | 1129 | struct page *page; |
1130 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | 1130 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; |
1131 | int r; | 1131 | int r; |
1132 | 1132 | ||
1133 | do { | 1133 | do { |
1134 | /* get a page */ | 1134 | /* get a page */ |
1135 | page = grab_cache_page_write_begin(mapping, index, 0); | 1135 | page = grab_cache_page_write_begin(mapping, index, 0); |
1136 | if (!page) | 1136 | if (!page) |
1137 | return -ENOMEM; | 1137 | return -ENOMEM; |
1138 | *pagep = page; | 1138 | *pagep = page; |
1139 | 1139 | ||
1140 | dout("write_begin file %p inode %p page %p %d~%d\n", file, | 1140 | dout("write_begin file %p inode %p page %p %d~%d\n", file, |
1141 | inode, page, (int)pos, (int)len); | 1141 | inode, page, (int)pos, (int)len); |
1142 | 1142 | ||
1143 | r = ceph_update_writeable_page(file, pos, len, page); | 1143 | r = ceph_update_writeable_page(file, pos, len, page); |
1144 | } while (r == -EAGAIN); | 1144 | } while (r == -EAGAIN); |
1145 | 1145 | ||
1146 | return r; | 1146 | return r; |
1147 | } | 1147 | } |
1148 | 1148 | ||
1149 | /* | 1149 | /* |
1150 | * we don't do anything in here that simple_write_end doesn't do | 1150 | * we don't do anything in here that simple_write_end doesn't do |
1151 | * except adjust dirty page accounting and drop read lock on | 1151 | * except adjust dirty page accounting and drop read lock on |
1152 | * mdsc->snap_rwsem. | 1152 | * mdsc->snap_rwsem. |
1153 | */ | 1153 | */ |
1154 | static int ceph_write_end(struct file *file, struct address_space *mapping, | 1154 | static int ceph_write_end(struct file *file, struct address_space *mapping, |
1155 | loff_t pos, unsigned len, unsigned copied, | 1155 | loff_t pos, unsigned len, unsigned copied, |
1156 | struct page *page, void *fsdata) | 1156 | struct page *page, void *fsdata) |
1157 | { | 1157 | { |
1158 | struct inode *inode = file_inode(file); | 1158 | struct inode *inode = file_inode(file); |
1159 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 1159 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
1160 | struct ceph_mds_client *mdsc = fsc->mdsc; | 1160 | struct ceph_mds_client *mdsc = fsc->mdsc; |
1161 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); | 1161 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); |
1162 | int check_cap = 0; | 1162 | int check_cap = 0; |
1163 | 1163 | ||
1164 | dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, | 1164 | dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, |
1165 | inode, page, (int)pos, (int)copied, (int)len); | 1165 | inode, page, (int)pos, (int)copied, (int)len); |
1166 | 1166 | ||
1167 | /* zero the stale part of the page if we did a short copy */ | 1167 | /* zero the stale part of the page if we did a short copy */ |
1168 | if (copied < len) | 1168 | if (copied < len) |
1169 | zero_user_segment(page, from+copied, len); | 1169 | zero_user_segment(page, from+copied, len); |
1170 | 1170 | ||
1171 | /* did file size increase? */ | 1171 | /* did file size increase? */ |
1172 | /* (no need for i_size_read(); we caller holds i_mutex */ | 1172 | /* (no need for i_size_read(); we caller holds i_mutex */ |
1173 | if (pos+copied > inode->i_size) | 1173 | if (pos+copied > inode->i_size) |
1174 | check_cap = ceph_inode_set_size(inode, pos+copied); | 1174 | check_cap = ceph_inode_set_size(inode, pos+copied); |
1175 | 1175 | ||
1176 | if (!PageUptodate(page)) | 1176 | if (!PageUptodate(page)) |
1177 | SetPageUptodate(page); | 1177 | SetPageUptodate(page); |
1178 | 1178 | ||
1179 | set_page_dirty(page); | 1179 | set_page_dirty(page); |
1180 | 1180 | ||
1181 | unlock_page(page); | 1181 | unlock_page(page); |
1182 | up_read(&mdsc->snap_rwsem); | 1182 | up_read(&mdsc->snap_rwsem); |
1183 | page_cache_release(page); | 1183 | page_cache_release(page); |
1184 | 1184 | ||
1185 | if (check_cap) | 1185 | if (check_cap) |
1186 | ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); | 1186 | ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); |
1187 | 1187 | ||
1188 | return copied; | 1188 | return copied; |
1189 | } | 1189 | } |
1190 | 1190 | ||
1191 | /* | 1191 | /* |
1192 | * we set .direct_IO to indicate direct io is supported, but since we | 1192 | * we set .direct_IO to indicate direct io is supported, but since we |
1193 | * intercept O_DIRECT reads and writes early, this function should | 1193 | * intercept O_DIRECT reads and writes early, this function should |
1194 | * never get called. | 1194 | * never get called. |
1195 | */ | 1195 | */ |
1196 | static ssize_t ceph_direct_io(int rw, struct kiocb *iocb, | 1196 | static ssize_t ceph_direct_io(int rw, struct kiocb *iocb, |
1197 | struct iov_iter *iter, | 1197 | struct iov_iter *iter, |
1198 | loff_t pos) | 1198 | loff_t pos) |
1199 | { | 1199 | { |
1200 | WARN_ON(1); | 1200 | WARN_ON(1); |
1201 | return -EINVAL; | 1201 | return -EINVAL; |
1202 | } | 1202 | } |
1203 | 1203 | ||
1204 | const struct address_space_operations ceph_aops = { | 1204 | const struct address_space_operations ceph_aops = { |
1205 | .readpage = ceph_readpage, | 1205 | .readpage = ceph_readpage, |
1206 | .readpages = ceph_readpages, | 1206 | .readpages = ceph_readpages, |
1207 | .writepage = ceph_writepage, | 1207 | .writepage = ceph_writepage, |
1208 | .writepages = ceph_writepages_start, | 1208 | .writepages = ceph_writepages_start, |
1209 | .write_begin = ceph_write_begin, | 1209 | .write_begin = ceph_write_begin, |
1210 | .write_end = ceph_write_end, | 1210 | .write_end = ceph_write_end, |
1211 | .set_page_dirty = ceph_set_page_dirty, | 1211 | .set_page_dirty = ceph_set_page_dirty, |
1212 | .invalidatepage = ceph_invalidatepage, | 1212 | .invalidatepage = ceph_invalidatepage, |
1213 | .releasepage = ceph_releasepage, | 1213 | .releasepage = ceph_releasepage, |
1214 | .direct_IO = ceph_direct_io, | 1214 | .direct_IO = ceph_direct_io, |
1215 | }; | 1215 | }; |
1216 | 1216 | ||
1217 | 1217 | ||
1218 | /* | 1218 | /* |
1219 | * vm ops | 1219 | * vm ops |
1220 | */ | 1220 | */ |
1221 | static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | 1221 | static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) |
1222 | { | 1222 | { |
1223 | struct inode *inode = file_inode(vma->vm_file); | 1223 | struct inode *inode = file_inode(vma->vm_file); |
1224 | struct ceph_inode_info *ci = ceph_inode(inode); | 1224 | struct ceph_inode_info *ci = ceph_inode(inode); |
1225 | struct ceph_file_info *fi = vma->vm_file->private_data; | 1225 | struct ceph_file_info *fi = vma->vm_file->private_data; |
1226 | struct page *pinned_page = NULL; | 1226 | struct page *pinned_page = NULL; |
1227 | loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT; | 1227 | loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT; |
1228 | int want, got, ret; | 1228 | int want, got, ret; |
1229 | 1229 | ||
1230 | dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", | 1230 | dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", |
1231 | inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE); | 1231 | inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE); |
1232 | if (fi->fmode & CEPH_FILE_MODE_LAZY) | 1232 | if (fi->fmode & CEPH_FILE_MODE_LAZY) |
1233 | want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; | 1233 | want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; |
1234 | else | 1234 | else |
1235 | want = CEPH_CAP_FILE_CACHE; | 1235 | want = CEPH_CAP_FILE_CACHE; |
1236 | while (1) { | 1236 | while (1) { |
1237 | got = 0; | 1237 | got = 0; |
1238 | ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, | 1238 | ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, |
1239 | -1, &got, &pinned_page); | 1239 | -1, &got, &pinned_page); |
1240 | if (ret == 0) | 1240 | if (ret == 0) |
1241 | break; | 1241 | break; |
1242 | if (ret != -ERESTARTSYS) { | 1242 | if (ret != -ERESTARTSYS) { |
1243 | WARN_ON(1); | 1243 | WARN_ON(1); |
1244 | return VM_FAULT_SIGBUS; | 1244 | return VM_FAULT_SIGBUS; |
1245 | } | 1245 | } |
1246 | } | 1246 | } |
1247 | dout("filemap_fault %p %llu~%zd got cap refs on %s\n", | 1247 | dout("filemap_fault %p %llu~%zd got cap refs on %s\n", |
1248 | inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got)); | 1248 | inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got)); |
1249 | 1249 | ||
1250 | if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || | 1250 | if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || |
1251 | ci->i_inline_version == CEPH_INLINE_NONE) | 1251 | ci->i_inline_version == CEPH_INLINE_NONE) |
1252 | ret = filemap_fault(vma, vmf); | 1252 | ret = filemap_fault(vma, vmf); |
1253 | else | 1253 | else |
1254 | ret = -EAGAIN; | 1254 | ret = -EAGAIN; |
1255 | 1255 | ||
1256 | dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", | 1256 | dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", |
1257 | inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret); | 1257 | inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret); |
1258 | if (pinned_page) | 1258 | if (pinned_page) |
1259 | page_cache_release(pinned_page); | 1259 | page_cache_release(pinned_page); |
1260 | ceph_put_cap_refs(ci, got); | 1260 | ceph_put_cap_refs(ci, got); |
1261 | 1261 | ||
1262 | if (ret != -EAGAIN) | 1262 | if (ret != -EAGAIN) |
1263 | return ret; | 1263 | return ret; |
1264 | 1264 | ||
1265 | /* read inline data */ | 1265 | /* read inline data */ |
1266 | if (off >= PAGE_CACHE_SIZE) { | 1266 | if (off >= PAGE_CACHE_SIZE) { |
1267 | /* does not support inline data > PAGE_SIZE */ | 1267 | /* does not support inline data > PAGE_SIZE */ |
1268 | ret = VM_FAULT_SIGBUS; | 1268 | ret = VM_FAULT_SIGBUS; |
1269 | } else { | 1269 | } else { |
1270 | int ret1; | 1270 | int ret1; |
1271 | struct address_space *mapping = inode->i_mapping; | 1271 | struct address_space *mapping = inode->i_mapping; |
1272 | struct page *page = find_or_create_page(mapping, 0, | 1272 | struct page *page = find_or_create_page(mapping, 0, |
1273 | mapping_gfp_mask(mapping) & | 1273 | mapping_gfp_mask(mapping) & |
1274 | ~__GFP_FS); | 1274 | ~__GFP_FS); |
1275 | if (!page) { | 1275 | if (!page) { |
1276 | ret = VM_FAULT_OOM; | 1276 | ret = VM_FAULT_OOM; |
1277 | goto out; | 1277 | goto out; |
1278 | } | 1278 | } |
1279 | ret1 = __ceph_do_getattr(inode, page, | 1279 | ret1 = __ceph_do_getattr(inode, page, |
1280 | CEPH_STAT_CAP_INLINE_DATA, true); | 1280 | CEPH_STAT_CAP_INLINE_DATA, true); |
1281 | if (ret1 < 0 || off >= i_size_read(inode)) { | 1281 | if (ret1 < 0 || off >= i_size_read(inode)) { |
1282 | unlock_page(page); | 1282 | unlock_page(page); |
1283 | page_cache_release(page); | 1283 | page_cache_release(page); |
1284 | ret = VM_FAULT_SIGBUS; | 1284 | ret = VM_FAULT_SIGBUS; |
1285 | goto out; | 1285 | goto out; |
1286 | } | 1286 | } |
1287 | if (ret1 < PAGE_CACHE_SIZE) | 1287 | if (ret1 < PAGE_CACHE_SIZE) |
1288 | zero_user_segment(page, ret1, PAGE_CACHE_SIZE); | 1288 | zero_user_segment(page, ret1, PAGE_CACHE_SIZE); |
1289 | else | 1289 | else |
1290 | flush_dcache_page(page); | 1290 | flush_dcache_page(page); |
1291 | SetPageUptodate(page); | 1291 | SetPageUptodate(page); |
1292 | vmf->page = page; | 1292 | vmf->page = page; |
1293 | ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; | 1293 | ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; |
1294 | } | 1294 | } |
1295 | out: | 1295 | out: |
1296 | dout("filemap_fault %p %llu~%zd read inline data ret %d\n", | 1296 | dout("filemap_fault %p %llu~%zd read inline data ret %d\n", |
1297 | inode, off, (size_t)PAGE_CACHE_SIZE, ret); | 1297 | inode, off, (size_t)PAGE_CACHE_SIZE, ret); |
1298 | return ret; | 1298 | return ret; |
1299 | } | 1299 | } |
1300 | 1300 | ||
1301 | /* | 1301 | /* |
1302 | * Reuse write_begin here for simplicity. | 1302 | * Reuse write_begin here for simplicity. |
1303 | */ | 1303 | */ |
1304 | static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | 1304 | static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) |
1305 | { | 1305 | { |
1306 | struct inode *inode = file_inode(vma->vm_file); | 1306 | struct inode *inode = file_inode(vma->vm_file); |
1307 | struct ceph_inode_info *ci = ceph_inode(inode); | 1307 | struct ceph_inode_info *ci = ceph_inode(inode); |
1308 | struct ceph_file_info *fi = vma->vm_file->private_data; | 1308 | struct ceph_file_info *fi = vma->vm_file->private_data; |
1309 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; | 1309 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; |
1310 | struct page *page = vmf->page; | 1310 | struct page *page = vmf->page; |
1311 | loff_t off = page_offset(page); | 1311 | loff_t off = page_offset(page); |
1312 | loff_t size = i_size_read(inode); | 1312 | loff_t size = i_size_read(inode); |
1313 | size_t len; | 1313 | size_t len; |
1314 | int want, got, ret; | 1314 | int want, got, ret; |
1315 | 1315 | ||
1316 | if (ci->i_inline_version != CEPH_INLINE_NONE) { | 1316 | if (ci->i_inline_version != CEPH_INLINE_NONE) { |
1317 | struct page *locked_page = NULL; | 1317 | struct page *locked_page = NULL; |
1318 | if (off == 0) { | 1318 | if (off == 0) { |
1319 | lock_page(page); | 1319 | lock_page(page); |
1320 | locked_page = page; | 1320 | locked_page = page; |
1321 | } | 1321 | } |
1322 | ret = ceph_uninline_data(vma->vm_file, locked_page); | 1322 | ret = ceph_uninline_data(vma->vm_file, locked_page); |
1323 | if (locked_page) | 1323 | if (locked_page) |
1324 | unlock_page(locked_page); | 1324 | unlock_page(locked_page); |
1325 | if (ret < 0) | 1325 | if (ret < 0) |
1326 | return VM_FAULT_SIGBUS; | 1326 | return VM_FAULT_SIGBUS; |
1327 | } | 1327 | } |
1328 | 1328 | ||
1329 | if (off + PAGE_CACHE_SIZE <= size) | 1329 | if (off + PAGE_CACHE_SIZE <= size) |
1330 | len = PAGE_CACHE_SIZE; | 1330 | len = PAGE_CACHE_SIZE; |
1331 | else | 1331 | else |
1332 | len = size & ~PAGE_CACHE_MASK; | 1332 | len = size & ~PAGE_CACHE_MASK; |
1333 | 1333 | ||
1334 | dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n", | 1334 | dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n", |
1335 | inode, ceph_vinop(inode), off, len, size); | 1335 | inode, ceph_vinop(inode), off, len, size); |
1336 | if (fi->fmode & CEPH_FILE_MODE_LAZY) | 1336 | if (fi->fmode & CEPH_FILE_MODE_LAZY) |
1337 | want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; | 1337 | want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; |
1338 | else | 1338 | else |
1339 | want = CEPH_CAP_FILE_BUFFER; | 1339 | want = CEPH_CAP_FILE_BUFFER; |
1340 | while (1) { | 1340 | while (1) { |
1341 | got = 0; | 1341 | got = 0; |
1342 | ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, | 1342 | ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, |
1343 | &got, NULL); | 1343 | &got, NULL); |
1344 | if (ret == 0) | 1344 | if (ret == 0) |
1345 | break; | 1345 | break; |
1346 | if (ret != -ERESTARTSYS) { | 1346 | if (ret != -ERESTARTSYS) { |
1347 | WARN_ON(1); | 1347 | WARN_ON(1); |
1348 | return VM_FAULT_SIGBUS; | 1348 | return VM_FAULT_SIGBUS; |
1349 | } | 1349 | } |
1350 | } | 1350 | } |
1351 | dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", | 1351 | dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", |
1352 | inode, off, len, ceph_cap_string(got)); | 1352 | inode, off, len, ceph_cap_string(got)); |
1353 | 1353 | ||
1354 | /* Update time before taking page lock */ | 1354 | /* Update time before taking page lock */ |
1355 | file_update_time(vma->vm_file); | 1355 | file_update_time(vma->vm_file); |
1356 | 1356 | ||
1357 | lock_page(page); | 1357 | lock_page(page); |
1358 | 1358 | ||
1359 | ret = VM_FAULT_NOPAGE; | 1359 | ret = VM_FAULT_NOPAGE; |
1360 | if ((off > size) || | 1360 | if ((off > size) || |
1361 | (page->mapping != inode->i_mapping)) | 1361 | (page->mapping != inode->i_mapping)) |
1362 | goto out; | 1362 | goto out; |
1363 | 1363 | ||
1364 | ret = ceph_update_writeable_page(vma->vm_file, off, len, page); | 1364 | ret = ceph_update_writeable_page(vma->vm_file, off, len, page); |
1365 | if (ret == 0) { | 1365 | if (ret == 0) { |
1366 | /* success. we'll keep the page locked. */ | 1366 | /* success. we'll keep the page locked. */ |
1367 | set_page_dirty(page); | 1367 | set_page_dirty(page); |
1368 | up_read(&mdsc->snap_rwsem); | 1368 | up_read(&mdsc->snap_rwsem); |
1369 | ret = VM_FAULT_LOCKED; | 1369 | ret = VM_FAULT_LOCKED; |
1370 | } else { | 1370 | } else { |
1371 | if (ret == -ENOMEM) | 1371 | if (ret == -ENOMEM) |
1372 | ret = VM_FAULT_OOM; | 1372 | ret = VM_FAULT_OOM; |
1373 | else | 1373 | else |
1374 | ret = VM_FAULT_SIGBUS; | 1374 | ret = VM_FAULT_SIGBUS; |
1375 | } | 1375 | } |
1376 | out: | 1376 | out: |
1377 | if (ret != VM_FAULT_LOCKED) | 1377 | if (ret != VM_FAULT_LOCKED) |
1378 | unlock_page(page); | 1378 | unlock_page(page); |
1379 | if (ret == VM_FAULT_LOCKED || | 1379 | if (ret == VM_FAULT_LOCKED || |
1380 | ci->i_inline_version != CEPH_INLINE_NONE) { | 1380 | ci->i_inline_version != CEPH_INLINE_NONE) { |
1381 | int dirty; | 1381 | int dirty; |
1382 | spin_lock(&ci->i_ceph_lock); | 1382 | spin_lock(&ci->i_ceph_lock); |
1383 | ci->i_inline_version = CEPH_INLINE_NONE; | 1383 | ci->i_inline_version = CEPH_INLINE_NONE; |
1384 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); | 1384 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); |
1385 | spin_unlock(&ci->i_ceph_lock); | 1385 | spin_unlock(&ci->i_ceph_lock); |
1386 | if (dirty) | 1386 | if (dirty) |
1387 | __mark_inode_dirty(inode, dirty); | 1387 | __mark_inode_dirty(inode, dirty); |
1388 | } | 1388 | } |
1389 | 1389 | ||
1390 | dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n", | 1390 | dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n", |
1391 | inode, off, len, ceph_cap_string(got), ret); | 1391 | inode, off, len, ceph_cap_string(got), ret); |
1392 | ceph_put_cap_refs(ci, got); | 1392 | ceph_put_cap_refs(ci, got); |
1393 | 1393 | ||
1394 | return ret; | 1394 | return ret; |
1395 | } | 1395 | } |
1396 | 1396 | ||
1397 | void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, | 1397 | void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, |
1398 | char *data, size_t len) | 1398 | char *data, size_t len) |
1399 | { | 1399 | { |
1400 | struct address_space *mapping = inode->i_mapping; | 1400 | struct address_space *mapping = inode->i_mapping; |
1401 | struct page *page; | 1401 | struct page *page; |
1402 | 1402 | ||
1403 | if (locked_page) { | 1403 | if (locked_page) { |
1404 | page = locked_page; | 1404 | page = locked_page; |
1405 | } else { | 1405 | } else { |
1406 | if (i_size_read(inode) == 0) | 1406 | if (i_size_read(inode) == 0) |
1407 | return; | 1407 | return; |
1408 | page = find_or_create_page(mapping, 0, | 1408 | page = find_or_create_page(mapping, 0, |
1409 | mapping_gfp_mask(mapping) & ~__GFP_FS); | 1409 | mapping_gfp_mask(mapping) & ~__GFP_FS); |
1410 | if (!page) | 1410 | if (!page) |
1411 | return; | 1411 | return; |
1412 | if (PageUptodate(page)) { | 1412 | if (PageUptodate(page)) { |
1413 | unlock_page(page); | 1413 | unlock_page(page); |
1414 | page_cache_release(page); | 1414 | page_cache_release(page); |
1415 | return; | 1415 | return; |
1416 | } | 1416 | } |
1417 | } | 1417 | } |
1418 | 1418 | ||
1419 | dout("fill_inline_data %p %llx.%llx len %lu locked_page %p\n", | 1419 | dout("fill_inline_data %p %llx.%llx len %zu locked_page %p\n", |
1420 | inode, ceph_vinop(inode), len, locked_page); | 1420 | inode, ceph_vinop(inode), len, locked_page); |
1421 | 1421 | ||
1422 | if (len > 0) { | 1422 | if (len > 0) { |
1423 | void *kaddr = kmap_atomic(page); | 1423 | void *kaddr = kmap_atomic(page); |
1424 | memcpy(kaddr, data, len); | 1424 | memcpy(kaddr, data, len); |
1425 | kunmap_atomic(kaddr); | 1425 | kunmap_atomic(kaddr); |
1426 | } | 1426 | } |
1427 | 1427 | ||
1428 | if (page != locked_page) { | 1428 | if (page != locked_page) { |
1429 | if (len < PAGE_CACHE_SIZE) | 1429 | if (len < PAGE_CACHE_SIZE) |
1430 | zero_user_segment(page, len, PAGE_CACHE_SIZE); | 1430 | zero_user_segment(page, len, PAGE_CACHE_SIZE); |
1431 | else | 1431 | else |
1432 | flush_dcache_page(page); | 1432 | flush_dcache_page(page); |
1433 | 1433 | ||
1434 | SetPageUptodate(page); | 1434 | SetPageUptodate(page); |
1435 | unlock_page(page); | 1435 | unlock_page(page); |
1436 | page_cache_release(page); | 1436 | page_cache_release(page); |
1437 | } | 1437 | } |
1438 | } | 1438 | } |
1439 | 1439 | ||
1440 | int ceph_uninline_data(struct file *filp, struct page *locked_page) | 1440 | int ceph_uninline_data(struct file *filp, struct page *locked_page) |
1441 | { | 1441 | { |
1442 | struct inode *inode = file_inode(filp); | 1442 | struct inode *inode = file_inode(filp); |
1443 | struct ceph_inode_info *ci = ceph_inode(inode); | 1443 | struct ceph_inode_info *ci = ceph_inode(inode); |
1444 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 1444 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
1445 | struct ceph_osd_request *req; | 1445 | struct ceph_osd_request *req; |
1446 | struct page *page = NULL; | 1446 | struct page *page = NULL; |
1447 | u64 len, inline_version; | 1447 | u64 len, inline_version; |
1448 | int err = 0; | 1448 | int err = 0; |
1449 | bool from_pagecache = false; | 1449 | bool from_pagecache = false; |
1450 | 1450 | ||
1451 | spin_lock(&ci->i_ceph_lock); | 1451 | spin_lock(&ci->i_ceph_lock); |
1452 | inline_version = ci->i_inline_version; | 1452 | inline_version = ci->i_inline_version; |
1453 | spin_unlock(&ci->i_ceph_lock); | 1453 | spin_unlock(&ci->i_ceph_lock); |
1454 | 1454 | ||
1455 | dout("uninline_data %p %llx.%llx inline_version %llu\n", | 1455 | dout("uninline_data %p %llx.%llx inline_version %llu\n", |
1456 | inode, ceph_vinop(inode), inline_version); | 1456 | inode, ceph_vinop(inode), inline_version); |
1457 | 1457 | ||
1458 | if (inline_version == 1 || /* initial version, no data */ | 1458 | if (inline_version == 1 || /* initial version, no data */ |
1459 | inline_version == CEPH_INLINE_NONE) | 1459 | inline_version == CEPH_INLINE_NONE) |
1460 | goto out; | 1460 | goto out; |
1461 | 1461 | ||
1462 | if (locked_page) { | 1462 | if (locked_page) { |
1463 | page = locked_page; | 1463 | page = locked_page; |
1464 | WARN_ON(!PageUptodate(page)); | 1464 | WARN_ON(!PageUptodate(page)); |
1465 | } else if (ceph_caps_issued(ci) & | 1465 | } else if (ceph_caps_issued(ci) & |
1466 | (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) { | 1466 | (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) { |
1467 | page = find_get_page(inode->i_mapping, 0); | 1467 | page = find_get_page(inode->i_mapping, 0); |
1468 | if (page) { | 1468 | if (page) { |
1469 | if (PageUptodate(page)) { | 1469 | if (PageUptodate(page)) { |
1470 | from_pagecache = true; | 1470 | from_pagecache = true; |
1471 | lock_page(page); | 1471 | lock_page(page); |
1472 | } else { | 1472 | } else { |
1473 | page_cache_release(page); | 1473 | page_cache_release(page); |
1474 | page = NULL; | 1474 | page = NULL; |
1475 | } | 1475 | } |
1476 | } | 1476 | } |
1477 | } | 1477 | } |
1478 | 1478 | ||
1479 | if (page) { | 1479 | if (page) { |
1480 | len = i_size_read(inode); | 1480 | len = i_size_read(inode); |
1481 | if (len > PAGE_CACHE_SIZE) | 1481 | if (len > PAGE_CACHE_SIZE) |
1482 | len = PAGE_CACHE_SIZE; | 1482 | len = PAGE_CACHE_SIZE; |
1483 | } else { | 1483 | } else { |
1484 | page = __page_cache_alloc(GFP_NOFS); | 1484 | page = __page_cache_alloc(GFP_NOFS); |
1485 | if (!page) { | 1485 | if (!page) { |
1486 | err = -ENOMEM; | 1486 | err = -ENOMEM; |
1487 | goto out; | 1487 | goto out; |
1488 | } | 1488 | } |
1489 | err = __ceph_do_getattr(inode, page, | 1489 | err = __ceph_do_getattr(inode, page, |
1490 | CEPH_STAT_CAP_INLINE_DATA, true); | 1490 | CEPH_STAT_CAP_INLINE_DATA, true); |
1491 | if (err < 0) { | 1491 | if (err < 0) { |
1492 | /* no inline data */ | 1492 | /* no inline data */ |
1493 | if (err == -ENODATA) | 1493 | if (err == -ENODATA) |
1494 | err = 0; | 1494 | err = 0; |
1495 | goto out; | 1495 | goto out; |
1496 | } | 1496 | } |
1497 | len = err; | 1497 | len = err; |
1498 | } | 1498 | } |
1499 | 1499 | ||
1500 | req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, | 1500 | req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, |
1501 | ceph_vino(inode), 0, &len, 0, 1, | 1501 | ceph_vino(inode), 0, &len, 0, 1, |
1502 | CEPH_OSD_OP_CREATE, | 1502 | CEPH_OSD_OP_CREATE, |
1503 | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, | 1503 | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, |
1504 | ci->i_snap_realm->cached_context, | 1504 | ci->i_snap_realm->cached_context, |
1505 | 0, 0, false); | 1505 | 0, 0, false); |
1506 | if (IS_ERR(req)) { | 1506 | if (IS_ERR(req)) { |
1507 | err = PTR_ERR(req); | 1507 | err = PTR_ERR(req); |
1508 | goto out; | 1508 | goto out; |
1509 | } | 1509 | } |
1510 | 1510 | ||
1511 | ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); | 1511 | ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); |
1512 | err = ceph_osdc_start_request(&fsc->client->osdc, req, false); | 1512 | err = ceph_osdc_start_request(&fsc->client->osdc, req, false); |
1513 | if (!err) | 1513 | if (!err) |
1514 | err = ceph_osdc_wait_request(&fsc->client->osdc, req); | 1514 | err = ceph_osdc_wait_request(&fsc->client->osdc, req); |
1515 | ceph_osdc_put_request(req); | 1515 | ceph_osdc_put_request(req); |
1516 | if (err < 0) | 1516 | if (err < 0) |
1517 | goto out; | 1517 | goto out; |
1518 | 1518 | ||
1519 | req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, | 1519 | req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, |
1520 | ceph_vino(inode), 0, &len, 1, 3, | 1520 | ceph_vino(inode), 0, &len, 1, 3, |
1521 | CEPH_OSD_OP_WRITE, | 1521 | CEPH_OSD_OP_WRITE, |
1522 | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, | 1522 | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, |
1523 | ci->i_snap_realm->cached_context, | 1523 | ci->i_snap_realm->cached_context, |
1524 | ci->i_truncate_seq, ci->i_truncate_size, | 1524 | ci->i_truncate_seq, ci->i_truncate_size, |
1525 | false); | 1525 | false); |
1526 | if (IS_ERR(req)) { | 1526 | if (IS_ERR(req)) { |
1527 | err = PTR_ERR(req); | 1527 | err = PTR_ERR(req); |
1528 | goto out; | 1528 | goto out; |
1529 | } | 1529 | } |
1530 | 1530 | ||
1531 | osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); | 1531 | osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); |
1532 | 1532 | ||
1533 | err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, | 1533 | err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, |
1534 | "inline_version", &inline_version, | 1534 | "inline_version", &inline_version, |
1535 | sizeof(inline_version), | 1535 | sizeof(inline_version), |
1536 | CEPH_OSD_CMPXATTR_OP_GT, | 1536 | CEPH_OSD_CMPXATTR_OP_GT, |
1537 | CEPH_OSD_CMPXATTR_MODE_U64); | 1537 | CEPH_OSD_CMPXATTR_MODE_U64); |
1538 | if (err) | 1538 | if (err) |
1539 | goto out_put; | 1539 | goto out_put; |
1540 | 1540 | ||
1541 | err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, | 1541 | err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, |
1542 | "inline_version", &inline_version, | 1542 | "inline_version", &inline_version, |
1543 | sizeof(inline_version), 0, 0); | 1543 | sizeof(inline_version), 0, 0); |
1544 | if (err) | 1544 | if (err) |
1545 | goto out_put; | 1545 | goto out_put; |
1546 | 1546 | ||
1547 | ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); | 1547 | ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); |
1548 | err = ceph_osdc_start_request(&fsc->client->osdc, req, false); | 1548 | err = ceph_osdc_start_request(&fsc->client->osdc, req, false); |
1549 | if (!err) | 1549 | if (!err) |
1550 | err = ceph_osdc_wait_request(&fsc->client->osdc, req); | 1550 | err = ceph_osdc_wait_request(&fsc->client->osdc, req); |
1551 | out_put: | 1551 | out_put: |
1552 | ceph_osdc_put_request(req); | 1552 | ceph_osdc_put_request(req); |
1553 | if (err == -ECANCELED) | 1553 | if (err == -ECANCELED) |
1554 | err = 0; | 1554 | err = 0; |
1555 | out: | 1555 | out: |
1556 | if (page && page != locked_page) { | 1556 | if (page && page != locked_page) { |
1557 | if (from_pagecache) { | 1557 | if (from_pagecache) { |
1558 | unlock_page(page); | 1558 | unlock_page(page); |
1559 | page_cache_release(page); | 1559 | page_cache_release(page); |
1560 | } else | 1560 | } else |
1561 | __free_pages(page, 0); | 1561 | __free_pages(page, 0); |
1562 | } | 1562 | } |
1563 | 1563 | ||
1564 | dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", | 1564 | dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", |
1565 | inode, ceph_vinop(inode), inline_version, err); | 1565 | inode, ceph_vinop(inode), inline_version, err); |
1566 | return err; | 1566 | return err; |
1567 | } | 1567 | } |
1568 | 1568 | ||
1569 | static struct vm_operations_struct ceph_vmops = { | 1569 | static struct vm_operations_struct ceph_vmops = { |
1570 | .fault = ceph_filemap_fault, | 1570 | .fault = ceph_filemap_fault, |
1571 | .page_mkwrite = ceph_page_mkwrite, | 1571 | .page_mkwrite = ceph_page_mkwrite, |
1572 | .remap_pages = generic_file_remap_pages, | 1572 | .remap_pages = generic_file_remap_pages, |
1573 | }; | 1573 | }; |
1574 | 1574 | ||
1575 | int ceph_mmap(struct file *file, struct vm_area_struct *vma) | 1575 | int ceph_mmap(struct file *file, struct vm_area_struct *vma) |
1576 | { | 1576 | { |
1577 | struct address_space *mapping = file->f_mapping; | 1577 | struct address_space *mapping = file->f_mapping; |
1578 | 1578 | ||
1579 | if (!mapping->a_ops->readpage) | 1579 | if (!mapping->a_ops->readpage) |
1580 | return -ENOEXEC; | 1580 | return -ENOEXEC; |
1581 | file_accessed(file); | 1581 | file_accessed(file); |
1582 | vma->vm_ops = &ceph_vmops; | 1582 | vma->vm_ops = &ceph_vmops; |
1583 | return 0; | 1583 | return 0; |
1584 | } | 1584 | } |
1585 | 1585 |
include/linux/ceph/osd_client.h
1 | #ifndef _FS_CEPH_OSD_CLIENT_H | 1 | #ifndef _FS_CEPH_OSD_CLIENT_H |
2 | #define _FS_CEPH_OSD_CLIENT_H | 2 | #define _FS_CEPH_OSD_CLIENT_H |
3 | 3 | ||
4 | #include <linux/completion.h> | 4 | #include <linux/completion.h> |
5 | #include <linux/kref.h> | 5 | #include <linux/kref.h> |
6 | #include <linux/mempool.h> | 6 | #include <linux/mempool.h> |
7 | #include <linux/rbtree.h> | 7 | #include <linux/rbtree.h> |
8 | 8 | ||
9 | #include <linux/ceph/types.h> | 9 | #include <linux/ceph/types.h> |
10 | #include <linux/ceph/osdmap.h> | 10 | #include <linux/ceph/osdmap.h> |
11 | #include <linux/ceph/messenger.h> | 11 | #include <linux/ceph/messenger.h> |
12 | #include <linux/ceph/auth.h> | 12 | #include <linux/ceph/auth.h> |
13 | #include <linux/ceph/pagelist.h> | 13 | #include <linux/ceph/pagelist.h> |
14 | 14 | ||
15 | struct ceph_msg; | 15 | struct ceph_msg; |
16 | struct ceph_snap_context; | 16 | struct ceph_snap_context; |
17 | struct ceph_osd_request; | 17 | struct ceph_osd_request; |
18 | struct ceph_osd_client; | 18 | struct ceph_osd_client; |
19 | struct ceph_authorizer; | 19 | struct ceph_authorizer; |
20 | 20 | ||
21 | /* | 21 | /* |
22 | * completion callback for async writepages | 22 | * completion callback for async writepages |
23 | */ | 23 | */ |
24 | typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, | 24 | typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, |
25 | struct ceph_msg *); | 25 | struct ceph_msg *); |
26 | typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); | 26 | typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); |
27 | 27 | ||
28 | /* a given osd we're communicating with */ | 28 | /* a given osd we're communicating with */ |
29 | struct ceph_osd { | 29 | struct ceph_osd { |
30 | atomic_t o_ref; | 30 | atomic_t o_ref; |
31 | struct ceph_osd_client *o_osdc; | 31 | struct ceph_osd_client *o_osdc; |
32 | int o_osd; | 32 | int o_osd; |
33 | int o_incarnation; | 33 | int o_incarnation; |
34 | struct rb_node o_node; | 34 | struct rb_node o_node; |
35 | struct ceph_connection o_con; | 35 | struct ceph_connection o_con; |
36 | struct list_head o_requests; | 36 | struct list_head o_requests; |
37 | struct list_head o_linger_requests; | 37 | struct list_head o_linger_requests; |
38 | struct list_head o_osd_lru; | 38 | struct list_head o_osd_lru; |
39 | struct ceph_auth_handshake o_auth; | 39 | struct ceph_auth_handshake o_auth; |
40 | unsigned long lru_ttl; | 40 | unsigned long lru_ttl; |
41 | int o_marked_for_keepalive; | 41 | int o_marked_for_keepalive; |
42 | struct list_head o_keepalive_item; | 42 | struct list_head o_keepalive_item; |
43 | }; | 43 | }; |
44 | 44 | ||
45 | 45 | ||
46 | #define CEPH_OSD_MAX_OP 3 | 46 | #define CEPH_OSD_MAX_OP 3 |
47 | 47 | ||
48 | enum ceph_osd_data_type { | 48 | enum ceph_osd_data_type { |
49 | CEPH_OSD_DATA_TYPE_NONE = 0, | 49 | CEPH_OSD_DATA_TYPE_NONE = 0, |
50 | CEPH_OSD_DATA_TYPE_PAGES, | 50 | CEPH_OSD_DATA_TYPE_PAGES, |
51 | CEPH_OSD_DATA_TYPE_PAGELIST, | 51 | CEPH_OSD_DATA_TYPE_PAGELIST, |
52 | #ifdef CONFIG_BLOCK | 52 | #ifdef CONFIG_BLOCK |
53 | CEPH_OSD_DATA_TYPE_BIO, | 53 | CEPH_OSD_DATA_TYPE_BIO, |
54 | #endif /* CONFIG_BLOCK */ | 54 | #endif /* CONFIG_BLOCK */ |
55 | }; | 55 | }; |
56 | 56 | ||
57 | struct ceph_osd_data { | 57 | struct ceph_osd_data { |
58 | enum ceph_osd_data_type type; | 58 | enum ceph_osd_data_type type; |
59 | union { | 59 | union { |
60 | struct { | 60 | struct { |
61 | struct page **pages; | 61 | struct page **pages; |
62 | u64 length; | 62 | u64 length; |
63 | u32 alignment; | 63 | u32 alignment; |
64 | bool pages_from_pool; | 64 | bool pages_from_pool; |
65 | bool own_pages; | 65 | bool own_pages; |
66 | }; | 66 | }; |
67 | struct ceph_pagelist *pagelist; | 67 | struct ceph_pagelist *pagelist; |
68 | #ifdef CONFIG_BLOCK | 68 | #ifdef CONFIG_BLOCK |
69 | struct { | 69 | struct { |
70 | struct bio *bio; /* list of bios */ | 70 | struct bio *bio; /* list of bios */ |
71 | size_t bio_length; /* total in list */ | 71 | size_t bio_length; /* total in list */ |
72 | }; | 72 | }; |
73 | #endif /* CONFIG_BLOCK */ | 73 | #endif /* CONFIG_BLOCK */ |
74 | }; | 74 | }; |
75 | }; | 75 | }; |
76 | 76 | ||
77 | struct ceph_osd_req_op { | 77 | struct ceph_osd_req_op { |
78 | u16 op; /* CEPH_OSD_OP_* */ | 78 | u16 op; /* CEPH_OSD_OP_* */ |
79 | u32 flags; /* CEPH_OSD_OP_FLAG_* */ | 79 | u32 flags; /* CEPH_OSD_OP_FLAG_* */ |
80 | u32 payload_len; | 80 | u32 payload_len; |
81 | union { | 81 | union { |
82 | struct ceph_osd_data raw_data_in; | 82 | struct ceph_osd_data raw_data_in; |
83 | struct { | 83 | struct { |
84 | u64 offset, length; | 84 | u64 offset, length; |
85 | u64 truncate_size; | 85 | u64 truncate_size; |
86 | u32 truncate_seq; | 86 | u32 truncate_seq; |
87 | struct ceph_osd_data osd_data; | 87 | struct ceph_osd_data osd_data; |
88 | } extent; | 88 | } extent; |
89 | struct { | 89 | struct { |
90 | __le32 name_len; | 90 | u32 name_len; |
91 | __le32 value_len; | 91 | u32 value_len; |
92 | __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ | 92 | __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ |
93 | __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ | 93 | __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ |
94 | struct ceph_osd_data osd_data; | 94 | struct ceph_osd_data osd_data; |
95 | } xattr; | 95 | } xattr; |
96 | struct { | 96 | struct { |
97 | const char *class_name; | 97 | const char *class_name; |
98 | const char *method_name; | 98 | const char *method_name; |
99 | struct ceph_osd_data request_info; | 99 | struct ceph_osd_data request_info; |
100 | struct ceph_osd_data request_data; | 100 | struct ceph_osd_data request_data; |
101 | struct ceph_osd_data response_data; | 101 | struct ceph_osd_data response_data; |
102 | __u8 class_len; | 102 | __u8 class_len; |
103 | __u8 method_len; | 103 | __u8 method_len; |
104 | __u8 argc; | 104 | __u8 argc; |
105 | } cls; | 105 | } cls; |
106 | struct { | 106 | struct { |
107 | u64 cookie; | 107 | u64 cookie; |
108 | u64 ver; | 108 | u64 ver; |
109 | u32 prot_ver; | 109 | u32 prot_ver; |
110 | u32 timeout; | 110 | u32 timeout; |
111 | __u8 flag; | 111 | __u8 flag; |
112 | } watch; | 112 | } watch; |
113 | struct { | 113 | struct { |
114 | u64 expected_object_size; | 114 | u64 expected_object_size; |
115 | u64 expected_write_size; | 115 | u64 expected_write_size; |
116 | } alloc_hint; | 116 | } alloc_hint; |
117 | }; | 117 | }; |
118 | }; | 118 | }; |
119 | 119 | ||
120 | /* an in-flight request */ | 120 | /* an in-flight request */ |
121 | struct ceph_osd_request { | 121 | struct ceph_osd_request { |
122 | u64 r_tid; /* unique for this client */ | 122 | u64 r_tid; /* unique for this client */ |
123 | struct rb_node r_node; | 123 | struct rb_node r_node; |
124 | struct list_head r_req_lru_item; | 124 | struct list_head r_req_lru_item; |
125 | struct list_head r_osd_item; | 125 | struct list_head r_osd_item; |
126 | struct list_head r_linger_item; | 126 | struct list_head r_linger_item; |
127 | struct list_head r_linger_osd_item; | 127 | struct list_head r_linger_osd_item; |
128 | struct ceph_osd *r_osd; | 128 | struct ceph_osd *r_osd; |
129 | struct ceph_pg r_pgid; | 129 | struct ceph_pg r_pgid; |
130 | int r_pg_osds[CEPH_PG_MAX_SIZE]; | 130 | int r_pg_osds[CEPH_PG_MAX_SIZE]; |
131 | int r_num_pg_osds; | 131 | int r_num_pg_osds; |
132 | 132 | ||
133 | struct ceph_msg *r_request, *r_reply; | 133 | struct ceph_msg *r_request, *r_reply; |
134 | int r_flags; /* any additional flags for the osd */ | 134 | int r_flags; /* any additional flags for the osd */ |
135 | u32 r_sent; /* >0 if r_request is sending/sent */ | 135 | u32 r_sent; /* >0 if r_request is sending/sent */ |
136 | 136 | ||
137 | /* request osd ops array */ | 137 | /* request osd ops array */ |
138 | unsigned int r_num_ops; | 138 | unsigned int r_num_ops; |
139 | struct ceph_osd_req_op r_ops[CEPH_OSD_MAX_OP]; | 139 | struct ceph_osd_req_op r_ops[CEPH_OSD_MAX_OP]; |
140 | 140 | ||
141 | /* these are updated on each send */ | 141 | /* these are updated on each send */ |
142 | __le32 *r_request_osdmap_epoch; | 142 | __le32 *r_request_osdmap_epoch; |
143 | __le32 *r_request_flags; | 143 | __le32 *r_request_flags; |
144 | __le64 *r_request_pool; | 144 | __le64 *r_request_pool; |
145 | void *r_request_pgid; | 145 | void *r_request_pgid; |
146 | __le32 *r_request_attempts; | 146 | __le32 *r_request_attempts; |
147 | bool r_paused; | 147 | bool r_paused; |
148 | struct ceph_eversion *r_request_reassert_version; | 148 | struct ceph_eversion *r_request_reassert_version; |
149 | 149 | ||
150 | int r_result; | 150 | int r_result; |
151 | int r_reply_op_len[CEPH_OSD_MAX_OP]; | 151 | int r_reply_op_len[CEPH_OSD_MAX_OP]; |
152 | s32 r_reply_op_result[CEPH_OSD_MAX_OP]; | 152 | s32 r_reply_op_result[CEPH_OSD_MAX_OP]; |
153 | int r_got_reply; | 153 | int r_got_reply; |
154 | int r_linger; | 154 | int r_linger; |
155 | 155 | ||
156 | struct ceph_osd_client *r_osdc; | 156 | struct ceph_osd_client *r_osdc; |
157 | struct kref r_kref; | 157 | struct kref r_kref; |
158 | bool r_mempool; | 158 | bool r_mempool; |
159 | struct completion r_completion, r_safe_completion; | 159 | struct completion r_completion, r_safe_completion; |
160 | ceph_osdc_callback_t r_callback; | 160 | ceph_osdc_callback_t r_callback; |
161 | ceph_osdc_unsafe_callback_t r_unsafe_callback; | 161 | ceph_osdc_unsafe_callback_t r_unsafe_callback; |
162 | struct ceph_eversion r_reassert_version; | 162 | struct ceph_eversion r_reassert_version; |
163 | struct list_head r_unsafe_item; | 163 | struct list_head r_unsafe_item; |
164 | 164 | ||
165 | struct inode *r_inode; /* for use by callbacks */ | 165 | struct inode *r_inode; /* for use by callbacks */ |
166 | void *r_priv; /* ditto */ | 166 | void *r_priv; /* ditto */ |
167 | 167 | ||
168 | struct ceph_object_locator r_base_oloc; | 168 | struct ceph_object_locator r_base_oloc; |
169 | struct ceph_object_id r_base_oid; | 169 | struct ceph_object_id r_base_oid; |
170 | struct ceph_object_locator r_target_oloc; | 170 | struct ceph_object_locator r_target_oloc; |
171 | struct ceph_object_id r_target_oid; | 171 | struct ceph_object_id r_target_oid; |
172 | 172 | ||
173 | u64 r_snapid; | 173 | u64 r_snapid; |
174 | unsigned long r_stamp; /* send OR check time */ | 174 | unsigned long r_stamp; /* send OR check time */ |
175 | 175 | ||
176 | struct ceph_snap_context *r_snapc; /* snap context for writes */ | 176 | struct ceph_snap_context *r_snapc; /* snap context for writes */ |
177 | }; | 177 | }; |
178 | 178 | ||
179 | struct ceph_request_redirect { | 179 | struct ceph_request_redirect { |
180 | struct ceph_object_locator oloc; | 180 | struct ceph_object_locator oloc; |
181 | }; | 181 | }; |
182 | 182 | ||
183 | struct ceph_osd_event { | 183 | struct ceph_osd_event { |
184 | u64 cookie; | 184 | u64 cookie; |
185 | int one_shot; | 185 | int one_shot; |
186 | struct ceph_osd_client *osdc; | 186 | struct ceph_osd_client *osdc; |
187 | void (*cb)(u64, u64, u8, void *); | 187 | void (*cb)(u64, u64, u8, void *); |
188 | void *data; | 188 | void *data; |
189 | struct rb_node node; | 189 | struct rb_node node; |
190 | struct list_head osd_node; | 190 | struct list_head osd_node; |
191 | struct kref kref; | 191 | struct kref kref; |
192 | }; | 192 | }; |
193 | 193 | ||
194 | struct ceph_osd_event_work { | 194 | struct ceph_osd_event_work { |
195 | struct work_struct work; | 195 | struct work_struct work; |
196 | struct ceph_osd_event *event; | 196 | struct ceph_osd_event *event; |
197 | u64 ver; | 197 | u64 ver; |
198 | u64 notify_id; | 198 | u64 notify_id; |
199 | u8 opcode; | 199 | u8 opcode; |
200 | }; | 200 | }; |
201 | 201 | ||
202 | struct ceph_osd_client { | 202 | struct ceph_osd_client { |
203 | struct ceph_client *client; | 203 | struct ceph_client *client; |
204 | 204 | ||
205 | struct ceph_osdmap *osdmap; /* current map */ | 205 | struct ceph_osdmap *osdmap; /* current map */ |
206 | struct rw_semaphore map_sem; | 206 | struct rw_semaphore map_sem; |
207 | struct completion map_waiters; | 207 | struct completion map_waiters; |
208 | u64 last_requested_map; | 208 | u64 last_requested_map; |
209 | 209 | ||
210 | struct mutex request_mutex; | 210 | struct mutex request_mutex; |
211 | struct rb_root osds; /* osds */ | 211 | struct rb_root osds; /* osds */ |
212 | struct list_head osd_lru; /* idle osds */ | 212 | struct list_head osd_lru; /* idle osds */ |
213 | u64 timeout_tid; /* tid of timeout triggering rq */ | 213 | u64 timeout_tid; /* tid of timeout triggering rq */ |
214 | u64 last_tid; /* tid of last request */ | 214 | u64 last_tid; /* tid of last request */ |
215 | struct rb_root requests; /* pending requests */ | 215 | struct rb_root requests; /* pending requests */ |
216 | struct list_head req_lru; /* in-flight lru */ | 216 | struct list_head req_lru; /* in-flight lru */ |
217 | struct list_head req_unsent; /* unsent/need-resend queue */ | 217 | struct list_head req_unsent; /* unsent/need-resend queue */ |
218 | struct list_head req_notarget; /* map to no osd */ | 218 | struct list_head req_notarget; /* map to no osd */ |
219 | struct list_head req_linger; /* lingering requests */ | 219 | struct list_head req_linger; /* lingering requests */ |
220 | int num_requests; | 220 | int num_requests; |
221 | struct delayed_work timeout_work; | 221 | struct delayed_work timeout_work; |
222 | struct delayed_work osds_timeout_work; | 222 | struct delayed_work osds_timeout_work; |
223 | #ifdef CONFIG_DEBUG_FS | 223 | #ifdef CONFIG_DEBUG_FS |
224 | struct dentry *debugfs_file; | 224 | struct dentry *debugfs_file; |
225 | #endif | 225 | #endif |
226 | 226 | ||
227 | mempool_t *req_mempool; | 227 | mempool_t *req_mempool; |
228 | 228 | ||
229 | struct ceph_msgpool msgpool_op; | 229 | struct ceph_msgpool msgpool_op; |
230 | struct ceph_msgpool msgpool_op_reply; | 230 | struct ceph_msgpool msgpool_op_reply; |
231 | 231 | ||
232 | spinlock_t event_lock; | 232 | spinlock_t event_lock; |
233 | struct rb_root event_tree; | 233 | struct rb_root event_tree; |
234 | u64 event_count; | 234 | u64 event_count; |
235 | 235 | ||
236 | struct workqueue_struct *notify_wq; | 236 | struct workqueue_struct *notify_wq; |
237 | }; | 237 | }; |
238 | 238 | ||
239 | extern int ceph_osdc_setup(void); | 239 | extern int ceph_osdc_setup(void); |
240 | extern void ceph_osdc_cleanup(void); | 240 | extern void ceph_osdc_cleanup(void); |
241 | 241 | ||
242 | extern int ceph_osdc_init(struct ceph_osd_client *osdc, | 242 | extern int ceph_osdc_init(struct ceph_osd_client *osdc, |
243 | struct ceph_client *client); | 243 | struct ceph_client *client); |
244 | extern void ceph_osdc_stop(struct ceph_osd_client *osdc); | 244 | extern void ceph_osdc_stop(struct ceph_osd_client *osdc); |
245 | 245 | ||
246 | extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, | 246 | extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, |
247 | struct ceph_msg *msg); | 247 | struct ceph_msg *msg); |
248 | extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, | 248 | extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, |
249 | struct ceph_msg *msg); | 249 | struct ceph_msg *msg); |
250 | 250 | ||
251 | extern void osd_req_op_init(struct ceph_osd_request *osd_req, | 251 | extern void osd_req_op_init(struct ceph_osd_request *osd_req, |
252 | unsigned int which, u16 opcode); | 252 | unsigned int which, u16 opcode); |
253 | 253 | ||
254 | extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *, | 254 | extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *, |
255 | unsigned int which, | 255 | unsigned int which, |
256 | struct page **pages, u64 length, | 256 | struct page **pages, u64 length, |
257 | u32 alignment, bool pages_from_pool, | 257 | u32 alignment, bool pages_from_pool, |
258 | bool own_pages); | 258 | bool own_pages); |
259 | 259 | ||
260 | extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req, | 260 | extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req, |
261 | unsigned int which, u16 opcode, | 261 | unsigned int which, u16 opcode, |
262 | u64 offset, u64 length, | 262 | u64 offset, u64 length, |
263 | u64 truncate_size, u32 truncate_seq); | 263 | u64 truncate_size, u32 truncate_seq); |
264 | extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, | 264 | extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, |
265 | unsigned int which, u64 length); | 265 | unsigned int which, u64 length); |
266 | 266 | ||
267 | extern struct ceph_osd_data *osd_req_op_extent_osd_data( | 267 | extern struct ceph_osd_data *osd_req_op_extent_osd_data( |
268 | struct ceph_osd_request *osd_req, | 268 | struct ceph_osd_request *osd_req, |
269 | unsigned int which); | 269 | unsigned int which); |
270 | extern struct ceph_osd_data *osd_req_op_cls_response_data( | 270 | extern struct ceph_osd_data *osd_req_op_cls_response_data( |
271 | struct ceph_osd_request *osd_req, | 271 | struct ceph_osd_request *osd_req, |
272 | unsigned int which); | 272 | unsigned int which); |
273 | 273 | ||
274 | extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, | 274 | extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, |
275 | unsigned int which, | 275 | unsigned int which, |
276 | struct page **pages, u64 length, | 276 | struct page **pages, u64 length, |
277 | u32 alignment, bool pages_from_pool, | 277 | u32 alignment, bool pages_from_pool, |
278 | bool own_pages); | 278 | bool own_pages); |
279 | extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *, | 279 | extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *, |
280 | unsigned int which, | 280 | unsigned int which, |
281 | struct ceph_pagelist *pagelist); | 281 | struct ceph_pagelist *pagelist); |
282 | #ifdef CONFIG_BLOCK | 282 | #ifdef CONFIG_BLOCK |
283 | extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *, | 283 | extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *, |
284 | unsigned int which, | 284 | unsigned int which, |
285 | struct bio *bio, size_t bio_length); | 285 | struct bio *bio, size_t bio_length); |
286 | #endif /* CONFIG_BLOCK */ | 286 | #endif /* CONFIG_BLOCK */ |
287 | 287 | ||
288 | extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *, | 288 | extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *, |
289 | unsigned int which, | 289 | unsigned int which, |
290 | struct ceph_pagelist *pagelist); | 290 | struct ceph_pagelist *pagelist); |
291 | extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *, | 291 | extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *, |
292 | unsigned int which, | 292 | unsigned int which, |
293 | struct page **pages, u64 length, | 293 | struct page **pages, u64 length, |
294 | u32 alignment, bool pages_from_pool, | 294 | u32 alignment, bool pages_from_pool, |
295 | bool own_pages); | 295 | bool own_pages); |
296 | extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, | 296 | extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, |
297 | unsigned int which, | 297 | unsigned int which, |
298 | struct page **pages, u64 length, | 298 | struct page **pages, u64 length, |
299 | u32 alignment, bool pages_from_pool, | 299 | u32 alignment, bool pages_from_pool, |
300 | bool own_pages); | 300 | bool own_pages); |
301 | 301 | ||
302 | extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, | 302 | extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, |
303 | unsigned int which, u16 opcode, | 303 | unsigned int which, u16 opcode, |
304 | const char *class, const char *method); | 304 | const char *class, const char *method); |
305 | extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, | 305 | extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, |
306 | u16 opcode, const char *name, const void *value, | 306 | u16 opcode, const char *name, const void *value, |
307 | size_t size, u8 cmp_op, u8 cmp_mode); | 307 | size_t size, u8 cmp_op, u8 cmp_mode); |
308 | extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, | 308 | extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, |
309 | unsigned int which, u16 opcode, | 309 | unsigned int which, u16 opcode, |
310 | u64 cookie, u64 version, int flag); | 310 | u64 cookie, u64 version, int flag); |
311 | extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, | 311 | extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, |
312 | unsigned int which, | 312 | unsigned int which, |
313 | u64 expected_object_size, | 313 | u64 expected_object_size, |
314 | u64 expected_write_size); | 314 | u64 expected_write_size); |
315 | 315 | ||
316 | extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, | 316 | extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, |
317 | struct ceph_snap_context *snapc, | 317 | struct ceph_snap_context *snapc, |
318 | unsigned int num_ops, | 318 | unsigned int num_ops, |
319 | bool use_mempool, | 319 | bool use_mempool, |
320 | gfp_t gfp_flags); | 320 | gfp_t gfp_flags); |
321 | 321 | ||
322 | extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, | 322 | extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, |
323 | struct ceph_snap_context *snapc, | 323 | struct ceph_snap_context *snapc, |
324 | u64 snap_id, | 324 | u64 snap_id, |
325 | struct timespec *mtime); | 325 | struct timespec *mtime); |
326 | 326 | ||
327 | extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, | 327 | extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, |
328 | struct ceph_file_layout *layout, | 328 | struct ceph_file_layout *layout, |
329 | struct ceph_vino vino, | 329 | struct ceph_vino vino, |
330 | u64 offset, u64 *len, | 330 | u64 offset, u64 *len, |
331 | unsigned int which, int num_ops, | 331 | unsigned int which, int num_ops, |
332 | int opcode, int flags, | 332 | int opcode, int flags, |
333 | struct ceph_snap_context *snapc, | 333 | struct ceph_snap_context *snapc, |
334 | u32 truncate_seq, u64 truncate_size, | 334 | u32 truncate_seq, u64 truncate_size, |
335 | bool use_mempool); | 335 | bool use_mempool); |
336 | 336 | ||
337 | extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, | 337 | extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, |
338 | struct ceph_osd_request *req); | 338 | struct ceph_osd_request *req); |
339 | 339 | ||
340 | extern void ceph_osdc_get_request(struct ceph_osd_request *req); | 340 | extern void ceph_osdc_get_request(struct ceph_osd_request *req); |
341 | extern void ceph_osdc_put_request(struct ceph_osd_request *req); | 341 | extern void ceph_osdc_put_request(struct ceph_osd_request *req); |
342 | 342 | ||
343 | extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, | 343 | extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, |
344 | struct ceph_osd_request *req, | 344 | struct ceph_osd_request *req, |
345 | bool nofail); | 345 | bool nofail); |
346 | extern void ceph_osdc_cancel_request(struct ceph_osd_request *req); | 346 | extern void ceph_osdc_cancel_request(struct ceph_osd_request *req); |
347 | extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, | 347 | extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, |
348 | struct ceph_osd_request *req); | 348 | struct ceph_osd_request *req); |
349 | extern void ceph_osdc_sync(struct ceph_osd_client *osdc); | 349 | extern void ceph_osdc_sync(struct ceph_osd_client *osdc); |
350 | 350 | ||
351 | extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc); | 351 | extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc); |
352 | 352 | ||
353 | extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, | 353 | extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, |
354 | struct ceph_vino vino, | 354 | struct ceph_vino vino, |
355 | struct ceph_file_layout *layout, | 355 | struct ceph_file_layout *layout, |
356 | u64 off, u64 *plen, | 356 | u64 off, u64 *plen, |
357 | u32 truncate_seq, u64 truncate_size, | 357 | u32 truncate_seq, u64 truncate_size, |
358 | struct page **pages, int nr_pages, | 358 | struct page **pages, int nr_pages, |
359 | int page_align); | 359 | int page_align); |
360 | 360 | ||
361 | extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, | 361 | extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, |
362 | struct ceph_vino vino, | 362 | struct ceph_vino vino, |
363 | struct ceph_file_layout *layout, | 363 | struct ceph_file_layout *layout, |
364 | struct ceph_snap_context *sc, | 364 | struct ceph_snap_context *sc, |
365 | u64 off, u64 len, | 365 | u64 off, u64 len, |
366 | u32 truncate_seq, u64 truncate_size, | 366 | u32 truncate_seq, u64 truncate_size, |
367 | struct timespec *mtime, | 367 | struct timespec *mtime, |
368 | struct page **pages, int nr_pages); | 368 | struct page **pages, int nr_pages); |
369 | 369 | ||
370 | /* watch/notify events */ | 370 | /* watch/notify events */ |
371 | extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, | 371 | extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, |
372 | void (*event_cb)(u64, u64, u8, void *), | 372 | void (*event_cb)(u64, u64, u8, void *), |
373 | void *data, struct ceph_osd_event **pevent); | 373 | void *data, struct ceph_osd_event **pevent); |
374 | extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); | 374 | extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); |
375 | extern void ceph_osdc_put_event(struct ceph_osd_event *event); | 375 | extern void ceph_osdc_put_event(struct ceph_osd_event *event); |
376 | #endif | 376 | #endif |
377 | 377 | ||
378 | 378 |
net/ceph/auth_x.c
1 | 1 | ||
2 | #include <linux/ceph/ceph_debug.h> | 2 | #include <linux/ceph/ceph_debug.h> |
3 | 3 | ||
4 | #include <linux/err.h> | 4 | #include <linux/err.h> |
5 | #include <linux/module.h> | 5 | #include <linux/module.h> |
6 | #include <linux/random.h> | 6 | #include <linux/random.h> |
7 | #include <linux/slab.h> | 7 | #include <linux/slab.h> |
8 | 8 | ||
9 | #include <linux/ceph/decode.h> | 9 | #include <linux/ceph/decode.h> |
10 | #include <linux/ceph/auth.h> | 10 | #include <linux/ceph/auth.h> |
11 | #include <linux/ceph/messenger.h> | 11 | #include <linux/ceph/messenger.h> |
12 | 12 | ||
13 | #include "crypto.h" | 13 | #include "crypto.h" |
14 | #include "auth_x.h" | 14 | #include "auth_x.h" |
15 | #include "auth_x_protocol.h" | 15 | #include "auth_x_protocol.h" |
16 | 16 | ||
17 | static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed); | 17 | static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed); |
18 | 18 | ||
19 | static int ceph_x_is_authenticated(struct ceph_auth_client *ac) | 19 | static int ceph_x_is_authenticated(struct ceph_auth_client *ac) |
20 | { | 20 | { |
21 | struct ceph_x_info *xi = ac->private; | 21 | struct ceph_x_info *xi = ac->private; |
22 | int need; | 22 | int need; |
23 | 23 | ||
24 | ceph_x_validate_tickets(ac, &need); | 24 | ceph_x_validate_tickets(ac, &need); |
25 | dout("ceph_x_is_authenticated want=%d need=%d have=%d\n", | 25 | dout("ceph_x_is_authenticated want=%d need=%d have=%d\n", |
26 | ac->want_keys, need, xi->have_keys); | 26 | ac->want_keys, need, xi->have_keys); |
27 | return (ac->want_keys & xi->have_keys) == ac->want_keys; | 27 | return (ac->want_keys & xi->have_keys) == ac->want_keys; |
28 | } | 28 | } |
29 | 29 | ||
30 | static int ceph_x_should_authenticate(struct ceph_auth_client *ac) | 30 | static int ceph_x_should_authenticate(struct ceph_auth_client *ac) |
31 | { | 31 | { |
32 | struct ceph_x_info *xi = ac->private; | 32 | struct ceph_x_info *xi = ac->private; |
33 | int need; | 33 | int need; |
34 | 34 | ||
35 | ceph_x_validate_tickets(ac, &need); | 35 | ceph_x_validate_tickets(ac, &need); |
36 | dout("ceph_x_should_authenticate want=%d need=%d have=%d\n", | 36 | dout("ceph_x_should_authenticate want=%d need=%d have=%d\n", |
37 | ac->want_keys, need, xi->have_keys); | 37 | ac->want_keys, need, xi->have_keys); |
38 | return need != 0; | 38 | return need != 0; |
39 | } | 39 | } |
40 | 40 | ||
41 | static int ceph_x_encrypt_buflen(int ilen) | 41 | static int ceph_x_encrypt_buflen(int ilen) |
42 | { | 42 | { |
43 | return sizeof(struct ceph_x_encrypt_header) + ilen + 16 + | 43 | return sizeof(struct ceph_x_encrypt_header) + ilen + 16 + |
44 | sizeof(u32); | 44 | sizeof(u32); |
45 | } | 45 | } |
46 | 46 | ||
47 | static int ceph_x_encrypt(struct ceph_crypto_key *secret, | 47 | static int ceph_x_encrypt(struct ceph_crypto_key *secret, |
48 | void *ibuf, int ilen, void *obuf, size_t olen) | 48 | void *ibuf, int ilen, void *obuf, size_t olen) |
49 | { | 49 | { |
50 | struct ceph_x_encrypt_header head = { | 50 | struct ceph_x_encrypt_header head = { |
51 | .struct_v = 1, | 51 | .struct_v = 1, |
52 | .magic = cpu_to_le64(CEPHX_ENC_MAGIC) | 52 | .magic = cpu_to_le64(CEPHX_ENC_MAGIC) |
53 | }; | 53 | }; |
54 | size_t len = olen - sizeof(u32); | 54 | size_t len = olen - sizeof(u32); |
55 | int ret; | 55 | int ret; |
56 | 56 | ||
57 | ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len, | 57 | ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len, |
58 | &head, sizeof(head), ibuf, ilen); | 58 | &head, sizeof(head), ibuf, ilen); |
59 | if (ret) | 59 | if (ret) |
60 | return ret; | 60 | return ret; |
61 | ceph_encode_32(&obuf, len); | 61 | ceph_encode_32(&obuf, len); |
62 | return len + sizeof(u32); | 62 | return len + sizeof(u32); |
63 | } | 63 | } |
64 | 64 | ||
65 | static int ceph_x_decrypt(struct ceph_crypto_key *secret, | 65 | static int ceph_x_decrypt(struct ceph_crypto_key *secret, |
66 | void **p, void *end, void **obuf, size_t olen) | 66 | void **p, void *end, void **obuf, size_t olen) |
67 | { | 67 | { |
68 | struct ceph_x_encrypt_header head; | 68 | struct ceph_x_encrypt_header head; |
69 | size_t head_len = sizeof(head); | 69 | size_t head_len = sizeof(head); |
70 | int len, ret; | 70 | int len, ret; |
71 | 71 | ||
72 | len = ceph_decode_32(p); | 72 | len = ceph_decode_32(p); |
73 | if (*p + len > end) | 73 | if (*p + len > end) |
74 | return -EINVAL; | 74 | return -EINVAL; |
75 | 75 | ||
76 | dout("ceph_x_decrypt len %d\n", len); | 76 | dout("ceph_x_decrypt len %d\n", len); |
77 | if (*obuf == NULL) { | 77 | if (*obuf == NULL) { |
78 | *obuf = kmalloc(len, GFP_NOFS); | 78 | *obuf = kmalloc(len, GFP_NOFS); |
79 | if (!*obuf) | 79 | if (!*obuf) |
80 | return -ENOMEM; | 80 | return -ENOMEM; |
81 | olen = len; | 81 | olen = len; |
82 | } | 82 | } |
83 | 83 | ||
84 | ret = ceph_decrypt2(secret, &head, &head_len, *obuf, &olen, *p, len); | 84 | ret = ceph_decrypt2(secret, &head, &head_len, *obuf, &olen, *p, len); |
85 | if (ret) | 85 | if (ret) |
86 | return ret; | 86 | return ret; |
87 | if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC) | 87 | if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC) |
88 | return -EPERM; | 88 | return -EPERM; |
89 | *p += len; | 89 | *p += len; |
90 | return olen; | 90 | return olen; |
91 | } | 91 | } |
92 | 92 | ||
93 | /* | 93 | /* |
94 | * get existing (or insert new) ticket handler | 94 | * get existing (or insert new) ticket handler |
95 | */ | 95 | */ |
96 | static struct ceph_x_ticket_handler * | 96 | static struct ceph_x_ticket_handler * |
97 | get_ticket_handler(struct ceph_auth_client *ac, int service) | 97 | get_ticket_handler(struct ceph_auth_client *ac, int service) |
98 | { | 98 | { |
99 | struct ceph_x_ticket_handler *th; | 99 | struct ceph_x_ticket_handler *th; |
100 | struct ceph_x_info *xi = ac->private; | 100 | struct ceph_x_info *xi = ac->private; |
101 | struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node; | 101 | struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node; |
102 | 102 | ||
103 | while (*p) { | 103 | while (*p) { |
104 | parent = *p; | 104 | parent = *p; |
105 | th = rb_entry(parent, struct ceph_x_ticket_handler, node); | 105 | th = rb_entry(parent, struct ceph_x_ticket_handler, node); |
106 | if (service < th->service) | 106 | if (service < th->service) |
107 | p = &(*p)->rb_left; | 107 | p = &(*p)->rb_left; |
108 | else if (service > th->service) | 108 | else if (service > th->service) |
109 | p = &(*p)->rb_right; | 109 | p = &(*p)->rb_right; |
110 | else | 110 | else |
111 | return th; | 111 | return th; |
112 | } | 112 | } |
113 | 113 | ||
114 | /* add it */ | 114 | /* add it */ |
115 | th = kzalloc(sizeof(*th), GFP_NOFS); | 115 | th = kzalloc(sizeof(*th), GFP_NOFS); |
116 | if (!th) | 116 | if (!th) |
117 | return ERR_PTR(-ENOMEM); | 117 | return ERR_PTR(-ENOMEM); |
118 | th->service = service; | 118 | th->service = service; |
119 | rb_link_node(&th->node, parent, p); | 119 | rb_link_node(&th->node, parent, p); |
120 | rb_insert_color(&th->node, &xi->ticket_handlers); | 120 | rb_insert_color(&th->node, &xi->ticket_handlers); |
121 | return th; | 121 | return th; |
122 | } | 122 | } |
123 | 123 | ||
124 | static void remove_ticket_handler(struct ceph_auth_client *ac, | 124 | static void remove_ticket_handler(struct ceph_auth_client *ac, |
125 | struct ceph_x_ticket_handler *th) | 125 | struct ceph_x_ticket_handler *th) |
126 | { | 126 | { |
127 | struct ceph_x_info *xi = ac->private; | 127 | struct ceph_x_info *xi = ac->private; |
128 | 128 | ||
129 | dout("remove_ticket_handler %p %d\n", th, th->service); | 129 | dout("remove_ticket_handler %p %d\n", th, th->service); |
130 | rb_erase(&th->node, &xi->ticket_handlers); | 130 | rb_erase(&th->node, &xi->ticket_handlers); |
131 | ceph_crypto_key_destroy(&th->session_key); | 131 | ceph_crypto_key_destroy(&th->session_key); |
132 | if (th->ticket_blob) | 132 | if (th->ticket_blob) |
133 | ceph_buffer_put(th->ticket_blob); | 133 | ceph_buffer_put(th->ticket_blob); |
134 | kfree(th); | 134 | kfree(th); |
135 | } | 135 | } |
136 | 136 | ||
137 | static int process_one_ticket(struct ceph_auth_client *ac, | 137 | static int process_one_ticket(struct ceph_auth_client *ac, |
138 | struct ceph_crypto_key *secret, | 138 | struct ceph_crypto_key *secret, |
139 | void **p, void *end) | 139 | void **p, void *end) |
140 | { | 140 | { |
141 | struct ceph_x_info *xi = ac->private; | 141 | struct ceph_x_info *xi = ac->private; |
142 | int type; | 142 | int type; |
143 | u8 tkt_struct_v, blob_struct_v; | 143 | u8 tkt_struct_v, blob_struct_v; |
144 | struct ceph_x_ticket_handler *th; | 144 | struct ceph_x_ticket_handler *th; |
145 | void *dbuf = NULL; | 145 | void *dbuf = NULL; |
146 | void *dp, *dend; | 146 | void *dp, *dend; |
147 | int dlen; | 147 | int dlen; |
148 | char is_enc; | 148 | char is_enc; |
149 | struct timespec validity; | 149 | struct timespec validity; |
150 | struct ceph_crypto_key old_key; | 150 | struct ceph_crypto_key old_key; |
151 | void *ticket_buf = NULL; | 151 | void *ticket_buf = NULL; |
152 | void *tp, *tpend; | 152 | void *tp, *tpend; |
153 | void **ptp; | 153 | void **ptp; |
154 | struct ceph_timespec new_validity; | 154 | struct ceph_timespec new_validity; |
155 | struct ceph_crypto_key new_session_key; | 155 | struct ceph_crypto_key new_session_key; |
156 | struct ceph_buffer *new_ticket_blob; | 156 | struct ceph_buffer *new_ticket_blob; |
157 | unsigned long new_expires, new_renew_after; | 157 | unsigned long new_expires, new_renew_after; |
158 | u64 new_secret_id; | 158 | u64 new_secret_id; |
159 | int ret; | 159 | int ret; |
160 | 160 | ||
161 | ceph_decode_need(p, end, sizeof(u32) + 1, bad); | 161 | ceph_decode_need(p, end, sizeof(u32) + 1, bad); |
162 | 162 | ||
163 | type = ceph_decode_32(p); | 163 | type = ceph_decode_32(p); |
164 | dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); | 164 | dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); |
165 | 165 | ||
166 | tkt_struct_v = ceph_decode_8(p); | 166 | tkt_struct_v = ceph_decode_8(p); |
167 | if (tkt_struct_v != 1) | 167 | if (tkt_struct_v != 1) |
168 | goto bad; | 168 | goto bad; |
169 | 169 | ||
170 | th = get_ticket_handler(ac, type); | 170 | th = get_ticket_handler(ac, type); |
171 | if (IS_ERR(th)) { | 171 | if (IS_ERR(th)) { |
172 | ret = PTR_ERR(th); | 172 | ret = PTR_ERR(th); |
173 | goto out; | 173 | goto out; |
174 | } | 174 | } |
175 | 175 | ||
176 | /* blob for me */ | 176 | /* blob for me */ |
177 | dlen = ceph_x_decrypt(secret, p, end, &dbuf, 0); | 177 | dlen = ceph_x_decrypt(secret, p, end, &dbuf, 0); |
178 | if (dlen <= 0) { | 178 | if (dlen <= 0) { |
179 | ret = dlen; | 179 | ret = dlen; |
180 | goto out; | 180 | goto out; |
181 | } | 181 | } |
182 | dout(" decrypted %d bytes\n", dlen); | 182 | dout(" decrypted %d bytes\n", dlen); |
183 | dp = dbuf; | 183 | dp = dbuf; |
184 | dend = dp + dlen; | 184 | dend = dp + dlen; |
185 | 185 | ||
186 | tkt_struct_v = ceph_decode_8(&dp); | 186 | tkt_struct_v = ceph_decode_8(&dp); |
187 | if (tkt_struct_v != 1) | 187 | if (tkt_struct_v != 1) |
188 | goto bad; | 188 | goto bad; |
189 | 189 | ||
190 | memcpy(&old_key, &th->session_key, sizeof(old_key)); | 190 | memcpy(&old_key, &th->session_key, sizeof(old_key)); |
191 | ret = ceph_crypto_key_decode(&new_session_key, &dp, dend); | 191 | ret = ceph_crypto_key_decode(&new_session_key, &dp, dend); |
192 | if (ret) | 192 | if (ret) |
193 | goto out; | 193 | goto out; |
194 | 194 | ||
195 | ceph_decode_copy(&dp, &new_validity, sizeof(new_validity)); | 195 | ceph_decode_copy(&dp, &new_validity, sizeof(new_validity)); |
196 | ceph_decode_timespec(&validity, &new_validity); | 196 | ceph_decode_timespec(&validity, &new_validity); |
197 | new_expires = get_seconds() + validity.tv_sec; | 197 | new_expires = get_seconds() + validity.tv_sec; |
198 | new_renew_after = new_expires - (validity.tv_sec / 4); | 198 | new_renew_after = new_expires - (validity.tv_sec / 4); |
199 | dout(" expires=%lu renew_after=%lu\n", new_expires, | 199 | dout(" expires=%lu renew_after=%lu\n", new_expires, |
200 | new_renew_after); | 200 | new_renew_after); |
201 | 201 | ||
202 | /* ticket blob for service */ | 202 | /* ticket blob for service */ |
203 | ceph_decode_8_safe(p, end, is_enc, bad); | 203 | ceph_decode_8_safe(p, end, is_enc, bad); |
204 | if (is_enc) { | 204 | if (is_enc) { |
205 | /* encrypted */ | 205 | /* encrypted */ |
206 | dout(" encrypted ticket\n"); | 206 | dout(" encrypted ticket\n"); |
207 | dlen = ceph_x_decrypt(&old_key, p, end, &ticket_buf, 0); | 207 | dlen = ceph_x_decrypt(&old_key, p, end, &ticket_buf, 0); |
208 | if (dlen < 0) { | 208 | if (dlen < 0) { |
209 | ret = dlen; | 209 | ret = dlen; |
210 | goto out; | 210 | goto out; |
211 | } | 211 | } |
212 | tp = ticket_buf; | 212 | tp = ticket_buf; |
213 | ptp = &tp; | 213 | ptp = &tp; |
214 | tpend = *ptp + dlen; | 214 | tpend = *ptp + dlen; |
215 | } else { | 215 | } else { |
216 | /* unencrypted */ | 216 | /* unencrypted */ |
217 | ptp = p; | 217 | ptp = p; |
218 | tpend = end; | 218 | tpend = end; |
219 | } | 219 | } |
220 | ceph_decode_32_safe(ptp, tpend, dlen, bad); | 220 | ceph_decode_32_safe(ptp, tpend, dlen, bad); |
221 | dout(" ticket blob is %d bytes\n", dlen); | 221 | dout(" ticket blob is %d bytes\n", dlen); |
222 | ceph_decode_need(ptp, tpend, 1 + sizeof(u64), bad); | 222 | ceph_decode_need(ptp, tpend, 1 + sizeof(u64), bad); |
223 | blob_struct_v = ceph_decode_8(ptp); | 223 | blob_struct_v = ceph_decode_8(ptp); |
224 | new_secret_id = ceph_decode_64(ptp); | 224 | new_secret_id = ceph_decode_64(ptp); |
225 | ret = ceph_decode_buffer(&new_ticket_blob, ptp, tpend); | 225 | ret = ceph_decode_buffer(&new_ticket_blob, ptp, tpend); |
226 | if (ret) | 226 | if (ret) |
227 | goto out; | 227 | goto out; |
228 | 228 | ||
229 | /* all is well, update our ticket */ | 229 | /* all is well, update our ticket */ |
230 | ceph_crypto_key_destroy(&th->session_key); | 230 | ceph_crypto_key_destroy(&th->session_key); |
231 | if (th->ticket_blob) | 231 | if (th->ticket_blob) |
232 | ceph_buffer_put(th->ticket_blob); | 232 | ceph_buffer_put(th->ticket_blob); |
233 | th->session_key = new_session_key; | 233 | th->session_key = new_session_key; |
234 | th->ticket_blob = new_ticket_blob; | 234 | th->ticket_blob = new_ticket_blob; |
235 | th->validity = new_validity; | 235 | th->validity = new_validity; |
236 | th->secret_id = new_secret_id; | 236 | th->secret_id = new_secret_id; |
237 | th->expires = new_expires; | 237 | th->expires = new_expires; |
238 | th->renew_after = new_renew_after; | 238 | th->renew_after = new_renew_after; |
239 | dout(" got ticket service %d (%s) secret_id %lld len %d\n", | 239 | dout(" got ticket service %d (%s) secret_id %lld len %d\n", |
240 | type, ceph_entity_type_name(type), th->secret_id, | 240 | type, ceph_entity_type_name(type), th->secret_id, |
241 | (int)th->ticket_blob->vec.iov_len); | 241 | (int)th->ticket_blob->vec.iov_len); |
242 | xi->have_keys |= th->service; | 242 | xi->have_keys |= th->service; |
243 | 243 | ||
244 | out: | 244 | out: |
245 | kfree(ticket_buf); | 245 | kfree(ticket_buf); |
246 | kfree(dbuf); | 246 | kfree(dbuf); |
247 | return ret; | 247 | return ret; |
248 | 248 | ||
249 | bad: | 249 | bad: |
250 | ret = -EINVAL; | 250 | ret = -EINVAL; |
251 | goto out; | 251 | goto out; |
252 | } | 252 | } |
253 | 253 | ||
254 | static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, | 254 | static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, |
255 | struct ceph_crypto_key *secret, | 255 | struct ceph_crypto_key *secret, |
256 | void *buf, void *end) | 256 | void *buf, void *end) |
257 | { | 257 | { |
258 | void *p = buf; | 258 | void *p = buf; |
259 | u8 reply_struct_v; | 259 | u8 reply_struct_v; |
260 | u32 num; | 260 | u32 num; |
261 | int ret; | 261 | int ret; |
262 | 262 | ||
263 | ceph_decode_8_safe(&p, end, reply_struct_v, bad); | 263 | ceph_decode_8_safe(&p, end, reply_struct_v, bad); |
264 | if (reply_struct_v != 1) | 264 | if (reply_struct_v != 1) |
265 | return -EINVAL; | 265 | return -EINVAL; |
266 | 266 | ||
267 | ceph_decode_32_safe(&p, end, num, bad); | 267 | ceph_decode_32_safe(&p, end, num, bad); |
268 | dout("%d tickets\n", num); | 268 | dout("%d tickets\n", num); |
269 | 269 | ||
270 | while (num--) { | 270 | while (num--) { |
271 | ret = process_one_ticket(ac, secret, &p, end); | 271 | ret = process_one_ticket(ac, secret, &p, end); |
272 | if (ret) | 272 | if (ret) |
273 | return ret; | 273 | return ret; |
274 | } | 274 | } |
275 | 275 | ||
276 | return 0; | 276 | return 0; |
277 | 277 | ||
278 | bad: | 278 | bad: |
279 | return -EINVAL; | 279 | return -EINVAL; |
280 | } | 280 | } |
281 | 281 | ||
282 | static int ceph_x_build_authorizer(struct ceph_auth_client *ac, | 282 | static int ceph_x_build_authorizer(struct ceph_auth_client *ac, |
283 | struct ceph_x_ticket_handler *th, | 283 | struct ceph_x_ticket_handler *th, |
284 | struct ceph_x_authorizer *au) | 284 | struct ceph_x_authorizer *au) |
285 | { | 285 | { |
286 | int maxlen; | 286 | int maxlen; |
287 | struct ceph_x_authorize_a *msg_a; | 287 | struct ceph_x_authorize_a *msg_a; |
288 | struct ceph_x_authorize_b msg_b; | 288 | struct ceph_x_authorize_b msg_b; |
289 | void *p, *end; | 289 | void *p, *end; |
290 | int ret; | 290 | int ret; |
291 | int ticket_blob_len = | 291 | int ticket_blob_len = |
292 | (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0); | 292 | (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0); |
293 | 293 | ||
294 | dout("build_authorizer for %s %p\n", | 294 | dout("build_authorizer for %s %p\n", |
295 | ceph_entity_type_name(th->service), au); | 295 | ceph_entity_type_name(th->service), au); |
296 | 296 | ||
297 | ceph_crypto_key_destroy(&au->session_key); | 297 | ceph_crypto_key_destroy(&au->session_key); |
298 | ret = ceph_crypto_key_clone(&au->session_key, &th->session_key); | 298 | ret = ceph_crypto_key_clone(&au->session_key, &th->session_key); |
299 | if (ret) | 299 | if (ret) |
300 | return ret; | 300 | return ret; |
301 | 301 | ||
302 | maxlen = sizeof(*msg_a) + sizeof(msg_b) + | 302 | maxlen = sizeof(*msg_a) + sizeof(msg_b) + |
303 | ceph_x_encrypt_buflen(ticket_blob_len); | 303 | ceph_x_encrypt_buflen(ticket_blob_len); |
304 | dout(" need len %d\n", maxlen); | 304 | dout(" need len %d\n", maxlen); |
305 | if (au->buf && au->buf->alloc_len < maxlen) { | 305 | if (au->buf && au->buf->alloc_len < maxlen) { |
306 | ceph_buffer_put(au->buf); | 306 | ceph_buffer_put(au->buf); |
307 | au->buf = NULL; | 307 | au->buf = NULL; |
308 | } | 308 | } |
309 | if (!au->buf) { | 309 | if (!au->buf) { |
310 | au->buf = ceph_buffer_new(maxlen, GFP_NOFS); | 310 | au->buf = ceph_buffer_new(maxlen, GFP_NOFS); |
311 | if (!au->buf) { | 311 | if (!au->buf) { |
312 | ceph_crypto_key_destroy(&au->session_key); | 312 | ceph_crypto_key_destroy(&au->session_key); |
313 | return -ENOMEM; | 313 | return -ENOMEM; |
314 | } | 314 | } |
315 | } | 315 | } |
316 | au->service = th->service; | 316 | au->service = th->service; |
317 | au->secret_id = th->secret_id; | 317 | au->secret_id = th->secret_id; |
318 | 318 | ||
319 | msg_a = au->buf->vec.iov_base; | 319 | msg_a = au->buf->vec.iov_base; |
320 | msg_a->struct_v = 1; | 320 | msg_a->struct_v = 1; |
321 | msg_a->global_id = cpu_to_le64(ac->global_id); | 321 | msg_a->global_id = cpu_to_le64(ac->global_id); |
322 | msg_a->service_id = cpu_to_le32(th->service); | 322 | msg_a->service_id = cpu_to_le32(th->service); |
323 | msg_a->ticket_blob.struct_v = 1; | 323 | msg_a->ticket_blob.struct_v = 1; |
324 | msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id); | 324 | msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id); |
325 | msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len); | 325 | msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len); |
326 | if (ticket_blob_len) { | 326 | if (ticket_blob_len) { |
327 | memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base, | 327 | memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base, |
328 | th->ticket_blob->vec.iov_len); | 328 | th->ticket_blob->vec.iov_len); |
329 | } | 329 | } |
330 | dout(" th %p secret_id %lld %lld\n", th, th->secret_id, | 330 | dout(" th %p secret_id %lld %lld\n", th, th->secret_id, |
331 | le64_to_cpu(msg_a->ticket_blob.secret_id)); | 331 | le64_to_cpu(msg_a->ticket_blob.secret_id)); |
332 | 332 | ||
333 | p = msg_a + 1; | 333 | p = msg_a + 1; |
334 | p += ticket_blob_len; | 334 | p += ticket_blob_len; |
335 | end = au->buf->vec.iov_base + au->buf->vec.iov_len; | 335 | end = au->buf->vec.iov_base + au->buf->vec.iov_len; |
336 | 336 | ||
337 | get_random_bytes(&au->nonce, sizeof(au->nonce)); | 337 | get_random_bytes(&au->nonce, sizeof(au->nonce)); |
338 | msg_b.struct_v = 1; | 338 | msg_b.struct_v = 1; |
339 | msg_b.nonce = cpu_to_le64(au->nonce); | 339 | msg_b.nonce = cpu_to_le64(au->nonce); |
340 | ret = ceph_x_encrypt(&au->session_key, &msg_b, sizeof(msg_b), | 340 | ret = ceph_x_encrypt(&au->session_key, &msg_b, sizeof(msg_b), |
341 | p, end - p); | 341 | p, end - p); |
342 | if (ret < 0) | 342 | if (ret < 0) |
343 | goto out_buf; | 343 | goto out_buf; |
344 | p += ret; | 344 | p += ret; |
345 | au->buf->vec.iov_len = p - au->buf->vec.iov_base; | 345 | au->buf->vec.iov_len = p - au->buf->vec.iov_base; |
346 | dout(" built authorizer nonce %llx len %d\n", au->nonce, | 346 | dout(" built authorizer nonce %llx len %d\n", au->nonce, |
347 | (int)au->buf->vec.iov_len); | 347 | (int)au->buf->vec.iov_len); |
348 | BUG_ON(au->buf->vec.iov_len > maxlen); | 348 | BUG_ON(au->buf->vec.iov_len > maxlen); |
349 | return 0; | 349 | return 0; |
350 | 350 | ||
351 | out_buf: | 351 | out_buf: |
352 | ceph_buffer_put(au->buf); | 352 | ceph_buffer_put(au->buf); |
353 | au->buf = NULL; | 353 | au->buf = NULL; |
354 | return ret; | 354 | return ret; |
355 | } | 355 | } |
356 | 356 | ||
357 | static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th, | 357 | static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th, |
358 | void **p, void *end) | 358 | void **p, void *end) |
359 | { | 359 | { |
360 | ceph_decode_need(p, end, 1 + sizeof(u64), bad); | 360 | ceph_decode_need(p, end, 1 + sizeof(u64), bad); |
361 | ceph_encode_8(p, 1); | 361 | ceph_encode_8(p, 1); |
362 | ceph_encode_64(p, th->secret_id); | 362 | ceph_encode_64(p, th->secret_id); |
363 | if (th->ticket_blob) { | 363 | if (th->ticket_blob) { |
364 | const char *buf = th->ticket_blob->vec.iov_base; | 364 | const char *buf = th->ticket_blob->vec.iov_base; |
365 | u32 len = th->ticket_blob->vec.iov_len; | 365 | u32 len = th->ticket_blob->vec.iov_len; |
366 | 366 | ||
367 | ceph_encode_32_safe(p, end, len, bad); | 367 | ceph_encode_32_safe(p, end, len, bad); |
368 | ceph_encode_copy_safe(p, end, buf, len, bad); | 368 | ceph_encode_copy_safe(p, end, buf, len, bad); |
369 | } else { | 369 | } else { |
370 | ceph_encode_32_safe(p, end, 0, bad); | 370 | ceph_encode_32_safe(p, end, 0, bad); |
371 | } | 371 | } |
372 | 372 | ||
373 | return 0; | 373 | return 0; |
374 | bad: | 374 | bad: |
375 | return -ERANGE; | 375 | return -ERANGE; |
376 | } | 376 | } |
377 | 377 | ||
378 | static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) | 378 | static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) |
379 | { | 379 | { |
380 | int want = ac->want_keys; | 380 | int want = ac->want_keys; |
381 | struct ceph_x_info *xi = ac->private; | 381 | struct ceph_x_info *xi = ac->private; |
382 | int service; | 382 | int service; |
383 | 383 | ||
384 | *pneed = ac->want_keys & ~(xi->have_keys); | 384 | *pneed = ac->want_keys & ~(xi->have_keys); |
385 | 385 | ||
386 | for (service = 1; service <= want; service <<= 1) { | 386 | for (service = 1; service <= want; service <<= 1) { |
387 | struct ceph_x_ticket_handler *th; | 387 | struct ceph_x_ticket_handler *th; |
388 | 388 | ||
389 | if (!(ac->want_keys & service)) | 389 | if (!(ac->want_keys & service)) |
390 | continue; | 390 | continue; |
391 | 391 | ||
392 | if (*pneed & service) | 392 | if (*pneed & service) |
393 | continue; | 393 | continue; |
394 | 394 | ||
395 | th = get_ticket_handler(ac, service); | 395 | th = get_ticket_handler(ac, service); |
396 | 396 | ||
397 | if (IS_ERR(th)) { | 397 | if (IS_ERR(th)) { |
398 | *pneed |= service; | 398 | *pneed |= service; |
399 | continue; | 399 | continue; |
400 | } | 400 | } |
401 | 401 | ||
402 | if (get_seconds() >= th->renew_after) | 402 | if (get_seconds() >= th->renew_after) |
403 | *pneed |= service; | 403 | *pneed |= service; |
404 | if (get_seconds() >= th->expires) | 404 | if (get_seconds() >= th->expires) |
405 | xi->have_keys &= ~service; | 405 | xi->have_keys &= ~service; |
406 | } | 406 | } |
407 | } | 407 | } |
408 | 408 | ||
409 | 409 | ||
410 | static int ceph_x_build_request(struct ceph_auth_client *ac, | 410 | static int ceph_x_build_request(struct ceph_auth_client *ac, |
411 | void *buf, void *end) | 411 | void *buf, void *end) |
412 | { | 412 | { |
413 | struct ceph_x_info *xi = ac->private; | 413 | struct ceph_x_info *xi = ac->private; |
414 | int need; | 414 | int need; |
415 | struct ceph_x_request_header *head = buf; | 415 | struct ceph_x_request_header *head = buf; |
416 | int ret; | 416 | int ret; |
417 | struct ceph_x_ticket_handler *th = | 417 | struct ceph_x_ticket_handler *th = |
418 | get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); | 418 | get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); |
419 | 419 | ||
420 | if (IS_ERR(th)) | 420 | if (IS_ERR(th)) |
421 | return PTR_ERR(th); | 421 | return PTR_ERR(th); |
422 | 422 | ||
423 | ceph_x_validate_tickets(ac, &need); | 423 | ceph_x_validate_tickets(ac, &need); |
424 | 424 | ||
425 | dout("build_request want %x have %x need %x\n", | 425 | dout("build_request want %x have %x need %x\n", |
426 | ac->want_keys, xi->have_keys, need); | 426 | ac->want_keys, xi->have_keys, need); |
427 | 427 | ||
428 | if (need & CEPH_ENTITY_TYPE_AUTH) { | 428 | if (need & CEPH_ENTITY_TYPE_AUTH) { |
429 | struct ceph_x_authenticate *auth = (void *)(head + 1); | 429 | struct ceph_x_authenticate *auth = (void *)(head + 1); |
430 | void *p = auth + 1; | 430 | void *p = auth + 1; |
431 | struct ceph_x_challenge_blob tmp; | 431 | struct ceph_x_challenge_blob tmp; |
432 | char tmp_enc[40]; | 432 | char tmp_enc[40]; |
433 | u64 *u; | 433 | u64 *u; |
434 | 434 | ||
435 | if (p > end) | 435 | if (p > end) |
436 | return -ERANGE; | 436 | return -ERANGE; |
437 | 437 | ||
438 | dout(" get_auth_session_key\n"); | 438 | dout(" get_auth_session_key\n"); |
439 | head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY); | 439 | head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY); |
440 | 440 | ||
441 | /* encrypt and hash */ | 441 | /* encrypt and hash */ |
442 | get_random_bytes(&auth->client_challenge, sizeof(u64)); | 442 | get_random_bytes(&auth->client_challenge, sizeof(u64)); |
443 | tmp.client_challenge = auth->client_challenge; | 443 | tmp.client_challenge = auth->client_challenge; |
444 | tmp.server_challenge = cpu_to_le64(xi->server_challenge); | 444 | tmp.server_challenge = cpu_to_le64(xi->server_challenge); |
445 | ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp), | 445 | ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp), |
446 | tmp_enc, sizeof(tmp_enc)); | 446 | tmp_enc, sizeof(tmp_enc)); |
447 | if (ret < 0) | 447 | if (ret < 0) |
448 | return ret; | 448 | return ret; |
449 | 449 | ||
450 | auth->struct_v = 1; | 450 | auth->struct_v = 1; |
451 | auth->key = 0; | 451 | auth->key = 0; |
452 | for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++) | 452 | for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++) |
453 | auth->key ^= *(__le64 *)u; | 453 | auth->key ^= *(__le64 *)u; |
454 | dout(" server_challenge %llx client_challenge %llx key %llx\n", | 454 | dout(" server_challenge %llx client_challenge %llx key %llx\n", |
455 | xi->server_challenge, le64_to_cpu(auth->client_challenge), | 455 | xi->server_challenge, le64_to_cpu(auth->client_challenge), |
456 | le64_to_cpu(auth->key)); | 456 | le64_to_cpu(auth->key)); |
457 | 457 | ||
458 | /* now encode the old ticket if exists */ | 458 | /* now encode the old ticket if exists */ |
459 | ret = ceph_x_encode_ticket(th, &p, end); | 459 | ret = ceph_x_encode_ticket(th, &p, end); |
460 | if (ret < 0) | 460 | if (ret < 0) |
461 | return ret; | 461 | return ret; |
462 | 462 | ||
463 | return p - buf; | 463 | return p - buf; |
464 | } | 464 | } |
465 | 465 | ||
466 | if (need) { | 466 | if (need) { |
467 | void *p = head + 1; | 467 | void *p = head + 1; |
468 | struct ceph_x_service_ticket_request *req; | 468 | struct ceph_x_service_ticket_request *req; |
469 | 469 | ||
470 | if (p > end) | 470 | if (p > end) |
471 | return -ERANGE; | 471 | return -ERANGE; |
472 | head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY); | 472 | head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY); |
473 | 473 | ||
474 | ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer); | 474 | ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer); |
475 | if (ret) | 475 | if (ret) |
476 | return ret; | 476 | return ret; |
477 | ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base, | 477 | ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base, |
478 | xi->auth_authorizer.buf->vec.iov_len); | 478 | xi->auth_authorizer.buf->vec.iov_len); |
479 | 479 | ||
480 | req = p; | 480 | req = p; |
481 | req->keys = cpu_to_le32(need); | 481 | req->keys = cpu_to_le32(need); |
482 | p += sizeof(*req); | 482 | p += sizeof(*req); |
483 | return p - buf; | 483 | return p - buf; |
484 | } | 484 | } |
485 | 485 | ||
486 | return 0; | 486 | return 0; |
487 | } | 487 | } |
488 | 488 | ||
489 | static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, | 489 | static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, |
490 | void *buf, void *end) | 490 | void *buf, void *end) |
491 | { | 491 | { |
492 | struct ceph_x_info *xi = ac->private; | 492 | struct ceph_x_info *xi = ac->private; |
493 | struct ceph_x_reply_header *head = buf; | 493 | struct ceph_x_reply_header *head = buf; |
494 | struct ceph_x_ticket_handler *th; | 494 | struct ceph_x_ticket_handler *th; |
495 | int len = end - buf; | 495 | int len = end - buf; |
496 | int op; | 496 | int op; |
497 | int ret; | 497 | int ret; |
498 | 498 | ||
499 | if (result) | 499 | if (result) |
500 | return result; /* XXX hmm? */ | 500 | return result; /* XXX hmm? */ |
501 | 501 | ||
502 | if (xi->starting) { | 502 | if (xi->starting) { |
503 | /* it's a hello */ | 503 | /* it's a hello */ |
504 | struct ceph_x_server_challenge *sc = buf; | 504 | struct ceph_x_server_challenge *sc = buf; |
505 | 505 | ||
506 | if (len != sizeof(*sc)) | 506 | if (len != sizeof(*sc)) |
507 | return -EINVAL; | 507 | return -EINVAL; |
508 | xi->server_challenge = le64_to_cpu(sc->server_challenge); | 508 | xi->server_challenge = le64_to_cpu(sc->server_challenge); |
509 | dout("handle_reply got server challenge %llx\n", | 509 | dout("handle_reply got server challenge %llx\n", |
510 | xi->server_challenge); | 510 | xi->server_challenge); |
511 | xi->starting = false; | 511 | xi->starting = false; |
512 | xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH; | 512 | xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH; |
513 | return -EAGAIN; | 513 | return -EAGAIN; |
514 | } | 514 | } |
515 | 515 | ||
516 | op = le16_to_cpu(head->op); | 516 | op = le16_to_cpu(head->op); |
517 | result = le32_to_cpu(head->result); | 517 | result = le32_to_cpu(head->result); |
518 | dout("handle_reply op %d result %d\n", op, result); | 518 | dout("handle_reply op %d result %d\n", op, result); |
519 | switch (op) { | 519 | switch (op) { |
520 | case CEPHX_GET_AUTH_SESSION_KEY: | 520 | case CEPHX_GET_AUTH_SESSION_KEY: |
521 | /* verify auth key */ | 521 | /* verify auth key */ |
522 | ret = ceph_x_proc_ticket_reply(ac, &xi->secret, | 522 | ret = ceph_x_proc_ticket_reply(ac, &xi->secret, |
523 | buf + sizeof(*head), end); | 523 | buf + sizeof(*head), end); |
524 | break; | 524 | break; |
525 | 525 | ||
526 | case CEPHX_GET_PRINCIPAL_SESSION_KEY: | 526 | case CEPHX_GET_PRINCIPAL_SESSION_KEY: |
527 | th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); | 527 | th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); |
528 | if (IS_ERR(th)) | 528 | if (IS_ERR(th)) |
529 | return PTR_ERR(th); | 529 | return PTR_ERR(th); |
530 | ret = ceph_x_proc_ticket_reply(ac, &th->session_key, | 530 | ret = ceph_x_proc_ticket_reply(ac, &th->session_key, |
531 | buf + sizeof(*head), end); | 531 | buf + sizeof(*head), end); |
532 | break; | 532 | break; |
533 | 533 | ||
534 | default: | 534 | default: |
535 | return -EINVAL; | 535 | return -EINVAL; |
536 | } | 536 | } |
537 | if (ret) | 537 | if (ret) |
538 | return ret; | 538 | return ret; |
539 | if (ac->want_keys == xi->have_keys) | 539 | if (ac->want_keys == xi->have_keys) |
540 | return 0; | 540 | return 0; |
541 | return -EAGAIN; | 541 | return -EAGAIN; |
542 | } | 542 | } |
543 | 543 | ||
544 | static int ceph_x_create_authorizer( | 544 | static int ceph_x_create_authorizer( |
545 | struct ceph_auth_client *ac, int peer_type, | 545 | struct ceph_auth_client *ac, int peer_type, |
546 | struct ceph_auth_handshake *auth) | 546 | struct ceph_auth_handshake *auth) |
547 | { | 547 | { |
548 | struct ceph_x_authorizer *au; | 548 | struct ceph_x_authorizer *au; |
549 | struct ceph_x_ticket_handler *th; | 549 | struct ceph_x_ticket_handler *th; |
550 | int ret; | 550 | int ret; |
551 | 551 | ||
552 | th = get_ticket_handler(ac, peer_type); | 552 | th = get_ticket_handler(ac, peer_type); |
553 | if (IS_ERR(th)) | 553 | if (IS_ERR(th)) |
554 | return PTR_ERR(th); | 554 | return PTR_ERR(th); |
555 | 555 | ||
556 | au = kzalloc(sizeof(*au), GFP_NOFS); | 556 | au = kzalloc(sizeof(*au), GFP_NOFS); |
557 | if (!au) | 557 | if (!au) |
558 | return -ENOMEM; | 558 | return -ENOMEM; |
559 | 559 | ||
560 | ret = ceph_x_build_authorizer(ac, th, au); | 560 | ret = ceph_x_build_authorizer(ac, th, au); |
561 | if (ret) { | 561 | if (ret) { |
562 | kfree(au); | 562 | kfree(au); |
563 | return ret; | 563 | return ret; |
564 | } | 564 | } |
565 | 565 | ||
566 | auth->authorizer = (struct ceph_authorizer *) au; | 566 | auth->authorizer = (struct ceph_authorizer *) au; |
567 | auth->authorizer_buf = au->buf->vec.iov_base; | 567 | auth->authorizer_buf = au->buf->vec.iov_base; |
568 | auth->authorizer_buf_len = au->buf->vec.iov_len; | 568 | auth->authorizer_buf_len = au->buf->vec.iov_len; |
569 | auth->authorizer_reply_buf = au->reply_buf; | 569 | auth->authorizer_reply_buf = au->reply_buf; |
570 | auth->authorizer_reply_buf_len = sizeof (au->reply_buf); | 570 | auth->authorizer_reply_buf_len = sizeof (au->reply_buf); |
571 | auth->sign_message = ac->ops->sign_message; | 571 | auth->sign_message = ac->ops->sign_message; |
572 | auth->check_message_signature = ac->ops->check_message_signature; | 572 | auth->check_message_signature = ac->ops->check_message_signature; |
573 | 573 | ||
574 | return 0; | 574 | return 0; |
575 | } | 575 | } |
576 | 576 | ||
577 | static int ceph_x_update_authorizer( | 577 | static int ceph_x_update_authorizer( |
578 | struct ceph_auth_client *ac, int peer_type, | 578 | struct ceph_auth_client *ac, int peer_type, |
579 | struct ceph_auth_handshake *auth) | 579 | struct ceph_auth_handshake *auth) |
580 | { | 580 | { |
581 | struct ceph_x_authorizer *au; | 581 | struct ceph_x_authorizer *au; |
582 | struct ceph_x_ticket_handler *th; | 582 | struct ceph_x_ticket_handler *th; |
583 | 583 | ||
584 | th = get_ticket_handler(ac, peer_type); | 584 | th = get_ticket_handler(ac, peer_type); |
585 | if (IS_ERR(th)) | 585 | if (IS_ERR(th)) |
586 | return PTR_ERR(th); | 586 | return PTR_ERR(th); |
587 | 587 | ||
588 | au = (struct ceph_x_authorizer *)auth->authorizer; | 588 | au = (struct ceph_x_authorizer *)auth->authorizer; |
589 | if (au->secret_id < th->secret_id) { | 589 | if (au->secret_id < th->secret_id) { |
590 | dout("ceph_x_update_authorizer service %u secret %llu < %llu\n", | 590 | dout("ceph_x_update_authorizer service %u secret %llu < %llu\n", |
591 | au->service, au->secret_id, th->secret_id); | 591 | au->service, au->secret_id, th->secret_id); |
592 | return ceph_x_build_authorizer(ac, th, au); | 592 | return ceph_x_build_authorizer(ac, th, au); |
593 | } | 593 | } |
594 | return 0; | 594 | return 0; |
595 | } | 595 | } |
596 | 596 | ||
597 | static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, | 597 | static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, |
598 | struct ceph_authorizer *a, size_t len) | 598 | struct ceph_authorizer *a, size_t len) |
599 | { | 599 | { |
600 | struct ceph_x_authorizer *au = (void *)a; | 600 | struct ceph_x_authorizer *au = (void *)a; |
601 | int ret = 0; | 601 | int ret = 0; |
602 | struct ceph_x_authorize_reply reply; | 602 | struct ceph_x_authorize_reply reply; |
603 | void *preply = &reply; | 603 | void *preply = &reply; |
604 | void *p = au->reply_buf; | 604 | void *p = au->reply_buf; |
605 | void *end = p + sizeof(au->reply_buf); | 605 | void *end = p + sizeof(au->reply_buf); |
606 | 606 | ||
607 | ret = ceph_x_decrypt(&au->session_key, &p, end, &preply, sizeof(reply)); | 607 | ret = ceph_x_decrypt(&au->session_key, &p, end, &preply, sizeof(reply)); |
608 | if (ret < 0) | 608 | if (ret < 0) |
609 | return ret; | 609 | return ret; |
610 | if (ret != sizeof(reply)) | 610 | if (ret != sizeof(reply)) |
611 | return -EPERM; | 611 | return -EPERM; |
612 | 612 | ||
613 | if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one)) | 613 | if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one)) |
614 | ret = -EPERM; | 614 | ret = -EPERM; |
615 | else | 615 | else |
616 | ret = 0; | 616 | ret = 0; |
617 | dout("verify_authorizer_reply nonce %llx got %llx ret %d\n", | 617 | dout("verify_authorizer_reply nonce %llx got %llx ret %d\n", |
618 | au->nonce, le64_to_cpu(reply.nonce_plus_one), ret); | 618 | au->nonce, le64_to_cpu(reply.nonce_plus_one), ret); |
619 | return ret; | 619 | return ret; |
620 | } | 620 | } |
621 | 621 | ||
622 | static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac, | 622 | static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac, |
623 | struct ceph_authorizer *a) | 623 | struct ceph_authorizer *a) |
624 | { | 624 | { |
625 | struct ceph_x_authorizer *au = (void *)a; | 625 | struct ceph_x_authorizer *au = (void *)a; |
626 | 626 | ||
627 | ceph_crypto_key_destroy(&au->session_key); | 627 | ceph_crypto_key_destroy(&au->session_key); |
628 | ceph_buffer_put(au->buf); | 628 | ceph_buffer_put(au->buf); |
629 | kfree(au); | 629 | kfree(au); |
630 | } | 630 | } |
631 | 631 | ||
632 | 632 | ||
633 | static void ceph_x_reset(struct ceph_auth_client *ac) | 633 | static void ceph_x_reset(struct ceph_auth_client *ac) |
634 | { | 634 | { |
635 | struct ceph_x_info *xi = ac->private; | 635 | struct ceph_x_info *xi = ac->private; |
636 | 636 | ||
637 | dout("reset\n"); | 637 | dout("reset\n"); |
638 | xi->starting = true; | 638 | xi->starting = true; |
639 | xi->server_challenge = 0; | 639 | xi->server_challenge = 0; |
640 | } | 640 | } |
641 | 641 | ||
642 | static void ceph_x_destroy(struct ceph_auth_client *ac) | 642 | static void ceph_x_destroy(struct ceph_auth_client *ac) |
643 | { | 643 | { |
644 | struct ceph_x_info *xi = ac->private; | 644 | struct ceph_x_info *xi = ac->private; |
645 | struct rb_node *p; | 645 | struct rb_node *p; |
646 | 646 | ||
647 | dout("ceph_x_destroy %p\n", ac); | 647 | dout("ceph_x_destroy %p\n", ac); |
648 | ceph_crypto_key_destroy(&xi->secret); | 648 | ceph_crypto_key_destroy(&xi->secret); |
649 | 649 | ||
650 | while ((p = rb_first(&xi->ticket_handlers)) != NULL) { | 650 | while ((p = rb_first(&xi->ticket_handlers)) != NULL) { |
651 | struct ceph_x_ticket_handler *th = | 651 | struct ceph_x_ticket_handler *th = |
652 | rb_entry(p, struct ceph_x_ticket_handler, node); | 652 | rb_entry(p, struct ceph_x_ticket_handler, node); |
653 | remove_ticket_handler(ac, th); | 653 | remove_ticket_handler(ac, th); |
654 | } | 654 | } |
655 | 655 | ||
656 | if (xi->auth_authorizer.buf) | 656 | if (xi->auth_authorizer.buf) |
657 | ceph_buffer_put(xi->auth_authorizer.buf); | 657 | ceph_buffer_put(xi->auth_authorizer.buf); |
658 | 658 | ||
659 | kfree(ac->private); | 659 | kfree(ac->private); |
660 | ac->private = NULL; | 660 | ac->private = NULL; |
661 | } | 661 | } |
662 | 662 | ||
663 | static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, | 663 | static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, |
664 | int peer_type) | 664 | int peer_type) |
665 | { | 665 | { |
666 | struct ceph_x_ticket_handler *th; | 666 | struct ceph_x_ticket_handler *th; |
667 | 667 | ||
668 | th = get_ticket_handler(ac, peer_type); | 668 | th = get_ticket_handler(ac, peer_type); |
669 | if (!IS_ERR(th)) | 669 | if (!IS_ERR(th)) |
670 | memset(&th->validity, 0, sizeof(th->validity)); | 670 | memset(&th->validity, 0, sizeof(th->validity)); |
671 | } | 671 | } |
672 | 672 | ||
673 | static int calcu_signature(struct ceph_x_authorizer *au, | 673 | static int calcu_signature(struct ceph_x_authorizer *au, |
674 | struct ceph_msg *msg, __le64 *sig) | 674 | struct ceph_msg *msg, __le64 *sig) |
675 | { | 675 | { |
676 | int ret; | 676 | int ret; |
677 | char tmp_enc[40]; | 677 | char tmp_enc[40]; |
678 | __le32 tmp[5] = { | 678 | __le32 tmp[5] = { |
679 | 16u, msg->hdr.crc, msg->footer.front_crc, | 679 | cpu_to_le32(16), msg->hdr.crc, msg->footer.front_crc, |
680 | msg->footer.middle_crc, msg->footer.data_crc, | 680 | msg->footer.middle_crc, msg->footer.data_crc, |
681 | }; | 681 | }; |
682 | ret = ceph_x_encrypt(&au->session_key, &tmp, sizeof(tmp), | 682 | ret = ceph_x_encrypt(&au->session_key, &tmp, sizeof(tmp), |
683 | tmp_enc, sizeof(tmp_enc)); | 683 | tmp_enc, sizeof(tmp_enc)); |
684 | if (ret < 0) | 684 | if (ret < 0) |
685 | return ret; | 685 | return ret; |
686 | *sig = *(__le64*)(tmp_enc + 4); | 686 | *sig = *(__le64*)(tmp_enc + 4); |
687 | return 0; | 687 | return 0; |
688 | } | 688 | } |
689 | 689 | ||
690 | static int ceph_x_sign_message(struct ceph_auth_handshake *auth, | 690 | static int ceph_x_sign_message(struct ceph_auth_handshake *auth, |
691 | struct ceph_msg *msg) | 691 | struct ceph_msg *msg) |
692 | { | 692 | { |
693 | int ret; | 693 | int ret; |
694 | if (!auth->authorizer) | 694 | if (!auth->authorizer) |
695 | return 0; | 695 | return 0; |
696 | ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer, | 696 | ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer, |
697 | msg, &msg->footer.sig); | 697 | msg, &msg->footer.sig); |
698 | if (ret < 0) | 698 | if (ret < 0) |
699 | return ret; | 699 | return ret; |
700 | msg->footer.flags |= CEPH_MSG_FOOTER_SIGNED; | 700 | msg->footer.flags |= CEPH_MSG_FOOTER_SIGNED; |
701 | return 0; | 701 | return 0; |
702 | } | 702 | } |
703 | 703 | ||
704 | static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth, | 704 | static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth, |
705 | struct ceph_msg *msg) | 705 | struct ceph_msg *msg) |
706 | { | 706 | { |
707 | __le64 sig_check; | 707 | __le64 sig_check; |
708 | int ret; | 708 | int ret; |
709 | 709 | ||
710 | if (!auth->authorizer) | 710 | if (!auth->authorizer) |
711 | return 0; | 711 | return 0; |
712 | ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer, | 712 | ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer, |
713 | msg, &sig_check); | 713 | msg, &sig_check); |
714 | if (ret < 0) | 714 | if (ret < 0) |
715 | return ret; | 715 | return ret; |
716 | if (sig_check == msg->footer.sig) | 716 | if (sig_check == msg->footer.sig) |
717 | return 0; | 717 | return 0; |
718 | if (msg->footer.flags & CEPH_MSG_FOOTER_SIGNED) | 718 | if (msg->footer.flags & CEPH_MSG_FOOTER_SIGNED) |
719 | dout("ceph_x_check_message_signature %p has signature %llx " | 719 | dout("ceph_x_check_message_signature %p has signature %llx " |
720 | "expect %llx\n", msg, msg->footer.sig, sig_check); | 720 | "expect %llx\n", msg, msg->footer.sig, sig_check); |
721 | else | 721 | else |
722 | dout("ceph_x_check_message_signature %p sender did not set " | 722 | dout("ceph_x_check_message_signature %p sender did not set " |
723 | "CEPH_MSG_FOOTER_SIGNED\n", msg); | 723 | "CEPH_MSG_FOOTER_SIGNED\n", msg); |
724 | return -EBADMSG; | 724 | return -EBADMSG; |
725 | } | 725 | } |
726 | 726 | ||
727 | static const struct ceph_auth_client_ops ceph_x_ops = { | 727 | static const struct ceph_auth_client_ops ceph_x_ops = { |
728 | .name = "x", | 728 | .name = "x", |
729 | .is_authenticated = ceph_x_is_authenticated, | 729 | .is_authenticated = ceph_x_is_authenticated, |
730 | .should_authenticate = ceph_x_should_authenticate, | 730 | .should_authenticate = ceph_x_should_authenticate, |
731 | .build_request = ceph_x_build_request, | 731 | .build_request = ceph_x_build_request, |
732 | .handle_reply = ceph_x_handle_reply, | 732 | .handle_reply = ceph_x_handle_reply, |
733 | .create_authorizer = ceph_x_create_authorizer, | 733 | .create_authorizer = ceph_x_create_authorizer, |
734 | .update_authorizer = ceph_x_update_authorizer, | 734 | .update_authorizer = ceph_x_update_authorizer, |
735 | .verify_authorizer_reply = ceph_x_verify_authorizer_reply, | 735 | .verify_authorizer_reply = ceph_x_verify_authorizer_reply, |
736 | .destroy_authorizer = ceph_x_destroy_authorizer, | 736 | .destroy_authorizer = ceph_x_destroy_authorizer, |
737 | .invalidate_authorizer = ceph_x_invalidate_authorizer, | 737 | .invalidate_authorizer = ceph_x_invalidate_authorizer, |
738 | .reset = ceph_x_reset, | 738 | .reset = ceph_x_reset, |
739 | .destroy = ceph_x_destroy, | 739 | .destroy = ceph_x_destroy, |
740 | .sign_message = ceph_x_sign_message, | 740 | .sign_message = ceph_x_sign_message, |
741 | .check_message_signature = ceph_x_check_message_signature, | 741 | .check_message_signature = ceph_x_check_message_signature, |
742 | }; | 742 | }; |
743 | 743 | ||
744 | 744 | ||
745 | int ceph_x_init(struct ceph_auth_client *ac) | 745 | int ceph_x_init(struct ceph_auth_client *ac) |
746 | { | 746 | { |
747 | struct ceph_x_info *xi; | 747 | struct ceph_x_info *xi; |
748 | int ret; | 748 | int ret; |
749 | 749 | ||
750 | dout("ceph_x_init %p\n", ac); | 750 | dout("ceph_x_init %p\n", ac); |
751 | ret = -ENOMEM; | 751 | ret = -ENOMEM; |
752 | xi = kzalloc(sizeof(*xi), GFP_NOFS); | 752 | xi = kzalloc(sizeof(*xi), GFP_NOFS); |
753 | if (!xi) | 753 | if (!xi) |
754 | goto out; | 754 | goto out; |
755 | 755 | ||
756 | ret = -EINVAL; | 756 | ret = -EINVAL; |
757 | if (!ac->key) { | 757 | if (!ac->key) { |
758 | pr_err("no secret set (for auth_x protocol)\n"); | 758 | pr_err("no secret set (for auth_x protocol)\n"); |
759 | goto out_nomem; | 759 | goto out_nomem; |
760 | } | 760 | } |
761 | 761 | ||
762 | ret = ceph_crypto_key_clone(&xi->secret, ac->key); | 762 | ret = ceph_crypto_key_clone(&xi->secret, ac->key); |
763 | if (ret < 0) { | 763 | if (ret < 0) { |
764 | pr_err("cannot clone key: %d\n", ret); | 764 | pr_err("cannot clone key: %d\n", ret); |
765 | goto out_nomem; | 765 | goto out_nomem; |
766 | } | 766 | } |
767 | 767 | ||
768 | xi->starting = true; | 768 | xi->starting = true; |
769 | xi->ticket_handlers = RB_ROOT; | 769 | xi->ticket_handlers = RB_ROOT; |
770 | 770 | ||
771 | ac->protocol = CEPH_AUTH_CEPHX; | 771 | ac->protocol = CEPH_AUTH_CEPHX; |
772 | ac->private = xi; | 772 | ac->private = xi; |
773 | ac->ops = &ceph_x_ops; | 773 | ac->ops = &ceph_x_ops; |
774 | return 0; | 774 | return 0; |
775 | 775 | ||
776 | out_nomem: | 776 | out_nomem: |
777 | kfree(xi); | 777 | kfree(xi); |
778 | out: | 778 | out: |
779 | return ret; | 779 | return ret; |
780 | } | 780 | } |
781 | 781 | ||
782 | 782 | ||
783 | 783 |
net/ceph/mon_client.c
1 | #include <linux/ceph/ceph_debug.h> | 1 | #include <linux/ceph/ceph_debug.h> |
2 | 2 | ||
3 | #include <linux/module.h> | 3 | #include <linux/module.h> |
4 | #include <linux/types.h> | 4 | #include <linux/types.h> |
5 | #include <linux/slab.h> | 5 | #include <linux/slab.h> |
6 | #include <linux/random.h> | 6 | #include <linux/random.h> |
7 | #include <linux/sched.h> | 7 | #include <linux/sched.h> |
8 | 8 | ||
9 | #include <linux/ceph/mon_client.h> | 9 | #include <linux/ceph/mon_client.h> |
10 | #include <linux/ceph/libceph.h> | 10 | #include <linux/ceph/libceph.h> |
11 | #include <linux/ceph/debugfs.h> | 11 | #include <linux/ceph/debugfs.h> |
12 | #include <linux/ceph/decode.h> | 12 | #include <linux/ceph/decode.h> |
13 | #include <linux/ceph/auth.h> | 13 | #include <linux/ceph/auth.h> |
14 | 14 | ||
15 | /* | 15 | /* |
16 | * Interact with Ceph monitor cluster. Handle requests for new map | 16 | * Interact with Ceph monitor cluster. Handle requests for new map |
17 | * versions, and periodically resend as needed. Also implement | 17 | * versions, and periodically resend as needed. Also implement |
18 | * statfs() and umount(). | 18 | * statfs() and umount(). |
19 | * | 19 | * |
20 | * A small cluster of Ceph "monitors" are responsible for managing critical | 20 | * A small cluster of Ceph "monitors" are responsible for managing critical |
21 | * cluster configuration and state information. An odd number (e.g., 3, 5) | 21 | * cluster configuration and state information. An odd number (e.g., 3, 5) |
22 | * of cmon daemons use a modified version of the Paxos part-time parliament | 22 | * of cmon daemons use a modified version of the Paxos part-time parliament |
23 | * algorithm to manage the MDS map (mds cluster membership), OSD map, and | 23 | * algorithm to manage the MDS map (mds cluster membership), OSD map, and |
24 | * list of clients who have mounted the file system. | 24 | * list of clients who have mounted the file system. |
25 | * | 25 | * |
26 | * We maintain an open, active session with a monitor at all times in order to | 26 | * We maintain an open, active session with a monitor at all times in order to |
27 | * receive timely MDSMap updates. We periodically send a keepalive byte on the | 27 | * receive timely MDSMap updates. We periodically send a keepalive byte on the |
28 | * TCP socket to ensure we detect a failure. If the connection does break, we | 28 | * TCP socket to ensure we detect a failure. If the connection does break, we |
29 | * randomly hunt for a new monitor. Once the connection is reestablished, we | 29 | * randomly hunt for a new monitor. Once the connection is reestablished, we |
30 | * resend any outstanding requests. | 30 | * resend any outstanding requests. |
31 | */ | 31 | */ |
32 | 32 | ||
33 | static const struct ceph_connection_operations mon_con_ops; | 33 | static const struct ceph_connection_operations mon_con_ops; |
34 | 34 | ||
35 | static int __validate_auth(struct ceph_mon_client *monc); | 35 | static int __validate_auth(struct ceph_mon_client *monc); |
36 | 36 | ||
37 | /* | 37 | /* |
38 | * Decode a monmap blob (e.g., during mount). | 38 | * Decode a monmap blob (e.g., during mount). |
39 | */ | 39 | */ |
40 | struct ceph_monmap *ceph_monmap_decode(void *p, void *end) | 40 | struct ceph_monmap *ceph_monmap_decode(void *p, void *end) |
41 | { | 41 | { |
42 | struct ceph_monmap *m = NULL; | 42 | struct ceph_monmap *m = NULL; |
43 | int i, err = -EINVAL; | 43 | int i, err = -EINVAL; |
44 | struct ceph_fsid fsid; | 44 | struct ceph_fsid fsid; |
45 | u32 epoch, num_mon; | 45 | u32 epoch, num_mon; |
46 | u16 version; | 46 | u16 version; |
47 | u32 len; | 47 | u32 len; |
48 | 48 | ||
49 | ceph_decode_32_safe(&p, end, len, bad); | 49 | ceph_decode_32_safe(&p, end, len, bad); |
50 | ceph_decode_need(&p, end, len, bad); | 50 | ceph_decode_need(&p, end, len, bad); |
51 | 51 | ||
52 | dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p)); | 52 | dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p)); |
53 | 53 | ||
54 | ceph_decode_16_safe(&p, end, version, bad); | 54 | ceph_decode_16_safe(&p, end, version, bad); |
55 | 55 | ||
56 | ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad); | 56 | ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad); |
57 | ceph_decode_copy(&p, &fsid, sizeof(fsid)); | 57 | ceph_decode_copy(&p, &fsid, sizeof(fsid)); |
58 | epoch = ceph_decode_32(&p); | 58 | epoch = ceph_decode_32(&p); |
59 | 59 | ||
60 | num_mon = ceph_decode_32(&p); | 60 | num_mon = ceph_decode_32(&p); |
61 | ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad); | 61 | ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad); |
62 | 62 | ||
63 | if (num_mon >= CEPH_MAX_MON) | 63 | if (num_mon >= CEPH_MAX_MON) |
64 | goto bad; | 64 | goto bad; |
65 | m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS); | 65 | m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS); |
66 | if (m == NULL) | 66 | if (m == NULL) |
67 | return ERR_PTR(-ENOMEM); | 67 | return ERR_PTR(-ENOMEM); |
68 | m->fsid = fsid; | 68 | m->fsid = fsid; |
69 | m->epoch = epoch; | 69 | m->epoch = epoch; |
70 | m->num_mon = num_mon; | 70 | m->num_mon = num_mon; |
71 | ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0])); | 71 | ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0])); |
72 | for (i = 0; i < num_mon; i++) | 72 | for (i = 0; i < num_mon; i++) |
73 | ceph_decode_addr(&m->mon_inst[i].addr); | 73 | ceph_decode_addr(&m->mon_inst[i].addr); |
74 | 74 | ||
75 | dout("monmap_decode epoch %d, num_mon %d\n", m->epoch, | 75 | dout("monmap_decode epoch %d, num_mon %d\n", m->epoch, |
76 | m->num_mon); | 76 | m->num_mon); |
77 | for (i = 0; i < m->num_mon; i++) | 77 | for (i = 0; i < m->num_mon; i++) |
78 | dout("monmap_decode mon%d is %s\n", i, | 78 | dout("monmap_decode mon%d is %s\n", i, |
79 | ceph_pr_addr(&m->mon_inst[i].addr.in_addr)); | 79 | ceph_pr_addr(&m->mon_inst[i].addr.in_addr)); |
80 | return m; | 80 | return m; |
81 | 81 | ||
82 | bad: | 82 | bad: |
83 | dout("monmap_decode failed with %d\n", err); | 83 | dout("monmap_decode failed with %d\n", err); |
84 | kfree(m); | 84 | kfree(m); |
85 | return ERR_PTR(err); | 85 | return ERR_PTR(err); |
86 | } | 86 | } |
87 | 87 | ||
88 | /* | 88 | /* |
89 | * return true if *addr is included in the monmap. | 89 | * return true if *addr is included in the monmap. |
90 | */ | 90 | */ |
91 | int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr) | 91 | int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr) |
92 | { | 92 | { |
93 | int i; | 93 | int i; |
94 | 94 | ||
95 | for (i = 0; i < m->num_mon; i++) | 95 | for (i = 0; i < m->num_mon; i++) |
96 | if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0) | 96 | if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0) |
97 | return 1; | 97 | return 1; |
98 | return 0; | 98 | return 0; |
99 | } | 99 | } |
100 | 100 | ||
101 | /* | 101 | /* |
102 | * Send an auth request. | 102 | * Send an auth request. |
103 | */ | 103 | */ |
104 | static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) | 104 | static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) |
105 | { | 105 | { |
106 | monc->pending_auth = 1; | 106 | monc->pending_auth = 1; |
107 | monc->m_auth->front.iov_len = len; | 107 | monc->m_auth->front.iov_len = len; |
108 | monc->m_auth->hdr.front_len = cpu_to_le32(len); | 108 | monc->m_auth->hdr.front_len = cpu_to_le32(len); |
109 | ceph_msg_revoke(monc->m_auth); | 109 | ceph_msg_revoke(monc->m_auth); |
110 | ceph_msg_get(monc->m_auth); /* keep our ref */ | 110 | ceph_msg_get(monc->m_auth); /* keep our ref */ |
111 | ceph_con_send(&monc->con, monc->m_auth); | 111 | ceph_con_send(&monc->con, monc->m_auth); |
112 | } | 112 | } |
113 | 113 | ||
114 | /* | 114 | /* |
115 | * Close monitor session, if any. | 115 | * Close monitor session, if any. |
116 | */ | 116 | */ |
117 | static void __close_session(struct ceph_mon_client *monc) | 117 | static void __close_session(struct ceph_mon_client *monc) |
118 | { | 118 | { |
119 | dout("__close_session closing mon%d\n", monc->cur_mon); | 119 | dout("__close_session closing mon%d\n", monc->cur_mon); |
120 | ceph_msg_revoke(monc->m_auth); | 120 | ceph_msg_revoke(monc->m_auth); |
121 | ceph_msg_revoke_incoming(monc->m_auth_reply); | 121 | ceph_msg_revoke_incoming(monc->m_auth_reply); |
122 | ceph_msg_revoke(monc->m_subscribe); | 122 | ceph_msg_revoke(monc->m_subscribe); |
123 | ceph_msg_revoke_incoming(monc->m_subscribe_ack); | 123 | ceph_msg_revoke_incoming(monc->m_subscribe_ack); |
124 | ceph_con_close(&monc->con); | 124 | ceph_con_close(&monc->con); |
125 | monc->cur_mon = -1; | 125 | monc->cur_mon = -1; |
126 | monc->pending_auth = 0; | 126 | monc->pending_auth = 0; |
127 | ceph_auth_reset(monc->auth); | 127 | ceph_auth_reset(monc->auth); |
128 | } | 128 | } |
129 | 129 | ||
130 | /* | 130 | /* |
131 | * Open a session with a (new) monitor. | 131 | * Open a session with a (new) monitor. |
132 | */ | 132 | */ |
133 | static int __open_session(struct ceph_mon_client *monc) | 133 | static int __open_session(struct ceph_mon_client *monc) |
134 | { | 134 | { |
135 | char r; | 135 | char r; |
136 | int ret; | 136 | int ret; |
137 | 137 | ||
138 | if (monc->cur_mon < 0) { | 138 | if (monc->cur_mon < 0) { |
139 | get_random_bytes(&r, 1); | 139 | get_random_bytes(&r, 1); |
140 | monc->cur_mon = r % monc->monmap->num_mon; | 140 | monc->cur_mon = r % monc->monmap->num_mon; |
141 | dout("open_session num=%d r=%d -> mon%d\n", | 141 | dout("open_session num=%d r=%d -> mon%d\n", |
142 | monc->monmap->num_mon, r, monc->cur_mon); | 142 | monc->monmap->num_mon, r, monc->cur_mon); |
143 | monc->sub_sent = 0; | 143 | monc->sub_sent = 0; |
144 | monc->sub_renew_after = jiffies; /* i.e., expired */ | 144 | monc->sub_renew_after = jiffies; /* i.e., expired */ |
145 | monc->want_next_osdmap = !!monc->want_next_osdmap; | 145 | monc->want_next_osdmap = !!monc->want_next_osdmap; |
146 | 146 | ||
147 | dout("open_session mon%d opening\n", monc->cur_mon); | 147 | dout("open_session mon%d opening\n", monc->cur_mon); |
148 | ceph_con_open(&monc->con, | 148 | ceph_con_open(&monc->con, |
149 | CEPH_ENTITY_TYPE_MON, monc->cur_mon, | 149 | CEPH_ENTITY_TYPE_MON, monc->cur_mon, |
150 | &monc->monmap->mon_inst[monc->cur_mon].addr); | 150 | &monc->monmap->mon_inst[monc->cur_mon].addr); |
151 | 151 | ||
152 | /* initiatiate authentication handshake */ | 152 | /* initiatiate authentication handshake */ |
153 | ret = ceph_auth_build_hello(monc->auth, | 153 | ret = ceph_auth_build_hello(monc->auth, |
154 | monc->m_auth->front.iov_base, | 154 | monc->m_auth->front.iov_base, |
155 | monc->m_auth->front_alloc_len); | 155 | monc->m_auth->front_alloc_len); |
156 | __send_prepared_auth_request(monc, ret); | 156 | __send_prepared_auth_request(monc, ret); |
157 | } else { | 157 | } else { |
158 | dout("open_session mon%d already open\n", monc->cur_mon); | 158 | dout("open_session mon%d already open\n", monc->cur_mon); |
159 | } | 159 | } |
160 | return 0; | 160 | return 0; |
161 | } | 161 | } |
162 | 162 | ||
163 | static bool __sub_expired(struct ceph_mon_client *monc) | 163 | static bool __sub_expired(struct ceph_mon_client *monc) |
164 | { | 164 | { |
165 | return time_after_eq(jiffies, monc->sub_renew_after); | 165 | return time_after_eq(jiffies, monc->sub_renew_after); |
166 | } | 166 | } |
167 | 167 | ||
168 | /* | 168 | /* |
169 | * Reschedule delayed work timer. | 169 | * Reschedule delayed work timer. |
170 | */ | 170 | */ |
171 | static void __schedule_delayed(struct ceph_mon_client *monc) | 171 | static void __schedule_delayed(struct ceph_mon_client *monc) |
172 | { | 172 | { |
173 | unsigned int delay; | 173 | unsigned int delay; |
174 | 174 | ||
175 | if (monc->cur_mon < 0 || __sub_expired(monc)) | 175 | if (monc->cur_mon < 0 || __sub_expired(monc)) |
176 | delay = 10 * HZ; | 176 | delay = 10 * HZ; |
177 | else | 177 | else |
178 | delay = 20 * HZ; | 178 | delay = 20 * HZ; |
179 | dout("__schedule_delayed after %u\n", delay); | 179 | dout("__schedule_delayed after %u\n", delay); |
180 | schedule_delayed_work(&monc->delayed_work, delay); | 180 | schedule_delayed_work(&monc->delayed_work, delay); |
181 | } | 181 | } |
182 | 182 | ||
183 | /* | 183 | /* |
184 | * Send subscribe request for mdsmap and/or osdmap. | 184 | * Send subscribe request for mdsmap and/or osdmap. |
185 | */ | 185 | */ |
186 | static void __send_subscribe(struct ceph_mon_client *monc) | 186 | static void __send_subscribe(struct ceph_mon_client *monc) |
187 | { | 187 | { |
188 | dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n", | 188 | dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n", |
189 | (unsigned int)monc->sub_sent, __sub_expired(monc), | 189 | (unsigned int)monc->sub_sent, __sub_expired(monc), |
190 | monc->want_next_osdmap); | 190 | monc->want_next_osdmap); |
191 | if ((__sub_expired(monc) && !monc->sub_sent) || | 191 | if ((__sub_expired(monc) && !monc->sub_sent) || |
192 | monc->want_next_osdmap == 1) { | 192 | monc->want_next_osdmap == 1) { |
193 | struct ceph_msg *msg = monc->m_subscribe; | 193 | struct ceph_msg *msg = monc->m_subscribe; |
194 | struct ceph_mon_subscribe_item *i; | 194 | struct ceph_mon_subscribe_item *i; |
195 | void *p, *end; | 195 | void *p, *end; |
196 | int num; | 196 | int num; |
197 | 197 | ||
198 | p = msg->front.iov_base; | 198 | p = msg->front.iov_base; |
199 | end = p + msg->front_alloc_len; | 199 | end = p + msg->front_alloc_len; |
200 | 200 | ||
201 | num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; | 201 | num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; |
202 | ceph_encode_32(&p, num); | 202 | ceph_encode_32(&p, num); |
203 | 203 | ||
204 | if (monc->want_next_osdmap) { | 204 | if (monc->want_next_osdmap) { |
205 | dout("__send_subscribe to 'osdmap' %u\n", | 205 | dout("__send_subscribe to 'osdmap' %u\n", |
206 | (unsigned int)monc->have_osdmap); | 206 | (unsigned int)monc->have_osdmap); |
207 | ceph_encode_string(&p, end, "osdmap", 6); | 207 | ceph_encode_string(&p, end, "osdmap", 6); |
208 | i = p; | 208 | i = p; |
209 | i->have = cpu_to_le64(monc->have_osdmap); | 209 | i->have = cpu_to_le64(monc->have_osdmap); |
210 | i->onetime = 1; | 210 | i->onetime = 1; |
211 | p += sizeof(*i); | 211 | p += sizeof(*i); |
212 | monc->want_next_osdmap = 2; /* requested */ | 212 | monc->want_next_osdmap = 2; /* requested */ |
213 | } | 213 | } |
214 | if (monc->want_mdsmap) { | 214 | if (monc->want_mdsmap) { |
215 | dout("__send_subscribe to 'mdsmap' %u+\n", | 215 | dout("__send_subscribe to 'mdsmap' %u+\n", |
216 | (unsigned int)monc->have_mdsmap); | 216 | (unsigned int)monc->have_mdsmap); |
217 | ceph_encode_string(&p, end, "mdsmap", 6); | 217 | ceph_encode_string(&p, end, "mdsmap", 6); |
218 | i = p; | 218 | i = p; |
219 | i->have = cpu_to_le64(monc->have_mdsmap); | 219 | i->have = cpu_to_le64(monc->have_mdsmap); |
220 | i->onetime = 0; | 220 | i->onetime = 0; |
221 | p += sizeof(*i); | 221 | p += sizeof(*i); |
222 | } | 222 | } |
223 | ceph_encode_string(&p, end, "monmap", 6); | 223 | ceph_encode_string(&p, end, "monmap", 6); |
224 | i = p; | 224 | i = p; |
225 | i->have = 0; | 225 | i->have = 0; |
226 | i->onetime = 0; | 226 | i->onetime = 0; |
227 | p += sizeof(*i); | 227 | p += sizeof(*i); |
228 | 228 | ||
229 | msg->front.iov_len = p - msg->front.iov_base; | 229 | msg->front.iov_len = p - msg->front.iov_base; |
230 | msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); | 230 | msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); |
231 | ceph_msg_revoke(msg); | 231 | ceph_msg_revoke(msg); |
232 | ceph_con_send(&monc->con, ceph_msg_get(msg)); | 232 | ceph_con_send(&monc->con, ceph_msg_get(msg)); |
233 | 233 | ||
234 | monc->sub_sent = jiffies | 1; /* never 0 */ | 234 | monc->sub_sent = jiffies | 1; /* never 0 */ |
235 | } | 235 | } |
236 | } | 236 | } |
237 | 237 | ||
238 | static void handle_subscribe_ack(struct ceph_mon_client *monc, | 238 | static void handle_subscribe_ack(struct ceph_mon_client *monc, |
239 | struct ceph_msg *msg) | 239 | struct ceph_msg *msg) |
240 | { | 240 | { |
241 | unsigned int seconds; | 241 | unsigned int seconds; |
242 | struct ceph_mon_subscribe_ack *h = msg->front.iov_base; | 242 | struct ceph_mon_subscribe_ack *h = msg->front.iov_base; |
243 | 243 | ||
244 | if (msg->front.iov_len < sizeof(*h)) | 244 | if (msg->front.iov_len < sizeof(*h)) |
245 | goto bad; | 245 | goto bad; |
246 | seconds = le32_to_cpu(h->duration); | 246 | seconds = le32_to_cpu(h->duration); |
247 | 247 | ||
248 | mutex_lock(&monc->mutex); | 248 | mutex_lock(&monc->mutex); |
249 | if (monc->hunting) { | 249 | if (monc->hunting) { |
250 | pr_info("mon%d %s session established\n", | 250 | pr_info("mon%d %s session established\n", |
251 | monc->cur_mon, | 251 | monc->cur_mon, |
252 | ceph_pr_addr(&monc->con.peer_addr.in_addr)); | 252 | ceph_pr_addr(&monc->con.peer_addr.in_addr)); |
253 | monc->hunting = false; | 253 | monc->hunting = false; |
254 | } | 254 | } |
255 | dout("handle_subscribe_ack after %d seconds\n", seconds); | 255 | dout("handle_subscribe_ack after %d seconds\n", seconds); |
256 | monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1; | 256 | monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1; |
257 | monc->sub_sent = 0; | 257 | monc->sub_sent = 0; |
258 | mutex_unlock(&monc->mutex); | 258 | mutex_unlock(&monc->mutex); |
259 | return; | 259 | return; |
260 | bad: | 260 | bad: |
261 | pr_err("got corrupt subscribe-ack msg\n"); | 261 | pr_err("got corrupt subscribe-ack msg\n"); |
262 | ceph_msg_dump(msg); | 262 | ceph_msg_dump(msg); |
263 | } | 263 | } |
264 | 264 | ||
265 | /* | 265 | /* |
266 | * Keep track of which maps we have | 266 | * Keep track of which maps we have |
267 | */ | 267 | */ |
268 | int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got) | 268 | int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got) |
269 | { | 269 | { |
270 | mutex_lock(&monc->mutex); | 270 | mutex_lock(&monc->mutex); |
271 | monc->have_mdsmap = got; | 271 | monc->have_mdsmap = got; |
272 | mutex_unlock(&monc->mutex); | 272 | mutex_unlock(&monc->mutex); |
273 | return 0; | 273 | return 0; |
274 | } | 274 | } |
275 | EXPORT_SYMBOL(ceph_monc_got_mdsmap); | 275 | EXPORT_SYMBOL(ceph_monc_got_mdsmap); |
276 | 276 | ||
277 | int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) | 277 | int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) |
278 | { | 278 | { |
279 | mutex_lock(&monc->mutex); | 279 | mutex_lock(&monc->mutex); |
280 | monc->have_osdmap = got; | 280 | monc->have_osdmap = got; |
281 | monc->want_next_osdmap = 0; | 281 | monc->want_next_osdmap = 0; |
282 | mutex_unlock(&monc->mutex); | 282 | mutex_unlock(&monc->mutex); |
283 | return 0; | 283 | return 0; |
284 | } | 284 | } |
285 | 285 | ||
286 | /* | 286 | /* |
287 | * Register interest in the next osdmap | 287 | * Register interest in the next osdmap |
288 | */ | 288 | */ |
289 | void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) | 289 | void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) |
290 | { | 290 | { |
291 | dout("request_next_osdmap have %u\n", monc->have_osdmap); | 291 | dout("request_next_osdmap have %u\n", monc->have_osdmap); |
292 | mutex_lock(&monc->mutex); | 292 | mutex_lock(&monc->mutex); |
293 | if (!monc->want_next_osdmap) | 293 | if (!monc->want_next_osdmap) |
294 | monc->want_next_osdmap = 1; | 294 | monc->want_next_osdmap = 1; |
295 | if (monc->want_next_osdmap < 2) | 295 | if (monc->want_next_osdmap < 2) |
296 | __send_subscribe(monc); | 296 | __send_subscribe(monc); |
297 | mutex_unlock(&monc->mutex); | 297 | mutex_unlock(&monc->mutex); |
298 | } | 298 | } |
299 | EXPORT_SYMBOL(ceph_monc_request_next_osdmap); | 299 | EXPORT_SYMBOL(ceph_monc_request_next_osdmap); |
300 | 300 | ||
301 | int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, | 301 | int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, |
302 | unsigned long timeout) | 302 | unsigned long timeout) |
303 | { | 303 | { |
304 | unsigned long started = jiffies; | 304 | unsigned long started = jiffies; |
305 | int ret; | 305 | int ret; |
306 | 306 | ||
307 | mutex_lock(&monc->mutex); | 307 | mutex_lock(&monc->mutex); |
308 | while (monc->have_osdmap < epoch) { | 308 | while (monc->have_osdmap < epoch) { |
309 | mutex_unlock(&monc->mutex); | 309 | mutex_unlock(&monc->mutex); |
310 | 310 | ||
311 | if (timeout != 0 && time_after_eq(jiffies, started + timeout)) | 311 | if (timeout != 0 && time_after_eq(jiffies, started + timeout)) |
312 | return -ETIMEDOUT; | 312 | return -ETIMEDOUT; |
313 | 313 | ||
314 | ret = wait_event_interruptible_timeout(monc->client->auth_wq, | 314 | ret = wait_event_interruptible_timeout(monc->client->auth_wq, |
315 | monc->have_osdmap >= epoch, timeout); | 315 | monc->have_osdmap >= epoch, timeout); |
316 | if (ret < 0) | 316 | if (ret < 0) |
317 | return ret; | 317 | return ret; |
318 | 318 | ||
319 | mutex_lock(&monc->mutex); | 319 | mutex_lock(&monc->mutex); |
320 | } | 320 | } |
321 | 321 | ||
322 | mutex_unlock(&monc->mutex); | 322 | mutex_unlock(&monc->mutex); |
323 | return 0; | 323 | return 0; |
324 | } | 324 | } |
325 | EXPORT_SYMBOL(ceph_monc_wait_osdmap); | 325 | EXPORT_SYMBOL(ceph_monc_wait_osdmap); |
326 | 326 | ||
327 | /* | 327 | /* |
328 | * | 328 | * |
329 | */ | 329 | */ |
330 | int ceph_monc_open_session(struct ceph_mon_client *monc) | 330 | int ceph_monc_open_session(struct ceph_mon_client *monc) |
331 | { | 331 | { |
332 | mutex_lock(&monc->mutex); | 332 | mutex_lock(&monc->mutex); |
333 | __open_session(monc); | 333 | __open_session(monc); |
334 | __schedule_delayed(monc); | 334 | __schedule_delayed(monc); |
335 | mutex_unlock(&monc->mutex); | 335 | mutex_unlock(&monc->mutex); |
336 | return 0; | 336 | return 0; |
337 | } | 337 | } |
338 | EXPORT_SYMBOL(ceph_monc_open_session); | 338 | EXPORT_SYMBOL(ceph_monc_open_session); |
339 | 339 | ||
340 | /* | 340 | /* |
341 | * We require the fsid and global_id in order to initialize our | 341 | * We require the fsid and global_id in order to initialize our |
342 | * debugfs dir. | 342 | * debugfs dir. |
343 | */ | 343 | */ |
344 | static bool have_debugfs_info(struct ceph_mon_client *monc) | 344 | static bool have_debugfs_info(struct ceph_mon_client *monc) |
345 | { | 345 | { |
346 | dout("have_debugfs_info fsid %d globalid %lld\n", | 346 | dout("have_debugfs_info fsid %d globalid %lld\n", |
347 | (int)monc->client->have_fsid, monc->auth->global_id); | 347 | (int)monc->client->have_fsid, monc->auth->global_id); |
348 | return monc->client->have_fsid && monc->auth->global_id > 0; | 348 | return monc->client->have_fsid && monc->auth->global_id > 0; |
349 | } | 349 | } |
350 | 350 | ||
351 | /* | 351 | /* |
352 | * The monitor responds with mount ack indicate mount success. The | 352 | * The monitor responds with mount ack indicate mount success. The |
353 | * included client ticket allows the client to talk to MDSs and OSDs. | 353 | * included client ticket allows the client to talk to MDSs and OSDs. |
354 | */ | 354 | */ |
355 | static void ceph_monc_handle_map(struct ceph_mon_client *monc, | 355 | static void ceph_monc_handle_map(struct ceph_mon_client *monc, |
356 | struct ceph_msg *msg) | 356 | struct ceph_msg *msg) |
357 | { | 357 | { |
358 | struct ceph_client *client = monc->client; | 358 | struct ceph_client *client = monc->client; |
359 | struct ceph_monmap *monmap = NULL, *old = monc->monmap; | 359 | struct ceph_monmap *monmap = NULL, *old = monc->monmap; |
360 | void *p, *end; | 360 | void *p, *end; |
361 | int had_debugfs_info, init_debugfs = 0; | 361 | int had_debugfs_info, init_debugfs = 0; |
362 | 362 | ||
363 | mutex_lock(&monc->mutex); | 363 | mutex_lock(&monc->mutex); |
364 | 364 | ||
365 | had_debugfs_info = have_debugfs_info(monc); | 365 | had_debugfs_info = have_debugfs_info(monc); |
366 | 366 | ||
367 | dout("handle_monmap\n"); | 367 | dout("handle_monmap\n"); |
368 | p = msg->front.iov_base; | 368 | p = msg->front.iov_base; |
369 | end = p + msg->front.iov_len; | 369 | end = p + msg->front.iov_len; |
370 | 370 | ||
371 | monmap = ceph_monmap_decode(p, end); | 371 | monmap = ceph_monmap_decode(p, end); |
372 | if (IS_ERR(monmap)) { | 372 | if (IS_ERR(monmap)) { |
373 | pr_err("problem decoding monmap, %d\n", | 373 | pr_err("problem decoding monmap, %d\n", |
374 | (int)PTR_ERR(monmap)); | 374 | (int)PTR_ERR(monmap)); |
375 | goto out; | 375 | goto out; |
376 | } | 376 | } |
377 | 377 | ||
378 | if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) { | 378 | if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) { |
379 | kfree(monmap); | 379 | kfree(monmap); |
380 | goto out; | 380 | goto out; |
381 | } | 381 | } |
382 | 382 | ||
383 | client->monc.monmap = monmap; | 383 | client->monc.monmap = monmap; |
384 | kfree(old); | 384 | kfree(old); |
385 | 385 | ||
386 | if (!client->have_fsid) { | 386 | if (!client->have_fsid) { |
387 | client->have_fsid = true; | 387 | client->have_fsid = true; |
388 | if (!had_debugfs_info && have_debugfs_info(monc)) { | 388 | if (!had_debugfs_info && have_debugfs_info(monc)) { |
389 | pr_info("client%lld fsid %pU\n", | 389 | pr_info("client%lld fsid %pU\n", |
390 | ceph_client_id(monc->client), | 390 | ceph_client_id(monc->client), |
391 | &monc->client->fsid); | 391 | &monc->client->fsid); |
392 | init_debugfs = 1; | 392 | init_debugfs = 1; |
393 | } | 393 | } |
394 | mutex_unlock(&monc->mutex); | 394 | mutex_unlock(&monc->mutex); |
395 | 395 | ||
396 | if (init_debugfs) { | 396 | if (init_debugfs) { |
397 | /* | 397 | /* |
398 | * do debugfs initialization without mutex to avoid | 398 | * do debugfs initialization without mutex to avoid |
399 | * creating a locking dependency | 399 | * creating a locking dependency |
400 | */ | 400 | */ |
401 | ceph_debugfs_client_init(monc->client); | 401 | ceph_debugfs_client_init(monc->client); |
402 | } | 402 | } |
403 | 403 | ||
404 | goto out_unlocked; | 404 | goto out_unlocked; |
405 | } | 405 | } |
406 | out: | 406 | out: |
407 | mutex_unlock(&monc->mutex); | 407 | mutex_unlock(&monc->mutex); |
408 | out_unlocked: | 408 | out_unlocked: |
409 | wake_up_all(&client->auth_wq); | 409 | wake_up_all(&client->auth_wq); |
410 | } | 410 | } |
411 | 411 | ||
412 | /* | 412 | /* |
413 | * generic requests (e.g., statfs, poolop) | 413 | * generic requests (e.g., statfs, poolop) |
414 | */ | 414 | */ |
415 | static struct ceph_mon_generic_request *__lookup_generic_req( | 415 | static struct ceph_mon_generic_request *__lookup_generic_req( |
416 | struct ceph_mon_client *monc, u64 tid) | 416 | struct ceph_mon_client *monc, u64 tid) |
417 | { | 417 | { |
418 | struct ceph_mon_generic_request *req; | 418 | struct ceph_mon_generic_request *req; |
419 | struct rb_node *n = monc->generic_request_tree.rb_node; | 419 | struct rb_node *n = monc->generic_request_tree.rb_node; |
420 | 420 | ||
421 | while (n) { | 421 | while (n) { |
422 | req = rb_entry(n, struct ceph_mon_generic_request, node); | 422 | req = rb_entry(n, struct ceph_mon_generic_request, node); |
423 | if (tid < req->tid) | 423 | if (tid < req->tid) |
424 | n = n->rb_left; | 424 | n = n->rb_left; |
425 | else if (tid > req->tid) | 425 | else if (tid > req->tid) |
426 | n = n->rb_right; | 426 | n = n->rb_right; |
427 | else | 427 | else |
428 | return req; | 428 | return req; |
429 | } | 429 | } |
430 | return NULL; | 430 | return NULL; |
431 | } | 431 | } |
432 | 432 | ||
433 | static void __insert_generic_request(struct ceph_mon_client *monc, | 433 | static void __insert_generic_request(struct ceph_mon_client *monc, |
434 | struct ceph_mon_generic_request *new) | 434 | struct ceph_mon_generic_request *new) |
435 | { | 435 | { |
436 | struct rb_node **p = &monc->generic_request_tree.rb_node; | 436 | struct rb_node **p = &monc->generic_request_tree.rb_node; |
437 | struct rb_node *parent = NULL; | 437 | struct rb_node *parent = NULL; |
438 | struct ceph_mon_generic_request *req = NULL; | 438 | struct ceph_mon_generic_request *req = NULL; |
439 | 439 | ||
440 | while (*p) { | 440 | while (*p) { |
441 | parent = *p; | 441 | parent = *p; |
442 | req = rb_entry(parent, struct ceph_mon_generic_request, node); | 442 | req = rb_entry(parent, struct ceph_mon_generic_request, node); |
443 | if (new->tid < req->tid) | 443 | if (new->tid < req->tid) |
444 | p = &(*p)->rb_left; | 444 | p = &(*p)->rb_left; |
445 | else if (new->tid > req->tid) | 445 | else if (new->tid > req->tid) |
446 | p = &(*p)->rb_right; | 446 | p = &(*p)->rb_right; |
447 | else | 447 | else |
448 | BUG(); | 448 | BUG(); |
449 | } | 449 | } |
450 | 450 | ||
451 | rb_link_node(&new->node, parent, p); | 451 | rb_link_node(&new->node, parent, p); |
452 | rb_insert_color(&new->node, &monc->generic_request_tree); | 452 | rb_insert_color(&new->node, &monc->generic_request_tree); |
453 | } | 453 | } |
454 | 454 | ||
455 | static void release_generic_request(struct kref *kref) | 455 | static void release_generic_request(struct kref *kref) |
456 | { | 456 | { |
457 | struct ceph_mon_generic_request *req = | 457 | struct ceph_mon_generic_request *req = |
458 | container_of(kref, struct ceph_mon_generic_request, kref); | 458 | container_of(kref, struct ceph_mon_generic_request, kref); |
459 | 459 | ||
460 | if (req->reply) | 460 | if (req->reply) |
461 | ceph_msg_put(req->reply); | 461 | ceph_msg_put(req->reply); |
462 | if (req->request) | 462 | if (req->request) |
463 | ceph_msg_put(req->request); | 463 | ceph_msg_put(req->request); |
464 | 464 | ||
465 | kfree(req); | 465 | kfree(req); |
466 | } | 466 | } |
467 | 467 | ||
468 | static void put_generic_request(struct ceph_mon_generic_request *req) | 468 | static void put_generic_request(struct ceph_mon_generic_request *req) |
469 | { | 469 | { |
470 | kref_put(&req->kref, release_generic_request); | 470 | kref_put(&req->kref, release_generic_request); |
471 | } | 471 | } |
472 | 472 | ||
473 | static void get_generic_request(struct ceph_mon_generic_request *req) | 473 | static void get_generic_request(struct ceph_mon_generic_request *req) |
474 | { | 474 | { |
475 | kref_get(&req->kref); | 475 | kref_get(&req->kref); |
476 | } | 476 | } |
477 | 477 | ||
478 | static struct ceph_msg *get_generic_reply(struct ceph_connection *con, | 478 | static struct ceph_msg *get_generic_reply(struct ceph_connection *con, |
479 | struct ceph_msg_header *hdr, | 479 | struct ceph_msg_header *hdr, |
480 | int *skip) | 480 | int *skip) |
481 | { | 481 | { |
482 | struct ceph_mon_client *monc = con->private; | 482 | struct ceph_mon_client *monc = con->private; |
483 | struct ceph_mon_generic_request *req; | 483 | struct ceph_mon_generic_request *req; |
484 | u64 tid = le64_to_cpu(hdr->tid); | 484 | u64 tid = le64_to_cpu(hdr->tid); |
485 | struct ceph_msg *m; | 485 | struct ceph_msg *m; |
486 | 486 | ||
487 | mutex_lock(&monc->mutex); | 487 | mutex_lock(&monc->mutex); |
488 | req = __lookup_generic_req(monc, tid); | 488 | req = __lookup_generic_req(monc, tid); |
489 | if (!req) { | 489 | if (!req) { |
490 | dout("get_generic_reply %lld dne\n", tid); | 490 | dout("get_generic_reply %lld dne\n", tid); |
491 | *skip = 1; | 491 | *skip = 1; |
492 | m = NULL; | 492 | m = NULL; |
493 | } else { | 493 | } else { |
494 | dout("get_generic_reply %lld got %p\n", tid, req->reply); | 494 | dout("get_generic_reply %lld got %p\n", tid, req->reply); |
495 | *skip = 0; | 495 | *skip = 0; |
496 | m = ceph_msg_get(req->reply); | 496 | m = ceph_msg_get(req->reply); |
497 | /* | 497 | /* |
498 | * we don't need to track the connection reading into | 498 | * we don't need to track the connection reading into |
499 | * this reply because we only have one open connection | 499 | * this reply because we only have one open connection |
500 | * at a time, ever. | 500 | * at a time, ever. |
501 | */ | 501 | */ |
502 | } | 502 | } |
503 | mutex_unlock(&monc->mutex); | 503 | mutex_unlock(&monc->mutex); |
504 | return m; | 504 | return m; |
505 | } | 505 | } |
506 | 506 | ||
507 | static int __do_generic_request(struct ceph_mon_client *monc, u64 tid, | 507 | static int __do_generic_request(struct ceph_mon_client *monc, u64 tid, |
508 | struct ceph_mon_generic_request *req) | 508 | struct ceph_mon_generic_request *req) |
509 | { | 509 | { |
510 | int err; | 510 | int err; |
511 | 511 | ||
512 | /* register request */ | 512 | /* register request */ |
513 | req->tid = tid != 0 ? tid : ++monc->last_tid; | 513 | req->tid = tid != 0 ? tid : ++monc->last_tid; |
514 | req->request->hdr.tid = cpu_to_le64(req->tid); | 514 | req->request->hdr.tid = cpu_to_le64(req->tid); |
515 | __insert_generic_request(monc, req); | 515 | __insert_generic_request(monc, req); |
516 | monc->num_generic_requests++; | 516 | monc->num_generic_requests++; |
517 | ceph_con_send(&monc->con, ceph_msg_get(req->request)); | 517 | ceph_con_send(&monc->con, ceph_msg_get(req->request)); |
518 | mutex_unlock(&monc->mutex); | 518 | mutex_unlock(&monc->mutex); |
519 | 519 | ||
520 | err = wait_for_completion_interruptible(&req->completion); | 520 | err = wait_for_completion_interruptible(&req->completion); |
521 | 521 | ||
522 | mutex_lock(&monc->mutex); | 522 | mutex_lock(&monc->mutex); |
523 | rb_erase(&req->node, &monc->generic_request_tree); | 523 | rb_erase(&req->node, &monc->generic_request_tree); |
524 | monc->num_generic_requests--; | 524 | monc->num_generic_requests--; |
525 | 525 | ||
526 | if (!err) | 526 | if (!err) |
527 | err = req->result; | 527 | err = req->result; |
528 | return err; | 528 | return err; |
529 | } | 529 | } |
530 | 530 | ||
531 | static int do_generic_request(struct ceph_mon_client *monc, | 531 | static int do_generic_request(struct ceph_mon_client *monc, |
532 | struct ceph_mon_generic_request *req) | 532 | struct ceph_mon_generic_request *req) |
533 | { | 533 | { |
534 | int err; | 534 | int err; |
535 | 535 | ||
536 | mutex_lock(&monc->mutex); | 536 | mutex_lock(&monc->mutex); |
537 | err = __do_generic_request(monc, 0, req); | 537 | err = __do_generic_request(monc, 0, req); |
538 | mutex_unlock(&monc->mutex); | 538 | mutex_unlock(&monc->mutex); |
539 | 539 | ||
540 | return err; | 540 | return err; |
541 | } | 541 | } |
542 | 542 | ||
543 | /* | 543 | /* |
544 | * statfs | 544 | * statfs |
545 | */ | 545 | */ |
546 | static void handle_statfs_reply(struct ceph_mon_client *monc, | 546 | static void handle_statfs_reply(struct ceph_mon_client *monc, |
547 | struct ceph_msg *msg) | 547 | struct ceph_msg *msg) |
548 | { | 548 | { |
549 | struct ceph_mon_generic_request *req; | 549 | struct ceph_mon_generic_request *req; |
550 | struct ceph_mon_statfs_reply *reply = msg->front.iov_base; | 550 | struct ceph_mon_statfs_reply *reply = msg->front.iov_base; |
551 | u64 tid = le64_to_cpu(msg->hdr.tid); | 551 | u64 tid = le64_to_cpu(msg->hdr.tid); |
552 | 552 | ||
553 | if (msg->front.iov_len != sizeof(*reply)) | 553 | if (msg->front.iov_len != sizeof(*reply)) |
554 | goto bad; | 554 | goto bad; |
555 | dout("handle_statfs_reply %p tid %llu\n", msg, tid); | 555 | dout("handle_statfs_reply %p tid %llu\n", msg, tid); |
556 | 556 | ||
557 | mutex_lock(&monc->mutex); | 557 | mutex_lock(&monc->mutex); |
558 | req = __lookup_generic_req(monc, tid); | 558 | req = __lookup_generic_req(monc, tid); |
559 | if (req) { | 559 | if (req) { |
560 | *(struct ceph_statfs *)req->buf = reply->st; | 560 | *(struct ceph_statfs *)req->buf = reply->st; |
561 | req->result = 0; | 561 | req->result = 0; |
562 | get_generic_request(req); | 562 | get_generic_request(req); |
563 | } | 563 | } |
564 | mutex_unlock(&monc->mutex); | 564 | mutex_unlock(&monc->mutex); |
565 | if (req) { | 565 | if (req) { |
566 | complete_all(&req->completion); | 566 | complete_all(&req->completion); |
567 | put_generic_request(req); | 567 | put_generic_request(req); |
568 | } | 568 | } |
569 | return; | 569 | return; |
570 | 570 | ||
571 | bad: | 571 | bad: |
572 | pr_err("corrupt generic reply, tid %llu\n", tid); | 572 | pr_err("corrupt generic reply, tid %llu\n", tid); |
573 | ceph_msg_dump(msg); | 573 | ceph_msg_dump(msg); |
574 | } | 574 | } |
575 | 575 | ||
576 | /* | 576 | /* |
577 | * Do a synchronous statfs(). | 577 | * Do a synchronous statfs(). |
578 | */ | 578 | */ |
579 | int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) | 579 | int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) |
580 | { | 580 | { |
581 | struct ceph_mon_generic_request *req; | 581 | struct ceph_mon_generic_request *req; |
582 | struct ceph_mon_statfs *h; | 582 | struct ceph_mon_statfs *h; |
583 | int err; | 583 | int err; |
584 | 584 | ||
585 | req = kzalloc(sizeof(*req), GFP_NOFS); | 585 | req = kzalloc(sizeof(*req), GFP_NOFS); |
586 | if (!req) | 586 | if (!req) |
587 | return -ENOMEM; | 587 | return -ENOMEM; |
588 | 588 | ||
589 | kref_init(&req->kref); | 589 | kref_init(&req->kref); |
590 | req->buf = buf; | 590 | req->buf = buf; |
591 | req->buf_len = sizeof(*buf); | 591 | req->buf_len = sizeof(*buf); |
592 | init_completion(&req->completion); | 592 | init_completion(&req->completion); |
593 | 593 | ||
594 | err = -ENOMEM; | 594 | err = -ENOMEM; |
595 | req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS, | 595 | req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS, |
596 | true); | 596 | true); |
597 | if (!req->request) | 597 | if (!req->request) |
598 | goto out; | 598 | goto out; |
599 | req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS, | 599 | req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS, |
600 | true); | 600 | true); |
601 | if (!req->reply) | 601 | if (!req->reply) |
602 | goto out; | 602 | goto out; |
603 | 603 | ||
604 | /* fill out request */ | 604 | /* fill out request */ |
605 | h = req->request->front.iov_base; | 605 | h = req->request->front.iov_base; |
606 | h->monhdr.have_version = 0; | 606 | h->monhdr.have_version = 0; |
607 | h->monhdr.session_mon = cpu_to_le16(-1); | 607 | h->monhdr.session_mon = cpu_to_le16(-1); |
608 | h->monhdr.session_mon_tid = 0; | 608 | h->monhdr.session_mon_tid = 0; |
609 | h->fsid = monc->monmap->fsid; | 609 | h->fsid = monc->monmap->fsid; |
610 | 610 | ||
611 | err = do_generic_request(monc, req); | 611 | err = do_generic_request(monc, req); |
612 | 612 | ||
613 | out: | 613 | out: |
614 | kref_put(&req->kref, release_generic_request); | 614 | kref_put(&req->kref, release_generic_request); |
615 | return err; | 615 | return err; |
616 | } | 616 | } |
617 | EXPORT_SYMBOL(ceph_monc_do_statfs); | 617 | EXPORT_SYMBOL(ceph_monc_do_statfs); |
618 | 618 | ||
619 | static void handle_get_version_reply(struct ceph_mon_client *monc, | 619 | static void handle_get_version_reply(struct ceph_mon_client *monc, |
620 | struct ceph_msg *msg) | 620 | struct ceph_msg *msg) |
621 | { | 621 | { |
622 | struct ceph_mon_generic_request *req; | 622 | struct ceph_mon_generic_request *req; |
623 | u64 tid = le64_to_cpu(msg->hdr.tid); | 623 | u64 tid = le64_to_cpu(msg->hdr.tid); |
624 | void *p = msg->front.iov_base; | 624 | void *p = msg->front.iov_base; |
625 | void *end = p + msg->front_alloc_len; | 625 | void *end = p + msg->front_alloc_len; |
626 | u64 handle; | 626 | u64 handle; |
627 | 627 | ||
628 | dout("%s %p tid %llu\n", __func__, msg, tid); | 628 | dout("%s %p tid %llu\n", __func__, msg, tid); |
629 | 629 | ||
630 | ceph_decode_need(&p, end, 2*sizeof(u64), bad); | 630 | ceph_decode_need(&p, end, 2*sizeof(u64), bad); |
631 | handle = ceph_decode_64(&p); | 631 | handle = ceph_decode_64(&p); |
632 | if (tid != 0 && tid != handle) | 632 | if (tid != 0 && tid != handle) |
633 | goto bad; | 633 | goto bad; |
634 | 634 | ||
635 | mutex_lock(&monc->mutex); | 635 | mutex_lock(&monc->mutex); |
636 | req = __lookup_generic_req(monc, handle); | 636 | req = __lookup_generic_req(monc, handle); |
637 | if (req) { | 637 | if (req) { |
638 | *(u64 *)req->buf = ceph_decode_64(&p); | 638 | *(u64 *)req->buf = ceph_decode_64(&p); |
639 | req->result = 0; | 639 | req->result = 0; |
640 | get_generic_request(req); | 640 | get_generic_request(req); |
641 | } | 641 | } |
642 | mutex_unlock(&monc->mutex); | 642 | mutex_unlock(&monc->mutex); |
643 | if (req) { | 643 | if (req) { |
644 | complete_all(&req->completion); | 644 | complete_all(&req->completion); |
645 | put_generic_request(req); | 645 | put_generic_request(req); |
646 | } | 646 | } |
647 | 647 | ||
648 | return; | 648 | return; |
649 | bad: | 649 | bad: |
650 | pr_err("corrupt mon_get_version reply\n"); | 650 | pr_err("corrupt mon_get_version reply\n"); |
651 | ceph_msg_dump(msg); | 651 | ceph_msg_dump(msg); |
652 | } | 652 | } |
653 | 653 | ||
654 | /* | 654 | /* |
655 | * Send MMonGetVersion and wait for the reply. | 655 | * Send MMonGetVersion and wait for the reply. |
656 | * | 656 | * |
657 | * @what: one of "mdsmap", "osdmap" or "monmap" | 657 | * @what: one of "mdsmap", "osdmap" or "monmap" |
658 | */ | 658 | */ |
659 | int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what, | 659 | int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what, |
660 | u64 *newest) | 660 | u64 *newest) |
661 | { | 661 | { |
662 | struct ceph_mon_generic_request *req; | 662 | struct ceph_mon_generic_request *req; |
663 | void *p, *end; | 663 | void *p, *end; |
664 | u64 tid; | 664 | u64 tid; |
665 | int err; | 665 | int err; |
666 | 666 | ||
667 | req = kzalloc(sizeof(*req), GFP_NOFS); | 667 | req = kzalloc(sizeof(*req), GFP_NOFS); |
668 | if (!req) | 668 | if (!req) |
669 | return -ENOMEM; | 669 | return -ENOMEM; |
670 | 670 | ||
671 | kref_init(&req->kref); | 671 | kref_init(&req->kref); |
672 | req->buf = newest; | 672 | req->buf = newest; |
673 | req->buf_len = sizeof(*newest); | 673 | req->buf_len = sizeof(*newest); |
674 | init_completion(&req->completion); | 674 | init_completion(&req->completion); |
675 | 675 | ||
676 | req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION, | 676 | req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION, |
677 | sizeof(u64) + sizeof(u32) + strlen(what), | 677 | sizeof(u64) + sizeof(u32) + strlen(what), |
678 | GFP_NOFS, true); | 678 | GFP_NOFS, true); |
679 | if (!req->request) { | 679 | if (!req->request) { |
680 | err = -ENOMEM; | 680 | err = -ENOMEM; |
681 | goto out; | 681 | goto out; |
682 | } | 682 | } |
683 | 683 | ||
684 | req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024, | 684 | req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024, |
685 | GFP_NOFS, true); | 685 | GFP_NOFS, true); |
686 | if (!req->reply) { | 686 | if (!req->reply) { |
687 | err = -ENOMEM; | 687 | err = -ENOMEM; |
688 | goto out; | 688 | goto out; |
689 | } | 689 | } |
690 | 690 | ||
691 | p = req->request->front.iov_base; | 691 | p = req->request->front.iov_base; |
692 | end = p + req->request->front_alloc_len; | 692 | end = p + req->request->front_alloc_len; |
693 | 693 | ||
694 | /* fill out request */ | 694 | /* fill out request */ |
695 | mutex_lock(&monc->mutex); | 695 | mutex_lock(&monc->mutex); |
696 | tid = ++monc->last_tid; | 696 | tid = ++monc->last_tid; |
697 | ceph_encode_64(&p, tid); /* handle */ | 697 | ceph_encode_64(&p, tid); /* handle */ |
698 | ceph_encode_string(&p, end, what, strlen(what)); | 698 | ceph_encode_string(&p, end, what, strlen(what)); |
699 | 699 | ||
700 | err = __do_generic_request(monc, tid, req); | 700 | err = __do_generic_request(monc, tid, req); |
701 | 701 | ||
702 | mutex_unlock(&monc->mutex); | 702 | mutex_unlock(&monc->mutex); |
703 | out: | 703 | out: |
704 | kref_put(&req->kref, release_generic_request); | 704 | kref_put(&req->kref, release_generic_request); |
705 | return err; | 705 | return err; |
706 | } | 706 | } |
707 | EXPORT_SYMBOL(ceph_monc_do_get_version); | 707 | EXPORT_SYMBOL(ceph_monc_do_get_version); |
708 | 708 | ||
709 | /* | 709 | /* |
710 | * pool ops | 710 | * pool ops |
711 | */ | 711 | */ |
712 | static int get_poolop_reply_buf(const char *src, size_t src_len, | 712 | static int get_poolop_reply_buf(const char *src, size_t src_len, |
713 | char *dst, size_t dst_len) | 713 | char *dst, size_t dst_len) |
714 | { | 714 | { |
715 | u32 buf_len; | 715 | u32 buf_len; |
716 | 716 | ||
717 | if (src_len != sizeof(u32) + dst_len) | 717 | if (src_len != sizeof(u32) + dst_len) |
718 | return -EINVAL; | 718 | return -EINVAL; |
719 | 719 | ||
720 | buf_len = le32_to_cpu(*(u32 *)src); | 720 | buf_len = le32_to_cpu(*(__le32 *)src); |
721 | if (buf_len != dst_len) | 721 | if (buf_len != dst_len) |
722 | return -EINVAL; | 722 | return -EINVAL; |
723 | 723 | ||
724 | memcpy(dst, src + sizeof(u32), dst_len); | 724 | memcpy(dst, src + sizeof(u32), dst_len); |
725 | return 0; | 725 | return 0; |
726 | } | 726 | } |
727 | 727 | ||
728 | static void handle_poolop_reply(struct ceph_mon_client *monc, | 728 | static void handle_poolop_reply(struct ceph_mon_client *monc, |
729 | struct ceph_msg *msg) | 729 | struct ceph_msg *msg) |
730 | { | 730 | { |
731 | struct ceph_mon_generic_request *req; | 731 | struct ceph_mon_generic_request *req; |
732 | struct ceph_mon_poolop_reply *reply = msg->front.iov_base; | 732 | struct ceph_mon_poolop_reply *reply = msg->front.iov_base; |
733 | u64 tid = le64_to_cpu(msg->hdr.tid); | 733 | u64 tid = le64_to_cpu(msg->hdr.tid); |
734 | 734 | ||
735 | if (msg->front.iov_len < sizeof(*reply)) | 735 | if (msg->front.iov_len < sizeof(*reply)) |
736 | goto bad; | 736 | goto bad; |
737 | dout("handle_poolop_reply %p tid %llu\n", msg, tid); | 737 | dout("handle_poolop_reply %p tid %llu\n", msg, tid); |
738 | 738 | ||
739 | mutex_lock(&monc->mutex); | 739 | mutex_lock(&monc->mutex); |
740 | req = __lookup_generic_req(monc, tid); | 740 | req = __lookup_generic_req(monc, tid); |
741 | if (req) { | 741 | if (req) { |
742 | if (req->buf_len && | 742 | if (req->buf_len && |
743 | get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply), | 743 | get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply), |
744 | msg->front.iov_len - sizeof(*reply), | 744 | msg->front.iov_len - sizeof(*reply), |
745 | req->buf, req->buf_len) < 0) { | 745 | req->buf, req->buf_len) < 0) { |
746 | mutex_unlock(&monc->mutex); | 746 | mutex_unlock(&monc->mutex); |
747 | goto bad; | 747 | goto bad; |
748 | } | 748 | } |
749 | req->result = le32_to_cpu(reply->reply_code); | 749 | req->result = le32_to_cpu(reply->reply_code); |
750 | get_generic_request(req); | 750 | get_generic_request(req); |
751 | } | 751 | } |
752 | mutex_unlock(&monc->mutex); | 752 | mutex_unlock(&monc->mutex); |
753 | if (req) { | 753 | if (req) { |
754 | complete(&req->completion); | 754 | complete(&req->completion); |
755 | put_generic_request(req); | 755 | put_generic_request(req); |
756 | } | 756 | } |
757 | return; | 757 | return; |
758 | 758 | ||
759 | bad: | 759 | bad: |
760 | pr_err("corrupt generic reply, tid %llu\n", tid); | 760 | pr_err("corrupt generic reply, tid %llu\n", tid); |
761 | ceph_msg_dump(msg); | 761 | ceph_msg_dump(msg); |
762 | } | 762 | } |
763 | 763 | ||
764 | /* | 764 | /* |
765 | * Do a synchronous pool op. | 765 | * Do a synchronous pool op. |
766 | */ | 766 | */ |
767 | static int do_poolop(struct ceph_mon_client *monc, u32 op, | 767 | static int do_poolop(struct ceph_mon_client *monc, u32 op, |
768 | u32 pool, u64 snapid, | 768 | u32 pool, u64 snapid, |
769 | char *buf, int len) | 769 | char *buf, int len) |
770 | { | 770 | { |
771 | struct ceph_mon_generic_request *req; | 771 | struct ceph_mon_generic_request *req; |
772 | struct ceph_mon_poolop *h; | 772 | struct ceph_mon_poolop *h; |
773 | int err; | 773 | int err; |
774 | 774 | ||
775 | req = kzalloc(sizeof(*req), GFP_NOFS); | 775 | req = kzalloc(sizeof(*req), GFP_NOFS); |
776 | if (!req) | 776 | if (!req) |
777 | return -ENOMEM; | 777 | return -ENOMEM; |
778 | 778 | ||
779 | kref_init(&req->kref); | 779 | kref_init(&req->kref); |
780 | req->buf = buf; | 780 | req->buf = buf; |
781 | req->buf_len = len; | 781 | req->buf_len = len; |
782 | init_completion(&req->completion); | 782 | init_completion(&req->completion); |
783 | 783 | ||
784 | err = -ENOMEM; | 784 | err = -ENOMEM; |
785 | req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS, | 785 | req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS, |
786 | true); | 786 | true); |
787 | if (!req->request) | 787 | if (!req->request) |
788 | goto out; | 788 | goto out; |
789 | req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS, | 789 | req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS, |
790 | true); | 790 | true); |
791 | if (!req->reply) | 791 | if (!req->reply) |
792 | goto out; | 792 | goto out; |
793 | 793 | ||
794 | /* fill out request */ | 794 | /* fill out request */ |
795 | req->request->hdr.version = cpu_to_le16(2); | 795 | req->request->hdr.version = cpu_to_le16(2); |
796 | h = req->request->front.iov_base; | 796 | h = req->request->front.iov_base; |
797 | h->monhdr.have_version = 0; | 797 | h->monhdr.have_version = 0; |
798 | h->monhdr.session_mon = cpu_to_le16(-1); | 798 | h->monhdr.session_mon = cpu_to_le16(-1); |
799 | h->monhdr.session_mon_tid = 0; | 799 | h->monhdr.session_mon_tid = 0; |
800 | h->fsid = monc->monmap->fsid; | 800 | h->fsid = monc->monmap->fsid; |
801 | h->pool = cpu_to_le32(pool); | 801 | h->pool = cpu_to_le32(pool); |
802 | h->op = cpu_to_le32(op); | 802 | h->op = cpu_to_le32(op); |
803 | h->auid = 0; | 803 | h->auid = 0; |
804 | h->snapid = cpu_to_le64(snapid); | 804 | h->snapid = cpu_to_le64(snapid); |
805 | h->name_len = 0; | 805 | h->name_len = 0; |
806 | 806 | ||
807 | err = do_generic_request(monc, req); | 807 | err = do_generic_request(monc, req); |
808 | 808 | ||
809 | out: | 809 | out: |
810 | kref_put(&req->kref, release_generic_request); | 810 | kref_put(&req->kref, release_generic_request); |
811 | return err; | 811 | return err; |
812 | } | 812 | } |
813 | 813 | ||
814 | int ceph_monc_create_snapid(struct ceph_mon_client *monc, | 814 | int ceph_monc_create_snapid(struct ceph_mon_client *monc, |
815 | u32 pool, u64 *snapid) | 815 | u32 pool, u64 *snapid) |
816 | { | 816 | { |
817 | return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, | 817 | return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, |
818 | pool, 0, (char *)snapid, sizeof(*snapid)); | 818 | pool, 0, (char *)snapid, sizeof(*snapid)); |
819 | 819 | ||
820 | } | 820 | } |
821 | EXPORT_SYMBOL(ceph_monc_create_snapid); | 821 | EXPORT_SYMBOL(ceph_monc_create_snapid); |
822 | 822 | ||
823 | int ceph_monc_delete_snapid(struct ceph_mon_client *monc, | 823 | int ceph_monc_delete_snapid(struct ceph_mon_client *monc, |
824 | u32 pool, u64 snapid) | 824 | u32 pool, u64 snapid) |
825 | { | 825 | { |
826 | return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, | 826 | return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, |
827 | pool, snapid, NULL, 0); | 827 | pool, snapid, NULL, 0); |
828 | 828 | ||
829 | } | 829 | } |
830 | 830 | ||
831 | /* | 831 | /* |
832 | * Resend pending generic requests. | 832 | * Resend pending generic requests. |
833 | */ | 833 | */ |
834 | static void __resend_generic_request(struct ceph_mon_client *monc) | 834 | static void __resend_generic_request(struct ceph_mon_client *monc) |
835 | { | 835 | { |
836 | struct ceph_mon_generic_request *req; | 836 | struct ceph_mon_generic_request *req; |
837 | struct rb_node *p; | 837 | struct rb_node *p; |
838 | 838 | ||
839 | for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { | 839 | for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { |
840 | req = rb_entry(p, struct ceph_mon_generic_request, node); | 840 | req = rb_entry(p, struct ceph_mon_generic_request, node); |
841 | ceph_msg_revoke(req->request); | 841 | ceph_msg_revoke(req->request); |
842 | ceph_msg_revoke_incoming(req->reply); | 842 | ceph_msg_revoke_incoming(req->reply); |
843 | ceph_con_send(&monc->con, ceph_msg_get(req->request)); | 843 | ceph_con_send(&monc->con, ceph_msg_get(req->request)); |
844 | } | 844 | } |
845 | } | 845 | } |
846 | 846 | ||
847 | /* | 847 | /* |
848 | * Delayed work. If we haven't mounted yet, retry. Otherwise, | 848 | * Delayed work. If we haven't mounted yet, retry. Otherwise, |
849 | * renew/retry subscription as needed (in case it is timing out, or we | 849 | * renew/retry subscription as needed (in case it is timing out, or we |
850 | * got an ENOMEM). And keep the monitor connection alive. | 850 | * got an ENOMEM). And keep the monitor connection alive. |
851 | */ | 851 | */ |
852 | static void delayed_work(struct work_struct *work) | 852 | static void delayed_work(struct work_struct *work) |
853 | { | 853 | { |
854 | struct ceph_mon_client *monc = | 854 | struct ceph_mon_client *monc = |
855 | container_of(work, struct ceph_mon_client, delayed_work.work); | 855 | container_of(work, struct ceph_mon_client, delayed_work.work); |
856 | 856 | ||
857 | dout("monc delayed_work\n"); | 857 | dout("monc delayed_work\n"); |
858 | mutex_lock(&monc->mutex); | 858 | mutex_lock(&monc->mutex); |
859 | if (monc->hunting) { | 859 | if (monc->hunting) { |
860 | __close_session(monc); | 860 | __close_session(monc); |
861 | __open_session(monc); /* continue hunting */ | 861 | __open_session(monc); /* continue hunting */ |
862 | } else { | 862 | } else { |
863 | ceph_con_keepalive(&monc->con); | 863 | ceph_con_keepalive(&monc->con); |
864 | 864 | ||
865 | __validate_auth(monc); | 865 | __validate_auth(monc); |
866 | 866 | ||
867 | if (ceph_auth_is_authenticated(monc->auth)) | 867 | if (ceph_auth_is_authenticated(monc->auth)) |
868 | __send_subscribe(monc); | 868 | __send_subscribe(monc); |
869 | } | 869 | } |
870 | __schedule_delayed(monc); | 870 | __schedule_delayed(monc); |
871 | mutex_unlock(&monc->mutex); | 871 | mutex_unlock(&monc->mutex); |
872 | } | 872 | } |
873 | 873 | ||
874 | /* | 874 | /* |
875 | * On startup, we build a temporary monmap populated with the IPs | 875 | * On startup, we build a temporary monmap populated with the IPs |
876 | * provided by mount(2). | 876 | * provided by mount(2). |
877 | */ | 877 | */ |
878 | static int build_initial_monmap(struct ceph_mon_client *monc) | 878 | static int build_initial_monmap(struct ceph_mon_client *monc) |
879 | { | 879 | { |
880 | struct ceph_options *opt = monc->client->options; | 880 | struct ceph_options *opt = monc->client->options; |
881 | struct ceph_entity_addr *mon_addr = opt->mon_addr; | 881 | struct ceph_entity_addr *mon_addr = opt->mon_addr; |
882 | int num_mon = opt->num_mon; | 882 | int num_mon = opt->num_mon; |
883 | int i; | 883 | int i; |
884 | 884 | ||
885 | /* build initial monmap */ | 885 | /* build initial monmap */ |
886 | monc->monmap = kzalloc(sizeof(*monc->monmap) + | 886 | monc->monmap = kzalloc(sizeof(*monc->monmap) + |
887 | num_mon*sizeof(monc->monmap->mon_inst[0]), | 887 | num_mon*sizeof(monc->monmap->mon_inst[0]), |
888 | GFP_KERNEL); | 888 | GFP_KERNEL); |
889 | if (!monc->monmap) | 889 | if (!monc->monmap) |
890 | return -ENOMEM; | 890 | return -ENOMEM; |
891 | for (i = 0; i < num_mon; i++) { | 891 | for (i = 0; i < num_mon; i++) { |
892 | monc->monmap->mon_inst[i].addr = mon_addr[i]; | 892 | monc->monmap->mon_inst[i].addr = mon_addr[i]; |
893 | monc->monmap->mon_inst[i].addr.nonce = 0; | 893 | monc->monmap->mon_inst[i].addr.nonce = 0; |
894 | monc->monmap->mon_inst[i].name.type = | 894 | monc->monmap->mon_inst[i].name.type = |
895 | CEPH_ENTITY_TYPE_MON; | 895 | CEPH_ENTITY_TYPE_MON; |
896 | monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); | 896 | monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); |
897 | } | 897 | } |
898 | monc->monmap->num_mon = num_mon; | 898 | monc->monmap->num_mon = num_mon; |
899 | return 0; | 899 | return 0; |
900 | } | 900 | } |
901 | 901 | ||
902 | int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) | 902 | int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) |
903 | { | 903 | { |
904 | int err = 0; | 904 | int err = 0; |
905 | 905 | ||
906 | dout("init\n"); | 906 | dout("init\n"); |
907 | memset(monc, 0, sizeof(*monc)); | 907 | memset(monc, 0, sizeof(*monc)); |
908 | monc->client = cl; | 908 | monc->client = cl; |
909 | monc->monmap = NULL; | 909 | monc->monmap = NULL; |
910 | mutex_init(&monc->mutex); | 910 | mutex_init(&monc->mutex); |
911 | 911 | ||
912 | err = build_initial_monmap(monc); | 912 | err = build_initial_monmap(monc); |
913 | if (err) | 913 | if (err) |
914 | goto out; | 914 | goto out; |
915 | 915 | ||
916 | /* connection */ | 916 | /* connection */ |
917 | /* authentication */ | 917 | /* authentication */ |
918 | monc->auth = ceph_auth_init(cl->options->name, | 918 | monc->auth = ceph_auth_init(cl->options->name, |
919 | cl->options->key); | 919 | cl->options->key); |
920 | if (IS_ERR(monc->auth)) { | 920 | if (IS_ERR(monc->auth)) { |
921 | err = PTR_ERR(monc->auth); | 921 | err = PTR_ERR(monc->auth); |
922 | goto out_monmap; | 922 | goto out_monmap; |
923 | } | 923 | } |
924 | monc->auth->want_keys = | 924 | monc->auth->want_keys = |
925 | CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | | 925 | CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | |
926 | CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; | 926 | CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; |
927 | 927 | ||
928 | /* msgs */ | 928 | /* msgs */ |
929 | err = -ENOMEM; | 929 | err = -ENOMEM; |
930 | monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK, | 930 | monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK, |
931 | sizeof(struct ceph_mon_subscribe_ack), | 931 | sizeof(struct ceph_mon_subscribe_ack), |
932 | GFP_NOFS, true); | 932 | GFP_NOFS, true); |
933 | if (!monc->m_subscribe_ack) | 933 | if (!monc->m_subscribe_ack) |
934 | goto out_auth; | 934 | goto out_auth; |
935 | 935 | ||
936 | monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS, | 936 | monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS, |
937 | true); | 937 | true); |
938 | if (!monc->m_subscribe) | 938 | if (!monc->m_subscribe) |
939 | goto out_subscribe_ack; | 939 | goto out_subscribe_ack; |
940 | 940 | ||
941 | monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS, | 941 | monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS, |
942 | true); | 942 | true); |
943 | if (!monc->m_auth_reply) | 943 | if (!monc->m_auth_reply) |
944 | goto out_subscribe; | 944 | goto out_subscribe; |
945 | 945 | ||
946 | monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true); | 946 | monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true); |
947 | monc->pending_auth = 0; | 947 | monc->pending_auth = 0; |
948 | if (!monc->m_auth) | 948 | if (!monc->m_auth) |
949 | goto out_auth_reply; | 949 | goto out_auth_reply; |
950 | 950 | ||
951 | ceph_con_init(&monc->con, monc, &mon_con_ops, | 951 | ceph_con_init(&monc->con, monc, &mon_con_ops, |
952 | &monc->client->msgr); | 952 | &monc->client->msgr); |
953 | 953 | ||
954 | monc->cur_mon = -1; | 954 | monc->cur_mon = -1; |
955 | monc->hunting = true; | 955 | monc->hunting = true; |
956 | monc->sub_renew_after = jiffies; | 956 | monc->sub_renew_after = jiffies; |
957 | monc->sub_sent = 0; | 957 | monc->sub_sent = 0; |
958 | 958 | ||
959 | INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); | 959 | INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); |
960 | monc->generic_request_tree = RB_ROOT; | 960 | monc->generic_request_tree = RB_ROOT; |
961 | monc->num_generic_requests = 0; | 961 | monc->num_generic_requests = 0; |
962 | monc->last_tid = 0; | 962 | monc->last_tid = 0; |
963 | 963 | ||
964 | monc->have_mdsmap = 0; | 964 | monc->have_mdsmap = 0; |
965 | monc->have_osdmap = 0; | 965 | monc->have_osdmap = 0; |
966 | monc->want_next_osdmap = 1; | 966 | monc->want_next_osdmap = 1; |
967 | return 0; | 967 | return 0; |
968 | 968 | ||
969 | out_auth_reply: | 969 | out_auth_reply: |
970 | ceph_msg_put(monc->m_auth_reply); | 970 | ceph_msg_put(monc->m_auth_reply); |
971 | out_subscribe: | 971 | out_subscribe: |
972 | ceph_msg_put(monc->m_subscribe); | 972 | ceph_msg_put(monc->m_subscribe); |
973 | out_subscribe_ack: | 973 | out_subscribe_ack: |
974 | ceph_msg_put(monc->m_subscribe_ack); | 974 | ceph_msg_put(monc->m_subscribe_ack); |
975 | out_auth: | 975 | out_auth: |
976 | ceph_auth_destroy(monc->auth); | 976 | ceph_auth_destroy(monc->auth); |
977 | out_monmap: | 977 | out_monmap: |
978 | kfree(monc->monmap); | 978 | kfree(monc->monmap); |
979 | out: | 979 | out: |
980 | return err; | 980 | return err; |
981 | } | 981 | } |
982 | EXPORT_SYMBOL(ceph_monc_init); | 982 | EXPORT_SYMBOL(ceph_monc_init); |
983 | 983 | ||
984 | void ceph_monc_stop(struct ceph_mon_client *monc) | 984 | void ceph_monc_stop(struct ceph_mon_client *monc) |
985 | { | 985 | { |
986 | dout("stop\n"); | 986 | dout("stop\n"); |
987 | cancel_delayed_work_sync(&monc->delayed_work); | 987 | cancel_delayed_work_sync(&monc->delayed_work); |
988 | 988 | ||
989 | mutex_lock(&monc->mutex); | 989 | mutex_lock(&monc->mutex); |
990 | __close_session(monc); | 990 | __close_session(monc); |
991 | 991 | ||
992 | mutex_unlock(&monc->mutex); | 992 | mutex_unlock(&monc->mutex); |
993 | 993 | ||
994 | /* | 994 | /* |
995 | * flush msgr queue before we destroy ourselves to ensure that: | 995 | * flush msgr queue before we destroy ourselves to ensure that: |
996 | * - any work that references our embedded con is finished. | 996 | * - any work that references our embedded con is finished. |
997 | * - any osd_client or other work that may reference an authorizer | 997 | * - any osd_client or other work that may reference an authorizer |
998 | * finishes before we shut down the auth subsystem. | 998 | * finishes before we shut down the auth subsystem. |
999 | */ | 999 | */ |
1000 | ceph_msgr_flush(); | 1000 | ceph_msgr_flush(); |
1001 | 1001 | ||
1002 | ceph_auth_destroy(monc->auth); | 1002 | ceph_auth_destroy(monc->auth); |
1003 | 1003 | ||
1004 | ceph_msg_put(monc->m_auth); | 1004 | ceph_msg_put(monc->m_auth); |
1005 | ceph_msg_put(monc->m_auth_reply); | 1005 | ceph_msg_put(monc->m_auth_reply); |
1006 | ceph_msg_put(monc->m_subscribe); | 1006 | ceph_msg_put(monc->m_subscribe); |
1007 | ceph_msg_put(monc->m_subscribe_ack); | 1007 | ceph_msg_put(monc->m_subscribe_ack); |
1008 | 1008 | ||
1009 | kfree(monc->monmap); | 1009 | kfree(monc->monmap); |
1010 | } | 1010 | } |
1011 | EXPORT_SYMBOL(ceph_monc_stop); | 1011 | EXPORT_SYMBOL(ceph_monc_stop); |
1012 | 1012 | ||
1013 | static void handle_auth_reply(struct ceph_mon_client *monc, | 1013 | static void handle_auth_reply(struct ceph_mon_client *monc, |
1014 | struct ceph_msg *msg) | 1014 | struct ceph_msg *msg) |
1015 | { | 1015 | { |
1016 | int ret; | 1016 | int ret; |
1017 | int was_auth = 0; | 1017 | int was_auth = 0; |
1018 | int had_debugfs_info, init_debugfs = 0; | 1018 | int had_debugfs_info, init_debugfs = 0; |
1019 | 1019 | ||
1020 | mutex_lock(&monc->mutex); | 1020 | mutex_lock(&monc->mutex); |
1021 | had_debugfs_info = have_debugfs_info(monc); | 1021 | had_debugfs_info = have_debugfs_info(monc); |
1022 | was_auth = ceph_auth_is_authenticated(monc->auth); | 1022 | was_auth = ceph_auth_is_authenticated(monc->auth); |
1023 | monc->pending_auth = 0; | 1023 | monc->pending_auth = 0; |
1024 | ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, | 1024 | ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, |
1025 | msg->front.iov_len, | 1025 | msg->front.iov_len, |
1026 | monc->m_auth->front.iov_base, | 1026 | monc->m_auth->front.iov_base, |
1027 | monc->m_auth->front_alloc_len); | 1027 | monc->m_auth->front_alloc_len); |
1028 | if (ret < 0) { | 1028 | if (ret < 0) { |
1029 | monc->client->auth_err = ret; | 1029 | monc->client->auth_err = ret; |
1030 | wake_up_all(&monc->client->auth_wq); | 1030 | wake_up_all(&monc->client->auth_wq); |
1031 | } else if (ret > 0) { | 1031 | } else if (ret > 0) { |
1032 | __send_prepared_auth_request(monc, ret); | 1032 | __send_prepared_auth_request(monc, ret); |
1033 | } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) { | 1033 | } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) { |
1034 | dout("authenticated, starting session\n"); | 1034 | dout("authenticated, starting session\n"); |
1035 | 1035 | ||
1036 | monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT; | 1036 | monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT; |
1037 | monc->client->msgr.inst.name.num = | 1037 | monc->client->msgr.inst.name.num = |
1038 | cpu_to_le64(monc->auth->global_id); | 1038 | cpu_to_le64(monc->auth->global_id); |
1039 | 1039 | ||
1040 | __send_subscribe(monc); | 1040 | __send_subscribe(monc); |
1041 | __resend_generic_request(monc); | 1041 | __resend_generic_request(monc); |
1042 | } | 1042 | } |
1043 | 1043 | ||
1044 | if (!had_debugfs_info && have_debugfs_info(monc)) { | 1044 | if (!had_debugfs_info && have_debugfs_info(monc)) { |
1045 | pr_info("client%lld fsid %pU\n", | 1045 | pr_info("client%lld fsid %pU\n", |
1046 | ceph_client_id(monc->client), | 1046 | ceph_client_id(monc->client), |
1047 | &monc->client->fsid); | 1047 | &monc->client->fsid); |
1048 | init_debugfs = 1; | 1048 | init_debugfs = 1; |
1049 | } | 1049 | } |
1050 | mutex_unlock(&monc->mutex); | 1050 | mutex_unlock(&monc->mutex); |
1051 | 1051 | ||
1052 | if (init_debugfs) { | 1052 | if (init_debugfs) { |
1053 | /* | 1053 | /* |
1054 | * do debugfs initialization without mutex to avoid | 1054 | * do debugfs initialization without mutex to avoid |
1055 | * creating a locking dependency | 1055 | * creating a locking dependency |
1056 | */ | 1056 | */ |
1057 | ceph_debugfs_client_init(monc->client); | 1057 | ceph_debugfs_client_init(monc->client); |
1058 | } | 1058 | } |
1059 | } | 1059 | } |
1060 | 1060 | ||
1061 | static int __validate_auth(struct ceph_mon_client *monc) | 1061 | static int __validate_auth(struct ceph_mon_client *monc) |
1062 | { | 1062 | { |
1063 | int ret; | 1063 | int ret; |
1064 | 1064 | ||
1065 | if (monc->pending_auth) | 1065 | if (monc->pending_auth) |
1066 | return 0; | 1066 | return 0; |
1067 | 1067 | ||
1068 | ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base, | 1068 | ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base, |
1069 | monc->m_auth->front_alloc_len); | 1069 | monc->m_auth->front_alloc_len); |
1070 | if (ret <= 0) | 1070 | if (ret <= 0) |
1071 | return ret; /* either an error, or no need to authenticate */ | 1071 | return ret; /* either an error, or no need to authenticate */ |
1072 | __send_prepared_auth_request(monc, ret); | 1072 | __send_prepared_auth_request(monc, ret); |
1073 | return 0; | 1073 | return 0; |
1074 | } | 1074 | } |
1075 | 1075 | ||
1076 | int ceph_monc_validate_auth(struct ceph_mon_client *monc) | 1076 | int ceph_monc_validate_auth(struct ceph_mon_client *monc) |
1077 | { | 1077 | { |
1078 | int ret; | 1078 | int ret; |
1079 | 1079 | ||
1080 | mutex_lock(&monc->mutex); | 1080 | mutex_lock(&monc->mutex); |
1081 | ret = __validate_auth(monc); | 1081 | ret = __validate_auth(monc); |
1082 | mutex_unlock(&monc->mutex); | 1082 | mutex_unlock(&monc->mutex); |
1083 | return ret; | 1083 | return ret; |
1084 | } | 1084 | } |
1085 | EXPORT_SYMBOL(ceph_monc_validate_auth); | 1085 | EXPORT_SYMBOL(ceph_monc_validate_auth); |
1086 | 1086 | ||
1087 | /* | 1087 | /* |
1088 | * handle incoming message | 1088 | * handle incoming message |
1089 | */ | 1089 | */ |
1090 | static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) | 1090 | static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) |
1091 | { | 1091 | { |
1092 | struct ceph_mon_client *monc = con->private; | 1092 | struct ceph_mon_client *monc = con->private; |
1093 | int type = le16_to_cpu(msg->hdr.type); | 1093 | int type = le16_to_cpu(msg->hdr.type); |
1094 | 1094 | ||
1095 | if (!monc) | 1095 | if (!monc) |
1096 | return; | 1096 | return; |
1097 | 1097 | ||
1098 | switch (type) { | 1098 | switch (type) { |
1099 | case CEPH_MSG_AUTH_REPLY: | 1099 | case CEPH_MSG_AUTH_REPLY: |
1100 | handle_auth_reply(monc, msg); | 1100 | handle_auth_reply(monc, msg); |
1101 | break; | 1101 | break; |
1102 | 1102 | ||
1103 | case CEPH_MSG_MON_SUBSCRIBE_ACK: | 1103 | case CEPH_MSG_MON_SUBSCRIBE_ACK: |
1104 | handle_subscribe_ack(monc, msg); | 1104 | handle_subscribe_ack(monc, msg); |
1105 | break; | 1105 | break; |
1106 | 1106 | ||
1107 | case CEPH_MSG_STATFS_REPLY: | 1107 | case CEPH_MSG_STATFS_REPLY: |
1108 | handle_statfs_reply(monc, msg); | 1108 | handle_statfs_reply(monc, msg); |
1109 | break; | 1109 | break; |
1110 | 1110 | ||
1111 | case CEPH_MSG_MON_GET_VERSION_REPLY: | 1111 | case CEPH_MSG_MON_GET_VERSION_REPLY: |
1112 | handle_get_version_reply(monc, msg); | 1112 | handle_get_version_reply(monc, msg); |
1113 | break; | 1113 | break; |
1114 | 1114 | ||
1115 | case CEPH_MSG_POOLOP_REPLY: | 1115 | case CEPH_MSG_POOLOP_REPLY: |
1116 | handle_poolop_reply(monc, msg); | 1116 | handle_poolop_reply(monc, msg); |
1117 | break; | 1117 | break; |
1118 | 1118 | ||
1119 | case CEPH_MSG_MON_MAP: | 1119 | case CEPH_MSG_MON_MAP: |
1120 | ceph_monc_handle_map(monc, msg); | 1120 | ceph_monc_handle_map(monc, msg); |
1121 | break; | 1121 | break; |
1122 | 1122 | ||
1123 | case CEPH_MSG_OSD_MAP: | 1123 | case CEPH_MSG_OSD_MAP: |
1124 | ceph_osdc_handle_map(&monc->client->osdc, msg); | 1124 | ceph_osdc_handle_map(&monc->client->osdc, msg); |
1125 | break; | 1125 | break; |
1126 | 1126 | ||
1127 | default: | 1127 | default: |
1128 | /* can the chained handler handle it? */ | 1128 | /* can the chained handler handle it? */ |
1129 | if (monc->client->extra_mon_dispatch && | 1129 | if (monc->client->extra_mon_dispatch && |
1130 | monc->client->extra_mon_dispatch(monc->client, msg) == 0) | 1130 | monc->client->extra_mon_dispatch(monc->client, msg) == 0) |
1131 | break; | 1131 | break; |
1132 | 1132 | ||
1133 | pr_err("received unknown message type %d %s\n", type, | 1133 | pr_err("received unknown message type %d %s\n", type, |
1134 | ceph_msg_type_name(type)); | 1134 | ceph_msg_type_name(type)); |
1135 | } | 1135 | } |
1136 | ceph_msg_put(msg); | 1136 | ceph_msg_put(msg); |
1137 | } | 1137 | } |
1138 | 1138 | ||
1139 | /* | 1139 | /* |
1140 | * Allocate memory for incoming message | 1140 | * Allocate memory for incoming message |
1141 | */ | 1141 | */ |
1142 | static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, | 1142 | static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, |
1143 | struct ceph_msg_header *hdr, | 1143 | struct ceph_msg_header *hdr, |
1144 | int *skip) | 1144 | int *skip) |
1145 | { | 1145 | { |
1146 | struct ceph_mon_client *monc = con->private; | 1146 | struct ceph_mon_client *monc = con->private; |
1147 | int type = le16_to_cpu(hdr->type); | 1147 | int type = le16_to_cpu(hdr->type); |
1148 | int front_len = le32_to_cpu(hdr->front_len); | 1148 | int front_len = le32_to_cpu(hdr->front_len); |
1149 | struct ceph_msg *m = NULL; | 1149 | struct ceph_msg *m = NULL; |
1150 | 1150 | ||
1151 | *skip = 0; | 1151 | *skip = 0; |
1152 | 1152 | ||
1153 | switch (type) { | 1153 | switch (type) { |
1154 | case CEPH_MSG_MON_SUBSCRIBE_ACK: | 1154 | case CEPH_MSG_MON_SUBSCRIBE_ACK: |
1155 | m = ceph_msg_get(monc->m_subscribe_ack); | 1155 | m = ceph_msg_get(monc->m_subscribe_ack); |
1156 | break; | 1156 | break; |
1157 | case CEPH_MSG_POOLOP_REPLY: | 1157 | case CEPH_MSG_POOLOP_REPLY: |
1158 | case CEPH_MSG_STATFS_REPLY: | 1158 | case CEPH_MSG_STATFS_REPLY: |
1159 | return get_generic_reply(con, hdr, skip); | 1159 | return get_generic_reply(con, hdr, skip); |
1160 | case CEPH_MSG_AUTH_REPLY: | 1160 | case CEPH_MSG_AUTH_REPLY: |
1161 | m = ceph_msg_get(monc->m_auth_reply); | 1161 | m = ceph_msg_get(monc->m_auth_reply); |
1162 | break; | 1162 | break; |
1163 | case CEPH_MSG_MON_GET_VERSION_REPLY: | 1163 | case CEPH_MSG_MON_GET_VERSION_REPLY: |
1164 | if (le64_to_cpu(hdr->tid) != 0) | 1164 | if (le64_to_cpu(hdr->tid) != 0) |
1165 | return get_generic_reply(con, hdr, skip); | 1165 | return get_generic_reply(con, hdr, skip); |
1166 | 1166 | ||
1167 | /* | 1167 | /* |
1168 | * Older OSDs don't set reply tid even if the orignal | 1168 | * Older OSDs don't set reply tid even if the orignal |
1169 | * request had a non-zero tid. Workaround this weirdness | 1169 | * request had a non-zero tid. Workaround this weirdness |
1170 | * by falling through to the allocate case. | 1170 | * by falling through to the allocate case. |
1171 | */ | 1171 | */ |
1172 | case CEPH_MSG_MON_MAP: | 1172 | case CEPH_MSG_MON_MAP: |
1173 | case CEPH_MSG_MDS_MAP: | 1173 | case CEPH_MSG_MDS_MAP: |
1174 | case CEPH_MSG_OSD_MAP: | 1174 | case CEPH_MSG_OSD_MAP: |
1175 | m = ceph_msg_new(type, front_len, GFP_NOFS, false); | 1175 | m = ceph_msg_new(type, front_len, GFP_NOFS, false); |
1176 | if (!m) | 1176 | if (!m) |
1177 | return NULL; /* ENOMEM--return skip == 0 */ | 1177 | return NULL; /* ENOMEM--return skip == 0 */ |
1178 | break; | 1178 | break; |
1179 | } | 1179 | } |
1180 | 1180 | ||
1181 | if (!m) { | 1181 | if (!m) { |
1182 | pr_info("alloc_msg unknown type %d\n", type); | 1182 | pr_info("alloc_msg unknown type %d\n", type); |
1183 | *skip = 1; | 1183 | *skip = 1; |
1184 | } else if (front_len > m->front_alloc_len) { | 1184 | } else if (front_len > m->front_alloc_len) { |
1185 | pr_warn("mon_alloc_msg front %d > prealloc %d (%u#%llu)\n", | 1185 | pr_warn("mon_alloc_msg front %d > prealloc %d (%u#%llu)\n", |
1186 | front_len, m->front_alloc_len, | 1186 | front_len, m->front_alloc_len, |
1187 | (unsigned int)con->peer_name.type, | 1187 | (unsigned int)con->peer_name.type, |
1188 | le64_to_cpu(con->peer_name.num)); | 1188 | le64_to_cpu(con->peer_name.num)); |
1189 | ceph_msg_put(m); | 1189 | ceph_msg_put(m); |
1190 | m = ceph_msg_new(type, front_len, GFP_NOFS, false); | 1190 | m = ceph_msg_new(type, front_len, GFP_NOFS, false); |
1191 | } | 1191 | } |
1192 | 1192 | ||
1193 | return m; | 1193 | return m; |
1194 | } | 1194 | } |
1195 | 1195 | ||
1196 | /* | 1196 | /* |
1197 | * If the monitor connection resets, pick a new monitor and resubmit | 1197 | * If the monitor connection resets, pick a new monitor and resubmit |
1198 | * any pending requests. | 1198 | * any pending requests. |
1199 | */ | 1199 | */ |
1200 | static void mon_fault(struct ceph_connection *con) | 1200 | static void mon_fault(struct ceph_connection *con) |
1201 | { | 1201 | { |
1202 | struct ceph_mon_client *monc = con->private; | 1202 | struct ceph_mon_client *monc = con->private; |
1203 | 1203 | ||
1204 | if (!monc) | 1204 | if (!monc) |
1205 | return; | 1205 | return; |
1206 | 1206 | ||
1207 | dout("mon_fault\n"); | 1207 | dout("mon_fault\n"); |
1208 | mutex_lock(&monc->mutex); | 1208 | mutex_lock(&monc->mutex); |
1209 | if (!con->private) | 1209 | if (!con->private) |
1210 | goto out; | 1210 | goto out; |
1211 | 1211 | ||
1212 | if (!monc->hunting) | 1212 | if (!monc->hunting) |
1213 | pr_info("mon%d %s session lost, " | 1213 | pr_info("mon%d %s session lost, " |
1214 | "hunting for new mon\n", monc->cur_mon, | 1214 | "hunting for new mon\n", monc->cur_mon, |
1215 | ceph_pr_addr(&monc->con.peer_addr.in_addr)); | 1215 | ceph_pr_addr(&monc->con.peer_addr.in_addr)); |
1216 | 1216 | ||
1217 | __close_session(monc); | 1217 | __close_session(monc); |
1218 | if (!monc->hunting) { | 1218 | if (!monc->hunting) { |
1219 | /* start hunting */ | 1219 | /* start hunting */ |
1220 | monc->hunting = true; | 1220 | monc->hunting = true; |
1221 | __open_session(monc); | 1221 | __open_session(monc); |
1222 | } else { | 1222 | } else { |
1223 | /* already hunting, let's wait a bit */ | 1223 | /* already hunting, let's wait a bit */ |
1224 | __schedule_delayed(monc); | 1224 | __schedule_delayed(monc); |
1225 | } | 1225 | } |
1226 | out: | 1226 | out: |
1227 | mutex_unlock(&monc->mutex); | 1227 | mutex_unlock(&monc->mutex); |
1228 | } | 1228 | } |
1229 | 1229 | ||
1230 | /* | 1230 | /* |
1231 | * We can ignore refcounting on the connection struct, as all references | 1231 | * We can ignore refcounting on the connection struct, as all references |
1232 | * will come from the messenger workqueue, which is drained prior to | 1232 | * will come from the messenger workqueue, which is drained prior to |
1233 | * mon_client destruction. | 1233 | * mon_client destruction. |
1234 | */ | 1234 | */ |
1235 | static struct ceph_connection *con_get(struct ceph_connection *con) | 1235 | static struct ceph_connection *con_get(struct ceph_connection *con) |
1236 | { | 1236 | { |
1237 | return con; | 1237 | return con; |
1238 | } | 1238 | } |
1239 | 1239 | ||
1240 | static void con_put(struct ceph_connection *con) | 1240 | static void con_put(struct ceph_connection *con) |
1241 | { | 1241 | { |
1242 | } | 1242 | } |
1243 | 1243 | ||
1244 | static const struct ceph_connection_operations mon_con_ops = { | 1244 | static const struct ceph_connection_operations mon_con_ops = { |
1245 | .get = con_get, | 1245 | .get = con_get, |
1246 | .put = con_put, | 1246 | .put = con_put, |
1247 | .dispatch = dispatch, | 1247 | .dispatch = dispatch, |
1248 | .fault = mon_fault, | 1248 | .fault = mon_fault, |
1249 | .alloc_msg = mon_alloc_msg, | 1249 | .alloc_msg = mon_alloc_msg, |
1250 | }; | 1250 | }; |
1251 | 1251 |