Commit 20ebb345282d9d90603b021ced113b73e9cdb6a1

Authored by Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull two Ceph fixes from Sage Weil:
 "These are both pretty trivial: a sparse warning fix and size_t printk
  thing"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client:
  libceph: fix sparse endianness warnings
  ceph: use %zu for len in ceph_fill_inline_data()

Showing 4 changed files Inline Diff

1 #include <linux/ceph/ceph_debug.h> 1 #include <linux/ceph/ceph_debug.h>
2 2
3 #include <linux/backing-dev.h> 3 #include <linux/backing-dev.h>
4 #include <linux/fs.h> 4 #include <linux/fs.h>
5 #include <linux/mm.h> 5 #include <linux/mm.h>
6 #include <linux/pagemap.h> 6 #include <linux/pagemap.h>
7 #include <linux/writeback.h> /* generic_writepages */ 7 #include <linux/writeback.h> /* generic_writepages */
8 #include <linux/slab.h> 8 #include <linux/slab.h>
9 #include <linux/pagevec.h> 9 #include <linux/pagevec.h>
10 #include <linux/task_io_accounting_ops.h> 10 #include <linux/task_io_accounting_ops.h>
11 11
12 #include "super.h" 12 #include "super.h"
13 #include "mds_client.h" 13 #include "mds_client.h"
14 #include "cache.h" 14 #include "cache.h"
15 #include <linux/ceph/osd_client.h> 15 #include <linux/ceph/osd_client.h>
16 16
17 /* 17 /*
18 * Ceph address space ops. 18 * Ceph address space ops.
19 * 19 *
20 * There are a few funny things going on here. 20 * There are a few funny things going on here.
21 * 21 *
22 * The page->private field is used to reference a struct 22 * The page->private field is used to reference a struct
23 * ceph_snap_context for _every_ dirty page. This indicates which 23 * ceph_snap_context for _every_ dirty page. This indicates which
24 * snapshot the page was logically dirtied in, and thus which snap 24 * snapshot the page was logically dirtied in, and thus which snap
25 * context needs to be associated with the osd write during writeback. 25 * context needs to be associated with the osd write during writeback.
26 * 26 *
27 * Similarly, struct ceph_inode_info maintains a set of counters to 27 * Similarly, struct ceph_inode_info maintains a set of counters to
28 * count dirty pages on the inode. In the absence of snapshots, 28 * count dirty pages on the inode. In the absence of snapshots,
29 * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count. 29 * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
30 * 30 *
31 * When a snapshot is taken (that is, when the client receives 31 * When a snapshot is taken (that is, when the client receives
32 * notification that a snapshot was taken), each inode with caps and 32 * notification that a snapshot was taken), each inode with caps and
33 * with dirty pages (dirty pages implies there is a cap) gets a new 33 * with dirty pages (dirty pages implies there is a cap) gets a new
34 * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending 34 * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
35 * order, new snaps go to the tail). The i_wrbuffer_ref_head count is 35 * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
36 * moved to capsnap->dirty. (Unless a sync write is currently in 36 * moved to capsnap->dirty. (Unless a sync write is currently in
37 * progress. In that case, the capsnap is said to be "pending", new 37 * progress. In that case, the capsnap is said to be "pending", new
38 * writes cannot start, and the capsnap isn't "finalized" until the 38 * writes cannot start, and the capsnap isn't "finalized" until the
39 * write completes (or fails) and a final size/mtime for the inode for 39 * write completes (or fails) and a final size/mtime for the inode for
40 * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0. 40 * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
41 * 41 *
42 * On writeback, we must submit writes to the osd IN SNAP ORDER. So, 42 * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
43 * we look for the first capsnap in i_cap_snaps and write out pages in 43 * we look for the first capsnap in i_cap_snaps and write out pages in
44 * that snap context _only_. Then we move on to the next capsnap, 44 * that snap context _only_. Then we move on to the next capsnap,
45 * eventually reaching the "live" or "head" context (i.e., pages that 45 * eventually reaching the "live" or "head" context (i.e., pages that
46 * are not yet snapped) and are writing the most recently dirtied 46 * are not yet snapped) and are writing the most recently dirtied
47 * pages. 47 * pages.
48 * 48 *
49 * Invalidate and so forth must take care to ensure the dirty page 49 * Invalidate and so forth must take care to ensure the dirty page
50 * accounting is preserved. 50 * accounting is preserved.
51 */ 51 */
52 52
53 #define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10)) 53 #define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
54 #define CONGESTION_OFF_THRESH(congestion_kb) \ 54 #define CONGESTION_OFF_THRESH(congestion_kb) \
55 (CONGESTION_ON_THRESH(congestion_kb) - \ 55 (CONGESTION_ON_THRESH(congestion_kb) - \
56 (CONGESTION_ON_THRESH(congestion_kb) >> 2)) 56 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
57 57
58 static inline struct ceph_snap_context *page_snap_context(struct page *page) 58 static inline struct ceph_snap_context *page_snap_context(struct page *page)
59 { 59 {
60 if (PagePrivate(page)) 60 if (PagePrivate(page))
61 return (void *)page->private; 61 return (void *)page->private;
62 return NULL; 62 return NULL;
63 } 63 }
64 64
65 /* 65 /*
66 * Dirty a page. Optimistically adjust accounting, on the assumption 66 * Dirty a page. Optimistically adjust accounting, on the assumption
67 * that we won't race with invalidate. If we do, readjust. 67 * that we won't race with invalidate. If we do, readjust.
68 */ 68 */
69 static int ceph_set_page_dirty(struct page *page) 69 static int ceph_set_page_dirty(struct page *page)
70 { 70 {
71 struct address_space *mapping = page->mapping; 71 struct address_space *mapping = page->mapping;
72 struct inode *inode; 72 struct inode *inode;
73 struct ceph_inode_info *ci; 73 struct ceph_inode_info *ci;
74 struct ceph_snap_context *snapc; 74 struct ceph_snap_context *snapc;
75 int ret; 75 int ret;
76 76
77 if (unlikely(!mapping)) 77 if (unlikely(!mapping))
78 return !TestSetPageDirty(page); 78 return !TestSetPageDirty(page);
79 79
80 if (PageDirty(page)) { 80 if (PageDirty(page)) {
81 dout("%p set_page_dirty %p idx %lu -- already dirty\n", 81 dout("%p set_page_dirty %p idx %lu -- already dirty\n",
82 mapping->host, page, page->index); 82 mapping->host, page, page->index);
83 BUG_ON(!PagePrivate(page)); 83 BUG_ON(!PagePrivate(page));
84 return 0; 84 return 0;
85 } 85 }
86 86
87 inode = mapping->host; 87 inode = mapping->host;
88 ci = ceph_inode(inode); 88 ci = ceph_inode(inode);
89 89
90 /* 90 /*
91 * Note that we're grabbing a snapc ref here without holding 91 * Note that we're grabbing a snapc ref here without holding
92 * any locks! 92 * any locks!
93 */ 93 */
94 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); 94 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
95 95
96 /* dirty the head */ 96 /* dirty the head */
97 spin_lock(&ci->i_ceph_lock); 97 spin_lock(&ci->i_ceph_lock);
98 if (ci->i_head_snapc == NULL) 98 if (ci->i_head_snapc == NULL)
99 ci->i_head_snapc = ceph_get_snap_context(snapc); 99 ci->i_head_snapc = ceph_get_snap_context(snapc);
100 ++ci->i_wrbuffer_ref_head; 100 ++ci->i_wrbuffer_ref_head;
101 if (ci->i_wrbuffer_ref == 0) 101 if (ci->i_wrbuffer_ref == 0)
102 ihold(inode); 102 ihold(inode);
103 ++ci->i_wrbuffer_ref; 103 ++ci->i_wrbuffer_ref;
104 dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d " 104 dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
105 "snapc %p seq %lld (%d snaps)\n", 105 "snapc %p seq %lld (%d snaps)\n",
106 mapping->host, page, page->index, 106 mapping->host, page, page->index,
107 ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1, 107 ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
108 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, 108 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
109 snapc, snapc->seq, snapc->num_snaps); 109 snapc, snapc->seq, snapc->num_snaps);
110 spin_unlock(&ci->i_ceph_lock); 110 spin_unlock(&ci->i_ceph_lock);
111 111
112 /* 112 /*
113 * Reference snap context in page->private. Also set 113 * Reference snap context in page->private. Also set
114 * PagePrivate so that we get invalidatepage callback. 114 * PagePrivate so that we get invalidatepage callback.
115 */ 115 */
116 BUG_ON(PagePrivate(page)); 116 BUG_ON(PagePrivate(page));
117 page->private = (unsigned long)snapc; 117 page->private = (unsigned long)snapc;
118 SetPagePrivate(page); 118 SetPagePrivate(page);
119 119
120 ret = __set_page_dirty_nobuffers(page); 120 ret = __set_page_dirty_nobuffers(page);
121 WARN_ON(!PageLocked(page)); 121 WARN_ON(!PageLocked(page));
122 WARN_ON(!page->mapping); 122 WARN_ON(!page->mapping);
123 123
124 return ret; 124 return ret;
125 } 125 }
126 126
127 /* 127 /*
128 * If we are truncating the full page (i.e. offset == 0), adjust the 128 * If we are truncating the full page (i.e. offset == 0), adjust the
129 * dirty page counters appropriately. Only called if there is private 129 * dirty page counters appropriately. Only called if there is private
130 * data on the page. 130 * data on the page.
131 */ 131 */
132 static void ceph_invalidatepage(struct page *page, unsigned int offset, 132 static void ceph_invalidatepage(struct page *page, unsigned int offset,
133 unsigned int length) 133 unsigned int length)
134 { 134 {
135 struct inode *inode; 135 struct inode *inode;
136 struct ceph_inode_info *ci; 136 struct ceph_inode_info *ci;
137 struct ceph_snap_context *snapc = page_snap_context(page); 137 struct ceph_snap_context *snapc = page_snap_context(page);
138 138
139 inode = page->mapping->host; 139 inode = page->mapping->host;
140 ci = ceph_inode(inode); 140 ci = ceph_inode(inode);
141 141
142 if (offset != 0 || length != PAGE_CACHE_SIZE) { 142 if (offset != 0 || length != PAGE_CACHE_SIZE) {
143 dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n", 143 dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",
144 inode, page, page->index, offset, length); 144 inode, page, page->index, offset, length);
145 return; 145 return;
146 } 146 }
147 147
148 ceph_invalidate_fscache_page(inode, page); 148 ceph_invalidate_fscache_page(inode, page);
149 149
150 if (!PagePrivate(page)) 150 if (!PagePrivate(page))
151 return; 151 return;
152 152
153 /* 153 /*
154 * We can get non-dirty pages here due to races between 154 * We can get non-dirty pages here due to races between
155 * set_page_dirty and truncate_complete_page; just spit out a 155 * set_page_dirty and truncate_complete_page; just spit out a
156 * warning, in case we end up with accounting problems later. 156 * warning, in case we end up with accounting problems later.
157 */ 157 */
158 if (!PageDirty(page)) 158 if (!PageDirty(page))
159 pr_err("%p invalidatepage %p page not dirty\n", inode, page); 159 pr_err("%p invalidatepage %p page not dirty\n", inode, page);
160 160
161 ClearPageChecked(page); 161 ClearPageChecked(page);
162 162
163 dout("%p invalidatepage %p idx %lu full dirty page\n", 163 dout("%p invalidatepage %p idx %lu full dirty page\n",
164 inode, page, page->index); 164 inode, page, page->index);
165 165
166 ceph_put_wrbuffer_cap_refs(ci, 1, snapc); 166 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
167 ceph_put_snap_context(snapc); 167 ceph_put_snap_context(snapc);
168 page->private = 0; 168 page->private = 0;
169 ClearPagePrivate(page); 169 ClearPagePrivate(page);
170 } 170 }
171 171
172 static int ceph_releasepage(struct page *page, gfp_t g) 172 static int ceph_releasepage(struct page *page, gfp_t g)
173 { 173 {
174 struct inode *inode = page->mapping ? page->mapping->host : NULL; 174 struct inode *inode = page->mapping ? page->mapping->host : NULL;
175 dout("%p releasepage %p idx %lu\n", inode, page, page->index); 175 dout("%p releasepage %p idx %lu\n", inode, page, page->index);
176 WARN_ON(PageDirty(page)); 176 WARN_ON(PageDirty(page));
177 177
178 /* Can we release the page from the cache? */ 178 /* Can we release the page from the cache? */
179 if (!ceph_release_fscache_page(page, g)) 179 if (!ceph_release_fscache_page(page, g))
180 return 0; 180 return 0;
181 181
182 return !PagePrivate(page); 182 return !PagePrivate(page);
183 } 183 }
184 184
185 /* 185 /*
186 * read a single page, without unlocking it. 186 * read a single page, without unlocking it.
187 */ 187 */
188 static int readpage_nounlock(struct file *filp, struct page *page) 188 static int readpage_nounlock(struct file *filp, struct page *page)
189 { 189 {
190 struct inode *inode = file_inode(filp); 190 struct inode *inode = file_inode(filp);
191 struct ceph_inode_info *ci = ceph_inode(inode); 191 struct ceph_inode_info *ci = ceph_inode(inode);
192 struct ceph_osd_client *osdc = 192 struct ceph_osd_client *osdc =
193 &ceph_inode_to_client(inode)->client->osdc; 193 &ceph_inode_to_client(inode)->client->osdc;
194 int err = 0; 194 int err = 0;
195 u64 off = page_offset(page); 195 u64 off = page_offset(page);
196 u64 len = PAGE_CACHE_SIZE; 196 u64 len = PAGE_CACHE_SIZE;
197 197
198 if (off >= i_size_read(inode)) { 198 if (off >= i_size_read(inode)) {
199 zero_user_segment(page, err, PAGE_CACHE_SIZE); 199 zero_user_segment(page, err, PAGE_CACHE_SIZE);
200 SetPageUptodate(page); 200 SetPageUptodate(page);
201 return 0; 201 return 0;
202 } 202 }
203 203
204 /* 204 /*
205 * Uptodate inline data should have been added into page cache 205 * Uptodate inline data should have been added into page cache
206 * while getting Fcr caps. 206 * while getting Fcr caps.
207 */ 207 */
208 if (ci->i_inline_version != CEPH_INLINE_NONE) 208 if (ci->i_inline_version != CEPH_INLINE_NONE)
209 return -EINVAL; 209 return -EINVAL;
210 210
211 err = ceph_readpage_from_fscache(inode, page); 211 err = ceph_readpage_from_fscache(inode, page);
212 if (err == 0) 212 if (err == 0)
213 goto out; 213 goto out;
214 214
215 dout("readpage inode %p file %p page %p index %lu\n", 215 dout("readpage inode %p file %p page %p index %lu\n",
216 inode, filp, page, page->index); 216 inode, filp, page, page->index);
217 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 217 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
218 off, &len, 218 off, &len,
219 ci->i_truncate_seq, ci->i_truncate_size, 219 ci->i_truncate_seq, ci->i_truncate_size,
220 &page, 1, 0); 220 &page, 1, 0);
221 if (err == -ENOENT) 221 if (err == -ENOENT)
222 err = 0; 222 err = 0;
223 if (err < 0) { 223 if (err < 0) {
224 SetPageError(page); 224 SetPageError(page);
225 ceph_fscache_readpage_cancel(inode, page); 225 ceph_fscache_readpage_cancel(inode, page);
226 goto out; 226 goto out;
227 } 227 }
228 if (err < PAGE_CACHE_SIZE) 228 if (err < PAGE_CACHE_SIZE)
229 /* zero fill remainder of page */ 229 /* zero fill remainder of page */
230 zero_user_segment(page, err, PAGE_CACHE_SIZE); 230 zero_user_segment(page, err, PAGE_CACHE_SIZE);
231 else 231 else
232 flush_dcache_page(page); 232 flush_dcache_page(page);
233 233
234 SetPageUptodate(page); 234 SetPageUptodate(page);
235 ceph_readpage_to_fscache(inode, page); 235 ceph_readpage_to_fscache(inode, page);
236 236
237 out: 237 out:
238 return err < 0 ? err : 0; 238 return err < 0 ? err : 0;
239 } 239 }
240 240
241 static int ceph_readpage(struct file *filp, struct page *page) 241 static int ceph_readpage(struct file *filp, struct page *page)
242 { 242 {
243 int r = readpage_nounlock(filp, page); 243 int r = readpage_nounlock(filp, page);
244 unlock_page(page); 244 unlock_page(page);
245 return r; 245 return r;
246 } 246 }
247 247
248 /* 248 /*
249 * Finish an async read(ahead) op. 249 * Finish an async read(ahead) op.
250 */ 250 */
251 static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) 251 static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
252 { 252 {
253 struct inode *inode = req->r_inode; 253 struct inode *inode = req->r_inode;
254 struct ceph_osd_data *osd_data; 254 struct ceph_osd_data *osd_data;
255 int rc = req->r_result; 255 int rc = req->r_result;
256 int bytes = le32_to_cpu(msg->hdr.data_len); 256 int bytes = le32_to_cpu(msg->hdr.data_len);
257 int num_pages; 257 int num_pages;
258 int i; 258 int i;
259 259
260 dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); 260 dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
261 261
262 /* unlock all pages, zeroing any data we didn't read */ 262 /* unlock all pages, zeroing any data we didn't read */
263 osd_data = osd_req_op_extent_osd_data(req, 0); 263 osd_data = osd_req_op_extent_osd_data(req, 0);
264 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); 264 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
265 num_pages = calc_pages_for((u64)osd_data->alignment, 265 num_pages = calc_pages_for((u64)osd_data->alignment,
266 (u64)osd_data->length); 266 (u64)osd_data->length);
267 for (i = 0; i < num_pages; i++) { 267 for (i = 0; i < num_pages; i++) {
268 struct page *page = osd_data->pages[i]; 268 struct page *page = osd_data->pages[i];
269 269
270 if (rc < 0) 270 if (rc < 0)
271 goto unlock; 271 goto unlock;
272 if (bytes < (int)PAGE_CACHE_SIZE) { 272 if (bytes < (int)PAGE_CACHE_SIZE) {
273 /* zero (remainder of) page */ 273 /* zero (remainder of) page */
274 int s = bytes < 0 ? 0 : bytes; 274 int s = bytes < 0 ? 0 : bytes;
275 zero_user_segment(page, s, PAGE_CACHE_SIZE); 275 zero_user_segment(page, s, PAGE_CACHE_SIZE);
276 } 276 }
277 dout("finish_read %p uptodate %p idx %lu\n", inode, page, 277 dout("finish_read %p uptodate %p idx %lu\n", inode, page,
278 page->index); 278 page->index);
279 flush_dcache_page(page); 279 flush_dcache_page(page);
280 SetPageUptodate(page); 280 SetPageUptodate(page);
281 ceph_readpage_to_fscache(inode, page); 281 ceph_readpage_to_fscache(inode, page);
282 unlock: 282 unlock:
283 unlock_page(page); 283 unlock_page(page);
284 page_cache_release(page); 284 page_cache_release(page);
285 bytes -= PAGE_CACHE_SIZE; 285 bytes -= PAGE_CACHE_SIZE;
286 } 286 }
287 kfree(osd_data->pages); 287 kfree(osd_data->pages);
288 } 288 }
289 289
290 static void ceph_unlock_page_vector(struct page **pages, int num_pages) 290 static void ceph_unlock_page_vector(struct page **pages, int num_pages)
291 { 291 {
292 int i; 292 int i;
293 293
294 for (i = 0; i < num_pages; i++) 294 for (i = 0; i < num_pages; i++)
295 unlock_page(pages[i]); 295 unlock_page(pages[i]);
296 } 296 }
297 297
298 /* 298 /*
299 * start an async read(ahead) operation. return nr_pages we submitted 299 * start an async read(ahead) operation. return nr_pages we submitted
300 * a read for on success, or negative error code. 300 * a read for on success, or negative error code.
301 */ 301 */
302 static int start_read(struct inode *inode, struct list_head *page_list, int max) 302 static int start_read(struct inode *inode, struct list_head *page_list, int max)
303 { 303 {
304 struct ceph_osd_client *osdc = 304 struct ceph_osd_client *osdc =
305 &ceph_inode_to_client(inode)->client->osdc; 305 &ceph_inode_to_client(inode)->client->osdc;
306 struct ceph_inode_info *ci = ceph_inode(inode); 306 struct ceph_inode_info *ci = ceph_inode(inode);
307 struct page *page = list_entry(page_list->prev, struct page, lru); 307 struct page *page = list_entry(page_list->prev, struct page, lru);
308 struct ceph_vino vino; 308 struct ceph_vino vino;
309 struct ceph_osd_request *req; 309 struct ceph_osd_request *req;
310 u64 off; 310 u64 off;
311 u64 len; 311 u64 len;
312 int i; 312 int i;
313 struct page **pages; 313 struct page **pages;
314 pgoff_t next_index; 314 pgoff_t next_index;
315 int nr_pages = 0; 315 int nr_pages = 0;
316 int ret; 316 int ret;
317 317
318 off = (u64) page_offset(page); 318 off = (u64) page_offset(page);
319 319
320 /* count pages */ 320 /* count pages */
321 next_index = page->index; 321 next_index = page->index;
322 list_for_each_entry_reverse(page, page_list, lru) { 322 list_for_each_entry_reverse(page, page_list, lru) {
323 if (page->index != next_index) 323 if (page->index != next_index)
324 break; 324 break;
325 nr_pages++; 325 nr_pages++;
326 next_index++; 326 next_index++;
327 if (max && nr_pages == max) 327 if (max && nr_pages == max)
328 break; 328 break;
329 } 329 }
330 len = nr_pages << PAGE_CACHE_SHIFT; 330 len = nr_pages << PAGE_CACHE_SHIFT;
331 dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, 331 dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
332 off, len); 332 off, len);
333 vino = ceph_vino(inode); 333 vino = ceph_vino(inode);
334 req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 334 req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
335 0, 1, CEPH_OSD_OP_READ, 335 0, 1, CEPH_OSD_OP_READ,
336 CEPH_OSD_FLAG_READ, NULL, 336 CEPH_OSD_FLAG_READ, NULL,
337 ci->i_truncate_seq, ci->i_truncate_size, 337 ci->i_truncate_seq, ci->i_truncate_size,
338 false); 338 false);
339 if (IS_ERR(req)) 339 if (IS_ERR(req))
340 return PTR_ERR(req); 340 return PTR_ERR(req);
341 341
342 /* build page vector */ 342 /* build page vector */
343 nr_pages = calc_pages_for(0, len); 343 nr_pages = calc_pages_for(0, len);
344 pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); 344 pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
345 ret = -ENOMEM; 345 ret = -ENOMEM;
346 if (!pages) 346 if (!pages)
347 goto out; 347 goto out;
348 for (i = 0; i < nr_pages; ++i) { 348 for (i = 0; i < nr_pages; ++i) {
349 page = list_entry(page_list->prev, struct page, lru); 349 page = list_entry(page_list->prev, struct page, lru);
350 BUG_ON(PageLocked(page)); 350 BUG_ON(PageLocked(page));
351 list_del(&page->lru); 351 list_del(&page->lru);
352 352
353 dout("start_read %p adding %p idx %lu\n", inode, page, 353 dout("start_read %p adding %p idx %lu\n", inode, page,
354 page->index); 354 page->index);
355 if (add_to_page_cache_lru(page, &inode->i_data, page->index, 355 if (add_to_page_cache_lru(page, &inode->i_data, page->index,
356 GFP_NOFS)) { 356 GFP_NOFS)) {
357 ceph_fscache_uncache_page(inode, page); 357 ceph_fscache_uncache_page(inode, page);
358 page_cache_release(page); 358 page_cache_release(page);
359 dout("start_read %p add_to_page_cache failed %p\n", 359 dout("start_read %p add_to_page_cache failed %p\n",
360 inode, page); 360 inode, page);
361 nr_pages = i; 361 nr_pages = i;
362 goto out_pages; 362 goto out_pages;
363 } 363 }
364 pages[i] = page; 364 pages[i] = page;
365 } 365 }
366 osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); 366 osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
367 req->r_callback = finish_read; 367 req->r_callback = finish_read;
368 req->r_inode = inode; 368 req->r_inode = inode;
369 369
370 ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); 370 ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
371 371
372 dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); 372 dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
373 ret = ceph_osdc_start_request(osdc, req, false); 373 ret = ceph_osdc_start_request(osdc, req, false);
374 if (ret < 0) 374 if (ret < 0)
375 goto out_pages; 375 goto out_pages;
376 ceph_osdc_put_request(req); 376 ceph_osdc_put_request(req);
377 return nr_pages; 377 return nr_pages;
378 378
379 out_pages: 379 out_pages:
380 ceph_unlock_page_vector(pages, nr_pages); 380 ceph_unlock_page_vector(pages, nr_pages);
381 ceph_release_page_vector(pages, nr_pages); 381 ceph_release_page_vector(pages, nr_pages);
382 out: 382 out:
383 ceph_osdc_put_request(req); 383 ceph_osdc_put_request(req);
384 return ret; 384 return ret;
385 } 385 }
386 386
387 387
388 /* 388 /*
389 * Read multiple pages. Leave pages we don't read + unlock in page_list; 389 * Read multiple pages. Leave pages we don't read + unlock in page_list;
390 * the caller (VM) cleans them up. 390 * the caller (VM) cleans them up.
391 */ 391 */
392 static int ceph_readpages(struct file *file, struct address_space *mapping, 392 static int ceph_readpages(struct file *file, struct address_space *mapping,
393 struct list_head *page_list, unsigned nr_pages) 393 struct list_head *page_list, unsigned nr_pages)
394 { 394 {
395 struct inode *inode = file_inode(file); 395 struct inode *inode = file_inode(file);
396 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 396 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
397 int rc = 0; 397 int rc = 0;
398 int max = 0; 398 int max = 0;
399 399
400 if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) 400 if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
401 return -EINVAL; 401 return -EINVAL;
402 402
403 rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, 403 rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list,
404 &nr_pages); 404 &nr_pages);
405 405
406 if (rc == 0) 406 if (rc == 0)
407 goto out; 407 goto out;
408 408
409 if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE) 409 if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
410 max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1) 410 max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
411 >> PAGE_SHIFT; 411 >> PAGE_SHIFT;
412 412
413 dout("readpages %p file %p nr_pages %d max %d\n", inode, 413 dout("readpages %p file %p nr_pages %d max %d\n", inode,
414 file, nr_pages, 414 file, nr_pages,
415 max); 415 max);
416 while (!list_empty(page_list)) { 416 while (!list_empty(page_list)) {
417 rc = start_read(inode, page_list, max); 417 rc = start_read(inode, page_list, max);
418 if (rc < 0) 418 if (rc < 0)
419 goto out; 419 goto out;
420 BUG_ON(rc == 0); 420 BUG_ON(rc == 0);
421 } 421 }
422 out: 422 out:
423 ceph_fscache_readpages_cancel(inode, page_list); 423 ceph_fscache_readpages_cancel(inode, page_list);
424 424
425 dout("readpages %p file %p ret %d\n", inode, file, rc); 425 dout("readpages %p file %p ret %d\n", inode, file, rc);
426 return rc; 426 return rc;
427 } 427 }
428 428
429 /* 429 /*
430 * Get ref for the oldest snapc for an inode with dirty data... that is, the 430 * Get ref for the oldest snapc for an inode with dirty data... that is, the
431 * only snap context we are allowed to write back. 431 * only snap context we are allowed to write back.
432 */ 432 */
433 static struct ceph_snap_context *get_oldest_context(struct inode *inode, 433 static struct ceph_snap_context *get_oldest_context(struct inode *inode,
434 u64 *snap_size) 434 u64 *snap_size)
435 { 435 {
436 struct ceph_inode_info *ci = ceph_inode(inode); 436 struct ceph_inode_info *ci = ceph_inode(inode);
437 struct ceph_snap_context *snapc = NULL; 437 struct ceph_snap_context *snapc = NULL;
438 struct ceph_cap_snap *capsnap = NULL; 438 struct ceph_cap_snap *capsnap = NULL;
439 439
440 spin_lock(&ci->i_ceph_lock); 440 spin_lock(&ci->i_ceph_lock);
441 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 441 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
442 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, 442 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
443 capsnap->context, capsnap->dirty_pages); 443 capsnap->context, capsnap->dirty_pages);
444 if (capsnap->dirty_pages) { 444 if (capsnap->dirty_pages) {
445 snapc = ceph_get_snap_context(capsnap->context); 445 snapc = ceph_get_snap_context(capsnap->context);
446 if (snap_size) 446 if (snap_size)
447 *snap_size = capsnap->size; 447 *snap_size = capsnap->size;
448 break; 448 break;
449 } 449 }
450 } 450 }
451 if (!snapc && ci->i_wrbuffer_ref_head) { 451 if (!snapc && ci->i_wrbuffer_ref_head) {
452 snapc = ceph_get_snap_context(ci->i_head_snapc); 452 snapc = ceph_get_snap_context(ci->i_head_snapc);
453 dout(" head snapc %p has %d dirty pages\n", 453 dout(" head snapc %p has %d dirty pages\n",
454 snapc, ci->i_wrbuffer_ref_head); 454 snapc, ci->i_wrbuffer_ref_head);
455 } 455 }
456 spin_unlock(&ci->i_ceph_lock); 456 spin_unlock(&ci->i_ceph_lock);
457 return snapc; 457 return snapc;
458 } 458 }
459 459
460 /* 460 /*
461 * Write a single page, but leave the page locked. 461 * Write a single page, but leave the page locked.
462 * 462 *
463 * If we get a write error, set the page error bit, but still adjust the 463 * If we get a write error, set the page error bit, but still adjust the
464 * dirty page accounting (i.e., page is no longer dirty). 464 * dirty page accounting (i.e., page is no longer dirty).
465 */ 465 */
466 static int writepage_nounlock(struct page *page, struct writeback_control *wbc) 466 static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
467 { 467 {
468 struct inode *inode; 468 struct inode *inode;
469 struct ceph_inode_info *ci; 469 struct ceph_inode_info *ci;
470 struct ceph_fs_client *fsc; 470 struct ceph_fs_client *fsc;
471 struct ceph_osd_client *osdc; 471 struct ceph_osd_client *osdc;
472 struct ceph_snap_context *snapc, *oldest; 472 struct ceph_snap_context *snapc, *oldest;
473 loff_t page_off = page_offset(page); 473 loff_t page_off = page_offset(page);
474 long writeback_stat; 474 long writeback_stat;
475 u64 truncate_size, snap_size = 0; 475 u64 truncate_size, snap_size = 0;
476 u32 truncate_seq; 476 u32 truncate_seq;
477 int err = 0, len = PAGE_CACHE_SIZE; 477 int err = 0, len = PAGE_CACHE_SIZE;
478 478
479 dout("writepage %p idx %lu\n", page, page->index); 479 dout("writepage %p idx %lu\n", page, page->index);
480 480
481 if (!page->mapping || !page->mapping->host) { 481 if (!page->mapping || !page->mapping->host) {
482 dout("writepage %p - no mapping\n", page); 482 dout("writepage %p - no mapping\n", page);
483 return -EFAULT; 483 return -EFAULT;
484 } 484 }
485 inode = page->mapping->host; 485 inode = page->mapping->host;
486 ci = ceph_inode(inode); 486 ci = ceph_inode(inode);
487 fsc = ceph_inode_to_client(inode); 487 fsc = ceph_inode_to_client(inode);
488 osdc = &fsc->client->osdc; 488 osdc = &fsc->client->osdc;
489 489
490 /* verify this is a writeable snap context */ 490 /* verify this is a writeable snap context */
491 snapc = page_snap_context(page); 491 snapc = page_snap_context(page);
492 if (snapc == NULL) { 492 if (snapc == NULL) {
493 dout("writepage %p page %p not dirty?\n", inode, page); 493 dout("writepage %p page %p not dirty?\n", inode, page);
494 goto out; 494 goto out;
495 } 495 }
496 oldest = get_oldest_context(inode, &snap_size); 496 oldest = get_oldest_context(inode, &snap_size);
497 if (snapc->seq > oldest->seq) { 497 if (snapc->seq > oldest->seq) {
498 dout("writepage %p page %p snapc %p not writeable - noop\n", 498 dout("writepage %p page %p snapc %p not writeable - noop\n",
499 inode, page, snapc); 499 inode, page, snapc);
500 /* we should only noop if called by kswapd */ 500 /* we should only noop if called by kswapd */
501 WARN_ON((current->flags & PF_MEMALLOC) == 0); 501 WARN_ON((current->flags & PF_MEMALLOC) == 0);
502 ceph_put_snap_context(oldest); 502 ceph_put_snap_context(oldest);
503 goto out; 503 goto out;
504 } 504 }
505 ceph_put_snap_context(oldest); 505 ceph_put_snap_context(oldest);
506 506
507 spin_lock(&ci->i_ceph_lock); 507 spin_lock(&ci->i_ceph_lock);
508 truncate_seq = ci->i_truncate_seq; 508 truncate_seq = ci->i_truncate_seq;
509 truncate_size = ci->i_truncate_size; 509 truncate_size = ci->i_truncate_size;
510 if (!snap_size) 510 if (!snap_size)
511 snap_size = i_size_read(inode); 511 snap_size = i_size_read(inode);
512 spin_unlock(&ci->i_ceph_lock); 512 spin_unlock(&ci->i_ceph_lock);
513 513
514 /* is this a partial page at end of file? */ 514 /* is this a partial page at end of file? */
515 if (page_off >= snap_size) { 515 if (page_off >= snap_size) {
516 dout("%p page eof %llu\n", page, snap_size); 516 dout("%p page eof %llu\n", page, snap_size);
517 goto out; 517 goto out;
518 } 518 }
519 if (snap_size < page_off + len) 519 if (snap_size < page_off + len)
520 len = snap_size - page_off; 520 len = snap_size - page_off;
521 521
522 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", 522 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
523 inode, page, page->index, page_off, len, snapc); 523 inode, page, page->index, page_off, len, snapc);
524 524
525 writeback_stat = atomic_long_inc_return(&fsc->writeback_count); 525 writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
526 if (writeback_stat > 526 if (writeback_stat >
527 CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) 527 CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
528 set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); 528 set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
529 529
530 ceph_readpage_to_fscache(inode, page); 530 ceph_readpage_to_fscache(inode, page);
531 531
532 set_page_writeback(page); 532 set_page_writeback(page);
533 err = ceph_osdc_writepages(osdc, ceph_vino(inode), 533 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
534 &ci->i_layout, snapc, 534 &ci->i_layout, snapc,
535 page_off, len, 535 page_off, len,
536 truncate_seq, truncate_size, 536 truncate_seq, truncate_size,
537 &inode->i_mtime, &page, 1); 537 &inode->i_mtime, &page, 1);
538 if (err < 0) { 538 if (err < 0) {
539 dout("writepage setting page/mapping error %d %p\n", err, page); 539 dout("writepage setting page/mapping error %d %p\n", err, page);
540 SetPageError(page); 540 SetPageError(page);
541 mapping_set_error(&inode->i_data, err); 541 mapping_set_error(&inode->i_data, err);
542 if (wbc) 542 if (wbc)
543 wbc->pages_skipped++; 543 wbc->pages_skipped++;
544 } else { 544 } else {
545 dout("writepage cleaned page %p\n", page); 545 dout("writepage cleaned page %p\n", page);
546 err = 0; /* vfs expects us to return 0 */ 546 err = 0; /* vfs expects us to return 0 */
547 } 547 }
548 page->private = 0; 548 page->private = 0;
549 ClearPagePrivate(page); 549 ClearPagePrivate(page);
550 end_page_writeback(page); 550 end_page_writeback(page);
551 ceph_put_wrbuffer_cap_refs(ci, 1, snapc); 551 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
552 ceph_put_snap_context(snapc); /* page's reference */ 552 ceph_put_snap_context(snapc); /* page's reference */
553 out: 553 out:
554 return err; 554 return err;
555 } 555 }
556 556
557 static int ceph_writepage(struct page *page, struct writeback_control *wbc) 557 static int ceph_writepage(struct page *page, struct writeback_control *wbc)
558 { 558 {
559 int err; 559 int err;
560 struct inode *inode = page->mapping->host; 560 struct inode *inode = page->mapping->host;
561 BUG_ON(!inode); 561 BUG_ON(!inode);
562 ihold(inode); 562 ihold(inode);
563 err = writepage_nounlock(page, wbc); 563 err = writepage_nounlock(page, wbc);
564 unlock_page(page); 564 unlock_page(page);
565 iput(inode); 565 iput(inode);
566 return err; 566 return err;
567 } 567 }
568 568
569 569
570 /* 570 /*
571 * lame release_pages helper. release_pages() isn't exported to 571 * lame release_pages helper. release_pages() isn't exported to
572 * modules. 572 * modules.
573 */ 573 */
574 static void ceph_release_pages(struct page **pages, int num) 574 static void ceph_release_pages(struct page **pages, int num)
575 { 575 {
576 struct pagevec pvec; 576 struct pagevec pvec;
577 int i; 577 int i;
578 578
579 pagevec_init(&pvec, 0); 579 pagevec_init(&pvec, 0);
580 for (i = 0; i < num; i++) { 580 for (i = 0; i < num; i++) {
581 if (pagevec_add(&pvec, pages[i]) == 0) 581 if (pagevec_add(&pvec, pages[i]) == 0)
582 pagevec_release(&pvec); 582 pagevec_release(&pvec);
583 } 583 }
584 pagevec_release(&pvec); 584 pagevec_release(&pvec);
585 } 585 }
586 586
587 /* 587 /*
588 * async writeback completion handler. 588 * async writeback completion handler.
589 * 589 *
590 * If we get an error, set the mapping error bit, but not the individual 590 * If we get an error, set the mapping error bit, but not the individual
591 * page error bits. 591 * page error bits.
592 */ 592 */
593 static void writepages_finish(struct ceph_osd_request *req, 593 static void writepages_finish(struct ceph_osd_request *req,
594 struct ceph_msg *msg) 594 struct ceph_msg *msg)
595 { 595 {
596 struct inode *inode = req->r_inode; 596 struct inode *inode = req->r_inode;
597 struct ceph_inode_info *ci = ceph_inode(inode); 597 struct ceph_inode_info *ci = ceph_inode(inode);
598 struct ceph_osd_data *osd_data; 598 struct ceph_osd_data *osd_data;
599 unsigned wrote; 599 unsigned wrote;
600 struct page *page; 600 struct page *page;
601 int num_pages; 601 int num_pages;
602 int i; 602 int i;
603 struct ceph_snap_context *snapc = req->r_snapc; 603 struct ceph_snap_context *snapc = req->r_snapc;
604 struct address_space *mapping = inode->i_mapping; 604 struct address_space *mapping = inode->i_mapping;
605 int rc = req->r_result; 605 int rc = req->r_result;
606 u64 bytes = req->r_ops[0].extent.length; 606 u64 bytes = req->r_ops[0].extent.length;
607 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 607 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
608 long writeback_stat; 608 long writeback_stat;
609 unsigned issued = ceph_caps_issued(ci); 609 unsigned issued = ceph_caps_issued(ci);
610 610
611 osd_data = osd_req_op_extent_osd_data(req, 0); 611 osd_data = osd_req_op_extent_osd_data(req, 0);
612 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); 612 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
613 num_pages = calc_pages_for((u64)osd_data->alignment, 613 num_pages = calc_pages_for((u64)osd_data->alignment,
614 (u64)osd_data->length); 614 (u64)osd_data->length);
615 if (rc >= 0) { 615 if (rc >= 0) {
616 /* 616 /*
617 * Assume we wrote the pages we originally sent. The 617 * Assume we wrote the pages we originally sent. The
618 * osd might reply with fewer pages if our writeback 618 * osd might reply with fewer pages if our writeback
619 * raced with a truncation and was adjusted at the osd, 619 * raced with a truncation and was adjusted at the osd,
620 * so don't believe the reply. 620 * so don't believe the reply.
621 */ 621 */
622 wrote = num_pages; 622 wrote = num_pages;
623 } else { 623 } else {
624 wrote = 0; 624 wrote = 0;
625 mapping_set_error(mapping, rc); 625 mapping_set_error(mapping, rc);
626 } 626 }
627 dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n", 627 dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
628 inode, rc, bytes, wrote); 628 inode, rc, bytes, wrote);
629 629
630 /* clean all pages */ 630 /* clean all pages */
631 for (i = 0; i < num_pages; i++) { 631 for (i = 0; i < num_pages; i++) {
632 page = osd_data->pages[i]; 632 page = osd_data->pages[i];
633 BUG_ON(!page); 633 BUG_ON(!page);
634 WARN_ON(!PageUptodate(page)); 634 WARN_ON(!PageUptodate(page));
635 635
636 writeback_stat = 636 writeback_stat =
637 atomic_long_dec_return(&fsc->writeback_count); 637 atomic_long_dec_return(&fsc->writeback_count);
638 if (writeback_stat < 638 if (writeback_stat <
639 CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) 639 CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
640 clear_bdi_congested(&fsc->backing_dev_info, 640 clear_bdi_congested(&fsc->backing_dev_info,
641 BLK_RW_ASYNC); 641 BLK_RW_ASYNC);
642 642
643 ceph_put_snap_context(page_snap_context(page)); 643 ceph_put_snap_context(page_snap_context(page));
644 page->private = 0; 644 page->private = 0;
645 ClearPagePrivate(page); 645 ClearPagePrivate(page);
646 dout("unlocking %d %p\n", i, page); 646 dout("unlocking %d %p\n", i, page);
647 end_page_writeback(page); 647 end_page_writeback(page);
648 648
649 /* 649 /*
650 * We lost the cache cap, need to truncate the page before 650 * We lost the cache cap, need to truncate the page before
651 * it is unlocked, otherwise we'd truncate it later in the 651 * it is unlocked, otherwise we'd truncate it later in the
652 * page truncation thread, possibly losing some data that 652 * page truncation thread, possibly losing some data that
653 * raced its way in 653 * raced its way in
654 */ 654 */
655 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) 655 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
656 generic_error_remove_page(inode->i_mapping, page); 656 generic_error_remove_page(inode->i_mapping, page);
657 657
658 unlock_page(page); 658 unlock_page(page);
659 } 659 }
660 dout("%p wrote+cleaned %d pages\n", inode, wrote); 660 dout("%p wrote+cleaned %d pages\n", inode, wrote);
661 ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc); 661 ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
662 662
663 ceph_release_pages(osd_data->pages, num_pages); 663 ceph_release_pages(osd_data->pages, num_pages);
664 if (osd_data->pages_from_pool) 664 if (osd_data->pages_from_pool)
665 mempool_free(osd_data->pages, 665 mempool_free(osd_data->pages,
666 ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); 666 ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
667 else 667 else
668 kfree(osd_data->pages); 668 kfree(osd_data->pages);
669 ceph_osdc_put_request(req); 669 ceph_osdc_put_request(req);
670 } 670 }
671 671
672 /* 672 /*
673 * initiate async writeback 673 * initiate async writeback
674 */ 674 */
675 static int ceph_writepages_start(struct address_space *mapping, 675 static int ceph_writepages_start(struct address_space *mapping,
676 struct writeback_control *wbc) 676 struct writeback_control *wbc)
677 { 677 {
678 struct inode *inode = mapping->host; 678 struct inode *inode = mapping->host;
679 struct ceph_inode_info *ci = ceph_inode(inode); 679 struct ceph_inode_info *ci = ceph_inode(inode);
680 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 680 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
681 struct ceph_vino vino = ceph_vino(inode); 681 struct ceph_vino vino = ceph_vino(inode);
682 pgoff_t index, start, end; 682 pgoff_t index, start, end;
683 int range_whole = 0; 683 int range_whole = 0;
684 int should_loop = 1; 684 int should_loop = 1;
685 pgoff_t max_pages = 0, max_pages_ever = 0; 685 pgoff_t max_pages = 0, max_pages_ever = 0;
686 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; 686 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
687 struct pagevec pvec; 687 struct pagevec pvec;
688 int done = 0; 688 int done = 0;
689 int rc = 0; 689 int rc = 0;
690 unsigned wsize = 1 << inode->i_blkbits; 690 unsigned wsize = 1 << inode->i_blkbits;
691 struct ceph_osd_request *req = NULL; 691 struct ceph_osd_request *req = NULL;
692 int do_sync = 0; 692 int do_sync = 0;
693 u64 truncate_size, snap_size; 693 u64 truncate_size, snap_size;
694 u32 truncate_seq; 694 u32 truncate_seq;
695 695
696 /* 696 /*
697 * Include a 'sync' in the OSD request if this is a data 697 * Include a 'sync' in the OSD request if this is a data
698 * integrity write (e.g., O_SYNC write or fsync()), or if our 698 * integrity write (e.g., O_SYNC write or fsync()), or if our
699 * cap is being revoked. 699 * cap is being revoked.
700 */ 700 */
701 if ((wbc->sync_mode == WB_SYNC_ALL) || 701 if ((wbc->sync_mode == WB_SYNC_ALL) ||
702 ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) 702 ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
703 do_sync = 1; 703 do_sync = 1;
704 dout("writepages_start %p dosync=%d (mode=%s)\n", 704 dout("writepages_start %p dosync=%d (mode=%s)\n",
705 inode, do_sync, 705 inode, do_sync,
706 wbc->sync_mode == WB_SYNC_NONE ? "NONE" : 706 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
707 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); 707 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
708 708
709 if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { 709 if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
710 pr_warn("writepage_start %p on forced umount\n", inode); 710 pr_warn("writepage_start %p on forced umount\n", inode);
711 return -EIO; /* we're in a forced umount, don't write! */ 711 return -EIO; /* we're in a forced umount, don't write! */
712 } 712 }
713 if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) 713 if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
714 wsize = fsc->mount_options->wsize; 714 wsize = fsc->mount_options->wsize;
715 if (wsize < PAGE_CACHE_SIZE) 715 if (wsize < PAGE_CACHE_SIZE)
716 wsize = PAGE_CACHE_SIZE; 716 wsize = PAGE_CACHE_SIZE;
717 max_pages_ever = wsize >> PAGE_CACHE_SHIFT; 717 max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
718 718
719 pagevec_init(&pvec, 0); 719 pagevec_init(&pvec, 0);
720 720
721 /* where to start/end? */ 721 /* where to start/end? */
722 if (wbc->range_cyclic) { 722 if (wbc->range_cyclic) {
723 start = mapping->writeback_index; /* Start from prev offset */ 723 start = mapping->writeback_index; /* Start from prev offset */
724 end = -1; 724 end = -1;
725 dout(" cyclic, start at %lu\n", start); 725 dout(" cyclic, start at %lu\n", start);
726 } else { 726 } else {
727 start = wbc->range_start >> PAGE_CACHE_SHIFT; 727 start = wbc->range_start >> PAGE_CACHE_SHIFT;
728 end = wbc->range_end >> PAGE_CACHE_SHIFT; 728 end = wbc->range_end >> PAGE_CACHE_SHIFT;
729 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 729 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
730 range_whole = 1; 730 range_whole = 1;
731 should_loop = 0; 731 should_loop = 0;
732 dout(" not cyclic, %lu to %lu\n", start, end); 732 dout(" not cyclic, %lu to %lu\n", start, end);
733 } 733 }
734 index = start; 734 index = start;
735 735
736 retry: 736 retry:
737 /* find oldest snap context with dirty data */ 737 /* find oldest snap context with dirty data */
738 ceph_put_snap_context(snapc); 738 ceph_put_snap_context(snapc);
739 snap_size = 0; 739 snap_size = 0;
740 snapc = get_oldest_context(inode, &snap_size); 740 snapc = get_oldest_context(inode, &snap_size);
741 if (!snapc) { 741 if (!snapc) {
742 /* hmm, why does writepages get called when there 742 /* hmm, why does writepages get called when there
743 is no dirty data? */ 743 is no dirty data? */
744 dout(" no snap context with dirty data?\n"); 744 dout(" no snap context with dirty data?\n");
745 goto out; 745 goto out;
746 } 746 }
747 if (snap_size == 0) 747 if (snap_size == 0)
748 snap_size = i_size_read(inode); 748 snap_size = i_size_read(inode);
749 dout(" oldest snapc is %p seq %lld (%d snaps)\n", 749 dout(" oldest snapc is %p seq %lld (%d snaps)\n",
750 snapc, snapc->seq, snapc->num_snaps); 750 snapc, snapc->seq, snapc->num_snaps);
751 751
752 spin_lock(&ci->i_ceph_lock); 752 spin_lock(&ci->i_ceph_lock);
753 truncate_seq = ci->i_truncate_seq; 753 truncate_seq = ci->i_truncate_seq;
754 truncate_size = ci->i_truncate_size; 754 truncate_size = ci->i_truncate_size;
755 if (!snap_size) 755 if (!snap_size)
756 snap_size = i_size_read(inode); 756 snap_size = i_size_read(inode);
757 spin_unlock(&ci->i_ceph_lock); 757 spin_unlock(&ci->i_ceph_lock);
758 758
759 if (last_snapc && snapc != last_snapc) { 759 if (last_snapc && snapc != last_snapc) {
760 /* if we switched to a newer snapc, restart our scan at the 760 /* if we switched to a newer snapc, restart our scan at the
761 * start of the original file range. */ 761 * start of the original file range. */
762 dout(" snapc differs from last pass, restarting at %lu\n", 762 dout(" snapc differs from last pass, restarting at %lu\n",
763 index); 763 index);
764 index = start; 764 index = start;
765 } 765 }
766 last_snapc = snapc; 766 last_snapc = snapc;
767 767
768 while (!done && index <= end) { 768 while (!done && index <= end) {
769 unsigned i; 769 unsigned i;
770 int first; 770 int first;
771 pgoff_t next; 771 pgoff_t next;
772 int pvec_pages, locked_pages; 772 int pvec_pages, locked_pages;
773 struct page **pages = NULL; 773 struct page **pages = NULL;
774 mempool_t *pool = NULL; /* Becomes non-null if mempool used */ 774 mempool_t *pool = NULL; /* Becomes non-null if mempool used */
775 struct page *page; 775 struct page *page;
776 int want; 776 int want;
777 u64 offset, len; 777 u64 offset, len;
778 long writeback_stat; 778 long writeback_stat;
779 779
780 next = 0; 780 next = 0;
781 locked_pages = 0; 781 locked_pages = 0;
782 max_pages = max_pages_ever; 782 max_pages = max_pages_ever;
783 783
784 get_more_pages: 784 get_more_pages:
785 first = -1; 785 first = -1;
786 want = min(end - index, 786 want = min(end - index,
787 min((pgoff_t)PAGEVEC_SIZE, 787 min((pgoff_t)PAGEVEC_SIZE,
788 max_pages - (pgoff_t)locked_pages) - 1) 788 max_pages - (pgoff_t)locked_pages) - 1)
789 + 1; 789 + 1;
790 pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, 790 pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
791 PAGECACHE_TAG_DIRTY, 791 PAGECACHE_TAG_DIRTY,
792 want); 792 want);
793 dout("pagevec_lookup_tag got %d\n", pvec_pages); 793 dout("pagevec_lookup_tag got %d\n", pvec_pages);
794 if (!pvec_pages && !locked_pages) 794 if (!pvec_pages && !locked_pages)
795 break; 795 break;
796 for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { 796 for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
797 page = pvec.pages[i]; 797 page = pvec.pages[i];
798 dout("? %p idx %lu\n", page, page->index); 798 dout("? %p idx %lu\n", page, page->index);
799 if (locked_pages == 0) 799 if (locked_pages == 0)
800 lock_page(page); /* first page */ 800 lock_page(page); /* first page */
801 else if (!trylock_page(page)) 801 else if (!trylock_page(page))
802 break; 802 break;
803 803
804 /* only dirty pages, or our accounting breaks */ 804 /* only dirty pages, or our accounting breaks */
805 if (unlikely(!PageDirty(page)) || 805 if (unlikely(!PageDirty(page)) ||
806 unlikely(page->mapping != mapping)) { 806 unlikely(page->mapping != mapping)) {
807 dout("!dirty or !mapping %p\n", page); 807 dout("!dirty or !mapping %p\n", page);
808 unlock_page(page); 808 unlock_page(page);
809 break; 809 break;
810 } 810 }
811 if (!wbc->range_cyclic && page->index > end) { 811 if (!wbc->range_cyclic && page->index > end) {
812 dout("end of range %p\n", page); 812 dout("end of range %p\n", page);
813 done = 1; 813 done = 1;
814 unlock_page(page); 814 unlock_page(page);
815 break; 815 break;
816 } 816 }
817 if (next && (page->index != next)) { 817 if (next && (page->index != next)) {
818 dout("not consecutive %p\n", page); 818 dout("not consecutive %p\n", page);
819 unlock_page(page); 819 unlock_page(page);
820 break; 820 break;
821 } 821 }
822 if (wbc->sync_mode != WB_SYNC_NONE) { 822 if (wbc->sync_mode != WB_SYNC_NONE) {
823 dout("waiting on writeback %p\n", page); 823 dout("waiting on writeback %p\n", page);
824 wait_on_page_writeback(page); 824 wait_on_page_writeback(page);
825 } 825 }
826 if (page_offset(page) >= snap_size) { 826 if (page_offset(page) >= snap_size) {
827 dout("%p page eof %llu\n", page, snap_size); 827 dout("%p page eof %llu\n", page, snap_size);
828 done = 1; 828 done = 1;
829 unlock_page(page); 829 unlock_page(page);
830 break; 830 break;
831 } 831 }
832 if (PageWriteback(page)) { 832 if (PageWriteback(page)) {
833 dout("%p under writeback\n", page); 833 dout("%p under writeback\n", page);
834 unlock_page(page); 834 unlock_page(page);
835 break; 835 break;
836 } 836 }
837 837
838 /* only if matching snap context */ 838 /* only if matching snap context */
839 pgsnapc = page_snap_context(page); 839 pgsnapc = page_snap_context(page);
840 if (pgsnapc->seq > snapc->seq) { 840 if (pgsnapc->seq > snapc->seq) {
841 dout("page snapc %p %lld > oldest %p %lld\n", 841 dout("page snapc %p %lld > oldest %p %lld\n",
842 pgsnapc, pgsnapc->seq, snapc, snapc->seq); 842 pgsnapc, pgsnapc->seq, snapc, snapc->seq);
843 unlock_page(page); 843 unlock_page(page);
844 if (!locked_pages) 844 if (!locked_pages)
845 continue; /* keep looking for snap */ 845 continue; /* keep looking for snap */
846 break; 846 break;
847 } 847 }
848 848
849 if (!clear_page_dirty_for_io(page)) { 849 if (!clear_page_dirty_for_io(page)) {
850 dout("%p !clear_page_dirty_for_io\n", page); 850 dout("%p !clear_page_dirty_for_io\n", page);
851 unlock_page(page); 851 unlock_page(page);
852 break; 852 break;
853 } 853 }
854 854
855 /* 855 /*
856 * We have something to write. If this is 856 * We have something to write. If this is
857 * the first locked page this time through, 857 * the first locked page this time through,
858 * allocate an osd request and a page array 858 * allocate an osd request and a page array
859 * that it will use. 859 * that it will use.
860 */ 860 */
861 if (locked_pages == 0) { 861 if (locked_pages == 0) {
862 BUG_ON(pages); 862 BUG_ON(pages);
863 /* prepare async write request */ 863 /* prepare async write request */
864 offset = (u64)page_offset(page); 864 offset = (u64)page_offset(page);
865 len = wsize; 865 len = wsize;
866 req = ceph_osdc_new_request(&fsc->client->osdc, 866 req = ceph_osdc_new_request(&fsc->client->osdc,
867 &ci->i_layout, vino, 867 &ci->i_layout, vino,
868 offset, &len, 0, 868 offset, &len, 0,
869 do_sync ? 2 : 1, 869 do_sync ? 2 : 1,
870 CEPH_OSD_OP_WRITE, 870 CEPH_OSD_OP_WRITE,
871 CEPH_OSD_FLAG_WRITE | 871 CEPH_OSD_FLAG_WRITE |
872 CEPH_OSD_FLAG_ONDISK, 872 CEPH_OSD_FLAG_ONDISK,
873 snapc, truncate_seq, 873 snapc, truncate_seq,
874 truncate_size, true); 874 truncate_size, true);
875 if (IS_ERR(req)) { 875 if (IS_ERR(req)) {
876 rc = PTR_ERR(req); 876 rc = PTR_ERR(req);
877 unlock_page(page); 877 unlock_page(page);
878 break; 878 break;
879 } 879 }
880 880
881 if (do_sync) 881 if (do_sync)
882 osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); 882 osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
883 883
884 req->r_callback = writepages_finish; 884 req->r_callback = writepages_finish;
885 req->r_inode = inode; 885 req->r_inode = inode;
886 886
887 max_pages = calc_pages_for(0, (u64)len); 887 max_pages = calc_pages_for(0, (u64)len);
888 pages = kmalloc(max_pages * sizeof (*pages), 888 pages = kmalloc(max_pages * sizeof (*pages),
889 GFP_NOFS); 889 GFP_NOFS);
890 if (!pages) { 890 if (!pages) {
891 pool = fsc->wb_pagevec_pool; 891 pool = fsc->wb_pagevec_pool;
892 pages = mempool_alloc(pool, GFP_NOFS); 892 pages = mempool_alloc(pool, GFP_NOFS);
893 BUG_ON(!pages); 893 BUG_ON(!pages);
894 } 894 }
895 } 895 }
896 896
897 /* note position of first page in pvec */ 897 /* note position of first page in pvec */
898 if (first < 0) 898 if (first < 0)
899 first = i; 899 first = i;
900 dout("%p will write page %p idx %lu\n", 900 dout("%p will write page %p idx %lu\n",
901 inode, page, page->index); 901 inode, page, page->index);
902 902
903 writeback_stat = 903 writeback_stat =
904 atomic_long_inc_return(&fsc->writeback_count); 904 atomic_long_inc_return(&fsc->writeback_count);
905 if (writeback_stat > CONGESTION_ON_THRESH( 905 if (writeback_stat > CONGESTION_ON_THRESH(
906 fsc->mount_options->congestion_kb)) { 906 fsc->mount_options->congestion_kb)) {
907 set_bdi_congested(&fsc->backing_dev_info, 907 set_bdi_congested(&fsc->backing_dev_info,
908 BLK_RW_ASYNC); 908 BLK_RW_ASYNC);
909 } 909 }
910 910
911 set_page_writeback(page); 911 set_page_writeback(page);
912 pages[locked_pages] = page; 912 pages[locked_pages] = page;
913 locked_pages++; 913 locked_pages++;
914 next = page->index + 1; 914 next = page->index + 1;
915 } 915 }
916 916
917 /* did we get anything? */ 917 /* did we get anything? */
918 if (!locked_pages) 918 if (!locked_pages)
919 goto release_pvec_pages; 919 goto release_pvec_pages;
920 if (i) { 920 if (i) {
921 int j; 921 int j;
922 BUG_ON(!locked_pages || first < 0); 922 BUG_ON(!locked_pages || first < 0);
923 923
924 if (pvec_pages && i == pvec_pages && 924 if (pvec_pages && i == pvec_pages &&
925 locked_pages < max_pages) { 925 locked_pages < max_pages) {
926 dout("reached end pvec, trying for more\n"); 926 dout("reached end pvec, trying for more\n");
927 pagevec_reinit(&pvec); 927 pagevec_reinit(&pvec);
928 goto get_more_pages; 928 goto get_more_pages;
929 } 929 }
930 930
931 /* shift unused pages over in the pvec... we 931 /* shift unused pages over in the pvec... we
932 * will need to release them below. */ 932 * will need to release them below. */
933 for (j = i; j < pvec_pages; j++) { 933 for (j = i; j < pvec_pages; j++) {
934 dout(" pvec leftover page %p\n", 934 dout(" pvec leftover page %p\n",
935 pvec.pages[j]); 935 pvec.pages[j]);
936 pvec.pages[j-i+first] = pvec.pages[j]; 936 pvec.pages[j-i+first] = pvec.pages[j];
937 } 937 }
938 pvec.nr -= i-first; 938 pvec.nr -= i-first;
939 } 939 }
940 940
941 /* Format the osd request message and submit the write */ 941 /* Format the osd request message and submit the write */
942 942
943 offset = page_offset(pages[0]); 943 offset = page_offset(pages[0]);
944 len = min(snap_size - offset, 944 len = min(snap_size - offset,
945 (u64)locked_pages << PAGE_CACHE_SHIFT); 945 (u64)locked_pages << PAGE_CACHE_SHIFT);
946 dout("writepages got %d pages at %llu~%llu\n", 946 dout("writepages got %d pages at %llu~%llu\n",
947 locked_pages, offset, len); 947 locked_pages, offset, len);
948 948
949 osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, 949 osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
950 !!pool, false); 950 !!pool, false);
951 951
952 pages = NULL; /* request message now owns the pages array */ 952 pages = NULL; /* request message now owns the pages array */
953 pool = NULL; 953 pool = NULL;
954 954
955 /* Update the write op length in case we changed it */ 955 /* Update the write op length in case we changed it */
956 956
957 osd_req_op_extent_update(req, 0, len); 957 osd_req_op_extent_update(req, 0, len);
958 958
959 vino = ceph_vino(inode); 959 vino = ceph_vino(inode);
960 ceph_osdc_build_request(req, offset, snapc, vino.snap, 960 ceph_osdc_build_request(req, offset, snapc, vino.snap,
961 &inode->i_mtime); 961 &inode->i_mtime);
962 962
963 rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); 963 rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
964 BUG_ON(rc); 964 BUG_ON(rc);
965 req = NULL; 965 req = NULL;
966 966
967 /* continue? */ 967 /* continue? */
968 index = next; 968 index = next;
969 wbc->nr_to_write -= locked_pages; 969 wbc->nr_to_write -= locked_pages;
970 if (wbc->nr_to_write <= 0) 970 if (wbc->nr_to_write <= 0)
971 done = 1; 971 done = 1;
972 972
973 release_pvec_pages: 973 release_pvec_pages:
974 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, 974 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
975 pvec.nr ? pvec.pages[0] : NULL); 975 pvec.nr ? pvec.pages[0] : NULL);
976 pagevec_release(&pvec); 976 pagevec_release(&pvec);
977 977
978 if (locked_pages && !done) 978 if (locked_pages && !done)
979 goto retry; 979 goto retry;
980 } 980 }
981 981
982 if (should_loop && !done) { 982 if (should_loop && !done) {
983 /* more to do; loop back to beginning of file */ 983 /* more to do; loop back to beginning of file */
984 dout("writepages looping back to beginning of file\n"); 984 dout("writepages looping back to beginning of file\n");
985 should_loop = 0; 985 should_loop = 0;
986 index = 0; 986 index = 0;
987 goto retry; 987 goto retry;
988 } 988 }
989 989
990 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 990 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
991 mapping->writeback_index = index; 991 mapping->writeback_index = index;
992 992
993 out: 993 out:
994 if (req) 994 if (req)
995 ceph_osdc_put_request(req); 995 ceph_osdc_put_request(req);
996 ceph_put_snap_context(snapc); 996 ceph_put_snap_context(snapc);
997 dout("writepages done, rc = %d\n", rc); 997 dout("writepages done, rc = %d\n", rc);
998 return rc; 998 return rc;
999 } 999 }
1000 1000
1001 1001
1002 1002
1003 /* 1003 /*
1004 * See if a given @snapc is either writeable, or already written. 1004 * See if a given @snapc is either writeable, or already written.
1005 */ 1005 */
1006 static int context_is_writeable_or_written(struct inode *inode, 1006 static int context_is_writeable_or_written(struct inode *inode,
1007 struct ceph_snap_context *snapc) 1007 struct ceph_snap_context *snapc)
1008 { 1008 {
1009 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL); 1009 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
1010 int ret = !oldest || snapc->seq <= oldest->seq; 1010 int ret = !oldest || snapc->seq <= oldest->seq;
1011 1011
1012 ceph_put_snap_context(oldest); 1012 ceph_put_snap_context(oldest);
1013 return ret; 1013 return ret;
1014 } 1014 }
1015 1015
1016 /* 1016 /*
1017 * We are only allowed to write into/dirty the page if the page is 1017 * We are only allowed to write into/dirty the page if the page is
1018 * clean, or already dirty within the same snap context. 1018 * clean, or already dirty within the same snap context.
1019 * 1019 *
1020 * called with page locked. 1020 * called with page locked.
1021 * return success with page locked, 1021 * return success with page locked,
1022 * or any failure (incl -EAGAIN) with page unlocked. 1022 * or any failure (incl -EAGAIN) with page unlocked.
1023 */ 1023 */
1024 static int ceph_update_writeable_page(struct file *file, 1024 static int ceph_update_writeable_page(struct file *file,
1025 loff_t pos, unsigned len, 1025 loff_t pos, unsigned len,
1026 struct page *page) 1026 struct page *page)
1027 { 1027 {
1028 struct inode *inode = file_inode(file); 1028 struct inode *inode = file_inode(file);
1029 struct ceph_inode_info *ci = ceph_inode(inode); 1029 struct ceph_inode_info *ci = ceph_inode(inode);
1030 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 1030 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1031 loff_t page_off = pos & PAGE_CACHE_MASK; 1031 loff_t page_off = pos & PAGE_CACHE_MASK;
1032 int pos_in_page = pos & ~PAGE_CACHE_MASK; 1032 int pos_in_page = pos & ~PAGE_CACHE_MASK;
1033 int end_in_page = pos_in_page + len; 1033 int end_in_page = pos_in_page + len;
1034 loff_t i_size; 1034 loff_t i_size;
1035 int r; 1035 int r;
1036 struct ceph_snap_context *snapc, *oldest; 1036 struct ceph_snap_context *snapc, *oldest;
1037 1037
1038 retry_locked: 1038 retry_locked:
1039 /* writepages currently holds page lock, but if we change that later, */ 1039 /* writepages currently holds page lock, but if we change that later, */
1040 wait_on_page_writeback(page); 1040 wait_on_page_writeback(page);
1041 1041
1042 /* check snap context */ 1042 /* check snap context */
1043 BUG_ON(!ci->i_snap_realm); 1043 BUG_ON(!ci->i_snap_realm);
1044 down_read(&mdsc->snap_rwsem); 1044 down_read(&mdsc->snap_rwsem);
1045 BUG_ON(!ci->i_snap_realm->cached_context); 1045 BUG_ON(!ci->i_snap_realm->cached_context);
1046 snapc = page_snap_context(page); 1046 snapc = page_snap_context(page);
1047 if (snapc && snapc != ci->i_head_snapc) { 1047 if (snapc && snapc != ci->i_head_snapc) {
1048 /* 1048 /*
1049 * this page is already dirty in another (older) snap 1049 * this page is already dirty in another (older) snap
1050 * context! is it writeable now? 1050 * context! is it writeable now?
1051 */ 1051 */
1052 oldest = get_oldest_context(inode, NULL); 1052 oldest = get_oldest_context(inode, NULL);
1053 up_read(&mdsc->snap_rwsem); 1053 up_read(&mdsc->snap_rwsem);
1054 1054
1055 if (snapc->seq > oldest->seq) { 1055 if (snapc->seq > oldest->seq) {
1056 ceph_put_snap_context(oldest); 1056 ceph_put_snap_context(oldest);
1057 dout(" page %p snapc %p not current or oldest\n", 1057 dout(" page %p snapc %p not current or oldest\n",
1058 page, snapc); 1058 page, snapc);
1059 /* 1059 /*
1060 * queue for writeback, and wait for snapc to 1060 * queue for writeback, and wait for snapc to
1061 * be writeable or written 1061 * be writeable or written
1062 */ 1062 */
1063 snapc = ceph_get_snap_context(snapc); 1063 snapc = ceph_get_snap_context(snapc);
1064 unlock_page(page); 1064 unlock_page(page);
1065 ceph_queue_writeback(inode); 1065 ceph_queue_writeback(inode);
1066 r = wait_event_interruptible(ci->i_cap_wq, 1066 r = wait_event_interruptible(ci->i_cap_wq,
1067 context_is_writeable_or_written(inode, snapc)); 1067 context_is_writeable_or_written(inode, snapc));
1068 ceph_put_snap_context(snapc); 1068 ceph_put_snap_context(snapc);
1069 if (r == -ERESTARTSYS) 1069 if (r == -ERESTARTSYS)
1070 return r; 1070 return r;
1071 return -EAGAIN; 1071 return -EAGAIN;
1072 } 1072 }
1073 ceph_put_snap_context(oldest); 1073 ceph_put_snap_context(oldest);
1074 1074
1075 /* yay, writeable, do it now (without dropping page lock) */ 1075 /* yay, writeable, do it now (without dropping page lock) */
1076 dout(" page %p snapc %p not current, but oldest\n", 1076 dout(" page %p snapc %p not current, but oldest\n",
1077 page, snapc); 1077 page, snapc);
1078 if (!clear_page_dirty_for_io(page)) 1078 if (!clear_page_dirty_for_io(page))
1079 goto retry_locked; 1079 goto retry_locked;
1080 r = writepage_nounlock(page, NULL); 1080 r = writepage_nounlock(page, NULL);
1081 if (r < 0) 1081 if (r < 0)
1082 goto fail_nosnap; 1082 goto fail_nosnap;
1083 goto retry_locked; 1083 goto retry_locked;
1084 } 1084 }
1085 1085
1086 if (PageUptodate(page)) { 1086 if (PageUptodate(page)) {
1087 dout(" page %p already uptodate\n", page); 1087 dout(" page %p already uptodate\n", page);
1088 return 0; 1088 return 0;
1089 } 1089 }
1090 1090
1091 /* full page? */ 1091 /* full page? */
1092 if (pos_in_page == 0 && len == PAGE_CACHE_SIZE) 1092 if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
1093 return 0; 1093 return 0;
1094 1094
1095 /* past end of file? */ 1095 /* past end of file? */
1096 i_size = inode->i_size; /* caller holds i_mutex */ 1096 i_size = inode->i_size; /* caller holds i_mutex */
1097 1097
1098 if (page_off >= i_size || 1098 if (page_off >= i_size ||
1099 (pos_in_page == 0 && (pos+len) >= i_size && 1099 (pos_in_page == 0 && (pos+len) >= i_size &&
1100 end_in_page - pos_in_page != PAGE_CACHE_SIZE)) { 1100 end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
1101 dout(" zeroing %p 0 - %d and %d - %d\n", 1101 dout(" zeroing %p 0 - %d and %d - %d\n",
1102 page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE); 1102 page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
1103 zero_user_segments(page, 1103 zero_user_segments(page,
1104 0, pos_in_page, 1104 0, pos_in_page,
1105 end_in_page, PAGE_CACHE_SIZE); 1105 end_in_page, PAGE_CACHE_SIZE);
1106 return 0; 1106 return 0;
1107 } 1107 }
1108 1108
1109 /* we need to read it. */ 1109 /* we need to read it. */
1110 up_read(&mdsc->snap_rwsem); 1110 up_read(&mdsc->snap_rwsem);
1111 r = readpage_nounlock(file, page); 1111 r = readpage_nounlock(file, page);
1112 if (r < 0) 1112 if (r < 0)
1113 goto fail_nosnap; 1113 goto fail_nosnap;
1114 goto retry_locked; 1114 goto retry_locked;
1115 fail_nosnap: 1115 fail_nosnap:
1116 unlock_page(page); 1116 unlock_page(page);
1117 return r; 1117 return r;
1118 } 1118 }
1119 1119
1120 /* 1120 /*
1121 * We are only allowed to write into/dirty the page if the page is 1121 * We are only allowed to write into/dirty the page if the page is
1122 * clean, or already dirty within the same snap context. 1122 * clean, or already dirty within the same snap context.
1123 */ 1123 */
1124 static int ceph_write_begin(struct file *file, struct address_space *mapping, 1124 static int ceph_write_begin(struct file *file, struct address_space *mapping,
1125 loff_t pos, unsigned len, unsigned flags, 1125 loff_t pos, unsigned len, unsigned flags,
1126 struct page **pagep, void **fsdata) 1126 struct page **pagep, void **fsdata)
1127 { 1127 {
1128 struct inode *inode = file_inode(file); 1128 struct inode *inode = file_inode(file);
1129 struct page *page; 1129 struct page *page;
1130 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 1130 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1131 int r; 1131 int r;
1132 1132
1133 do { 1133 do {
1134 /* get a page */ 1134 /* get a page */
1135 page = grab_cache_page_write_begin(mapping, index, 0); 1135 page = grab_cache_page_write_begin(mapping, index, 0);
1136 if (!page) 1136 if (!page)
1137 return -ENOMEM; 1137 return -ENOMEM;
1138 *pagep = page; 1138 *pagep = page;
1139 1139
1140 dout("write_begin file %p inode %p page %p %d~%d\n", file, 1140 dout("write_begin file %p inode %p page %p %d~%d\n", file,
1141 inode, page, (int)pos, (int)len); 1141 inode, page, (int)pos, (int)len);
1142 1142
1143 r = ceph_update_writeable_page(file, pos, len, page); 1143 r = ceph_update_writeable_page(file, pos, len, page);
1144 } while (r == -EAGAIN); 1144 } while (r == -EAGAIN);
1145 1145
1146 return r; 1146 return r;
1147 } 1147 }
1148 1148
1149 /* 1149 /*
1150 * we don't do anything in here that simple_write_end doesn't do 1150 * we don't do anything in here that simple_write_end doesn't do
1151 * except adjust dirty page accounting and drop read lock on 1151 * except adjust dirty page accounting and drop read lock on
1152 * mdsc->snap_rwsem. 1152 * mdsc->snap_rwsem.
1153 */ 1153 */
1154 static int ceph_write_end(struct file *file, struct address_space *mapping, 1154 static int ceph_write_end(struct file *file, struct address_space *mapping,
1155 loff_t pos, unsigned len, unsigned copied, 1155 loff_t pos, unsigned len, unsigned copied,
1156 struct page *page, void *fsdata) 1156 struct page *page, void *fsdata)
1157 { 1157 {
1158 struct inode *inode = file_inode(file); 1158 struct inode *inode = file_inode(file);
1159 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 1159 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1160 struct ceph_mds_client *mdsc = fsc->mdsc; 1160 struct ceph_mds_client *mdsc = fsc->mdsc;
1161 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 1161 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1162 int check_cap = 0; 1162 int check_cap = 0;
1163 1163
1164 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, 1164 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
1165 inode, page, (int)pos, (int)copied, (int)len); 1165 inode, page, (int)pos, (int)copied, (int)len);
1166 1166
1167 /* zero the stale part of the page if we did a short copy */ 1167 /* zero the stale part of the page if we did a short copy */
1168 if (copied < len) 1168 if (copied < len)
1169 zero_user_segment(page, from+copied, len); 1169 zero_user_segment(page, from+copied, len);
1170 1170
1171 /* did file size increase? */ 1171 /* did file size increase? */
1172 /* (no need for i_size_read(); we caller holds i_mutex */ 1172 /* (no need for i_size_read(); we caller holds i_mutex */
1173 if (pos+copied > inode->i_size) 1173 if (pos+copied > inode->i_size)
1174 check_cap = ceph_inode_set_size(inode, pos+copied); 1174 check_cap = ceph_inode_set_size(inode, pos+copied);
1175 1175
1176 if (!PageUptodate(page)) 1176 if (!PageUptodate(page))
1177 SetPageUptodate(page); 1177 SetPageUptodate(page);
1178 1178
1179 set_page_dirty(page); 1179 set_page_dirty(page);
1180 1180
1181 unlock_page(page); 1181 unlock_page(page);
1182 up_read(&mdsc->snap_rwsem); 1182 up_read(&mdsc->snap_rwsem);
1183 page_cache_release(page); 1183 page_cache_release(page);
1184 1184
1185 if (check_cap) 1185 if (check_cap)
1186 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); 1186 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
1187 1187
1188 return copied; 1188 return copied;
1189 } 1189 }
1190 1190
1191 /* 1191 /*
1192 * we set .direct_IO to indicate direct io is supported, but since we 1192 * we set .direct_IO to indicate direct io is supported, but since we
1193 * intercept O_DIRECT reads and writes early, this function should 1193 * intercept O_DIRECT reads and writes early, this function should
1194 * never get called. 1194 * never get called.
1195 */ 1195 */
1196 static ssize_t ceph_direct_io(int rw, struct kiocb *iocb, 1196 static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
1197 struct iov_iter *iter, 1197 struct iov_iter *iter,
1198 loff_t pos) 1198 loff_t pos)
1199 { 1199 {
1200 WARN_ON(1); 1200 WARN_ON(1);
1201 return -EINVAL; 1201 return -EINVAL;
1202 } 1202 }
1203 1203
1204 const struct address_space_operations ceph_aops = { 1204 const struct address_space_operations ceph_aops = {
1205 .readpage = ceph_readpage, 1205 .readpage = ceph_readpage,
1206 .readpages = ceph_readpages, 1206 .readpages = ceph_readpages,
1207 .writepage = ceph_writepage, 1207 .writepage = ceph_writepage,
1208 .writepages = ceph_writepages_start, 1208 .writepages = ceph_writepages_start,
1209 .write_begin = ceph_write_begin, 1209 .write_begin = ceph_write_begin,
1210 .write_end = ceph_write_end, 1210 .write_end = ceph_write_end,
1211 .set_page_dirty = ceph_set_page_dirty, 1211 .set_page_dirty = ceph_set_page_dirty,
1212 .invalidatepage = ceph_invalidatepage, 1212 .invalidatepage = ceph_invalidatepage,
1213 .releasepage = ceph_releasepage, 1213 .releasepage = ceph_releasepage,
1214 .direct_IO = ceph_direct_io, 1214 .direct_IO = ceph_direct_io,
1215 }; 1215 };
1216 1216
1217 1217
1218 /* 1218 /*
1219 * vm ops 1219 * vm ops
1220 */ 1220 */
1221 static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1221 static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1222 { 1222 {
1223 struct inode *inode = file_inode(vma->vm_file); 1223 struct inode *inode = file_inode(vma->vm_file);
1224 struct ceph_inode_info *ci = ceph_inode(inode); 1224 struct ceph_inode_info *ci = ceph_inode(inode);
1225 struct ceph_file_info *fi = vma->vm_file->private_data; 1225 struct ceph_file_info *fi = vma->vm_file->private_data;
1226 struct page *pinned_page = NULL; 1226 struct page *pinned_page = NULL;
1227 loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT; 1227 loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT;
1228 int want, got, ret; 1228 int want, got, ret;
1229 1229
1230 dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", 1230 dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n",
1231 inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE); 1231 inode, ceph_vinop(inode), off, (size_t)PAGE_CACHE_SIZE);
1232 if (fi->fmode & CEPH_FILE_MODE_LAZY) 1232 if (fi->fmode & CEPH_FILE_MODE_LAZY)
1233 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 1233 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
1234 else 1234 else
1235 want = CEPH_CAP_FILE_CACHE; 1235 want = CEPH_CAP_FILE_CACHE;
1236 while (1) { 1236 while (1) {
1237 got = 0; 1237 got = 0;
1238 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, 1238 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want,
1239 -1, &got, &pinned_page); 1239 -1, &got, &pinned_page);
1240 if (ret == 0) 1240 if (ret == 0)
1241 break; 1241 break;
1242 if (ret != -ERESTARTSYS) { 1242 if (ret != -ERESTARTSYS) {
1243 WARN_ON(1); 1243 WARN_ON(1);
1244 return VM_FAULT_SIGBUS; 1244 return VM_FAULT_SIGBUS;
1245 } 1245 }
1246 } 1246 }
1247 dout("filemap_fault %p %llu~%zd got cap refs on %s\n", 1247 dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
1248 inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got)); 1248 inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got));
1249 1249
1250 if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || 1250 if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
1251 ci->i_inline_version == CEPH_INLINE_NONE) 1251 ci->i_inline_version == CEPH_INLINE_NONE)
1252 ret = filemap_fault(vma, vmf); 1252 ret = filemap_fault(vma, vmf);
1253 else 1253 else
1254 ret = -EAGAIN; 1254 ret = -EAGAIN;
1255 1255
1256 dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", 1256 dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
1257 inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret); 1257 inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret);
1258 if (pinned_page) 1258 if (pinned_page)
1259 page_cache_release(pinned_page); 1259 page_cache_release(pinned_page);
1260 ceph_put_cap_refs(ci, got); 1260 ceph_put_cap_refs(ci, got);
1261 1261
1262 if (ret != -EAGAIN) 1262 if (ret != -EAGAIN)
1263 return ret; 1263 return ret;
1264 1264
1265 /* read inline data */ 1265 /* read inline data */
1266 if (off >= PAGE_CACHE_SIZE) { 1266 if (off >= PAGE_CACHE_SIZE) {
1267 /* does not support inline data > PAGE_SIZE */ 1267 /* does not support inline data > PAGE_SIZE */
1268 ret = VM_FAULT_SIGBUS; 1268 ret = VM_FAULT_SIGBUS;
1269 } else { 1269 } else {
1270 int ret1; 1270 int ret1;
1271 struct address_space *mapping = inode->i_mapping; 1271 struct address_space *mapping = inode->i_mapping;
1272 struct page *page = find_or_create_page(mapping, 0, 1272 struct page *page = find_or_create_page(mapping, 0,
1273 mapping_gfp_mask(mapping) & 1273 mapping_gfp_mask(mapping) &
1274 ~__GFP_FS); 1274 ~__GFP_FS);
1275 if (!page) { 1275 if (!page) {
1276 ret = VM_FAULT_OOM; 1276 ret = VM_FAULT_OOM;
1277 goto out; 1277 goto out;
1278 } 1278 }
1279 ret1 = __ceph_do_getattr(inode, page, 1279 ret1 = __ceph_do_getattr(inode, page,
1280 CEPH_STAT_CAP_INLINE_DATA, true); 1280 CEPH_STAT_CAP_INLINE_DATA, true);
1281 if (ret1 < 0 || off >= i_size_read(inode)) { 1281 if (ret1 < 0 || off >= i_size_read(inode)) {
1282 unlock_page(page); 1282 unlock_page(page);
1283 page_cache_release(page); 1283 page_cache_release(page);
1284 ret = VM_FAULT_SIGBUS; 1284 ret = VM_FAULT_SIGBUS;
1285 goto out; 1285 goto out;
1286 } 1286 }
1287 if (ret1 < PAGE_CACHE_SIZE) 1287 if (ret1 < PAGE_CACHE_SIZE)
1288 zero_user_segment(page, ret1, PAGE_CACHE_SIZE); 1288 zero_user_segment(page, ret1, PAGE_CACHE_SIZE);
1289 else 1289 else
1290 flush_dcache_page(page); 1290 flush_dcache_page(page);
1291 SetPageUptodate(page); 1291 SetPageUptodate(page);
1292 vmf->page = page; 1292 vmf->page = page;
1293 ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; 1293 ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
1294 } 1294 }
1295 out: 1295 out:
1296 dout("filemap_fault %p %llu~%zd read inline data ret %d\n", 1296 dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
1297 inode, off, (size_t)PAGE_CACHE_SIZE, ret); 1297 inode, off, (size_t)PAGE_CACHE_SIZE, ret);
1298 return ret; 1298 return ret;
1299 } 1299 }
1300 1300
1301 /* 1301 /*
1302 * Reuse write_begin here for simplicity. 1302 * Reuse write_begin here for simplicity.
1303 */ 1303 */
1304 static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 1304 static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1305 { 1305 {
1306 struct inode *inode = file_inode(vma->vm_file); 1306 struct inode *inode = file_inode(vma->vm_file);
1307 struct ceph_inode_info *ci = ceph_inode(inode); 1307 struct ceph_inode_info *ci = ceph_inode(inode);
1308 struct ceph_file_info *fi = vma->vm_file->private_data; 1308 struct ceph_file_info *fi = vma->vm_file->private_data;
1309 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 1309 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1310 struct page *page = vmf->page; 1310 struct page *page = vmf->page;
1311 loff_t off = page_offset(page); 1311 loff_t off = page_offset(page);
1312 loff_t size = i_size_read(inode); 1312 loff_t size = i_size_read(inode);
1313 size_t len; 1313 size_t len;
1314 int want, got, ret; 1314 int want, got, ret;
1315 1315
1316 if (ci->i_inline_version != CEPH_INLINE_NONE) { 1316 if (ci->i_inline_version != CEPH_INLINE_NONE) {
1317 struct page *locked_page = NULL; 1317 struct page *locked_page = NULL;
1318 if (off == 0) { 1318 if (off == 0) {
1319 lock_page(page); 1319 lock_page(page);
1320 locked_page = page; 1320 locked_page = page;
1321 } 1321 }
1322 ret = ceph_uninline_data(vma->vm_file, locked_page); 1322 ret = ceph_uninline_data(vma->vm_file, locked_page);
1323 if (locked_page) 1323 if (locked_page)
1324 unlock_page(locked_page); 1324 unlock_page(locked_page);
1325 if (ret < 0) 1325 if (ret < 0)
1326 return VM_FAULT_SIGBUS; 1326 return VM_FAULT_SIGBUS;
1327 } 1327 }
1328 1328
1329 if (off + PAGE_CACHE_SIZE <= size) 1329 if (off + PAGE_CACHE_SIZE <= size)
1330 len = PAGE_CACHE_SIZE; 1330 len = PAGE_CACHE_SIZE;
1331 else 1331 else
1332 len = size & ~PAGE_CACHE_MASK; 1332 len = size & ~PAGE_CACHE_MASK;
1333 1333
1334 dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n", 1334 dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
1335 inode, ceph_vinop(inode), off, len, size); 1335 inode, ceph_vinop(inode), off, len, size);
1336 if (fi->fmode & CEPH_FILE_MODE_LAZY) 1336 if (fi->fmode & CEPH_FILE_MODE_LAZY)
1337 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; 1337 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
1338 else 1338 else
1339 want = CEPH_CAP_FILE_BUFFER; 1339 want = CEPH_CAP_FILE_BUFFER;
1340 while (1) { 1340 while (1) {
1341 got = 0; 1341 got = 0;
1342 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, 1342 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len,
1343 &got, NULL); 1343 &got, NULL);
1344 if (ret == 0) 1344 if (ret == 0)
1345 break; 1345 break;
1346 if (ret != -ERESTARTSYS) { 1346 if (ret != -ERESTARTSYS) {
1347 WARN_ON(1); 1347 WARN_ON(1);
1348 return VM_FAULT_SIGBUS; 1348 return VM_FAULT_SIGBUS;
1349 } 1349 }
1350 } 1350 }
1351 dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", 1351 dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
1352 inode, off, len, ceph_cap_string(got)); 1352 inode, off, len, ceph_cap_string(got));
1353 1353
1354 /* Update time before taking page lock */ 1354 /* Update time before taking page lock */
1355 file_update_time(vma->vm_file); 1355 file_update_time(vma->vm_file);
1356 1356
1357 lock_page(page); 1357 lock_page(page);
1358 1358
1359 ret = VM_FAULT_NOPAGE; 1359 ret = VM_FAULT_NOPAGE;
1360 if ((off > size) || 1360 if ((off > size) ||
1361 (page->mapping != inode->i_mapping)) 1361 (page->mapping != inode->i_mapping))
1362 goto out; 1362 goto out;
1363 1363
1364 ret = ceph_update_writeable_page(vma->vm_file, off, len, page); 1364 ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
1365 if (ret == 0) { 1365 if (ret == 0) {
1366 /* success. we'll keep the page locked. */ 1366 /* success. we'll keep the page locked. */
1367 set_page_dirty(page); 1367 set_page_dirty(page);
1368 up_read(&mdsc->snap_rwsem); 1368 up_read(&mdsc->snap_rwsem);
1369 ret = VM_FAULT_LOCKED; 1369 ret = VM_FAULT_LOCKED;
1370 } else { 1370 } else {
1371 if (ret == -ENOMEM) 1371 if (ret == -ENOMEM)
1372 ret = VM_FAULT_OOM; 1372 ret = VM_FAULT_OOM;
1373 else 1373 else
1374 ret = VM_FAULT_SIGBUS; 1374 ret = VM_FAULT_SIGBUS;
1375 } 1375 }
1376 out: 1376 out:
1377 if (ret != VM_FAULT_LOCKED) 1377 if (ret != VM_FAULT_LOCKED)
1378 unlock_page(page); 1378 unlock_page(page);
1379 if (ret == VM_FAULT_LOCKED || 1379 if (ret == VM_FAULT_LOCKED ||
1380 ci->i_inline_version != CEPH_INLINE_NONE) { 1380 ci->i_inline_version != CEPH_INLINE_NONE) {
1381 int dirty; 1381 int dirty;
1382 spin_lock(&ci->i_ceph_lock); 1382 spin_lock(&ci->i_ceph_lock);
1383 ci->i_inline_version = CEPH_INLINE_NONE; 1383 ci->i_inline_version = CEPH_INLINE_NONE;
1384 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); 1384 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
1385 spin_unlock(&ci->i_ceph_lock); 1385 spin_unlock(&ci->i_ceph_lock);
1386 if (dirty) 1386 if (dirty)
1387 __mark_inode_dirty(inode, dirty); 1387 __mark_inode_dirty(inode, dirty);
1388 } 1388 }
1389 1389
1390 dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n", 1390 dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n",
1391 inode, off, len, ceph_cap_string(got), ret); 1391 inode, off, len, ceph_cap_string(got), ret);
1392 ceph_put_cap_refs(ci, got); 1392 ceph_put_cap_refs(ci, got);
1393 1393
1394 return ret; 1394 return ret;
1395 } 1395 }
1396 1396
1397 void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, 1397 void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
1398 char *data, size_t len) 1398 char *data, size_t len)
1399 { 1399 {
1400 struct address_space *mapping = inode->i_mapping; 1400 struct address_space *mapping = inode->i_mapping;
1401 struct page *page; 1401 struct page *page;
1402 1402
1403 if (locked_page) { 1403 if (locked_page) {
1404 page = locked_page; 1404 page = locked_page;
1405 } else { 1405 } else {
1406 if (i_size_read(inode) == 0) 1406 if (i_size_read(inode) == 0)
1407 return; 1407 return;
1408 page = find_or_create_page(mapping, 0, 1408 page = find_or_create_page(mapping, 0,
1409 mapping_gfp_mask(mapping) & ~__GFP_FS); 1409 mapping_gfp_mask(mapping) & ~__GFP_FS);
1410 if (!page) 1410 if (!page)
1411 return; 1411 return;
1412 if (PageUptodate(page)) { 1412 if (PageUptodate(page)) {
1413 unlock_page(page); 1413 unlock_page(page);
1414 page_cache_release(page); 1414 page_cache_release(page);
1415 return; 1415 return;
1416 } 1416 }
1417 } 1417 }
1418 1418
1419 dout("fill_inline_data %p %llx.%llx len %lu locked_page %p\n", 1419 dout("fill_inline_data %p %llx.%llx len %zu locked_page %p\n",
1420 inode, ceph_vinop(inode), len, locked_page); 1420 inode, ceph_vinop(inode), len, locked_page);
1421 1421
1422 if (len > 0) { 1422 if (len > 0) {
1423 void *kaddr = kmap_atomic(page); 1423 void *kaddr = kmap_atomic(page);
1424 memcpy(kaddr, data, len); 1424 memcpy(kaddr, data, len);
1425 kunmap_atomic(kaddr); 1425 kunmap_atomic(kaddr);
1426 } 1426 }
1427 1427
1428 if (page != locked_page) { 1428 if (page != locked_page) {
1429 if (len < PAGE_CACHE_SIZE) 1429 if (len < PAGE_CACHE_SIZE)
1430 zero_user_segment(page, len, PAGE_CACHE_SIZE); 1430 zero_user_segment(page, len, PAGE_CACHE_SIZE);
1431 else 1431 else
1432 flush_dcache_page(page); 1432 flush_dcache_page(page);
1433 1433
1434 SetPageUptodate(page); 1434 SetPageUptodate(page);
1435 unlock_page(page); 1435 unlock_page(page);
1436 page_cache_release(page); 1436 page_cache_release(page);
1437 } 1437 }
1438 } 1438 }
1439 1439
1440 int ceph_uninline_data(struct file *filp, struct page *locked_page) 1440 int ceph_uninline_data(struct file *filp, struct page *locked_page)
1441 { 1441 {
1442 struct inode *inode = file_inode(filp); 1442 struct inode *inode = file_inode(filp);
1443 struct ceph_inode_info *ci = ceph_inode(inode); 1443 struct ceph_inode_info *ci = ceph_inode(inode);
1444 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 1444 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1445 struct ceph_osd_request *req; 1445 struct ceph_osd_request *req;
1446 struct page *page = NULL; 1446 struct page *page = NULL;
1447 u64 len, inline_version; 1447 u64 len, inline_version;
1448 int err = 0; 1448 int err = 0;
1449 bool from_pagecache = false; 1449 bool from_pagecache = false;
1450 1450
1451 spin_lock(&ci->i_ceph_lock); 1451 spin_lock(&ci->i_ceph_lock);
1452 inline_version = ci->i_inline_version; 1452 inline_version = ci->i_inline_version;
1453 spin_unlock(&ci->i_ceph_lock); 1453 spin_unlock(&ci->i_ceph_lock);
1454 1454
1455 dout("uninline_data %p %llx.%llx inline_version %llu\n", 1455 dout("uninline_data %p %llx.%llx inline_version %llu\n",
1456 inode, ceph_vinop(inode), inline_version); 1456 inode, ceph_vinop(inode), inline_version);
1457 1457
1458 if (inline_version == 1 || /* initial version, no data */ 1458 if (inline_version == 1 || /* initial version, no data */
1459 inline_version == CEPH_INLINE_NONE) 1459 inline_version == CEPH_INLINE_NONE)
1460 goto out; 1460 goto out;
1461 1461
1462 if (locked_page) { 1462 if (locked_page) {
1463 page = locked_page; 1463 page = locked_page;
1464 WARN_ON(!PageUptodate(page)); 1464 WARN_ON(!PageUptodate(page));
1465 } else if (ceph_caps_issued(ci) & 1465 } else if (ceph_caps_issued(ci) &
1466 (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) { 1466 (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) {
1467 page = find_get_page(inode->i_mapping, 0); 1467 page = find_get_page(inode->i_mapping, 0);
1468 if (page) { 1468 if (page) {
1469 if (PageUptodate(page)) { 1469 if (PageUptodate(page)) {
1470 from_pagecache = true; 1470 from_pagecache = true;
1471 lock_page(page); 1471 lock_page(page);
1472 } else { 1472 } else {
1473 page_cache_release(page); 1473 page_cache_release(page);
1474 page = NULL; 1474 page = NULL;
1475 } 1475 }
1476 } 1476 }
1477 } 1477 }
1478 1478
1479 if (page) { 1479 if (page) {
1480 len = i_size_read(inode); 1480 len = i_size_read(inode);
1481 if (len > PAGE_CACHE_SIZE) 1481 if (len > PAGE_CACHE_SIZE)
1482 len = PAGE_CACHE_SIZE; 1482 len = PAGE_CACHE_SIZE;
1483 } else { 1483 } else {
1484 page = __page_cache_alloc(GFP_NOFS); 1484 page = __page_cache_alloc(GFP_NOFS);
1485 if (!page) { 1485 if (!page) {
1486 err = -ENOMEM; 1486 err = -ENOMEM;
1487 goto out; 1487 goto out;
1488 } 1488 }
1489 err = __ceph_do_getattr(inode, page, 1489 err = __ceph_do_getattr(inode, page,
1490 CEPH_STAT_CAP_INLINE_DATA, true); 1490 CEPH_STAT_CAP_INLINE_DATA, true);
1491 if (err < 0) { 1491 if (err < 0) {
1492 /* no inline data */ 1492 /* no inline data */
1493 if (err == -ENODATA) 1493 if (err == -ENODATA)
1494 err = 0; 1494 err = 0;
1495 goto out; 1495 goto out;
1496 } 1496 }
1497 len = err; 1497 len = err;
1498 } 1498 }
1499 1499
1500 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 1500 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
1501 ceph_vino(inode), 0, &len, 0, 1, 1501 ceph_vino(inode), 0, &len, 0, 1,
1502 CEPH_OSD_OP_CREATE, 1502 CEPH_OSD_OP_CREATE,
1503 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, 1503 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
1504 ci->i_snap_realm->cached_context, 1504 ci->i_snap_realm->cached_context,
1505 0, 0, false); 1505 0, 0, false);
1506 if (IS_ERR(req)) { 1506 if (IS_ERR(req)) {
1507 err = PTR_ERR(req); 1507 err = PTR_ERR(req);
1508 goto out; 1508 goto out;
1509 } 1509 }
1510 1510
1511 ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); 1511 ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
1512 err = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1512 err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
1513 if (!err) 1513 if (!err)
1514 err = ceph_osdc_wait_request(&fsc->client->osdc, req); 1514 err = ceph_osdc_wait_request(&fsc->client->osdc, req);
1515 ceph_osdc_put_request(req); 1515 ceph_osdc_put_request(req);
1516 if (err < 0) 1516 if (err < 0)
1517 goto out; 1517 goto out;
1518 1518
1519 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 1519 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
1520 ceph_vino(inode), 0, &len, 1, 3, 1520 ceph_vino(inode), 0, &len, 1, 3,
1521 CEPH_OSD_OP_WRITE, 1521 CEPH_OSD_OP_WRITE,
1522 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, 1522 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
1523 ci->i_snap_realm->cached_context, 1523 ci->i_snap_realm->cached_context,
1524 ci->i_truncate_seq, ci->i_truncate_size, 1524 ci->i_truncate_seq, ci->i_truncate_size,
1525 false); 1525 false);
1526 if (IS_ERR(req)) { 1526 if (IS_ERR(req)) {
1527 err = PTR_ERR(req); 1527 err = PTR_ERR(req);
1528 goto out; 1528 goto out;
1529 } 1529 }
1530 1530
1531 osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); 1531 osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false);
1532 1532
1533 err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, 1533 err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR,
1534 "inline_version", &inline_version, 1534 "inline_version", &inline_version,
1535 sizeof(inline_version), 1535 sizeof(inline_version),
1536 CEPH_OSD_CMPXATTR_OP_GT, 1536 CEPH_OSD_CMPXATTR_OP_GT,
1537 CEPH_OSD_CMPXATTR_MODE_U64); 1537 CEPH_OSD_CMPXATTR_MODE_U64);
1538 if (err) 1538 if (err)
1539 goto out_put; 1539 goto out_put;
1540 1540
1541 err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, 1541 err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR,
1542 "inline_version", &inline_version, 1542 "inline_version", &inline_version,
1543 sizeof(inline_version), 0, 0); 1543 sizeof(inline_version), 0, 0);
1544 if (err) 1544 if (err)
1545 goto out_put; 1545 goto out_put;
1546 1546
1547 ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); 1547 ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime);
1548 err = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1548 err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
1549 if (!err) 1549 if (!err)
1550 err = ceph_osdc_wait_request(&fsc->client->osdc, req); 1550 err = ceph_osdc_wait_request(&fsc->client->osdc, req);
1551 out_put: 1551 out_put:
1552 ceph_osdc_put_request(req); 1552 ceph_osdc_put_request(req);
1553 if (err == -ECANCELED) 1553 if (err == -ECANCELED)
1554 err = 0; 1554 err = 0;
1555 out: 1555 out:
1556 if (page && page != locked_page) { 1556 if (page && page != locked_page) {
1557 if (from_pagecache) { 1557 if (from_pagecache) {
1558 unlock_page(page); 1558 unlock_page(page);
1559 page_cache_release(page); 1559 page_cache_release(page);
1560 } else 1560 } else
1561 __free_pages(page, 0); 1561 __free_pages(page, 0);
1562 } 1562 }
1563 1563
1564 dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", 1564 dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
1565 inode, ceph_vinop(inode), inline_version, err); 1565 inode, ceph_vinop(inode), inline_version, err);
1566 return err; 1566 return err;
1567 } 1567 }
1568 1568
1569 static struct vm_operations_struct ceph_vmops = { 1569 static struct vm_operations_struct ceph_vmops = {
1570 .fault = ceph_filemap_fault, 1570 .fault = ceph_filemap_fault,
1571 .page_mkwrite = ceph_page_mkwrite, 1571 .page_mkwrite = ceph_page_mkwrite,
1572 .remap_pages = generic_file_remap_pages, 1572 .remap_pages = generic_file_remap_pages,
1573 }; 1573 };
1574 1574
1575 int ceph_mmap(struct file *file, struct vm_area_struct *vma) 1575 int ceph_mmap(struct file *file, struct vm_area_struct *vma)
1576 { 1576 {
1577 struct address_space *mapping = file->f_mapping; 1577 struct address_space *mapping = file->f_mapping;
1578 1578
1579 if (!mapping->a_ops->readpage) 1579 if (!mapping->a_ops->readpage)
1580 return -ENOEXEC; 1580 return -ENOEXEC;
1581 file_accessed(file); 1581 file_accessed(file);
1582 vma->vm_ops = &ceph_vmops; 1582 vma->vm_ops = &ceph_vmops;
1583 return 0; 1583 return 0;
1584 } 1584 }
1585 1585
include/linux/ceph/osd_client.h
1 #ifndef _FS_CEPH_OSD_CLIENT_H 1 #ifndef _FS_CEPH_OSD_CLIENT_H
2 #define _FS_CEPH_OSD_CLIENT_H 2 #define _FS_CEPH_OSD_CLIENT_H
3 3
4 #include <linux/completion.h> 4 #include <linux/completion.h>
5 #include <linux/kref.h> 5 #include <linux/kref.h>
6 #include <linux/mempool.h> 6 #include <linux/mempool.h>
7 #include <linux/rbtree.h> 7 #include <linux/rbtree.h>
8 8
9 #include <linux/ceph/types.h> 9 #include <linux/ceph/types.h>
10 #include <linux/ceph/osdmap.h> 10 #include <linux/ceph/osdmap.h>
11 #include <linux/ceph/messenger.h> 11 #include <linux/ceph/messenger.h>
12 #include <linux/ceph/auth.h> 12 #include <linux/ceph/auth.h>
13 #include <linux/ceph/pagelist.h> 13 #include <linux/ceph/pagelist.h>
14 14
15 struct ceph_msg; 15 struct ceph_msg;
16 struct ceph_snap_context; 16 struct ceph_snap_context;
17 struct ceph_osd_request; 17 struct ceph_osd_request;
18 struct ceph_osd_client; 18 struct ceph_osd_client;
19 struct ceph_authorizer; 19 struct ceph_authorizer;
20 20
21 /* 21 /*
22 * completion callback for async writepages 22 * completion callback for async writepages
23 */ 23 */
24 typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *, 24 typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
25 struct ceph_msg *); 25 struct ceph_msg *);
26 typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool); 26 typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
27 27
28 /* a given osd we're communicating with */ 28 /* a given osd we're communicating with */
29 struct ceph_osd { 29 struct ceph_osd {
30 atomic_t o_ref; 30 atomic_t o_ref;
31 struct ceph_osd_client *o_osdc; 31 struct ceph_osd_client *o_osdc;
32 int o_osd; 32 int o_osd;
33 int o_incarnation; 33 int o_incarnation;
34 struct rb_node o_node; 34 struct rb_node o_node;
35 struct ceph_connection o_con; 35 struct ceph_connection o_con;
36 struct list_head o_requests; 36 struct list_head o_requests;
37 struct list_head o_linger_requests; 37 struct list_head o_linger_requests;
38 struct list_head o_osd_lru; 38 struct list_head o_osd_lru;
39 struct ceph_auth_handshake o_auth; 39 struct ceph_auth_handshake o_auth;
40 unsigned long lru_ttl; 40 unsigned long lru_ttl;
41 int o_marked_for_keepalive; 41 int o_marked_for_keepalive;
42 struct list_head o_keepalive_item; 42 struct list_head o_keepalive_item;
43 }; 43 };
44 44
45 45
46 #define CEPH_OSD_MAX_OP 3 46 #define CEPH_OSD_MAX_OP 3
47 47
48 enum ceph_osd_data_type { 48 enum ceph_osd_data_type {
49 CEPH_OSD_DATA_TYPE_NONE = 0, 49 CEPH_OSD_DATA_TYPE_NONE = 0,
50 CEPH_OSD_DATA_TYPE_PAGES, 50 CEPH_OSD_DATA_TYPE_PAGES,
51 CEPH_OSD_DATA_TYPE_PAGELIST, 51 CEPH_OSD_DATA_TYPE_PAGELIST,
52 #ifdef CONFIG_BLOCK 52 #ifdef CONFIG_BLOCK
53 CEPH_OSD_DATA_TYPE_BIO, 53 CEPH_OSD_DATA_TYPE_BIO,
54 #endif /* CONFIG_BLOCK */ 54 #endif /* CONFIG_BLOCK */
55 }; 55 };
56 56
57 struct ceph_osd_data { 57 struct ceph_osd_data {
58 enum ceph_osd_data_type type; 58 enum ceph_osd_data_type type;
59 union { 59 union {
60 struct { 60 struct {
61 struct page **pages; 61 struct page **pages;
62 u64 length; 62 u64 length;
63 u32 alignment; 63 u32 alignment;
64 bool pages_from_pool; 64 bool pages_from_pool;
65 bool own_pages; 65 bool own_pages;
66 }; 66 };
67 struct ceph_pagelist *pagelist; 67 struct ceph_pagelist *pagelist;
68 #ifdef CONFIG_BLOCK 68 #ifdef CONFIG_BLOCK
69 struct { 69 struct {
70 struct bio *bio; /* list of bios */ 70 struct bio *bio; /* list of bios */
71 size_t bio_length; /* total in list */ 71 size_t bio_length; /* total in list */
72 }; 72 };
73 #endif /* CONFIG_BLOCK */ 73 #endif /* CONFIG_BLOCK */
74 }; 74 };
75 }; 75 };
76 76
77 struct ceph_osd_req_op { 77 struct ceph_osd_req_op {
78 u16 op; /* CEPH_OSD_OP_* */ 78 u16 op; /* CEPH_OSD_OP_* */
79 u32 flags; /* CEPH_OSD_OP_FLAG_* */ 79 u32 flags; /* CEPH_OSD_OP_FLAG_* */
80 u32 payload_len; 80 u32 payload_len;
81 union { 81 union {
82 struct ceph_osd_data raw_data_in; 82 struct ceph_osd_data raw_data_in;
83 struct { 83 struct {
84 u64 offset, length; 84 u64 offset, length;
85 u64 truncate_size; 85 u64 truncate_size;
86 u32 truncate_seq; 86 u32 truncate_seq;
87 struct ceph_osd_data osd_data; 87 struct ceph_osd_data osd_data;
88 } extent; 88 } extent;
89 struct { 89 struct {
90 __le32 name_len; 90 u32 name_len;
91 __le32 value_len; 91 u32 value_len;
92 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ 92 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
93 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ 93 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
94 struct ceph_osd_data osd_data; 94 struct ceph_osd_data osd_data;
95 } xattr; 95 } xattr;
96 struct { 96 struct {
97 const char *class_name; 97 const char *class_name;
98 const char *method_name; 98 const char *method_name;
99 struct ceph_osd_data request_info; 99 struct ceph_osd_data request_info;
100 struct ceph_osd_data request_data; 100 struct ceph_osd_data request_data;
101 struct ceph_osd_data response_data; 101 struct ceph_osd_data response_data;
102 __u8 class_len; 102 __u8 class_len;
103 __u8 method_len; 103 __u8 method_len;
104 __u8 argc; 104 __u8 argc;
105 } cls; 105 } cls;
106 struct { 106 struct {
107 u64 cookie; 107 u64 cookie;
108 u64 ver; 108 u64 ver;
109 u32 prot_ver; 109 u32 prot_ver;
110 u32 timeout; 110 u32 timeout;
111 __u8 flag; 111 __u8 flag;
112 } watch; 112 } watch;
113 struct { 113 struct {
114 u64 expected_object_size; 114 u64 expected_object_size;
115 u64 expected_write_size; 115 u64 expected_write_size;
116 } alloc_hint; 116 } alloc_hint;
117 }; 117 };
118 }; 118 };
119 119
120 /* an in-flight request */ 120 /* an in-flight request */
121 struct ceph_osd_request { 121 struct ceph_osd_request {
122 u64 r_tid; /* unique for this client */ 122 u64 r_tid; /* unique for this client */
123 struct rb_node r_node; 123 struct rb_node r_node;
124 struct list_head r_req_lru_item; 124 struct list_head r_req_lru_item;
125 struct list_head r_osd_item; 125 struct list_head r_osd_item;
126 struct list_head r_linger_item; 126 struct list_head r_linger_item;
127 struct list_head r_linger_osd_item; 127 struct list_head r_linger_osd_item;
128 struct ceph_osd *r_osd; 128 struct ceph_osd *r_osd;
129 struct ceph_pg r_pgid; 129 struct ceph_pg r_pgid;
130 int r_pg_osds[CEPH_PG_MAX_SIZE]; 130 int r_pg_osds[CEPH_PG_MAX_SIZE];
131 int r_num_pg_osds; 131 int r_num_pg_osds;
132 132
133 struct ceph_msg *r_request, *r_reply; 133 struct ceph_msg *r_request, *r_reply;
134 int r_flags; /* any additional flags for the osd */ 134 int r_flags; /* any additional flags for the osd */
135 u32 r_sent; /* >0 if r_request is sending/sent */ 135 u32 r_sent; /* >0 if r_request is sending/sent */
136 136
137 /* request osd ops array */ 137 /* request osd ops array */
138 unsigned int r_num_ops; 138 unsigned int r_num_ops;
139 struct ceph_osd_req_op r_ops[CEPH_OSD_MAX_OP]; 139 struct ceph_osd_req_op r_ops[CEPH_OSD_MAX_OP];
140 140
141 /* these are updated on each send */ 141 /* these are updated on each send */
142 __le32 *r_request_osdmap_epoch; 142 __le32 *r_request_osdmap_epoch;
143 __le32 *r_request_flags; 143 __le32 *r_request_flags;
144 __le64 *r_request_pool; 144 __le64 *r_request_pool;
145 void *r_request_pgid; 145 void *r_request_pgid;
146 __le32 *r_request_attempts; 146 __le32 *r_request_attempts;
147 bool r_paused; 147 bool r_paused;
148 struct ceph_eversion *r_request_reassert_version; 148 struct ceph_eversion *r_request_reassert_version;
149 149
150 int r_result; 150 int r_result;
151 int r_reply_op_len[CEPH_OSD_MAX_OP]; 151 int r_reply_op_len[CEPH_OSD_MAX_OP];
152 s32 r_reply_op_result[CEPH_OSD_MAX_OP]; 152 s32 r_reply_op_result[CEPH_OSD_MAX_OP];
153 int r_got_reply; 153 int r_got_reply;
154 int r_linger; 154 int r_linger;
155 155
156 struct ceph_osd_client *r_osdc; 156 struct ceph_osd_client *r_osdc;
157 struct kref r_kref; 157 struct kref r_kref;
158 bool r_mempool; 158 bool r_mempool;
159 struct completion r_completion, r_safe_completion; 159 struct completion r_completion, r_safe_completion;
160 ceph_osdc_callback_t r_callback; 160 ceph_osdc_callback_t r_callback;
161 ceph_osdc_unsafe_callback_t r_unsafe_callback; 161 ceph_osdc_unsafe_callback_t r_unsafe_callback;
162 struct ceph_eversion r_reassert_version; 162 struct ceph_eversion r_reassert_version;
163 struct list_head r_unsafe_item; 163 struct list_head r_unsafe_item;
164 164
165 struct inode *r_inode; /* for use by callbacks */ 165 struct inode *r_inode; /* for use by callbacks */
166 void *r_priv; /* ditto */ 166 void *r_priv; /* ditto */
167 167
168 struct ceph_object_locator r_base_oloc; 168 struct ceph_object_locator r_base_oloc;
169 struct ceph_object_id r_base_oid; 169 struct ceph_object_id r_base_oid;
170 struct ceph_object_locator r_target_oloc; 170 struct ceph_object_locator r_target_oloc;
171 struct ceph_object_id r_target_oid; 171 struct ceph_object_id r_target_oid;
172 172
173 u64 r_snapid; 173 u64 r_snapid;
174 unsigned long r_stamp; /* send OR check time */ 174 unsigned long r_stamp; /* send OR check time */
175 175
176 struct ceph_snap_context *r_snapc; /* snap context for writes */ 176 struct ceph_snap_context *r_snapc; /* snap context for writes */
177 }; 177 };
178 178
179 struct ceph_request_redirect { 179 struct ceph_request_redirect {
180 struct ceph_object_locator oloc; 180 struct ceph_object_locator oloc;
181 }; 181 };
182 182
183 struct ceph_osd_event { 183 struct ceph_osd_event {
184 u64 cookie; 184 u64 cookie;
185 int one_shot; 185 int one_shot;
186 struct ceph_osd_client *osdc; 186 struct ceph_osd_client *osdc;
187 void (*cb)(u64, u64, u8, void *); 187 void (*cb)(u64, u64, u8, void *);
188 void *data; 188 void *data;
189 struct rb_node node; 189 struct rb_node node;
190 struct list_head osd_node; 190 struct list_head osd_node;
191 struct kref kref; 191 struct kref kref;
192 }; 192 };
193 193
194 struct ceph_osd_event_work { 194 struct ceph_osd_event_work {
195 struct work_struct work; 195 struct work_struct work;
196 struct ceph_osd_event *event; 196 struct ceph_osd_event *event;
197 u64 ver; 197 u64 ver;
198 u64 notify_id; 198 u64 notify_id;
199 u8 opcode; 199 u8 opcode;
200 }; 200 };
201 201
202 struct ceph_osd_client { 202 struct ceph_osd_client {
203 struct ceph_client *client; 203 struct ceph_client *client;
204 204
205 struct ceph_osdmap *osdmap; /* current map */ 205 struct ceph_osdmap *osdmap; /* current map */
206 struct rw_semaphore map_sem; 206 struct rw_semaphore map_sem;
207 struct completion map_waiters; 207 struct completion map_waiters;
208 u64 last_requested_map; 208 u64 last_requested_map;
209 209
210 struct mutex request_mutex; 210 struct mutex request_mutex;
211 struct rb_root osds; /* osds */ 211 struct rb_root osds; /* osds */
212 struct list_head osd_lru; /* idle osds */ 212 struct list_head osd_lru; /* idle osds */
213 u64 timeout_tid; /* tid of timeout triggering rq */ 213 u64 timeout_tid; /* tid of timeout triggering rq */
214 u64 last_tid; /* tid of last request */ 214 u64 last_tid; /* tid of last request */
215 struct rb_root requests; /* pending requests */ 215 struct rb_root requests; /* pending requests */
216 struct list_head req_lru; /* in-flight lru */ 216 struct list_head req_lru; /* in-flight lru */
217 struct list_head req_unsent; /* unsent/need-resend queue */ 217 struct list_head req_unsent; /* unsent/need-resend queue */
218 struct list_head req_notarget; /* map to no osd */ 218 struct list_head req_notarget; /* map to no osd */
219 struct list_head req_linger; /* lingering requests */ 219 struct list_head req_linger; /* lingering requests */
220 int num_requests; 220 int num_requests;
221 struct delayed_work timeout_work; 221 struct delayed_work timeout_work;
222 struct delayed_work osds_timeout_work; 222 struct delayed_work osds_timeout_work;
223 #ifdef CONFIG_DEBUG_FS 223 #ifdef CONFIG_DEBUG_FS
224 struct dentry *debugfs_file; 224 struct dentry *debugfs_file;
225 #endif 225 #endif
226 226
227 mempool_t *req_mempool; 227 mempool_t *req_mempool;
228 228
229 struct ceph_msgpool msgpool_op; 229 struct ceph_msgpool msgpool_op;
230 struct ceph_msgpool msgpool_op_reply; 230 struct ceph_msgpool msgpool_op_reply;
231 231
232 spinlock_t event_lock; 232 spinlock_t event_lock;
233 struct rb_root event_tree; 233 struct rb_root event_tree;
234 u64 event_count; 234 u64 event_count;
235 235
236 struct workqueue_struct *notify_wq; 236 struct workqueue_struct *notify_wq;
237 }; 237 };
238 238
239 extern int ceph_osdc_setup(void); 239 extern int ceph_osdc_setup(void);
240 extern void ceph_osdc_cleanup(void); 240 extern void ceph_osdc_cleanup(void);
241 241
242 extern int ceph_osdc_init(struct ceph_osd_client *osdc, 242 extern int ceph_osdc_init(struct ceph_osd_client *osdc,
243 struct ceph_client *client); 243 struct ceph_client *client);
244 extern void ceph_osdc_stop(struct ceph_osd_client *osdc); 244 extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
245 245
246 extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, 246 extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
247 struct ceph_msg *msg); 247 struct ceph_msg *msg);
248 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, 248 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
249 struct ceph_msg *msg); 249 struct ceph_msg *msg);
250 250
251 extern void osd_req_op_init(struct ceph_osd_request *osd_req, 251 extern void osd_req_op_init(struct ceph_osd_request *osd_req,
252 unsigned int which, u16 opcode); 252 unsigned int which, u16 opcode);
253 253
254 extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *, 254 extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *,
255 unsigned int which, 255 unsigned int which,
256 struct page **pages, u64 length, 256 struct page **pages, u64 length,
257 u32 alignment, bool pages_from_pool, 257 u32 alignment, bool pages_from_pool,
258 bool own_pages); 258 bool own_pages);
259 259
260 extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req, 260 extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
261 unsigned int which, u16 opcode, 261 unsigned int which, u16 opcode,
262 u64 offset, u64 length, 262 u64 offset, u64 length,
263 u64 truncate_size, u32 truncate_seq); 263 u64 truncate_size, u32 truncate_seq);
264 extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, 264 extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
265 unsigned int which, u64 length); 265 unsigned int which, u64 length);
266 266
267 extern struct ceph_osd_data *osd_req_op_extent_osd_data( 267 extern struct ceph_osd_data *osd_req_op_extent_osd_data(
268 struct ceph_osd_request *osd_req, 268 struct ceph_osd_request *osd_req,
269 unsigned int which); 269 unsigned int which);
270 extern struct ceph_osd_data *osd_req_op_cls_response_data( 270 extern struct ceph_osd_data *osd_req_op_cls_response_data(
271 struct ceph_osd_request *osd_req, 271 struct ceph_osd_request *osd_req,
272 unsigned int which); 272 unsigned int which);
273 273
274 extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, 274 extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *,
275 unsigned int which, 275 unsigned int which,
276 struct page **pages, u64 length, 276 struct page **pages, u64 length,
277 u32 alignment, bool pages_from_pool, 277 u32 alignment, bool pages_from_pool,
278 bool own_pages); 278 bool own_pages);
279 extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *, 279 extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *,
280 unsigned int which, 280 unsigned int which,
281 struct ceph_pagelist *pagelist); 281 struct ceph_pagelist *pagelist);
282 #ifdef CONFIG_BLOCK 282 #ifdef CONFIG_BLOCK
283 extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *, 283 extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *,
284 unsigned int which, 284 unsigned int which,
285 struct bio *bio, size_t bio_length); 285 struct bio *bio, size_t bio_length);
286 #endif /* CONFIG_BLOCK */ 286 #endif /* CONFIG_BLOCK */
287 287
288 extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *, 288 extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *,
289 unsigned int which, 289 unsigned int which,
290 struct ceph_pagelist *pagelist); 290 struct ceph_pagelist *pagelist);
291 extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *, 291 extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *,
292 unsigned int which, 292 unsigned int which,
293 struct page **pages, u64 length, 293 struct page **pages, u64 length,
294 u32 alignment, bool pages_from_pool, 294 u32 alignment, bool pages_from_pool,
295 bool own_pages); 295 bool own_pages);
296 extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, 296 extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *,
297 unsigned int which, 297 unsigned int which,
298 struct page **pages, u64 length, 298 struct page **pages, u64 length,
299 u32 alignment, bool pages_from_pool, 299 u32 alignment, bool pages_from_pool,
300 bool own_pages); 300 bool own_pages);
301 301
302 extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, 302 extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
303 unsigned int which, u16 opcode, 303 unsigned int which, u16 opcode,
304 const char *class, const char *method); 304 const char *class, const char *method);
305 extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, 305 extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
306 u16 opcode, const char *name, const void *value, 306 u16 opcode, const char *name, const void *value,
307 size_t size, u8 cmp_op, u8 cmp_mode); 307 size_t size, u8 cmp_op, u8 cmp_mode);
308 extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, 308 extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
309 unsigned int which, u16 opcode, 309 unsigned int which, u16 opcode,
310 u64 cookie, u64 version, int flag); 310 u64 cookie, u64 version, int flag);
311 extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, 311 extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
312 unsigned int which, 312 unsigned int which,
313 u64 expected_object_size, 313 u64 expected_object_size,
314 u64 expected_write_size); 314 u64 expected_write_size);
315 315
316 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, 316 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
317 struct ceph_snap_context *snapc, 317 struct ceph_snap_context *snapc,
318 unsigned int num_ops, 318 unsigned int num_ops,
319 bool use_mempool, 319 bool use_mempool,
320 gfp_t gfp_flags); 320 gfp_t gfp_flags);
321 321
322 extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, 322 extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
323 struct ceph_snap_context *snapc, 323 struct ceph_snap_context *snapc,
324 u64 snap_id, 324 u64 snap_id,
325 struct timespec *mtime); 325 struct timespec *mtime);
326 326
327 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, 327 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
328 struct ceph_file_layout *layout, 328 struct ceph_file_layout *layout,
329 struct ceph_vino vino, 329 struct ceph_vino vino,
330 u64 offset, u64 *len, 330 u64 offset, u64 *len,
331 unsigned int which, int num_ops, 331 unsigned int which, int num_ops,
332 int opcode, int flags, 332 int opcode, int flags,
333 struct ceph_snap_context *snapc, 333 struct ceph_snap_context *snapc,
334 u32 truncate_seq, u64 truncate_size, 334 u32 truncate_seq, u64 truncate_size,
335 bool use_mempool); 335 bool use_mempool);
336 336
337 extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, 337 extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
338 struct ceph_osd_request *req); 338 struct ceph_osd_request *req);
339 339
340 extern void ceph_osdc_get_request(struct ceph_osd_request *req); 340 extern void ceph_osdc_get_request(struct ceph_osd_request *req);
341 extern void ceph_osdc_put_request(struct ceph_osd_request *req); 341 extern void ceph_osdc_put_request(struct ceph_osd_request *req);
342 342
343 extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, 343 extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
344 struct ceph_osd_request *req, 344 struct ceph_osd_request *req,
345 bool nofail); 345 bool nofail);
346 extern void ceph_osdc_cancel_request(struct ceph_osd_request *req); 346 extern void ceph_osdc_cancel_request(struct ceph_osd_request *req);
347 extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, 347 extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
348 struct ceph_osd_request *req); 348 struct ceph_osd_request *req);
349 extern void ceph_osdc_sync(struct ceph_osd_client *osdc); 349 extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
350 350
351 extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc); 351 extern void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc);
352 352
353 extern int ceph_osdc_readpages(struct ceph_osd_client *osdc, 353 extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
354 struct ceph_vino vino, 354 struct ceph_vino vino,
355 struct ceph_file_layout *layout, 355 struct ceph_file_layout *layout,
356 u64 off, u64 *plen, 356 u64 off, u64 *plen,
357 u32 truncate_seq, u64 truncate_size, 357 u32 truncate_seq, u64 truncate_size,
358 struct page **pages, int nr_pages, 358 struct page **pages, int nr_pages,
359 int page_align); 359 int page_align);
360 360
361 extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, 361 extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
362 struct ceph_vino vino, 362 struct ceph_vino vino,
363 struct ceph_file_layout *layout, 363 struct ceph_file_layout *layout,
364 struct ceph_snap_context *sc, 364 struct ceph_snap_context *sc,
365 u64 off, u64 len, 365 u64 off, u64 len,
366 u32 truncate_seq, u64 truncate_size, 366 u32 truncate_seq, u64 truncate_size,
367 struct timespec *mtime, 367 struct timespec *mtime,
368 struct page **pages, int nr_pages); 368 struct page **pages, int nr_pages);
369 369
370 /* watch/notify events */ 370 /* watch/notify events */
371 extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, 371 extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
372 void (*event_cb)(u64, u64, u8, void *), 372 void (*event_cb)(u64, u64, u8, void *),
373 void *data, struct ceph_osd_event **pevent); 373 void *data, struct ceph_osd_event **pevent);
374 extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); 374 extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
375 extern void ceph_osdc_put_event(struct ceph_osd_event *event); 375 extern void ceph_osdc_put_event(struct ceph_osd_event *event);
376 #endif 376 #endif
377 377
378 378
1 1
2 #include <linux/ceph/ceph_debug.h> 2 #include <linux/ceph/ceph_debug.h>
3 3
4 #include <linux/err.h> 4 #include <linux/err.h>
5 #include <linux/module.h> 5 #include <linux/module.h>
6 #include <linux/random.h> 6 #include <linux/random.h>
7 #include <linux/slab.h> 7 #include <linux/slab.h>
8 8
9 #include <linux/ceph/decode.h> 9 #include <linux/ceph/decode.h>
10 #include <linux/ceph/auth.h> 10 #include <linux/ceph/auth.h>
11 #include <linux/ceph/messenger.h> 11 #include <linux/ceph/messenger.h>
12 12
13 #include "crypto.h" 13 #include "crypto.h"
14 #include "auth_x.h" 14 #include "auth_x.h"
15 #include "auth_x_protocol.h" 15 #include "auth_x_protocol.h"
16 16
17 static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed); 17 static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
18 18
19 static int ceph_x_is_authenticated(struct ceph_auth_client *ac) 19 static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
20 { 20 {
21 struct ceph_x_info *xi = ac->private; 21 struct ceph_x_info *xi = ac->private;
22 int need; 22 int need;
23 23
24 ceph_x_validate_tickets(ac, &need); 24 ceph_x_validate_tickets(ac, &need);
25 dout("ceph_x_is_authenticated want=%d need=%d have=%d\n", 25 dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
26 ac->want_keys, need, xi->have_keys); 26 ac->want_keys, need, xi->have_keys);
27 return (ac->want_keys & xi->have_keys) == ac->want_keys; 27 return (ac->want_keys & xi->have_keys) == ac->want_keys;
28 } 28 }
29 29
30 static int ceph_x_should_authenticate(struct ceph_auth_client *ac) 30 static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
31 { 31 {
32 struct ceph_x_info *xi = ac->private; 32 struct ceph_x_info *xi = ac->private;
33 int need; 33 int need;
34 34
35 ceph_x_validate_tickets(ac, &need); 35 ceph_x_validate_tickets(ac, &need);
36 dout("ceph_x_should_authenticate want=%d need=%d have=%d\n", 36 dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
37 ac->want_keys, need, xi->have_keys); 37 ac->want_keys, need, xi->have_keys);
38 return need != 0; 38 return need != 0;
39 } 39 }
40 40
41 static int ceph_x_encrypt_buflen(int ilen) 41 static int ceph_x_encrypt_buflen(int ilen)
42 { 42 {
43 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 + 43 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
44 sizeof(u32); 44 sizeof(u32);
45 } 45 }
46 46
47 static int ceph_x_encrypt(struct ceph_crypto_key *secret, 47 static int ceph_x_encrypt(struct ceph_crypto_key *secret,
48 void *ibuf, int ilen, void *obuf, size_t olen) 48 void *ibuf, int ilen, void *obuf, size_t olen)
49 { 49 {
50 struct ceph_x_encrypt_header head = { 50 struct ceph_x_encrypt_header head = {
51 .struct_v = 1, 51 .struct_v = 1,
52 .magic = cpu_to_le64(CEPHX_ENC_MAGIC) 52 .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
53 }; 53 };
54 size_t len = olen - sizeof(u32); 54 size_t len = olen - sizeof(u32);
55 int ret; 55 int ret;
56 56
57 ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len, 57 ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
58 &head, sizeof(head), ibuf, ilen); 58 &head, sizeof(head), ibuf, ilen);
59 if (ret) 59 if (ret)
60 return ret; 60 return ret;
61 ceph_encode_32(&obuf, len); 61 ceph_encode_32(&obuf, len);
62 return len + sizeof(u32); 62 return len + sizeof(u32);
63 } 63 }
64 64
65 static int ceph_x_decrypt(struct ceph_crypto_key *secret, 65 static int ceph_x_decrypt(struct ceph_crypto_key *secret,
66 void **p, void *end, void **obuf, size_t olen) 66 void **p, void *end, void **obuf, size_t olen)
67 { 67 {
68 struct ceph_x_encrypt_header head; 68 struct ceph_x_encrypt_header head;
69 size_t head_len = sizeof(head); 69 size_t head_len = sizeof(head);
70 int len, ret; 70 int len, ret;
71 71
72 len = ceph_decode_32(p); 72 len = ceph_decode_32(p);
73 if (*p + len > end) 73 if (*p + len > end)
74 return -EINVAL; 74 return -EINVAL;
75 75
76 dout("ceph_x_decrypt len %d\n", len); 76 dout("ceph_x_decrypt len %d\n", len);
77 if (*obuf == NULL) { 77 if (*obuf == NULL) {
78 *obuf = kmalloc(len, GFP_NOFS); 78 *obuf = kmalloc(len, GFP_NOFS);
79 if (!*obuf) 79 if (!*obuf)
80 return -ENOMEM; 80 return -ENOMEM;
81 olen = len; 81 olen = len;
82 } 82 }
83 83
84 ret = ceph_decrypt2(secret, &head, &head_len, *obuf, &olen, *p, len); 84 ret = ceph_decrypt2(secret, &head, &head_len, *obuf, &olen, *p, len);
85 if (ret) 85 if (ret)
86 return ret; 86 return ret;
87 if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC) 87 if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
88 return -EPERM; 88 return -EPERM;
89 *p += len; 89 *p += len;
90 return olen; 90 return olen;
91 } 91 }
92 92
93 /* 93 /*
94 * get existing (or insert new) ticket handler 94 * get existing (or insert new) ticket handler
95 */ 95 */
96 static struct ceph_x_ticket_handler * 96 static struct ceph_x_ticket_handler *
97 get_ticket_handler(struct ceph_auth_client *ac, int service) 97 get_ticket_handler(struct ceph_auth_client *ac, int service)
98 { 98 {
99 struct ceph_x_ticket_handler *th; 99 struct ceph_x_ticket_handler *th;
100 struct ceph_x_info *xi = ac->private; 100 struct ceph_x_info *xi = ac->private;
101 struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node; 101 struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
102 102
103 while (*p) { 103 while (*p) {
104 parent = *p; 104 parent = *p;
105 th = rb_entry(parent, struct ceph_x_ticket_handler, node); 105 th = rb_entry(parent, struct ceph_x_ticket_handler, node);
106 if (service < th->service) 106 if (service < th->service)
107 p = &(*p)->rb_left; 107 p = &(*p)->rb_left;
108 else if (service > th->service) 108 else if (service > th->service)
109 p = &(*p)->rb_right; 109 p = &(*p)->rb_right;
110 else 110 else
111 return th; 111 return th;
112 } 112 }
113 113
114 /* add it */ 114 /* add it */
115 th = kzalloc(sizeof(*th), GFP_NOFS); 115 th = kzalloc(sizeof(*th), GFP_NOFS);
116 if (!th) 116 if (!th)
117 return ERR_PTR(-ENOMEM); 117 return ERR_PTR(-ENOMEM);
118 th->service = service; 118 th->service = service;
119 rb_link_node(&th->node, parent, p); 119 rb_link_node(&th->node, parent, p);
120 rb_insert_color(&th->node, &xi->ticket_handlers); 120 rb_insert_color(&th->node, &xi->ticket_handlers);
121 return th; 121 return th;
122 } 122 }
123 123
124 static void remove_ticket_handler(struct ceph_auth_client *ac, 124 static void remove_ticket_handler(struct ceph_auth_client *ac,
125 struct ceph_x_ticket_handler *th) 125 struct ceph_x_ticket_handler *th)
126 { 126 {
127 struct ceph_x_info *xi = ac->private; 127 struct ceph_x_info *xi = ac->private;
128 128
129 dout("remove_ticket_handler %p %d\n", th, th->service); 129 dout("remove_ticket_handler %p %d\n", th, th->service);
130 rb_erase(&th->node, &xi->ticket_handlers); 130 rb_erase(&th->node, &xi->ticket_handlers);
131 ceph_crypto_key_destroy(&th->session_key); 131 ceph_crypto_key_destroy(&th->session_key);
132 if (th->ticket_blob) 132 if (th->ticket_blob)
133 ceph_buffer_put(th->ticket_blob); 133 ceph_buffer_put(th->ticket_blob);
134 kfree(th); 134 kfree(th);
135 } 135 }
136 136
137 static int process_one_ticket(struct ceph_auth_client *ac, 137 static int process_one_ticket(struct ceph_auth_client *ac,
138 struct ceph_crypto_key *secret, 138 struct ceph_crypto_key *secret,
139 void **p, void *end) 139 void **p, void *end)
140 { 140 {
141 struct ceph_x_info *xi = ac->private; 141 struct ceph_x_info *xi = ac->private;
142 int type; 142 int type;
143 u8 tkt_struct_v, blob_struct_v; 143 u8 tkt_struct_v, blob_struct_v;
144 struct ceph_x_ticket_handler *th; 144 struct ceph_x_ticket_handler *th;
145 void *dbuf = NULL; 145 void *dbuf = NULL;
146 void *dp, *dend; 146 void *dp, *dend;
147 int dlen; 147 int dlen;
148 char is_enc; 148 char is_enc;
149 struct timespec validity; 149 struct timespec validity;
150 struct ceph_crypto_key old_key; 150 struct ceph_crypto_key old_key;
151 void *ticket_buf = NULL; 151 void *ticket_buf = NULL;
152 void *tp, *tpend; 152 void *tp, *tpend;
153 void **ptp; 153 void **ptp;
154 struct ceph_timespec new_validity; 154 struct ceph_timespec new_validity;
155 struct ceph_crypto_key new_session_key; 155 struct ceph_crypto_key new_session_key;
156 struct ceph_buffer *new_ticket_blob; 156 struct ceph_buffer *new_ticket_blob;
157 unsigned long new_expires, new_renew_after; 157 unsigned long new_expires, new_renew_after;
158 u64 new_secret_id; 158 u64 new_secret_id;
159 int ret; 159 int ret;
160 160
161 ceph_decode_need(p, end, sizeof(u32) + 1, bad); 161 ceph_decode_need(p, end, sizeof(u32) + 1, bad);
162 162
163 type = ceph_decode_32(p); 163 type = ceph_decode_32(p);
164 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); 164 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
165 165
166 tkt_struct_v = ceph_decode_8(p); 166 tkt_struct_v = ceph_decode_8(p);
167 if (tkt_struct_v != 1) 167 if (tkt_struct_v != 1)
168 goto bad; 168 goto bad;
169 169
170 th = get_ticket_handler(ac, type); 170 th = get_ticket_handler(ac, type);
171 if (IS_ERR(th)) { 171 if (IS_ERR(th)) {
172 ret = PTR_ERR(th); 172 ret = PTR_ERR(th);
173 goto out; 173 goto out;
174 } 174 }
175 175
176 /* blob for me */ 176 /* blob for me */
177 dlen = ceph_x_decrypt(secret, p, end, &dbuf, 0); 177 dlen = ceph_x_decrypt(secret, p, end, &dbuf, 0);
178 if (dlen <= 0) { 178 if (dlen <= 0) {
179 ret = dlen; 179 ret = dlen;
180 goto out; 180 goto out;
181 } 181 }
182 dout(" decrypted %d bytes\n", dlen); 182 dout(" decrypted %d bytes\n", dlen);
183 dp = dbuf; 183 dp = dbuf;
184 dend = dp + dlen; 184 dend = dp + dlen;
185 185
186 tkt_struct_v = ceph_decode_8(&dp); 186 tkt_struct_v = ceph_decode_8(&dp);
187 if (tkt_struct_v != 1) 187 if (tkt_struct_v != 1)
188 goto bad; 188 goto bad;
189 189
190 memcpy(&old_key, &th->session_key, sizeof(old_key)); 190 memcpy(&old_key, &th->session_key, sizeof(old_key));
191 ret = ceph_crypto_key_decode(&new_session_key, &dp, dend); 191 ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
192 if (ret) 192 if (ret)
193 goto out; 193 goto out;
194 194
195 ceph_decode_copy(&dp, &new_validity, sizeof(new_validity)); 195 ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
196 ceph_decode_timespec(&validity, &new_validity); 196 ceph_decode_timespec(&validity, &new_validity);
197 new_expires = get_seconds() + validity.tv_sec; 197 new_expires = get_seconds() + validity.tv_sec;
198 new_renew_after = new_expires - (validity.tv_sec / 4); 198 new_renew_after = new_expires - (validity.tv_sec / 4);
199 dout(" expires=%lu renew_after=%lu\n", new_expires, 199 dout(" expires=%lu renew_after=%lu\n", new_expires,
200 new_renew_after); 200 new_renew_after);
201 201
202 /* ticket blob for service */ 202 /* ticket blob for service */
203 ceph_decode_8_safe(p, end, is_enc, bad); 203 ceph_decode_8_safe(p, end, is_enc, bad);
204 if (is_enc) { 204 if (is_enc) {
205 /* encrypted */ 205 /* encrypted */
206 dout(" encrypted ticket\n"); 206 dout(" encrypted ticket\n");
207 dlen = ceph_x_decrypt(&old_key, p, end, &ticket_buf, 0); 207 dlen = ceph_x_decrypt(&old_key, p, end, &ticket_buf, 0);
208 if (dlen < 0) { 208 if (dlen < 0) {
209 ret = dlen; 209 ret = dlen;
210 goto out; 210 goto out;
211 } 211 }
212 tp = ticket_buf; 212 tp = ticket_buf;
213 ptp = &tp; 213 ptp = &tp;
214 tpend = *ptp + dlen; 214 tpend = *ptp + dlen;
215 } else { 215 } else {
216 /* unencrypted */ 216 /* unencrypted */
217 ptp = p; 217 ptp = p;
218 tpend = end; 218 tpend = end;
219 } 219 }
220 ceph_decode_32_safe(ptp, tpend, dlen, bad); 220 ceph_decode_32_safe(ptp, tpend, dlen, bad);
221 dout(" ticket blob is %d bytes\n", dlen); 221 dout(" ticket blob is %d bytes\n", dlen);
222 ceph_decode_need(ptp, tpend, 1 + sizeof(u64), bad); 222 ceph_decode_need(ptp, tpend, 1 + sizeof(u64), bad);
223 blob_struct_v = ceph_decode_8(ptp); 223 blob_struct_v = ceph_decode_8(ptp);
224 new_secret_id = ceph_decode_64(ptp); 224 new_secret_id = ceph_decode_64(ptp);
225 ret = ceph_decode_buffer(&new_ticket_blob, ptp, tpend); 225 ret = ceph_decode_buffer(&new_ticket_blob, ptp, tpend);
226 if (ret) 226 if (ret)
227 goto out; 227 goto out;
228 228
229 /* all is well, update our ticket */ 229 /* all is well, update our ticket */
230 ceph_crypto_key_destroy(&th->session_key); 230 ceph_crypto_key_destroy(&th->session_key);
231 if (th->ticket_blob) 231 if (th->ticket_blob)
232 ceph_buffer_put(th->ticket_blob); 232 ceph_buffer_put(th->ticket_blob);
233 th->session_key = new_session_key; 233 th->session_key = new_session_key;
234 th->ticket_blob = new_ticket_blob; 234 th->ticket_blob = new_ticket_blob;
235 th->validity = new_validity; 235 th->validity = new_validity;
236 th->secret_id = new_secret_id; 236 th->secret_id = new_secret_id;
237 th->expires = new_expires; 237 th->expires = new_expires;
238 th->renew_after = new_renew_after; 238 th->renew_after = new_renew_after;
239 dout(" got ticket service %d (%s) secret_id %lld len %d\n", 239 dout(" got ticket service %d (%s) secret_id %lld len %d\n",
240 type, ceph_entity_type_name(type), th->secret_id, 240 type, ceph_entity_type_name(type), th->secret_id,
241 (int)th->ticket_blob->vec.iov_len); 241 (int)th->ticket_blob->vec.iov_len);
242 xi->have_keys |= th->service; 242 xi->have_keys |= th->service;
243 243
244 out: 244 out:
245 kfree(ticket_buf); 245 kfree(ticket_buf);
246 kfree(dbuf); 246 kfree(dbuf);
247 return ret; 247 return ret;
248 248
249 bad: 249 bad:
250 ret = -EINVAL; 250 ret = -EINVAL;
251 goto out; 251 goto out;
252 } 252 }
253 253
254 static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, 254 static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
255 struct ceph_crypto_key *secret, 255 struct ceph_crypto_key *secret,
256 void *buf, void *end) 256 void *buf, void *end)
257 { 257 {
258 void *p = buf; 258 void *p = buf;
259 u8 reply_struct_v; 259 u8 reply_struct_v;
260 u32 num; 260 u32 num;
261 int ret; 261 int ret;
262 262
263 ceph_decode_8_safe(&p, end, reply_struct_v, bad); 263 ceph_decode_8_safe(&p, end, reply_struct_v, bad);
264 if (reply_struct_v != 1) 264 if (reply_struct_v != 1)
265 return -EINVAL; 265 return -EINVAL;
266 266
267 ceph_decode_32_safe(&p, end, num, bad); 267 ceph_decode_32_safe(&p, end, num, bad);
268 dout("%d tickets\n", num); 268 dout("%d tickets\n", num);
269 269
270 while (num--) { 270 while (num--) {
271 ret = process_one_ticket(ac, secret, &p, end); 271 ret = process_one_ticket(ac, secret, &p, end);
272 if (ret) 272 if (ret)
273 return ret; 273 return ret;
274 } 274 }
275 275
276 return 0; 276 return 0;
277 277
278 bad: 278 bad:
279 return -EINVAL; 279 return -EINVAL;
280 } 280 }
281 281
282 static int ceph_x_build_authorizer(struct ceph_auth_client *ac, 282 static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
283 struct ceph_x_ticket_handler *th, 283 struct ceph_x_ticket_handler *th,
284 struct ceph_x_authorizer *au) 284 struct ceph_x_authorizer *au)
285 { 285 {
286 int maxlen; 286 int maxlen;
287 struct ceph_x_authorize_a *msg_a; 287 struct ceph_x_authorize_a *msg_a;
288 struct ceph_x_authorize_b msg_b; 288 struct ceph_x_authorize_b msg_b;
289 void *p, *end; 289 void *p, *end;
290 int ret; 290 int ret;
291 int ticket_blob_len = 291 int ticket_blob_len =
292 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0); 292 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
293 293
294 dout("build_authorizer for %s %p\n", 294 dout("build_authorizer for %s %p\n",
295 ceph_entity_type_name(th->service), au); 295 ceph_entity_type_name(th->service), au);
296 296
297 ceph_crypto_key_destroy(&au->session_key); 297 ceph_crypto_key_destroy(&au->session_key);
298 ret = ceph_crypto_key_clone(&au->session_key, &th->session_key); 298 ret = ceph_crypto_key_clone(&au->session_key, &th->session_key);
299 if (ret) 299 if (ret)
300 return ret; 300 return ret;
301 301
302 maxlen = sizeof(*msg_a) + sizeof(msg_b) + 302 maxlen = sizeof(*msg_a) + sizeof(msg_b) +
303 ceph_x_encrypt_buflen(ticket_blob_len); 303 ceph_x_encrypt_buflen(ticket_blob_len);
304 dout(" need len %d\n", maxlen); 304 dout(" need len %d\n", maxlen);
305 if (au->buf && au->buf->alloc_len < maxlen) { 305 if (au->buf && au->buf->alloc_len < maxlen) {
306 ceph_buffer_put(au->buf); 306 ceph_buffer_put(au->buf);
307 au->buf = NULL; 307 au->buf = NULL;
308 } 308 }
309 if (!au->buf) { 309 if (!au->buf) {
310 au->buf = ceph_buffer_new(maxlen, GFP_NOFS); 310 au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
311 if (!au->buf) { 311 if (!au->buf) {
312 ceph_crypto_key_destroy(&au->session_key); 312 ceph_crypto_key_destroy(&au->session_key);
313 return -ENOMEM; 313 return -ENOMEM;
314 } 314 }
315 } 315 }
316 au->service = th->service; 316 au->service = th->service;
317 au->secret_id = th->secret_id; 317 au->secret_id = th->secret_id;
318 318
319 msg_a = au->buf->vec.iov_base; 319 msg_a = au->buf->vec.iov_base;
320 msg_a->struct_v = 1; 320 msg_a->struct_v = 1;
321 msg_a->global_id = cpu_to_le64(ac->global_id); 321 msg_a->global_id = cpu_to_le64(ac->global_id);
322 msg_a->service_id = cpu_to_le32(th->service); 322 msg_a->service_id = cpu_to_le32(th->service);
323 msg_a->ticket_blob.struct_v = 1; 323 msg_a->ticket_blob.struct_v = 1;
324 msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id); 324 msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
325 msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len); 325 msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
326 if (ticket_blob_len) { 326 if (ticket_blob_len) {
327 memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base, 327 memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
328 th->ticket_blob->vec.iov_len); 328 th->ticket_blob->vec.iov_len);
329 } 329 }
330 dout(" th %p secret_id %lld %lld\n", th, th->secret_id, 330 dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
331 le64_to_cpu(msg_a->ticket_blob.secret_id)); 331 le64_to_cpu(msg_a->ticket_blob.secret_id));
332 332
333 p = msg_a + 1; 333 p = msg_a + 1;
334 p += ticket_blob_len; 334 p += ticket_blob_len;
335 end = au->buf->vec.iov_base + au->buf->vec.iov_len; 335 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
336 336
337 get_random_bytes(&au->nonce, sizeof(au->nonce)); 337 get_random_bytes(&au->nonce, sizeof(au->nonce));
338 msg_b.struct_v = 1; 338 msg_b.struct_v = 1;
339 msg_b.nonce = cpu_to_le64(au->nonce); 339 msg_b.nonce = cpu_to_le64(au->nonce);
340 ret = ceph_x_encrypt(&au->session_key, &msg_b, sizeof(msg_b), 340 ret = ceph_x_encrypt(&au->session_key, &msg_b, sizeof(msg_b),
341 p, end - p); 341 p, end - p);
342 if (ret < 0) 342 if (ret < 0)
343 goto out_buf; 343 goto out_buf;
344 p += ret; 344 p += ret;
345 au->buf->vec.iov_len = p - au->buf->vec.iov_base; 345 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
346 dout(" built authorizer nonce %llx len %d\n", au->nonce, 346 dout(" built authorizer nonce %llx len %d\n", au->nonce,
347 (int)au->buf->vec.iov_len); 347 (int)au->buf->vec.iov_len);
348 BUG_ON(au->buf->vec.iov_len > maxlen); 348 BUG_ON(au->buf->vec.iov_len > maxlen);
349 return 0; 349 return 0;
350 350
351 out_buf: 351 out_buf:
352 ceph_buffer_put(au->buf); 352 ceph_buffer_put(au->buf);
353 au->buf = NULL; 353 au->buf = NULL;
354 return ret; 354 return ret;
355 } 355 }
356 356
357 static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th, 357 static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
358 void **p, void *end) 358 void **p, void *end)
359 { 359 {
360 ceph_decode_need(p, end, 1 + sizeof(u64), bad); 360 ceph_decode_need(p, end, 1 + sizeof(u64), bad);
361 ceph_encode_8(p, 1); 361 ceph_encode_8(p, 1);
362 ceph_encode_64(p, th->secret_id); 362 ceph_encode_64(p, th->secret_id);
363 if (th->ticket_blob) { 363 if (th->ticket_blob) {
364 const char *buf = th->ticket_blob->vec.iov_base; 364 const char *buf = th->ticket_blob->vec.iov_base;
365 u32 len = th->ticket_blob->vec.iov_len; 365 u32 len = th->ticket_blob->vec.iov_len;
366 366
367 ceph_encode_32_safe(p, end, len, bad); 367 ceph_encode_32_safe(p, end, len, bad);
368 ceph_encode_copy_safe(p, end, buf, len, bad); 368 ceph_encode_copy_safe(p, end, buf, len, bad);
369 } else { 369 } else {
370 ceph_encode_32_safe(p, end, 0, bad); 370 ceph_encode_32_safe(p, end, 0, bad);
371 } 371 }
372 372
373 return 0; 373 return 0;
374 bad: 374 bad:
375 return -ERANGE; 375 return -ERANGE;
376 } 376 }
377 377
378 static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) 378 static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
379 { 379 {
380 int want = ac->want_keys; 380 int want = ac->want_keys;
381 struct ceph_x_info *xi = ac->private; 381 struct ceph_x_info *xi = ac->private;
382 int service; 382 int service;
383 383
384 *pneed = ac->want_keys & ~(xi->have_keys); 384 *pneed = ac->want_keys & ~(xi->have_keys);
385 385
386 for (service = 1; service <= want; service <<= 1) { 386 for (service = 1; service <= want; service <<= 1) {
387 struct ceph_x_ticket_handler *th; 387 struct ceph_x_ticket_handler *th;
388 388
389 if (!(ac->want_keys & service)) 389 if (!(ac->want_keys & service))
390 continue; 390 continue;
391 391
392 if (*pneed & service) 392 if (*pneed & service)
393 continue; 393 continue;
394 394
395 th = get_ticket_handler(ac, service); 395 th = get_ticket_handler(ac, service);
396 396
397 if (IS_ERR(th)) { 397 if (IS_ERR(th)) {
398 *pneed |= service; 398 *pneed |= service;
399 continue; 399 continue;
400 } 400 }
401 401
402 if (get_seconds() >= th->renew_after) 402 if (get_seconds() >= th->renew_after)
403 *pneed |= service; 403 *pneed |= service;
404 if (get_seconds() >= th->expires) 404 if (get_seconds() >= th->expires)
405 xi->have_keys &= ~service; 405 xi->have_keys &= ~service;
406 } 406 }
407 } 407 }
408 408
409 409
410 static int ceph_x_build_request(struct ceph_auth_client *ac, 410 static int ceph_x_build_request(struct ceph_auth_client *ac,
411 void *buf, void *end) 411 void *buf, void *end)
412 { 412 {
413 struct ceph_x_info *xi = ac->private; 413 struct ceph_x_info *xi = ac->private;
414 int need; 414 int need;
415 struct ceph_x_request_header *head = buf; 415 struct ceph_x_request_header *head = buf;
416 int ret; 416 int ret;
417 struct ceph_x_ticket_handler *th = 417 struct ceph_x_ticket_handler *th =
418 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); 418 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
419 419
420 if (IS_ERR(th)) 420 if (IS_ERR(th))
421 return PTR_ERR(th); 421 return PTR_ERR(th);
422 422
423 ceph_x_validate_tickets(ac, &need); 423 ceph_x_validate_tickets(ac, &need);
424 424
425 dout("build_request want %x have %x need %x\n", 425 dout("build_request want %x have %x need %x\n",
426 ac->want_keys, xi->have_keys, need); 426 ac->want_keys, xi->have_keys, need);
427 427
428 if (need & CEPH_ENTITY_TYPE_AUTH) { 428 if (need & CEPH_ENTITY_TYPE_AUTH) {
429 struct ceph_x_authenticate *auth = (void *)(head + 1); 429 struct ceph_x_authenticate *auth = (void *)(head + 1);
430 void *p = auth + 1; 430 void *p = auth + 1;
431 struct ceph_x_challenge_blob tmp; 431 struct ceph_x_challenge_blob tmp;
432 char tmp_enc[40]; 432 char tmp_enc[40];
433 u64 *u; 433 u64 *u;
434 434
435 if (p > end) 435 if (p > end)
436 return -ERANGE; 436 return -ERANGE;
437 437
438 dout(" get_auth_session_key\n"); 438 dout(" get_auth_session_key\n");
439 head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY); 439 head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
440 440
441 /* encrypt and hash */ 441 /* encrypt and hash */
442 get_random_bytes(&auth->client_challenge, sizeof(u64)); 442 get_random_bytes(&auth->client_challenge, sizeof(u64));
443 tmp.client_challenge = auth->client_challenge; 443 tmp.client_challenge = auth->client_challenge;
444 tmp.server_challenge = cpu_to_le64(xi->server_challenge); 444 tmp.server_challenge = cpu_to_le64(xi->server_challenge);
445 ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp), 445 ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
446 tmp_enc, sizeof(tmp_enc)); 446 tmp_enc, sizeof(tmp_enc));
447 if (ret < 0) 447 if (ret < 0)
448 return ret; 448 return ret;
449 449
450 auth->struct_v = 1; 450 auth->struct_v = 1;
451 auth->key = 0; 451 auth->key = 0;
452 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++) 452 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
453 auth->key ^= *(__le64 *)u; 453 auth->key ^= *(__le64 *)u;
454 dout(" server_challenge %llx client_challenge %llx key %llx\n", 454 dout(" server_challenge %llx client_challenge %llx key %llx\n",
455 xi->server_challenge, le64_to_cpu(auth->client_challenge), 455 xi->server_challenge, le64_to_cpu(auth->client_challenge),
456 le64_to_cpu(auth->key)); 456 le64_to_cpu(auth->key));
457 457
458 /* now encode the old ticket if exists */ 458 /* now encode the old ticket if exists */
459 ret = ceph_x_encode_ticket(th, &p, end); 459 ret = ceph_x_encode_ticket(th, &p, end);
460 if (ret < 0) 460 if (ret < 0)
461 return ret; 461 return ret;
462 462
463 return p - buf; 463 return p - buf;
464 } 464 }
465 465
466 if (need) { 466 if (need) {
467 void *p = head + 1; 467 void *p = head + 1;
468 struct ceph_x_service_ticket_request *req; 468 struct ceph_x_service_ticket_request *req;
469 469
470 if (p > end) 470 if (p > end)
471 return -ERANGE; 471 return -ERANGE;
472 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY); 472 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
473 473
474 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer); 474 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
475 if (ret) 475 if (ret)
476 return ret; 476 return ret;
477 ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base, 477 ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
478 xi->auth_authorizer.buf->vec.iov_len); 478 xi->auth_authorizer.buf->vec.iov_len);
479 479
480 req = p; 480 req = p;
481 req->keys = cpu_to_le32(need); 481 req->keys = cpu_to_le32(need);
482 p += sizeof(*req); 482 p += sizeof(*req);
483 return p - buf; 483 return p - buf;
484 } 484 }
485 485
486 return 0; 486 return 0;
487 } 487 }
488 488
489 static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, 489 static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
490 void *buf, void *end) 490 void *buf, void *end)
491 { 491 {
492 struct ceph_x_info *xi = ac->private; 492 struct ceph_x_info *xi = ac->private;
493 struct ceph_x_reply_header *head = buf; 493 struct ceph_x_reply_header *head = buf;
494 struct ceph_x_ticket_handler *th; 494 struct ceph_x_ticket_handler *th;
495 int len = end - buf; 495 int len = end - buf;
496 int op; 496 int op;
497 int ret; 497 int ret;
498 498
499 if (result) 499 if (result)
500 return result; /* XXX hmm? */ 500 return result; /* XXX hmm? */
501 501
502 if (xi->starting) { 502 if (xi->starting) {
503 /* it's a hello */ 503 /* it's a hello */
504 struct ceph_x_server_challenge *sc = buf; 504 struct ceph_x_server_challenge *sc = buf;
505 505
506 if (len != sizeof(*sc)) 506 if (len != sizeof(*sc))
507 return -EINVAL; 507 return -EINVAL;
508 xi->server_challenge = le64_to_cpu(sc->server_challenge); 508 xi->server_challenge = le64_to_cpu(sc->server_challenge);
509 dout("handle_reply got server challenge %llx\n", 509 dout("handle_reply got server challenge %llx\n",
510 xi->server_challenge); 510 xi->server_challenge);
511 xi->starting = false; 511 xi->starting = false;
512 xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH; 512 xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
513 return -EAGAIN; 513 return -EAGAIN;
514 } 514 }
515 515
516 op = le16_to_cpu(head->op); 516 op = le16_to_cpu(head->op);
517 result = le32_to_cpu(head->result); 517 result = le32_to_cpu(head->result);
518 dout("handle_reply op %d result %d\n", op, result); 518 dout("handle_reply op %d result %d\n", op, result);
519 switch (op) { 519 switch (op) {
520 case CEPHX_GET_AUTH_SESSION_KEY: 520 case CEPHX_GET_AUTH_SESSION_KEY:
521 /* verify auth key */ 521 /* verify auth key */
522 ret = ceph_x_proc_ticket_reply(ac, &xi->secret, 522 ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
523 buf + sizeof(*head), end); 523 buf + sizeof(*head), end);
524 break; 524 break;
525 525
526 case CEPHX_GET_PRINCIPAL_SESSION_KEY: 526 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
527 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); 527 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
528 if (IS_ERR(th)) 528 if (IS_ERR(th))
529 return PTR_ERR(th); 529 return PTR_ERR(th);
530 ret = ceph_x_proc_ticket_reply(ac, &th->session_key, 530 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
531 buf + sizeof(*head), end); 531 buf + sizeof(*head), end);
532 break; 532 break;
533 533
534 default: 534 default:
535 return -EINVAL; 535 return -EINVAL;
536 } 536 }
537 if (ret) 537 if (ret)
538 return ret; 538 return ret;
539 if (ac->want_keys == xi->have_keys) 539 if (ac->want_keys == xi->have_keys)
540 return 0; 540 return 0;
541 return -EAGAIN; 541 return -EAGAIN;
542 } 542 }
543 543
544 static int ceph_x_create_authorizer( 544 static int ceph_x_create_authorizer(
545 struct ceph_auth_client *ac, int peer_type, 545 struct ceph_auth_client *ac, int peer_type,
546 struct ceph_auth_handshake *auth) 546 struct ceph_auth_handshake *auth)
547 { 547 {
548 struct ceph_x_authorizer *au; 548 struct ceph_x_authorizer *au;
549 struct ceph_x_ticket_handler *th; 549 struct ceph_x_ticket_handler *th;
550 int ret; 550 int ret;
551 551
552 th = get_ticket_handler(ac, peer_type); 552 th = get_ticket_handler(ac, peer_type);
553 if (IS_ERR(th)) 553 if (IS_ERR(th))
554 return PTR_ERR(th); 554 return PTR_ERR(th);
555 555
556 au = kzalloc(sizeof(*au), GFP_NOFS); 556 au = kzalloc(sizeof(*au), GFP_NOFS);
557 if (!au) 557 if (!au)
558 return -ENOMEM; 558 return -ENOMEM;
559 559
560 ret = ceph_x_build_authorizer(ac, th, au); 560 ret = ceph_x_build_authorizer(ac, th, au);
561 if (ret) { 561 if (ret) {
562 kfree(au); 562 kfree(au);
563 return ret; 563 return ret;
564 } 564 }
565 565
566 auth->authorizer = (struct ceph_authorizer *) au; 566 auth->authorizer = (struct ceph_authorizer *) au;
567 auth->authorizer_buf = au->buf->vec.iov_base; 567 auth->authorizer_buf = au->buf->vec.iov_base;
568 auth->authorizer_buf_len = au->buf->vec.iov_len; 568 auth->authorizer_buf_len = au->buf->vec.iov_len;
569 auth->authorizer_reply_buf = au->reply_buf; 569 auth->authorizer_reply_buf = au->reply_buf;
570 auth->authorizer_reply_buf_len = sizeof (au->reply_buf); 570 auth->authorizer_reply_buf_len = sizeof (au->reply_buf);
571 auth->sign_message = ac->ops->sign_message; 571 auth->sign_message = ac->ops->sign_message;
572 auth->check_message_signature = ac->ops->check_message_signature; 572 auth->check_message_signature = ac->ops->check_message_signature;
573 573
574 return 0; 574 return 0;
575 } 575 }
576 576
577 static int ceph_x_update_authorizer( 577 static int ceph_x_update_authorizer(
578 struct ceph_auth_client *ac, int peer_type, 578 struct ceph_auth_client *ac, int peer_type,
579 struct ceph_auth_handshake *auth) 579 struct ceph_auth_handshake *auth)
580 { 580 {
581 struct ceph_x_authorizer *au; 581 struct ceph_x_authorizer *au;
582 struct ceph_x_ticket_handler *th; 582 struct ceph_x_ticket_handler *th;
583 583
584 th = get_ticket_handler(ac, peer_type); 584 th = get_ticket_handler(ac, peer_type);
585 if (IS_ERR(th)) 585 if (IS_ERR(th))
586 return PTR_ERR(th); 586 return PTR_ERR(th);
587 587
588 au = (struct ceph_x_authorizer *)auth->authorizer; 588 au = (struct ceph_x_authorizer *)auth->authorizer;
589 if (au->secret_id < th->secret_id) { 589 if (au->secret_id < th->secret_id) {
590 dout("ceph_x_update_authorizer service %u secret %llu < %llu\n", 590 dout("ceph_x_update_authorizer service %u secret %llu < %llu\n",
591 au->service, au->secret_id, th->secret_id); 591 au->service, au->secret_id, th->secret_id);
592 return ceph_x_build_authorizer(ac, th, au); 592 return ceph_x_build_authorizer(ac, th, au);
593 } 593 }
594 return 0; 594 return 0;
595 } 595 }
596 596
597 static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, 597 static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
598 struct ceph_authorizer *a, size_t len) 598 struct ceph_authorizer *a, size_t len)
599 { 599 {
600 struct ceph_x_authorizer *au = (void *)a; 600 struct ceph_x_authorizer *au = (void *)a;
601 int ret = 0; 601 int ret = 0;
602 struct ceph_x_authorize_reply reply; 602 struct ceph_x_authorize_reply reply;
603 void *preply = &reply; 603 void *preply = &reply;
604 void *p = au->reply_buf; 604 void *p = au->reply_buf;
605 void *end = p + sizeof(au->reply_buf); 605 void *end = p + sizeof(au->reply_buf);
606 606
607 ret = ceph_x_decrypt(&au->session_key, &p, end, &preply, sizeof(reply)); 607 ret = ceph_x_decrypt(&au->session_key, &p, end, &preply, sizeof(reply));
608 if (ret < 0) 608 if (ret < 0)
609 return ret; 609 return ret;
610 if (ret != sizeof(reply)) 610 if (ret != sizeof(reply))
611 return -EPERM; 611 return -EPERM;
612 612
613 if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one)) 613 if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
614 ret = -EPERM; 614 ret = -EPERM;
615 else 615 else
616 ret = 0; 616 ret = 0;
617 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n", 617 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
618 au->nonce, le64_to_cpu(reply.nonce_plus_one), ret); 618 au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
619 return ret; 619 return ret;
620 } 620 }
621 621
622 static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac, 622 static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
623 struct ceph_authorizer *a) 623 struct ceph_authorizer *a)
624 { 624 {
625 struct ceph_x_authorizer *au = (void *)a; 625 struct ceph_x_authorizer *au = (void *)a;
626 626
627 ceph_crypto_key_destroy(&au->session_key); 627 ceph_crypto_key_destroy(&au->session_key);
628 ceph_buffer_put(au->buf); 628 ceph_buffer_put(au->buf);
629 kfree(au); 629 kfree(au);
630 } 630 }
631 631
632 632
633 static void ceph_x_reset(struct ceph_auth_client *ac) 633 static void ceph_x_reset(struct ceph_auth_client *ac)
634 { 634 {
635 struct ceph_x_info *xi = ac->private; 635 struct ceph_x_info *xi = ac->private;
636 636
637 dout("reset\n"); 637 dout("reset\n");
638 xi->starting = true; 638 xi->starting = true;
639 xi->server_challenge = 0; 639 xi->server_challenge = 0;
640 } 640 }
641 641
642 static void ceph_x_destroy(struct ceph_auth_client *ac) 642 static void ceph_x_destroy(struct ceph_auth_client *ac)
643 { 643 {
644 struct ceph_x_info *xi = ac->private; 644 struct ceph_x_info *xi = ac->private;
645 struct rb_node *p; 645 struct rb_node *p;
646 646
647 dout("ceph_x_destroy %p\n", ac); 647 dout("ceph_x_destroy %p\n", ac);
648 ceph_crypto_key_destroy(&xi->secret); 648 ceph_crypto_key_destroy(&xi->secret);
649 649
650 while ((p = rb_first(&xi->ticket_handlers)) != NULL) { 650 while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
651 struct ceph_x_ticket_handler *th = 651 struct ceph_x_ticket_handler *th =
652 rb_entry(p, struct ceph_x_ticket_handler, node); 652 rb_entry(p, struct ceph_x_ticket_handler, node);
653 remove_ticket_handler(ac, th); 653 remove_ticket_handler(ac, th);
654 } 654 }
655 655
656 if (xi->auth_authorizer.buf) 656 if (xi->auth_authorizer.buf)
657 ceph_buffer_put(xi->auth_authorizer.buf); 657 ceph_buffer_put(xi->auth_authorizer.buf);
658 658
659 kfree(ac->private); 659 kfree(ac->private);
660 ac->private = NULL; 660 ac->private = NULL;
661 } 661 }
662 662
663 static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, 663 static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
664 int peer_type) 664 int peer_type)
665 { 665 {
666 struct ceph_x_ticket_handler *th; 666 struct ceph_x_ticket_handler *th;
667 667
668 th = get_ticket_handler(ac, peer_type); 668 th = get_ticket_handler(ac, peer_type);
669 if (!IS_ERR(th)) 669 if (!IS_ERR(th))
670 memset(&th->validity, 0, sizeof(th->validity)); 670 memset(&th->validity, 0, sizeof(th->validity));
671 } 671 }
672 672
673 static int calcu_signature(struct ceph_x_authorizer *au, 673 static int calcu_signature(struct ceph_x_authorizer *au,
674 struct ceph_msg *msg, __le64 *sig) 674 struct ceph_msg *msg, __le64 *sig)
675 { 675 {
676 int ret; 676 int ret;
677 char tmp_enc[40]; 677 char tmp_enc[40];
678 __le32 tmp[5] = { 678 __le32 tmp[5] = {
679 16u, msg->hdr.crc, msg->footer.front_crc, 679 cpu_to_le32(16), msg->hdr.crc, msg->footer.front_crc,
680 msg->footer.middle_crc, msg->footer.data_crc, 680 msg->footer.middle_crc, msg->footer.data_crc,
681 }; 681 };
682 ret = ceph_x_encrypt(&au->session_key, &tmp, sizeof(tmp), 682 ret = ceph_x_encrypt(&au->session_key, &tmp, sizeof(tmp),
683 tmp_enc, sizeof(tmp_enc)); 683 tmp_enc, sizeof(tmp_enc));
684 if (ret < 0) 684 if (ret < 0)
685 return ret; 685 return ret;
686 *sig = *(__le64*)(tmp_enc + 4); 686 *sig = *(__le64*)(tmp_enc + 4);
687 return 0; 687 return 0;
688 } 688 }
689 689
690 static int ceph_x_sign_message(struct ceph_auth_handshake *auth, 690 static int ceph_x_sign_message(struct ceph_auth_handshake *auth,
691 struct ceph_msg *msg) 691 struct ceph_msg *msg)
692 { 692 {
693 int ret; 693 int ret;
694 if (!auth->authorizer) 694 if (!auth->authorizer)
695 return 0; 695 return 0;
696 ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer, 696 ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer,
697 msg, &msg->footer.sig); 697 msg, &msg->footer.sig);
698 if (ret < 0) 698 if (ret < 0)
699 return ret; 699 return ret;
700 msg->footer.flags |= CEPH_MSG_FOOTER_SIGNED; 700 msg->footer.flags |= CEPH_MSG_FOOTER_SIGNED;
701 return 0; 701 return 0;
702 } 702 }
703 703
704 static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth, 704 static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth,
705 struct ceph_msg *msg) 705 struct ceph_msg *msg)
706 { 706 {
707 __le64 sig_check; 707 __le64 sig_check;
708 int ret; 708 int ret;
709 709
710 if (!auth->authorizer) 710 if (!auth->authorizer)
711 return 0; 711 return 0;
712 ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer, 712 ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer,
713 msg, &sig_check); 713 msg, &sig_check);
714 if (ret < 0) 714 if (ret < 0)
715 return ret; 715 return ret;
716 if (sig_check == msg->footer.sig) 716 if (sig_check == msg->footer.sig)
717 return 0; 717 return 0;
718 if (msg->footer.flags & CEPH_MSG_FOOTER_SIGNED) 718 if (msg->footer.flags & CEPH_MSG_FOOTER_SIGNED)
719 dout("ceph_x_check_message_signature %p has signature %llx " 719 dout("ceph_x_check_message_signature %p has signature %llx "
720 "expect %llx\n", msg, msg->footer.sig, sig_check); 720 "expect %llx\n", msg, msg->footer.sig, sig_check);
721 else 721 else
722 dout("ceph_x_check_message_signature %p sender did not set " 722 dout("ceph_x_check_message_signature %p sender did not set "
723 "CEPH_MSG_FOOTER_SIGNED\n", msg); 723 "CEPH_MSG_FOOTER_SIGNED\n", msg);
724 return -EBADMSG; 724 return -EBADMSG;
725 } 725 }
726 726
727 static const struct ceph_auth_client_ops ceph_x_ops = { 727 static const struct ceph_auth_client_ops ceph_x_ops = {
728 .name = "x", 728 .name = "x",
729 .is_authenticated = ceph_x_is_authenticated, 729 .is_authenticated = ceph_x_is_authenticated,
730 .should_authenticate = ceph_x_should_authenticate, 730 .should_authenticate = ceph_x_should_authenticate,
731 .build_request = ceph_x_build_request, 731 .build_request = ceph_x_build_request,
732 .handle_reply = ceph_x_handle_reply, 732 .handle_reply = ceph_x_handle_reply,
733 .create_authorizer = ceph_x_create_authorizer, 733 .create_authorizer = ceph_x_create_authorizer,
734 .update_authorizer = ceph_x_update_authorizer, 734 .update_authorizer = ceph_x_update_authorizer,
735 .verify_authorizer_reply = ceph_x_verify_authorizer_reply, 735 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
736 .destroy_authorizer = ceph_x_destroy_authorizer, 736 .destroy_authorizer = ceph_x_destroy_authorizer,
737 .invalidate_authorizer = ceph_x_invalidate_authorizer, 737 .invalidate_authorizer = ceph_x_invalidate_authorizer,
738 .reset = ceph_x_reset, 738 .reset = ceph_x_reset,
739 .destroy = ceph_x_destroy, 739 .destroy = ceph_x_destroy,
740 .sign_message = ceph_x_sign_message, 740 .sign_message = ceph_x_sign_message,
741 .check_message_signature = ceph_x_check_message_signature, 741 .check_message_signature = ceph_x_check_message_signature,
742 }; 742 };
743 743
744 744
745 int ceph_x_init(struct ceph_auth_client *ac) 745 int ceph_x_init(struct ceph_auth_client *ac)
746 { 746 {
747 struct ceph_x_info *xi; 747 struct ceph_x_info *xi;
748 int ret; 748 int ret;
749 749
750 dout("ceph_x_init %p\n", ac); 750 dout("ceph_x_init %p\n", ac);
751 ret = -ENOMEM; 751 ret = -ENOMEM;
752 xi = kzalloc(sizeof(*xi), GFP_NOFS); 752 xi = kzalloc(sizeof(*xi), GFP_NOFS);
753 if (!xi) 753 if (!xi)
754 goto out; 754 goto out;
755 755
756 ret = -EINVAL; 756 ret = -EINVAL;
757 if (!ac->key) { 757 if (!ac->key) {
758 pr_err("no secret set (for auth_x protocol)\n"); 758 pr_err("no secret set (for auth_x protocol)\n");
759 goto out_nomem; 759 goto out_nomem;
760 } 760 }
761 761
762 ret = ceph_crypto_key_clone(&xi->secret, ac->key); 762 ret = ceph_crypto_key_clone(&xi->secret, ac->key);
763 if (ret < 0) { 763 if (ret < 0) {
764 pr_err("cannot clone key: %d\n", ret); 764 pr_err("cannot clone key: %d\n", ret);
765 goto out_nomem; 765 goto out_nomem;
766 } 766 }
767 767
768 xi->starting = true; 768 xi->starting = true;
769 xi->ticket_handlers = RB_ROOT; 769 xi->ticket_handlers = RB_ROOT;
770 770
771 ac->protocol = CEPH_AUTH_CEPHX; 771 ac->protocol = CEPH_AUTH_CEPHX;
772 ac->private = xi; 772 ac->private = xi;
773 ac->ops = &ceph_x_ops; 773 ac->ops = &ceph_x_ops;
774 return 0; 774 return 0;
775 775
776 out_nomem: 776 out_nomem:
777 kfree(xi); 777 kfree(xi);
778 out: 778 out:
779 return ret; 779 return ret;
780 } 780 }
781 781
782 782
783 783
net/ceph/mon_client.c
1 #include <linux/ceph/ceph_debug.h> 1 #include <linux/ceph/ceph_debug.h>
2 2
3 #include <linux/module.h> 3 #include <linux/module.h>
4 #include <linux/types.h> 4 #include <linux/types.h>
5 #include <linux/slab.h> 5 #include <linux/slab.h>
6 #include <linux/random.h> 6 #include <linux/random.h>
7 #include <linux/sched.h> 7 #include <linux/sched.h>
8 8
9 #include <linux/ceph/mon_client.h> 9 #include <linux/ceph/mon_client.h>
10 #include <linux/ceph/libceph.h> 10 #include <linux/ceph/libceph.h>
11 #include <linux/ceph/debugfs.h> 11 #include <linux/ceph/debugfs.h>
12 #include <linux/ceph/decode.h> 12 #include <linux/ceph/decode.h>
13 #include <linux/ceph/auth.h> 13 #include <linux/ceph/auth.h>
14 14
15 /* 15 /*
16 * Interact with Ceph monitor cluster. Handle requests for new map 16 * Interact with Ceph monitor cluster. Handle requests for new map
17 * versions, and periodically resend as needed. Also implement 17 * versions, and periodically resend as needed. Also implement
18 * statfs() and umount(). 18 * statfs() and umount().
19 * 19 *
20 * A small cluster of Ceph "monitors" are responsible for managing critical 20 * A small cluster of Ceph "monitors" are responsible for managing critical
21 * cluster configuration and state information. An odd number (e.g., 3, 5) 21 * cluster configuration and state information. An odd number (e.g., 3, 5)
22 * of cmon daemons use a modified version of the Paxos part-time parliament 22 * of cmon daemons use a modified version of the Paxos part-time parliament
23 * algorithm to manage the MDS map (mds cluster membership), OSD map, and 23 * algorithm to manage the MDS map (mds cluster membership), OSD map, and
24 * list of clients who have mounted the file system. 24 * list of clients who have mounted the file system.
25 * 25 *
26 * We maintain an open, active session with a monitor at all times in order to 26 * We maintain an open, active session with a monitor at all times in order to
27 * receive timely MDSMap updates. We periodically send a keepalive byte on the 27 * receive timely MDSMap updates. We periodically send a keepalive byte on the
28 * TCP socket to ensure we detect a failure. If the connection does break, we 28 * TCP socket to ensure we detect a failure. If the connection does break, we
29 * randomly hunt for a new monitor. Once the connection is reestablished, we 29 * randomly hunt for a new monitor. Once the connection is reestablished, we
30 * resend any outstanding requests. 30 * resend any outstanding requests.
31 */ 31 */
32 32
33 static const struct ceph_connection_operations mon_con_ops; 33 static const struct ceph_connection_operations mon_con_ops;
34 34
35 static int __validate_auth(struct ceph_mon_client *monc); 35 static int __validate_auth(struct ceph_mon_client *monc);
36 36
37 /* 37 /*
38 * Decode a monmap blob (e.g., during mount). 38 * Decode a monmap blob (e.g., during mount).
39 */ 39 */
40 struct ceph_monmap *ceph_monmap_decode(void *p, void *end) 40 struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
41 { 41 {
42 struct ceph_monmap *m = NULL; 42 struct ceph_monmap *m = NULL;
43 int i, err = -EINVAL; 43 int i, err = -EINVAL;
44 struct ceph_fsid fsid; 44 struct ceph_fsid fsid;
45 u32 epoch, num_mon; 45 u32 epoch, num_mon;
46 u16 version; 46 u16 version;
47 u32 len; 47 u32 len;
48 48
49 ceph_decode_32_safe(&p, end, len, bad); 49 ceph_decode_32_safe(&p, end, len, bad);
50 ceph_decode_need(&p, end, len, bad); 50 ceph_decode_need(&p, end, len, bad);
51 51
52 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p)); 52 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
53 53
54 ceph_decode_16_safe(&p, end, version, bad); 54 ceph_decode_16_safe(&p, end, version, bad);
55 55
56 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad); 56 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
57 ceph_decode_copy(&p, &fsid, sizeof(fsid)); 57 ceph_decode_copy(&p, &fsid, sizeof(fsid));
58 epoch = ceph_decode_32(&p); 58 epoch = ceph_decode_32(&p);
59 59
60 num_mon = ceph_decode_32(&p); 60 num_mon = ceph_decode_32(&p);
61 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad); 61 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
62 62
63 if (num_mon >= CEPH_MAX_MON) 63 if (num_mon >= CEPH_MAX_MON)
64 goto bad; 64 goto bad;
65 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS); 65 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
66 if (m == NULL) 66 if (m == NULL)
67 return ERR_PTR(-ENOMEM); 67 return ERR_PTR(-ENOMEM);
68 m->fsid = fsid; 68 m->fsid = fsid;
69 m->epoch = epoch; 69 m->epoch = epoch;
70 m->num_mon = num_mon; 70 m->num_mon = num_mon;
71 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0])); 71 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
72 for (i = 0; i < num_mon; i++) 72 for (i = 0; i < num_mon; i++)
73 ceph_decode_addr(&m->mon_inst[i].addr); 73 ceph_decode_addr(&m->mon_inst[i].addr);
74 74
75 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch, 75 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
76 m->num_mon); 76 m->num_mon);
77 for (i = 0; i < m->num_mon; i++) 77 for (i = 0; i < m->num_mon; i++)
78 dout("monmap_decode mon%d is %s\n", i, 78 dout("monmap_decode mon%d is %s\n", i,
79 ceph_pr_addr(&m->mon_inst[i].addr.in_addr)); 79 ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
80 return m; 80 return m;
81 81
82 bad: 82 bad:
83 dout("monmap_decode failed with %d\n", err); 83 dout("monmap_decode failed with %d\n", err);
84 kfree(m); 84 kfree(m);
85 return ERR_PTR(err); 85 return ERR_PTR(err);
86 } 86 }
87 87
88 /* 88 /*
89 * return true if *addr is included in the monmap. 89 * return true if *addr is included in the monmap.
90 */ 90 */
91 int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr) 91 int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
92 { 92 {
93 int i; 93 int i;
94 94
95 for (i = 0; i < m->num_mon; i++) 95 for (i = 0; i < m->num_mon; i++)
96 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0) 96 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
97 return 1; 97 return 1;
98 return 0; 98 return 0;
99 } 99 }
100 100
101 /* 101 /*
102 * Send an auth request. 102 * Send an auth request.
103 */ 103 */
104 static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) 104 static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
105 { 105 {
106 monc->pending_auth = 1; 106 monc->pending_auth = 1;
107 monc->m_auth->front.iov_len = len; 107 monc->m_auth->front.iov_len = len;
108 monc->m_auth->hdr.front_len = cpu_to_le32(len); 108 monc->m_auth->hdr.front_len = cpu_to_le32(len);
109 ceph_msg_revoke(monc->m_auth); 109 ceph_msg_revoke(monc->m_auth);
110 ceph_msg_get(monc->m_auth); /* keep our ref */ 110 ceph_msg_get(monc->m_auth); /* keep our ref */
111 ceph_con_send(&monc->con, monc->m_auth); 111 ceph_con_send(&monc->con, monc->m_auth);
112 } 112 }
113 113
114 /* 114 /*
115 * Close monitor session, if any. 115 * Close monitor session, if any.
116 */ 116 */
117 static void __close_session(struct ceph_mon_client *monc) 117 static void __close_session(struct ceph_mon_client *monc)
118 { 118 {
119 dout("__close_session closing mon%d\n", monc->cur_mon); 119 dout("__close_session closing mon%d\n", monc->cur_mon);
120 ceph_msg_revoke(monc->m_auth); 120 ceph_msg_revoke(monc->m_auth);
121 ceph_msg_revoke_incoming(monc->m_auth_reply); 121 ceph_msg_revoke_incoming(monc->m_auth_reply);
122 ceph_msg_revoke(monc->m_subscribe); 122 ceph_msg_revoke(monc->m_subscribe);
123 ceph_msg_revoke_incoming(monc->m_subscribe_ack); 123 ceph_msg_revoke_incoming(monc->m_subscribe_ack);
124 ceph_con_close(&monc->con); 124 ceph_con_close(&monc->con);
125 monc->cur_mon = -1; 125 monc->cur_mon = -1;
126 monc->pending_auth = 0; 126 monc->pending_auth = 0;
127 ceph_auth_reset(monc->auth); 127 ceph_auth_reset(monc->auth);
128 } 128 }
129 129
130 /* 130 /*
131 * Open a session with a (new) monitor. 131 * Open a session with a (new) monitor.
132 */ 132 */
133 static int __open_session(struct ceph_mon_client *monc) 133 static int __open_session(struct ceph_mon_client *monc)
134 { 134 {
135 char r; 135 char r;
136 int ret; 136 int ret;
137 137
138 if (monc->cur_mon < 0) { 138 if (monc->cur_mon < 0) {
139 get_random_bytes(&r, 1); 139 get_random_bytes(&r, 1);
140 monc->cur_mon = r % monc->monmap->num_mon; 140 monc->cur_mon = r % monc->monmap->num_mon;
141 dout("open_session num=%d r=%d -> mon%d\n", 141 dout("open_session num=%d r=%d -> mon%d\n",
142 monc->monmap->num_mon, r, monc->cur_mon); 142 monc->monmap->num_mon, r, monc->cur_mon);
143 monc->sub_sent = 0; 143 monc->sub_sent = 0;
144 monc->sub_renew_after = jiffies; /* i.e., expired */ 144 monc->sub_renew_after = jiffies; /* i.e., expired */
145 monc->want_next_osdmap = !!monc->want_next_osdmap; 145 monc->want_next_osdmap = !!monc->want_next_osdmap;
146 146
147 dout("open_session mon%d opening\n", monc->cur_mon); 147 dout("open_session mon%d opening\n", monc->cur_mon);
148 ceph_con_open(&monc->con, 148 ceph_con_open(&monc->con,
149 CEPH_ENTITY_TYPE_MON, monc->cur_mon, 149 CEPH_ENTITY_TYPE_MON, monc->cur_mon,
150 &monc->monmap->mon_inst[monc->cur_mon].addr); 150 &monc->monmap->mon_inst[monc->cur_mon].addr);
151 151
152 /* initiatiate authentication handshake */ 152 /* initiatiate authentication handshake */
153 ret = ceph_auth_build_hello(monc->auth, 153 ret = ceph_auth_build_hello(monc->auth,
154 monc->m_auth->front.iov_base, 154 monc->m_auth->front.iov_base,
155 monc->m_auth->front_alloc_len); 155 monc->m_auth->front_alloc_len);
156 __send_prepared_auth_request(monc, ret); 156 __send_prepared_auth_request(monc, ret);
157 } else { 157 } else {
158 dout("open_session mon%d already open\n", monc->cur_mon); 158 dout("open_session mon%d already open\n", monc->cur_mon);
159 } 159 }
160 return 0; 160 return 0;
161 } 161 }
162 162
163 static bool __sub_expired(struct ceph_mon_client *monc) 163 static bool __sub_expired(struct ceph_mon_client *monc)
164 { 164 {
165 return time_after_eq(jiffies, monc->sub_renew_after); 165 return time_after_eq(jiffies, monc->sub_renew_after);
166 } 166 }
167 167
168 /* 168 /*
169 * Reschedule delayed work timer. 169 * Reschedule delayed work timer.
170 */ 170 */
171 static void __schedule_delayed(struct ceph_mon_client *monc) 171 static void __schedule_delayed(struct ceph_mon_client *monc)
172 { 172 {
173 unsigned int delay; 173 unsigned int delay;
174 174
175 if (monc->cur_mon < 0 || __sub_expired(monc)) 175 if (monc->cur_mon < 0 || __sub_expired(monc))
176 delay = 10 * HZ; 176 delay = 10 * HZ;
177 else 177 else
178 delay = 20 * HZ; 178 delay = 20 * HZ;
179 dout("__schedule_delayed after %u\n", delay); 179 dout("__schedule_delayed after %u\n", delay);
180 schedule_delayed_work(&monc->delayed_work, delay); 180 schedule_delayed_work(&monc->delayed_work, delay);
181 } 181 }
182 182
183 /* 183 /*
184 * Send subscribe request for mdsmap and/or osdmap. 184 * Send subscribe request for mdsmap and/or osdmap.
185 */ 185 */
186 static void __send_subscribe(struct ceph_mon_client *monc) 186 static void __send_subscribe(struct ceph_mon_client *monc)
187 { 187 {
188 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n", 188 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
189 (unsigned int)monc->sub_sent, __sub_expired(monc), 189 (unsigned int)monc->sub_sent, __sub_expired(monc),
190 monc->want_next_osdmap); 190 monc->want_next_osdmap);
191 if ((__sub_expired(monc) && !monc->sub_sent) || 191 if ((__sub_expired(monc) && !monc->sub_sent) ||
192 monc->want_next_osdmap == 1) { 192 monc->want_next_osdmap == 1) {
193 struct ceph_msg *msg = monc->m_subscribe; 193 struct ceph_msg *msg = monc->m_subscribe;
194 struct ceph_mon_subscribe_item *i; 194 struct ceph_mon_subscribe_item *i;
195 void *p, *end; 195 void *p, *end;
196 int num; 196 int num;
197 197
198 p = msg->front.iov_base; 198 p = msg->front.iov_base;
199 end = p + msg->front_alloc_len; 199 end = p + msg->front_alloc_len;
200 200
201 num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; 201 num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
202 ceph_encode_32(&p, num); 202 ceph_encode_32(&p, num);
203 203
204 if (monc->want_next_osdmap) { 204 if (monc->want_next_osdmap) {
205 dout("__send_subscribe to 'osdmap' %u\n", 205 dout("__send_subscribe to 'osdmap' %u\n",
206 (unsigned int)monc->have_osdmap); 206 (unsigned int)monc->have_osdmap);
207 ceph_encode_string(&p, end, "osdmap", 6); 207 ceph_encode_string(&p, end, "osdmap", 6);
208 i = p; 208 i = p;
209 i->have = cpu_to_le64(monc->have_osdmap); 209 i->have = cpu_to_le64(monc->have_osdmap);
210 i->onetime = 1; 210 i->onetime = 1;
211 p += sizeof(*i); 211 p += sizeof(*i);
212 monc->want_next_osdmap = 2; /* requested */ 212 monc->want_next_osdmap = 2; /* requested */
213 } 213 }
214 if (monc->want_mdsmap) { 214 if (monc->want_mdsmap) {
215 dout("__send_subscribe to 'mdsmap' %u+\n", 215 dout("__send_subscribe to 'mdsmap' %u+\n",
216 (unsigned int)monc->have_mdsmap); 216 (unsigned int)monc->have_mdsmap);
217 ceph_encode_string(&p, end, "mdsmap", 6); 217 ceph_encode_string(&p, end, "mdsmap", 6);
218 i = p; 218 i = p;
219 i->have = cpu_to_le64(monc->have_mdsmap); 219 i->have = cpu_to_le64(monc->have_mdsmap);
220 i->onetime = 0; 220 i->onetime = 0;
221 p += sizeof(*i); 221 p += sizeof(*i);
222 } 222 }
223 ceph_encode_string(&p, end, "monmap", 6); 223 ceph_encode_string(&p, end, "monmap", 6);
224 i = p; 224 i = p;
225 i->have = 0; 225 i->have = 0;
226 i->onetime = 0; 226 i->onetime = 0;
227 p += sizeof(*i); 227 p += sizeof(*i);
228 228
229 msg->front.iov_len = p - msg->front.iov_base; 229 msg->front.iov_len = p - msg->front.iov_base;
230 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 230 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
231 ceph_msg_revoke(msg); 231 ceph_msg_revoke(msg);
232 ceph_con_send(&monc->con, ceph_msg_get(msg)); 232 ceph_con_send(&monc->con, ceph_msg_get(msg));
233 233
234 monc->sub_sent = jiffies | 1; /* never 0 */ 234 monc->sub_sent = jiffies | 1; /* never 0 */
235 } 235 }
236 } 236 }
237 237
238 static void handle_subscribe_ack(struct ceph_mon_client *monc, 238 static void handle_subscribe_ack(struct ceph_mon_client *monc,
239 struct ceph_msg *msg) 239 struct ceph_msg *msg)
240 { 240 {
241 unsigned int seconds; 241 unsigned int seconds;
242 struct ceph_mon_subscribe_ack *h = msg->front.iov_base; 242 struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
243 243
244 if (msg->front.iov_len < sizeof(*h)) 244 if (msg->front.iov_len < sizeof(*h))
245 goto bad; 245 goto bad;
246 seconds = le32_to_cpu(h->duration); 246 seconds = le32_to_cpu(h->duration);
247 247
248 mutex_lock(&monc->mutex); 248 mutex_lock(&monc->mutex);
249 if (monc->hunting) { 249 if (monc->hunting) {
250 pr_info("mon%d %s session established\n", 250 pr_info("mon%d %s session established\n",
251 monc->cur_mon, 251 monc->cur_mon,
252 ceph_pr_addr(&monc->con.peer_addr.in_addr)); 252 ceph_pr_addr(&monc->con.peer_addr.in_addr));
253 monc->hunting = false; 253 monc->hunting = false;
254 } 254 }
255 dout("handle_subscribe_ack after %d seconds\n", seconds); 255 dout("handle_subscribe_ack after %d seconds\n", seconds);
256 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1; 256 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
257 monc->sub_sent = 0; 257 monc->sub_sent = 0;
258 mutex_unlock(&monc->mutex); 258 mutex_unlock(&monc->mutex);
259 return; 259 return;
260 bad: 260 bad:
261 pr_err("got corrupt subscribe-ack msg\n"); 261 pr_err("got corrupt subscribe-ack msg\n");
262 ceph_msg_dump(msg); 262 ceph_msg_dump(msg);
263 } 263 }
264 264
265 /* 265 /*
266 * Keep track of which maps we have 266 * Keep track of which maps we have
267 */ 267 */
268 int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got) 268 int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
269 { 269 {
270 mutex_lock(&monc->mutex); 270 mutex_lock(&monc->mutex);
271 monc->have_mdsmap = got; 271 monc->have_mdsmap = got;
272 mutex_unlock(&monc->mutex); 272 mutex_unlock(&monc->mutex);
273 return 0; 273 return 0;
274 } 274 }
275 EXPORT_SYMBOL(ceph_monc_got_mdsmap); 275 EXPORT_SYMBOL(ceph_monc_got_mdsmap);
276 276
277 int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) 277 int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
278 { 278 {
279 mutex_lock(&monc->mutex); 279 mutex_lock(&monc->mutex);
280 monc->have_osdmap = got; 280 monc->have_osdmap = got;
281 monc->want_next_osdmap = 0; 281 monc->want_next_osdmap = 0;
282 mutex_unlock(&monc->mutex); 282 mutex_unlock(&monc->mutex);
283 return 0; 283 return 0;
284 } 284 }
285 285
286 /* 286 /*
287 * Register interest in the next osdmap 287 * Register interest in the next osdmap
288 */ 288 */
289 void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) 289 void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
290 { 290 {
291 dout("request_next_osdmap have %u\n", monc->have_osdmap); 291 dout("request_next_osdmap have %u\n", monc->have_osdmap);
292 mutex_lock(&monc->mutex); 292 mutex_lock(&monc->mutex);
293 if (!monc->want_next_osdmap) 293 if (!monc->want_next_osdmap)
294 monc->want_next_osdmap = 1; 294 monc->want_next_osdmap = 1;
295 if (monc->want_next_osdmap < 2) 295 if (monc->want_next_osdmap < 2)
296 __send_subscribe(monc); 296 __send_subscribe(monc);
297 mutex_unlock(&monc->mutex); 297 mutex_unlock(&monc->mutex);
298 } 298 }
299 EXPORT_SYMBOL(ceph_monc_request_next_osdmap); 299 EXPORT_SYMBOL(ceph_monc_request_next_osdmap);
300 300
301 int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, 301 int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
302 unsigned long timeout) 302 unsigned long timeout)
303 { 303 {
304 unsigned long started = jiffies; 304 unsigned long started = jiffies;
305 int ret; 305 int ret;
306 306
307 mutex_lock(&monc->mutex); 307 mutex_lock(&monc->mutex);
308 while (monc->have_osdmap < epoch) { 308 while (monc->have_osdmap < epoch) {
309 mutex_unlock(&monc->mutex); 309 mutex_unlock(&monc->mutex);
310 310
311 if (timeout != 0 && time_after_eq(jiffies, started + timeout)) 311 if (timeout != 0 && time_after_eq(jiffies, started + timeout))
312 return -ETIMEDOUT; 312 return -ETIMEDOUT;
313 313
314 ret = wait_event_interruptible_timeout(monc->client->auth_wq, 314 ret = wait_event_interruptible_timeout(monc->client->auth_wq,
315 monc->have_osdmap >= epoch, timeout); 315 monc->have_osdmap >= epoch, timeout);
316 if (ret < 0) 316 if (ret < 0)
317 return ret; 317 return ret;
318 318
319 mutex_lock(&monc->mutex); 319 mutex_lock(&monc->mutex);
320 } 320 }
321 321
322 mutex_unlock(&monc->mutex); 322 mutex_unlock(&monc->mutex);
323 return 0; 323 return 0;
324 } 324 }
325 EXPORT_SYMBOL(ceph_monc_wait_osdmap); 325 EXPORT_SYMBOL(ceph_monc_wait_osdmap);
326 326
327 /* 327 /*
328 * 328 *
329 */ 329 */
330 int ceph_monc_open_session(struct ceph_mon_client *monc) 330 int ceph_monc_open_session(struct ceph_mon_client *monc)
331 { 331 {
332 mutex_lock(&monc->mutex); 332 mutex_lock(&monc->mutex);
333 __open_session(monc); 333 __open_session(monc);
334 __schedule_delayed(monc); 334 __schedule_delayed(monc);
335 mutex_unlock(&monc->mutex); 335 mutex_unlock(&monc->mutex);
336 return 0; 336 return 0;
337 } 337 }
338 EXPORT_SYMBOL(ceph_monc_open_session); 338 EXPORT_SYMBOL(ceph_monc_open_session);
339 339
340 /* 340 /*
341 * We require the fsid and global_id in order to initialize our 341 * We require the fsid and global_id in order to initialize our
342 * debugfs dir. 342 * debugfs dir.
343 */ 343 */
344 static bool have_debugfs_info(struct ceph_mon_client *monc) 344 static bool have_debugfs_info(struct ceph_mon_client *monc)
345 { 345 {
346 dout("have_debugfs_info fsid %d globalid %lld\n", 346 dout("have_debugfs_info fsid %d globalid %lld\n",
347 (int)monc->client->have_fsid, monc->auth->global_id); 347 (int)monc->client->have_fsid, monc->auth->global_id);
348 return monc->client->have_fsid && monc->auth->global_id > 0; 348 return monc->client->have_fsid && monc->auth->global_id > 0;
349 } 349 }
350 350
351 /* 351 /*
352 * The monitor responds with mount ack indicate mount success. The 352 * The monitor responds with mount ack indicate mount success. The
353 * included client ticket allows the client to talk to MDSs and OSDs. 353 * included client ticket allows the client to talk to MDSs and OSDs.
354 */ 354 */
355 static void ceph_monc_handle_map(struct ceph_mon_client *monc, 355 static void ceph_monc_handle_map(struct ceph_mon_client *monc,
356 struct ceph_msg *msg) 356 struct ceph_msg *msg)
357 { 357 {
358 struct ceph_client *client = monc->client; 358 struct ceph_client *client = monc->client;
359 struct ceph_monmap *monmap = NULL, *old = monc->monmap; 359 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
360 void *p, *end; 360 void *p, *end;
361 int had_debugfs_info, init_debugfs = 0; 361 int had_debugfs_info, init_debugfs = 0;
362 362
363 mutex_lock(&monc->mutex); 363 mutex_lock(&monc->mutex);
364 364
365 had_debugfs_info = have_debugfs_info(monc); 365 had_debugfs_info = have_debugfs_info(monc);
366 366
367 dout("handle_monmap\n"); 367 dout("handle_monmap\n");
368 p = msg->front.iov_base; 368 p = msg->front.iov_base;
369 end = p + msg->front.iov_len; 369 end = p + msg->front.iov_len;
370 370
371 monmap = ceph_monmap_decode(p, end); 371 monmap = ceph_monmap_decode(p, end);
372 if (IS_ERR(monmap)) { 372 if (IS_ERR(monmap)) {
373 pr_err("problem decoding monmap, %d\n", 373 pr_err("problem decoding monmap, %d\n",
374 (int)PTR_ERR(monmap)); 374 (int)PTR_ERR(monmap));
375 goto out; 375 goto out;
376 } 376 }
377 377
378 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) { 378 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
379 kfree(monmap); 379 kfree(monmap);
380 goto out; 380 goto out;
381 } 381 }
382 382
383 client->monc.monmap = monmap; 383 client->monc.monmap = monmap;
384 kfree(old); 384 kfree(old);
385 385
386 if (!client->have_fsid) { 386 if (!client->have_fsid) {
387 client->have_fsid = true; 387 client->have_fsid = true;
388 if (!had_debugfs_info && have_debugfs_info(monc)) { 388 if (!had_debugfs_info && have_debugfs_info(monc)) {
389 pr_info("client%lld fsid %pU\n", 389 pr_info("client%lld fsid %pU\n",
390 ceph_client_id(monc->client), 390 ceph_client_id(monc->client),
391 &monc->client->fsid); 391 &monc->client->fsid);
392 init_debugfs = 1; 392 init_debugfs = 1;
393 } 393 }
394 mutex_unlock(&monc->mutex); 394 mutex_unlock(&monc->mutex);
395 395
396 if (init_debugfs) { 396 if (init_debugfs) {
397 /* 397 /*
398 * do debugfs initialization without mutex to avoid 398 * do debugfs initialization without mutex to avoid
399 * creating a locking dependency 399 * creating a locking dependency
400 */ 400 */
401 ceph_debugfs_client_init(monc->client); 401 ceph_debugfs_client_init(monc->client);
402 } 402 }
403 403
404 goto out_unlocked; 404 goto out_unlocked;
405 } 405 }
406 out: 406 out:
407 mutex_unlock(&monc->mutex); 407 mutex_unlock(&monc->mutex);
408 out_unlocked: 408 out_unlocked:
409 wake_up_all(&client->auth_wq); 409 wake_up_all(&client->auth_wq);
410 } 410 }
411 411
412 /* 412 /*
413 * generic requests (e.g., statfs, poolop) 413 * generic requests (e.g., statfs, poolop)
414 */ 414 */
415 static struct ceph_mon_generic_request *__lookup_generic_req( 415 static struct ceph_mon_generic_request *__lookup_generic_req(
416 struct ceph_mon_client *monc, u64 tid) 416 struct ceph_mon_client *monc, u64 tid)
417 { 417 {
418 struct ceph_mon_generic_request *req; 418 struct ceph_mon_generic_request *req;
419 struct rb_node *n = monc->generic_request_tree.rb_node; 419 struct rb_node *n = monc->generic_request_tree.rb_node;
420 420
421 while (n) { 421 while (n) {
422 req = rb_entry(n, struct ceph_mon_generic_request, node); 422 req = rb_entry(n, struct ceph_mon_generic_request, node);
423 if (tid < req->tid) 423 if (tid < req->tid)
424 n = n->rb_left; 424 n = n->rb_left;
425 else if (tid > req->tid) 425 else if (tid > req->tid)
426 n = n->rb_right; 426 n = n->rb_right;
427 else 427 else
428 return req; 428 return req;
429 } 429 }
430 return NULL; 430 return NULL;
431 } 431 }
432 432
433 static void __insert_generic_request(struct ceph_mon_client *monc, 433 static void __insert_generic_request(struct ceph_mon_client *monc,
434 struct ceph_mon_generic_request *new) 434 struct ceph_mon_generic_request *new)
435 { 435 {
436 struct rb_node **p = &monc->generic_request_tree.rb_node; 436 struct rb_node **p = &monc->generic_request_tree.rb_node;
437 struct rb_node *parent = NULL; 437 struct rb_node *parent = NULL;
438 struct ceph_mon_generic_request *req = NULL; 438 struct ceph_mon_generic_request *req = NULL;
439 439
440 while (*p) { 440 while (*p) {
441 parent = *p; 441 parent = *p;
442 req = rb_entry(parent, struct ceph_mon_generic_request, node); 442 req = rb_entry(parent, struct ceph_mon_generic_request, node);
443 if (new->tid < req->tid) 443 if (new->tid < req->tid)
444 p = &(*p)->rb_left; 444 p = &(*p)->rb_left;
445 else if (new->tid > req->tid) 445 else if (new->tid > req->tid)
446 p = &(*p)->rb_right; 446 p = &(*p)->rb_right;
447 else 447 else
448 BUG(); 448 BUG();
449 } 449 }
450 450
451 rb_link_node(&new->node, parent, p); 451 rb_link_node(&new->node, parent, p);
452 rb_insert_color(&new->node, &monc->generic_request_tree); 452 rb_insert_color(&new->node, &monc->generic_request_tree);
453 } 453 }
454 454
455 static void release_generic_request(struct kref *kref) 455 static void release_generic_request(struct kref *kref)
456 { 456 {
457 struct ceph_mon_generic_request *req = 457 struct ceph_mon_generic_request *req =
458 container_of(kref, struct ceph_mon_generic_request, kref); 458 container_of(kref, struct ceph_mon_generic_request, kref);
459 459
460 if (req->reply) 460 if (req->reply)
461 ceph_msg_put(req->reply); 461 ceph_msg_put(req->reply);
462 if (req->request) 462 if (req->request)
463 ceph_msg_put(req->request); 463 ceph_msg_put(req->request);
464 464
465 kfree(req); 465 kfree(req);
466 } 466 }
467 467
468 static void put_generic_request(struct ceph_mon_generic_request *req) 468 static void put_generic_request(struct ceph_mon_generic_request *req)
469 { 469 {
470 kref_put(&req->kref, release_generic_request); 470 kref_put(&req->kref, release_generic_request);
471 } 471 }
472 472
473 static void get_generic_request(struct ceph_mon_generic_request *req) 473 static void get_generic_request(struct ceph_mon_generic_request *req)
474 { 474 {
475 kref_get(&req->kref); 475 kref_get(&req->kref);
476 } 476 }
477 477
478 static struct ceph_msg *get_generic_reply(struct ceph_connection *con, 478 static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
479 struct ceph_msg_header *hdr, 479 struct ceph_msg_header *hdr,
480 int *skip) 480 int *skip)
481 { 481 {
482 struct ceph_mon_client *monc = con->private; 482 struct ceph_mon_client *monc = con->private;
483 struct ceph_mon_generic_request *req; 483 struct ceph_mon_generic_request *req;
484 u64 tid = le64_to_cpu(hdr->tid); 484 u64 tid = le64_to_cpu(hdr->tid);
485 struct ceph_msg *m; 485 struct ceph_msg *m;
486 486
487 mutex_lock(&monc->mutex); 487 mutex_lock(&monc->mutex);
488 req = __lookup_generic_req(monc, tid); 488 req = __lookup_generic_req(monc, tid);
489 if (!req) { 489 if (!req) {
490 dout("get_generic_reply %lld dne\n", tid); 490 dout("get_generic_reply %lld dne\n", tid);
491 *skip = 1; 491 *skip = 1;
492 m = NULL; 492 m = NULL;
493 } else { 493 } else {
494 dout("get_generic_reply %lld got %p\n", tid, req->reply); 494 dout("get_generic_reply %lld got %p\n", tid, req->reply);
495 *skip = 0; 495 *skip = 0;
496 m = ceph_msg_get(req->reply); 496 m = ceph_msg_get(req->reply);
497 /* 497 /*
498 * we don't need to track the connection reading into 498 * we don't need to track the connection reading into
499 * this reply because we only have one open connection 499 * this reply because we only have one open connection
500 * at a time, ever. 500 * at a time, ever.
501 */ 501 */
502 } 502 }
503 mutex_unlock(&monc->mutex); 503 mutex_unlock(&monc->mutex);
504 return m; 504 return m;
505 } 505 }
506 506
507 static int __do_generic_request(struct ceph_mon_client *monc, u64 tid, 507 static int __do_generic_request(struct ceph_mon_client *monc, u64 tid,
508 struct ceph_mon_generic_request *req) 508 struct ceph_mon_generic_request *req)
509 { 509 {
510 int err; 510 int err;
511 511
512 /* register request */ 512 /* register request */
513 req->tid = tid != 0 ? tid : ++monc->last_tid; 513 req->tid = tid != 0 ? tid : ++monc->last_tid;
514 req->request->hdr.tid = cpu_to_le64(req->tid); 514 req->request->hdr.tid = cpu_to_le64(req->tid);
515 __insert_generic_request(monc, req); 515 __insert_generic_request(monc, req);
516 monc->num_generic_requests++; 516 monc->num_generic_requests++;
517 ceph_con_send(&monc->con, ceph_msg_get(req->request)); 517 ceph_con_send(&monc->con, ceph_msg_get(req->request));
518 mutex_unlock(&monc->mutex); 518 mutex_unlock(&monc->mutex);
519 519
520 err = wait_for_completion_interruptible(&req->completion); 520 err = wait_for_completion_interruptible(&req->completion);
521 521
522 mutex_lock(&monc->mutex); 522 mutex_lock(&monc->mutex);
523 rb_erase(&req->node, &monc->generic_request_tree); 523 rb_erase(&req->node, &monc->generic_request_tree);
524 monc->num_generic_requests--; 524 monc->num_generic_requests--;
525 525
526 if (!err) 526 if (!err)
527 err = req->result; 527 err = req->result;
528 return err; 528 return err;
529 } 529 }
530 530
531 static int do_generic_request(struct ceph_mon_client *monc, 531 static int do_generic_request(struct ceph_mon_client *monc,
532 struct ceph_mon_generic_request *req) 532 struct ceph_mon_generic_request *req)
533 { 533 {
534 int err; 534 int err;
535 535
536 mutex_lock(&monc->mutex); 536 mutex_lock(&monc->mutex);
537 err = __do_generic_request(monc, 0, req); 537 err = __do_generic_request(monc, 0, req);
538 mutex_unlock(&monc->mutex); 538 mutex_unlock(&monc->mutex);
539 539
540 return err; 540 return err;
541 } 541 }
542 542
543 /* 543 /*
544 * statfs 544 * statfs
545 */ 545 */
546 static void handle_statfs_reply(struct ceph_mon_client *monc, 546 static void handle_statfs_reply(struct ceph_mon_client *monc,
547 struct ceph_msg *msg) 547 struct ceph_msg *msg)
548 { 548 {
549 struct ceph_mon_generic_request *req; 549 struct ceph_mon_generic_request *req;
550 struct ceph_mon_statfs_reply *reply = msg->front.iov_base; 550 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
551 u64 tid = le64_to_cpu(msg->hdr.tid); 551 u64 tid = le64_to_cpu(msg->hdr.tid);
552 552
553 if (msg->front.iov_len != sizeof(*reply)) 553 if (msg->front.iov_len != sizeof(*reply))
554 goto bad; 554 goto bad;
555 dout("handle_statfs_reply %p tid %llu\n", msg, tid); 555 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
556 556
557 mutex_lock(&monc->mutex); 557 mutex_lock(&monc->mutex);
558 req = __lookup_generic_req(monc, tid); 558 req = __lookup_generic_req(monc, tid);
559 if (req) { 559 if (req) {
560 *(struct ceph_statfs *)req->buf = reply->st; 560 *(struct ceph_statfs *)req->buf = reply->st;
561 req->result = 0; 561 req->result = 0;
562 get_generic_request(req); 562 get_generic_request(req);
563 } 563 }
564 mutex_unlock(&monc->mutex); 564 mutex_unlock(&monc->mutex);
565 if (req) { 565 if (req) {
566 complete_all(&req->completion); 566 complete_all(&req->completion);
567 put_generic_request(req); 567 put_generic_request(req);
568 } 568 }
569 return; 569 return;
570 570
571 bad: 571 bad:
572 pr_err("corrupt generic reply, tid %llu\n", tid); 572 pr_err("corrupt generic reply, tid %llu\n", tid);
573 ceph_msg_dump(msg); 573 ceph_msg_dump(msg);
574 } 574 }
575 575
576 /* 576 /*
577 * Do a synchronous statfs(). 577 * Do a synchronous statfs().
578 */ 578 */
579 int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) 579 int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
580 { 580 {
581 struct ceph_mon_generic_request *req; 581 struct ceph_mon_generic_request *req;
582 struct ceph_mon_statfs *h; 582 struct ceph_mon_statfs *h;
583 int err; 583 int err;
584 584
585 req = kzalloc(sizeof(*req), GFP_NOFS); 585 req = kzalloc(sizeof(*req), GFP_NOFS);
586 if (!req) 586 if (!req)
587 return -ENOMEM; 587 return -ENOMEM;
588 588
589 kref_init(&req->kref); 589 kref_init(&req->kref);
590 req->buf = buf; 590 req->buf = buf;
591 req->buf_len = sizeof(*buf); 591 req->buf_len = sizeof(*buf);
592 init_completion(&req->completion); 592 init_completion(&req->completion);
593 593
594 err = -ENOMEM; 594 err = -ENOMEM;
595 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS, 595 req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS,
596 true); 596 true);
597 if (!req->request) 597 if (!req->request)
598 goto out; 598 goto out;
599 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS, 599 req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS,
600 true); 600 true);
601 if (!req->reply) 601 if (!req->reply)
602 goto out; 602 goto out;
603 603
604 /* fill out request */ 604 /* fill out request */
605 h = req->request->front.iov_base; 605 h = req->request->front.iov_base;
606 h->monhdr.have_version = 0; 606 h->monhdr.have_version = 0;
607 h->monhdr.session_mon = cpu_to_le16(-1); 607 h->monhdr.session_mon = cpu_to_le16(-1);
608 h->monhdr.session_mon_tid = 0; 608 h->monhdr.session_mon_tid = 0;
609 h->fsid = monc->monmap->fsid; 609 h->fsid = monc->monmap->fsid;
610 610
611 err = do_generic_request(monc, req); 611 err = do_generic_request(monc, req);
612 612
613 out: 613 out:
614 kref_put(&req->kref, release_generic_request); 614 kref_put(&req->kref, release_generic_request);
615 return err; 615 return err;
616 } 616 }
617 EXPORT_SYMBOL(ceph_monc_do_statfs); 617 EXPORT_SYMBOL(ceph_monc_do_statfs);
618 618
619 static void handle_get_version_reply(struct ceph_mon_client *monc, 619 static void handle_get_version_reply(struct ceph_mon_client *monc,
620 struct ceph_msg *msg) 620 struct ceph_msg *msg)
621 { 621 {
622 struct ceph_mon_generic_request *req; 622 struct ceph_mon_generic_request *req;
623 u64 tid = le64_to_cpu(msg->hdr.tid); 623 u64 tid = le64_to_cpu(msg->hdr.tid);
624 void *p = msg->front.iov_base; 624 void *p = msg->front.iov_base;
625 void *end = p + msg->front_alloc_len; 625 void *end = p + msg->front_alloc_len;
626 u64 handle; 626 u64 handle;
627 627
628 dout("%s %p tid %llu\n", __func__, msg, tid); 628 dout("%s %p tid %llu\n", __func__, msg, tid);
629 629
630 ceph_decode_need(&p, end, 2*sizeof(u64), bad); 630 ceph_decode_need(&p, end, 2*sizeof(u64), bad);
631 handle = ceph_decode_64(&p); 631 handle = ceph_decode_64(&p);
632 if (tid != 0 && tid != handle) 632 if (tid != 0 && tid != handle)
633 goto bad; 633 goto bad;
634 634
635 mutex_lock(&monc->mutex); 635 mutex_lock(&monc->mutex);
636 req = __lookup_generic_req(monc, handle); 636 req = __lookup_generic_req(monc, handle);
637 if (req) { 637 if (req) {
638 *(u64 *)req->buf = ceph_decode_64(&p); 638 *(u64 *)req->buf = ceph_decode_64(&p);
639 req->result = 0; 639 req->result = 0;
640 get_generic_request(req); 640 get_generic_request(req);
641 } 641 }
642 mutex_unlock(&monc->mutex); 642 mutex_unlock(&monc->mutex);
643 if (req) { 643 if (req) {
644 complete_all(&req->completion); 644 complete_all(&req->completion);
645 put_generic_request(req); 645 put_generic_request(req);
646 } 646 }
647 647
648 return; 648 return;
649 bad: 649 bad:
650 pr_err("corrupt mon_get_version reply\n"); 650 pr_err("corrupt mon_get_version reply\n");
651 ceph_msg_dump(msg); 651 ceph_msg_dump(msg);
652 } 652 }
653 653
654 /* 654 /*
655 * Send MMonGetVersion and wait for the reply. 655 * Send MMonGetVersion and wait for the reply.
656 * 656 *
657 * @what: one of "mdsmap", "osdmap" or "monmap" 657 * @what: one of "mdsmap", "osdmap" or "monmap"
658 */ 658 */
659 int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what, 659 int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what,
660 u64 *newest) 660 u64 *newest)
661 { 661 {
662 struct ceph_mon_generic_request *req; 662 struct ceph_mon_generic_request *req;
663 void *p, *end; 663 void *p, *end;
664 u64 tid; 664 u64 tid;
665 int err; 665 int err;
666 666
667 req = kzalloc(sizeof(*req), GFP_NOFS); 667 req = kzalloc(sizeof(*req), GFP_NOFS);
668 if (!req) 668 if (!req)
669 return -ENOMEM; 669 return -ENOMEM;
670 670
671 kref_init(&req->kref); 671 kref_init(&req->kref);
672 req->buf = newest; 672 req->buf = newest;
673 req->buf_len = sizeof(*newest); 673 req->buf_len = sizeof(*newest);
674 init_completion(&req->completion); 674 init_completion(&req->completion);
675 675
676 req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION, 676 req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION,
677 sizeof(u64) + sizeof(u32) + strlen(what), 677 sizeof(u64) + sizeof(u32) + strlen(what),
678 GFP_NOFS, true); 678 GFP_NOFS, true);
679 if (!req->request) { 679 if (!req->request) {
680 err = -ENOMEM; 680 err = -ENOMEM;
681 goto out; 681 goto out;
682 } 682 }
683 683
684 req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024, 684 req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024,
685 GFP_NOFS, true); 685 GFP_NOFS, true);
686 if (!req->reply) { 686 if (!req->reply) {
687 err = -ENOMEM; 687 err = -ENOMEM;
688 goto out; 688 goto out;
689 } 689 }
690 690
691 p = req->request->front.iov_base; 691 p = req->request->front.iov_base;
692 end = p + req->request->front_alloc_len; 692 end = p + req->request->front_alloc_len;
693 693
694 /* fill out request */ 694 /* fill out request */
695 mutex_lock(&monc->mutex); 695 mutex_lock(&monc->mutex);
696 tid = ++monc->last_tid; 696 tid = ++monc->last_tid;
697 ceph_encode_64(&p, tid); /* handle */ 697 ceph_encode_64(&p, tid); /* handle */
698 ceph_encode_string(&p, end, what, strlen(what)); 698 ceph_encode_string(&p, end, what, strlen(what));
699 699
700 err = __do_generic_request(monc, tid, req); 700 err = __do_generic_request(monc, tid, req);
701 701
702 mutex_unlock(&monc->mutex); 702 mutex_unlock(&monc->mutex);
703 out: 703 out:
704 kref_put(&req->kref, release_generic_request); 704 kref_put(&req->kref, release_generic_request);
705 return err; 705 return err;
706 } 706 }
707 EXPORT_SYMBOL(ceph_monc_do_get_version); 707 EXPORT_SYMBOL(ceph_monc_do_get_version);
708 708
709 /* 709 /*
710 * pool ops 710 * pool ops
711 */ 711 */
712 static int get_poolop_reply_buf(const char *src, size_t src_len, 712 static int get_poolop_reply_buf(const char *src, size_t src_len,
713 char *dst, size_t dst_len) 713 char *dst, size_t dst_len)
714 { 714 {
715 u32 buf_len; 715 u32 buf_len;
716 716
717 if (src_len != sizeof(u32) + dst_len) 717 if (src_len != sizeof(u32) + dst_len)
718 return -EINVAL; 718 return -EINVAL;
719 719
720 buf_len = le32_to_cpu(*(u32 *)src); 720 buf_len = le32_to_cpu(*(__le32 *)src);
721 if (buf_len != dst_len) 721 if (buf_len != dst_len)
722 return -EINVAL; 722 return -EINVAL;
723 723
724 memcpy(dst, src + sizeof(u32), dst_len); 724 memcpy(dst, src + sizeof(u32), dst_len);
725 return 0; 725 return 0;
726 } 726 }
727 727
728 static void handle_poolop_reply(struct ceph_mon_client *monc, 728 static void handle_poolop_reply(struct ceph_mon_client *monc,
729 struct ceph_msg *msg) 729 struct ceph_msg *msg)
730 { 730 {
731 struct ceph_mon_generic_request *req; 731 struct ceph_mon_generic_request *req;
732 struct ceph_mon_poolop_reply *reply = msg->front.iov_base; 732 struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
733 u64 tid = le64_to_cpu(msg->hdr.tid); 733 u64 tid = le64_to_cpu(msg->hdr.tid);
734 734
735 if (msg->front.iov_len < sizeof(*reply)) 735 if (msg->front.iov_len < sizeof(*reply))
736 goto bad; 736 goto bad;
737 dout("handle_poolop_reply %p tid %llu\n", msg, tid); 737 dout("handle_poolop_reply %p tid %llu\n", msg, tid);
738 738
739 mutex_lock(&monc->mutex); 739 mutex_lock(&monc->mutex);
740 req = __lookup_generic_req(monc, tid); 740 req = __lookup_generic_req(monc, tid);
741 if (req) { 741 if (req) {
742 if (req->buf_len && 742 if (req->buf_len &&
743 get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply), 743 get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
744 msg->front.iov_len - sizeof(*reply), 744 msg->front.iov_len - sizeof(*reply),
745 req->buf, req->buf_len) < 0) { 745 req->buf, req->buf_len) < 0) {
746 mutex_unlock(&monc->mutex); 746 mutex_unlock(&monc->mutex);
747 goto bad; 747 goto bad;
748 } 748 }
749 req->result = le32_to_cpu(reply->reply_code); 749 req->result = le32_to_cpu(reply->reply_code);
750 get_generic_request(req); 750 get_generic_request(req);
751 } 751 }
752 mutex_unlock(&monc->mutex); 752 mutex_unlock(&monc->mutex);
753 if (req) { 753 if (req) {
754 complete(&req->completion); 754 complete(&req->completion);
755 put_generic_request(req); 755 put_generic_request(req);
756 } 756 }
757 return; 757 return;
758 758
759 bad: 759 bad:
760 pr_err("corrupt generic reply, tid %llu\n", tid); 760 pr_err("corrupt generic reply, tid %llu\n", tid);
761 ceph_msg_dump(msg); 761 ceph_msg_dump(msg);
762 } 762 }
763 763
764 /* 764 /*
765 * Do a synchronous pool op. 765 * Do a synchronous pool op.
766 */ 766 */
767 static int do_poolop(struct ceph_mon_client *monc, u32 op, 767 static int do_poolop(struct ceph_mon_client *monc, u32 op,
768 u32 pool, u64 snapid, 768 u32 pool, u64 snapid,
769 char *buf, int len) 769 char *buf, int len)
770 { 770 {
771 struct ceph_mon_generic_request *req; 771 struct ceph_mon_generic_request *req;
772 struct ceph_mon_poolop *h; 772 struct ceph_mon_poolop *h;
773 int err; 773 int err;
774 774
775 req = kzalloc(sizeof(*req), GFP_NOFS); 775 req = kzalloc(sizeof(*req), GFP_NOFS);
776 if (!req) 776 if (!req)
777 return -ENOMEM; 777 return -ENOMEM;
778 778
779 kref_init(&req->kref); 779 kref_init(&req->kref);
780 req->buf = buf; 780 req->buf = buf;
781 req->buf_len = len; 781 req->buf_len = len;
782 init_completion(&req->completion); 782 init_completion(&req->completion);
783 783
784 err = -ENOMEM; 784 err = -ENOMEM;
785 req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS, 785 req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS,
786 true); 786 true);
787 if (!req->request) 787 if (!req->request)
788 goto out; 788 goto out;
789 req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS, 789 req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS,
790 true); 790 true);
791 if (!req->reply) 791 if (!req->reply)
792 goto out; 792 goto out;
793 793
794 /* fill out request */ 794 /* fill out request */
795 req->request->hdr.version = cpu_to_le16(2); 795 req->request->hdr.version = cpu_to_le16(2);
796 h = req->request->front.iov_base; 796 h = req->request->front.iov_base;
797 h->monhdr.have_version = 0; 797 h->monhdr.have_version = 0;
798 h->monhdr.session_mon = cpu_to_le16(-1); 798 h->monhdr.session_mon = cpu_to_le16(-1);
799 h->monhdr.session_mon_tid = 0; 799 h->monhdr.session_mon_tid = 0;
800 h->fsid = monc->monmap->fsid; 800 h->fsid = monc->monmap->fsid;
801 h->pool = cpu_to_le32(pool); 801 h->pool = cpu_to_le32(pool);
802 h->op = cpu_to_le32(op); 802 h->op = cpu_to_le32(op);
803 h->auid = 0; 803 h->auid = 0;
804 h->snapid = cpu_to_le64(snapid); 804 h->snapid = cpu_to_le64(snapid);
805 h->name_len = 0; 805 h->name_len = 0;
806 806
807 err = do_generic_request(monc, req); 807 err = do_generic_request(monc, req);
808 808
809 out: 809 out:
810 kref_put(&req->kref, release_generic_request); 810 kref_put(&req->kref, release_generic_request);
811 return err; 811 return err;
812 } 812 }
813 813
814 int ceph_monc_create_snapid(struct ceph_mon_client *monc, 814 int ceph_monc_create_snapid(struct ceph_mon_client *monc,
815 u32 pool, u64 *snapid) 815 u32 pool, u64 *snapid)
816 { 816 {
817 return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, 817 return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
818 pool, 0, (char *)snapid, sizeof(*snapid)); 818 pool, 0, (char *)snapid, sizeof(*snapid));
819 819
820 } 820 }
821 EXPORT_SYMBOL(ceph_monc_create_snapid); 821 EXPORT_SYMBOL(ceph_monc_create_snapid);
822 822
823 int ceph_monc_delete_snapid(struct ceph_mon_client *monc, 823 int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
824 u32 pool, u64 snapid) 824 u32 pool, u64 snapid)
825 { 825 {
826 return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, 826 return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
827 pool, snapid, NULL, 0); 827 pool, snapid, NULL, 0);
828 828
829 } 829 }
830 830
831 /* 831 /*
832 * Resend pending generic requests. 832 * Resend pending generic requests.
833 */ 833 */
834 static void __resend_generic_request(struct ceph_mon_client *monc) 834 static void __resend_generic_request(struct ceph_mon_client *monc)
835 { 835 {
836 struct ceph_mon_generic_request *req; 836 struct ceph_mon_generic_request *req;
837 struct rb_node *p; 837 struct rb_node *p;
838 838
839 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { 839 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
840 req = rb_entry(p, struct ceph_mon_generic_request, node); 840 req = rb_entry(p, struct ceph_mon_generic_request, node);
841 ceph_msg_revoke(req->request); 841 ceph_msg_revoke(req->request);
842 ceph_msg_revoke_incoming(req->reply); 842 ceph_msg_revoke_incoming(req->reply);
843 ceph_con_send(&monc->con, ceph_msg_get(req->request)); 843 ceph_con_send(&monc->con, ceph_msg_get(req->request));
844 } 844 }
845 } 845 }
846 846
847 /* 847 /*
848 * Delayed work. If we haven't mounted yet, retry. Otherwise, 848 * Delayed work. If we haven't mounted yet, retry. Otherwise,
849 * renew/retry subscription as needed (in case it is timing out, or we 849 * renew/retry subscription as needed (in case it is timing out, or we
850 * got an ENOMEM). And keep the monitor connection alive. 850 * got an ENOMEM). And keep the monitor connection alive.
851 */ 851 */
852 static void delayed_work(struct work_struct *work) 852 static void delayed_work(struct work_struct *work)
853 { 853 {
854 struct ceph_mon_client *monc = 854 struct ceph_mon_client *monc =
855 container_of(work, struct ceph_mon_client, delayed_work.work); 855 container_of(work, struct ceph_mon_client, delayed_work.work);
856 856
857 dout("monc delayed_work\n"); 857 dout("monc delayed_work\n");
858 mutex_lock(&monc->mutex); 858 mutex_lock(&monc->mutex);
859 if (monc->hunting) { 859 if (monc->hunting) {
860 __close_session(monc); 860 __close_session(monc);
861 __open_session(monc); /* continue hunting */ 861 __open_session(monc); /* continue hunting */
862 } else { 862 } else {
863 ceph_con_keepalive(&monc->con); 863 ceph_con_keepalive(&monc->con);
864 864
865 __validate_auth(monc); 865 __validate_auth(monc);
866 866
867 if (ceph_auth_is_authenticated(monc->auth)) 867 if (ceph_auth_is_authenticated(monc->auth))
868 __send_subscribe(monc); 868 __send_subscribe(monc);
869 } 869 }
870 __schedule_delayed(monc); 870 __schedule_delayed(monc);
871 mutex_unlock(&monc->mutex); 871 mutex_unlock(&monc->mutex);
872 } 872 }
873 873
874 /* 874 /*
875 * On startup, we build a temporary monmap populated with the IPs 875 * On startup, we build a temporary monmap populated with the IPs
876 * provided by mount(2). 876 * provided by mount(2).
877 */ 877 */
878 static int build_initial_monmap(struct ceph_mon_client *monc) 878 static int build_initial_monmap(struct ceph_mon_client *monc)
879 { 879 {
880 struct ceph_options *opt = monc->client->options; 880 struct ceph_options *opt = monc->client->options;
881 struct ceph_entity_addr *mon_addr = opt->mon_addr; 881 struct ceph_entity_addr *mon_addr = opt->mon_addr;
882 int num_mon = opt->num_mon; 882 int num_mon = opt->num_mon;
883 int i; 883 int i;
884 884
885 /* build initial monmap */ 885 /* build initial monmap */
886 monc->monmap = kzalloc(sizeof(*monc->monmap) + 886 monc->monmap = kzalloc(sizeof(*monc->monmap) +
887 num_mon*sizeof(monc->monmap->mon_inst[0]), 887 num_mon*sizeof(monc->monmap->mon_inst[0]),
888 GFP_KERNEL); 888 GFP_KERNEL);
889 if (!monc->monmap) 889 if (!monc->monmap)
890 return -ENOMEM; 890 return -ENOMEM;
891 for (i = 0; i < num_mon; i++) { 891 for (i = 0; i < num_mon; i++) {
892 monc->monmap->mon_inst[i].addr = mon_addr[i]; 892 monc->monmap->mon_inst[i].addr = mon_addr[i];
893 monc->monmap->mon_inst[i].addr.nonce = 0; 893 monc->monmap->mon_inst[i].addr.nonce = 0;
894 monc->monmap->mon_inst[i].name.type = 894 monc->monmap->mon_inst[i].name.type =
895 CEPH_ENTITY_TYPE_MON; 895 CEPH_ENTITY_TYPE_MON;
896 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); 896 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
897 } 897 }
898 monc->monmap->num_mon = num_mon; 898 monc->monmap->num_mon = num_mon;
899 return 0; 899 return 0;
900 } 900 }
901 901
902 int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) 902 int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
903 { 903 {
904 int err = 0; 904 int err = 0;
905 905
906 dout("init\n"); 906 dout("init\n");
907 memset(monc, 0, sizeof(*monc)); 907 memset(monc, 0, sizeof(*monc));
908 monc->client = cl; 908 monc->client = cl;
909 monc->monmap = NULL; 909 monc->monmap = NULL;
910 mutex_init(&monc->mutex); 910 mutex_init(&monc->mutex);
911 911
912 err = build_initial_monmap(monc); 912 err = build_initial_monmap(monc);
913 if (err) 913 if (err)
914 goto out; 914 goto out;
915 915
916 /* connection */ 916 /* connection */
917 /* authentication */ 917 /* authentication */
918 monc->auth = ceph_auth_init(cl->options->name, 918 monc->auth = ceph_auth_init(cl->options->name,
919 cl->options->key); 919 cl->options->key);
920 if (IS_ERR(monc->auth)) { 920 if (IS_ERR(monc->auth)) {
921 err = PTR_ERR(monc->auth); 921 err = PTR_ERR(monc->auth);
922 goto out_monmap; 922 goto out_monmap;
923 } 923 }
924 monc->auth->want_keys = 924 monc->auth->want_keys =
925 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | 925 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
926 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; 926 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
927 927
928 /* msgs */ 928 /* msgs */
929 err = -ENOMEM; 929 err = -ENOMEM;
930 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK, 930 monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
931 sizeof(struct ceph_mon_subscribe_ack), 931 sizeof(struct ceph_mon_subscribe_ack),
932 GFP_NOFS, true); 932 GFP_NOFS, true);
933 if (!monc->m_subscribe_ack) 933 if (!monc->m_subscribe_ack)
934 goto out_auth; 934 goto out_auth;
935 935
936 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS, 936 monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS,
937 true); 937 true);
938 if (!monc->m_subscribe) 938 if (!monc->m_subscribe)
939 goto out_subscribe_ack; 939 goto out_subscribe_ack;
940 940
941 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS, 941 monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS,
942 true); 942 true);
943 if (!monc->m_auth_reply) 943 if (!monc->m_auth_reply)
944 goto out_subscribe; 944 goto out_subscribe;
945 945
946 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true); 946 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true);
947 monc->pending_auth = 0; 947 monc->pending_auth = 0;
948 if (!monc->m_auth) 948 if (!monc->m_auth)
949 goto out_auth_reply; 949 goto out_auth_reply;
950 950
951 ceph_con_init(&monc->con, monc, &mon_con_ops, 951 ceph_con_init(&monc->con, monc, &mon_con_ops,
952 &monc->client->msgr); 952 &monc->client->msgr);
953 953
954 monc->cur_mon = -1; 954 monc->cur_mon = -1;
955 monc->hunting = true; 955 monc->hunting = true;
956 monc->sub_renew_after = jiffies; 956 monc->sub_renew_after = jiffies;
957 monc->sub_sent = 0; 957 monc->sub_sent = 0;
958 958
959 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); 959 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
960 monc->generic_request_tree = RB_ROOT; 960 monc->generic_request_tree = RB_ROOT;
961 monc->num_generic_requests = 0; 961 monc->num_generic_requests = 0;
962 monc->last_tid = 0; 962 monc->last_tid = 0;
963 963
964 monc->have_mdsmap = 0; 964 monc->have_mdsmap = 0;
965 monc->have_osdmap = 0; 965 monc->have_osdmap = 0;
966 monc->want_next_osdmap = 1; 966 monc->want_next_osdmap = 1;
967 return 0; 967 return 0;
968 968
969 out_auth_reply: 969 out_auth_reply:
970 ceph_msg_put(monc->m_auth_reply); 970 ceph_msg_put(monc->m_auth_reply);
971 out_subscribe: 971 out_subscribe:
972 ceph_msg_put(monc->m_subscribe); 972 ceph_msg_put(monc->m_subscribe);
973 out_subscribe_ack: 973 out_subscribe_ack:
974 ceph_msg_put(monc->m_subscribe_ack); 974 ceph_msg_put(monc->m_subscribe_ack);
975 out_auth: 975 out_auth:
976 ceph_auth_destroy(monc->auth); 976 ceph_auth_destroy(monc->auth);
977 out_monmap: 977 out_monmap:
978 kfree(monc->monmap); 978 kfree(monc->monmap);
979 out: 979 out:
980 return err; 980 return err;
981 } 981 }
982 EXPORT_SYMBOL(ceph_monc_init); 982 EXPORT_SYMBOL(ceph_monc_init);
983 983
984 void ceph_monc_stop(struct ceph_mon_client *monc) 984 void ceph_monc_stop(struct ceph_mon_client *monc)
985 { 985 {
986 dout("stop\n"); 986 dout("stop\n");
987 cancel_delayed_work_sync(&monc->delayed_work); 987 cancel_delayed_work_sync(&monc->delayed_work);
988 988
989 mutex_lock(&monc->mutex); 989 mutex_lock(&monc->mutex);
990 __close_session(monc); 990 __close_session(monc);
991 991
992 mutex_unlock(&monc->mutex); 992 mutex_unlock(&monc->mutex);
993 993
994 /* 994 /*
995 * flush msgr queue before we destroy ourselves to ensure that: 995 * flush msgr queue before we destroy ourselves to ensure that:
996 * - any work that references our embedded con is finished. 996 * - any work that references our embedded con is finished.
997 * - any osd_client or other work that may reference an authorizer 997 * - any osd_client or other work that may reference an authorizer
998 * finishes before we shut down the auth subsystem. 998 * finishes before we shut down the auth subsystem.
999 */ 999 */
1000 ceph_msgr_flush(); 1000 ceph_msgr_flush();
1001 1001
1002 ceph_auth_destroy(monc->auth); 1002 ceph_auth_destroy(monc->auth);
1003 1003
1004 ceph_msg_put(monc->m_auth); 1004 ceph_msg_put(monc->m_auth);
1005 ceph_msg_put(monc->m_auth_reply); 1005 ceph_msg_put(monc->m_auth_reply);
1006 ceph_msg_put(monc->m_subscribe); 1006 ceph_msg_put(monc->m_subscribe);
1007 ceph_msg_put(monc->m_subscribe_ack); 1007 ceph_msg_put(monc->m_subscribe_ack);
1008 1008
1009 kfree(monc->monmap); 1009 kfree(monc->monmap);
1010 } 1010 }
1011 EXPORT_SYMBOL(ceph_monc_stop); 1011 EXPORT_SYMBOL(ceph_monc_stop);
1012 1012
1013 static void handle_auth_reply(struct ceph_mon_client *monc, 1013 static void handle_auth_reply(struct ceph_mon_client *monc,
1014 struct ceph_msg *msg) 1014 struct ceph_msg *msg)
1015 { 1015 {
1016 int ret; 1016 int ret;
1017 int was_auth = 0; 1017 int was_auth = 0;
1018 int had_debugfs_info, init_debugfs = 0; 1018 int had_debugfs_info, init_debugfs = 0;
1019 1019
1020 mutex_lock(&monc->mutex); 1020 mutex_lock(&monc->mutex);
1021 had_debugfs_info = have_debugfs_info(monc); 1021 had_debugfs_info = have_debugfs_info(monc);
1022 was_auth = ceph_auth_is_authenticated(monc->auth); 1022 was_auth = ceph_auth_is_authenticated(monc->auth);
1023 monc->pending_auth = 0; 1023 monc->pending_auth = 0;
1024 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, 1024 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
1025 msg->front.iov_len, 1025 msg->front.iov_len,
1026 monc->m_auth->front.iov_base, 1026 monc->m_auth->front.iov_base,
1027 monc->m_auth->front_alloc_len); 1027 monc->m_auth->front_alloc_len);
1028 if (ret < 0) { 1028 if (ret < 0) {
1029 monc->client->auth_err = ret; 1029 monc->client->auth_err = ret;
1030 wake_up_all(&monc->client->auth_wq); 1030 wake_up_all(&monc->client->auth_wq);
1031 } else if (ret > 0) { 1031 } else if (ret > 0) {
1032 __send_prepared_auth_request(monc, ret); 1032 __send_prepared_auth_request(monc, ret);
1033 } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) { 1033 } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) {
1034 dout("authenticated, starting session\n"); 1034 dout("authenticated, starting session\n");
1035 1035
1036 monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT; 1036 monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
1037 monc->client->msgr.inst.name.num = 1037 monc->client->msgr.inst.name.num =
1038 cpu_to_le64(monc->auth->global_id); 1038 cpu_to_le64(monc->auth->global_id);
1039 1039
1040 __send_subscribe(monc); 1040 __send_subscribe(monc);
1041 __resend_generic_request(monc); 1041 __resend_generic_request(monc);
1042 } 1042 }
1043 1043
1044 if (!had_debugfs_info && have_debugfs_info(monc)) { 1044 if (!had_debugfs_info && have_debugfs_info(monc)) {
1045 pr_info("client%lld fsid %pU\n", 1045 pr_info("client%lld fsid %pU\n",
1046 ceph_client_id(monc->client), 1046 ceph_client_id(monc->client),
1047 &monc->client->fsid); 1047 &monc->client->fsid);
1048 init_debugfs = 1; 1048 init_debugfs = 1;
1049 } 1049 }
1050 mutex_unlock(&monc->mutex); 1050 mutex_unlock(&monc->mutex);
1051 1051
1052 if (init_debugfs) { 1052 if (init_debugfs) {
1053 /* 1053 /*
1054 * do debugfs initialization without mutex to avoid 1054 * do debugfs initialization without mutex to avoid
1055 * creating a locking dependency 1055 * creating a locking dependency
1056 */ 1056 */
1057 ceph_debugfs_client_init(monc->client); 1057 ceph_debugfs_client_init(monc->client);
1058 } 1058 }
1059 } 1059 }
1060 1060
1061 static int __validate_auth(struct ceph_mon_client *monc) 1061 static int __validate_auth(struct ceph_mon_client *monc)
1062 { 1062 {
1063 int ret; 1063 int ret;
1064 1064
1065 if (monc->pending_auth) 1065 if (monc->pending_auth)
1066 return 0; 1066 return 0;
1067 1067
1068 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base, 1068 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
1069 monc->m_auth->front_alloc_len); 1069 monc->m_auth->front_alloc_len);
1070 if (ret <= 0) 1070 if (ret <= 0)
1071 return ret; /* either an error, or no need to authenticate */ 1071 return ret; /* either an error, or no need to authenticate */
1072 __send_prepared_auth_request(monc, ret); 1072 __send_prepared_auth_request(monc, ret);
1073 return 0; 1073 return 0;
1074 } 1074 }
1075 1075
1076 int ceph_monc_validate_auth(struct ceph_mon_client *monc) 1076 int ceph_monc_validate_auth(struct ceph_mon_client *monc)
1077 { 1077 {
1078 int ret; 1078 int ret;
1079 1079
1080 mutex_lock(&monc->mutex); 1080 mutex_lock(&monc->mutex);
1081 ret = __validate_auth(monc); 1081 ret = __validate_auth(monc);
1082 mutex_unlock(&monc->mutex); 1082 mutex_unlock(&monc->mutex);
1083 return ret; 1083 return ret;
1084 } 1084 }
1085 EXPORT_SYMBOL(ceph_monc_validate_auth); 1085 EXPORT_SYMBOL(ceph_monc_validate_auth);
1086 1086
1087 /* 1087 /*
1088 * handle incoming message 1088 * handle incoming message
1089 */ 1089 */
1090 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) 1090 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1091 { 1091 {
1092 struct ceph_mon_client *monc = con->private; 1092 struct ceph_mon_client *monc = con->private;
1093 int type = le16_to_cpu(msg->hdr.type); 1093 int type = le16_to_cpu(msg->hdr.type);
1094 1094
1095 if (!monc) 1095 if (!monc)
1096 return; 1096 return;
1097 1097
1098 switch (type) { 1098 switch (type) {
1099 case CEPH_MSG_AUTH_REPLY: 1099 case CEPH_MSG_AUTH_REPLY:
1100 handle_auth_reply(monc, msg); 1100 handle_auth_reply(monc, msg);
1101 break; 1101 break;
1102 1102
1103 case CEPH_MSG_MON_SUBSCRIBE_ACK: 1103 case CEPH_MSG_MON_SUBSCRIBE_ACK:
1104 handle_subscribe_ack(monc, msg); 1104 handle_subscribe_ack(monc, msg);
1105 break; 1105 break;
1106 1106
1107 case CEPH_MSG_STATFS_REPLY: 1107 case CEPH_MSG_STATFS_REPLY:
1108 handle_statfs_reply(monc, msg); 1108 handle_statfs_reply(monc, msg);
1109 break; 1109 break;
1110 1110
1111 case CEPH_MSG_MON_GET_VERSION_REPLY: 1111 case CEPH_MSG_MON_GET_VERSION_REPLY:
1112 handle_get_version_reply(monc, msg); 1112 handle_get_version_reply(monc, msg);
1113 break; 1113 break;
1114 1114
1115 case CEPH_MSG_POOLOP_REPLY: 1115 case CEPH_MSG_POOLOP_REPLY:
1116 handle_poolop_reply(monc, msg); 1116 handle_poolop_reply(monc, msg);
1117 break; 1117 break;
1118 1118
1119 case CEPH_MSG_MON_MAP: 1119 case CEPH_MSG_MON_MAP:
1120 ceph_monc_handle_map(monc, msg); 1120 ceph_monc_handle_map(monc, msg);
1121 break; 1121 break;
1122 1122
1123 case CEPH_MSG_OSD_MAP: 1123 case CEPH_MSG_OSD_MAP:
1124 ceph_osdc_handle_map(&monc->client->osdc, msg); 1124 ceph_osdc_handle_map(&monc->client->osdc, msg);
1125 break; 1125 break;
1126 1126
1127 default: 1127 default:
1128 /* can the chained handler handle it? */ 1128 /* can the chained handler handle it? */
1129 if (monc->client->extra_mon_dispatch && 1129 if (monc->client->extra_mon_dispatch &&
1130 monc->client->extra_mon_dispatch(monc->client, msg) == 0) 1130 monc->client->extra_mon_dispatch(monc->client, msg) == 0)
1131 break; 1131 break;
1132 1132
1133 pr_err("received unknown message type %d %s\n", type, 1133 pr_err("received unknown message type %d %s\n", type,
1134 ceph_msg_type_name(type)); 1134 ceph_msg_type_name(type));
1135 } 1135 }
1136 ceph_msg_put(msg); 1136 ceph_msg_put(msg);
1137 } 1137 }
1138 1138
1139 /* 1139 /*
1140 * Allocate memory for incoming message 1140 * Allocate memory for incoming message
1141 */ 1141 */
1142 static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, 1142 static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
1143 struct ceph_msg_header *hdr, 1143 struct ceph_msg_header *hdr,
1144 int *skip) 1144 int *skip)
1145 { 1145 {
1146 struct ceph_mon_client *monc = con->private; 1146 struct ceph_mon_client *monc = con->private;
1147 int type = le16_to_cpu(hdr->type); 1147 int type = le16_to_cpu(hdr->type);
1148 int front_len = le32_to_cpu(hdr->front_len); 1148 int front_len = le32_to_cpu(hdr->front_len);
1149 struct ceph_msg *m = NULL; 1149 struct ceph_msg *m = NULL;
1150 1150
1151 *skip = 0; 1151 *skip = 0;
1152 1152
1153 switch (type) { 1153 switch (type) {
1154 case CEPH_MSG_MON_SUBSCRIBE_ACK: 1154 case CEPH_MSG_MON_SUBSCRIBE_ACK:
1155 m = ceph_msg_get(monc->m_subscribe_ack); 1155 m = ceph_msg_get(monc->m_subscribe_ack);
1156 break; 1156 break;
1157 case CEPH_MSG_POOLOP_REPLY: 1157 case CEPH_MSG_POOLOP_REPLY:
1158 case CEPH_MSG_STATFS_REPLY: 1158 case CEPH_MSG_STATFS_REPLY:
1159 return get_generic_reply(con, hdr, skip); 1159 return get_generic_reply(con, hdr, skip);
1160 case CEPH_MSG_AUTH_REPLY: 1160 case CEPH_MSG_AUTH_REPLY:
1161 m = ceph_msg_get(monc->m_auth_reply); 1161 m = ceph_msg_get(monc->m_auth_reply);
1162 break; 1162 break;
1163 case CEPH_MSG_MON_GET_VERSION_REPLY: 1163 case CEPH_MSG_MON_GET_VERSION_REPLY:
1164 if (le64_to_cpu(hdr->tid) != 0) 1164 if (le64_to_cpu(hdr->tid) != 0)
1165 return get_generic_reply(con, hdr, skip); 1165 return get_generic_reply(con, hdr, skip);
1166 1166
1167 /* 1167 /*
1168 * Older OSDs don't set reply tid even if the orignal 1168 * Older OSDs don't set reply tid even if the orignal
1169 * request had a non-zero tid. Workaround this weirdness 1169 * request had a non-zero tid. Workaround this weirdness
1170 * by falling through to the allocate case. 1170 * by falling through to the allocate case.
1171 */ 1171 */
1172 case CEPH_MSG_MON_MAP: 1172 case CEPH_MSG_MON_MAP:
1173 case CEPH_MSG_MDS_MAP: 1173 case CEPH_MSG_MDS_MAP:
1174 case CEPH_MSG_OSD_MAP: 1174 case CEPH_MSG_OSD_MAP:
1175 m = ceph_msg_new(type, front_len, GFP_NOFS, false); 1175 m = ceph_msg_new(type, front_len, GFP_NOFS, false);
1176 if (!m) 1176 if (!m)
1177 return NULL; /* ENOMEM--return skip == 0 */ 1177 return NULL; /* ENOMEM--return skip == 0 */
1178 break; 1178 break;
1179 } 1179 }
1180 1180
1181 if (!m) { 1181 if (!m) {
1182 pr_info("alloc_msg unknown type %d\n", type); 1182 pr_info("alloc_msg unknown type %d\n", type);
1183 *skip = 1; 1183 *skip = 1;
1184 } else if (front_len > m->front_alloc_len) { 1184 } else if (front_len > m->front_alloc_len) {
1185 pr_warn("mon_alloc_msg front %d > prealloc %d (%u#%llu)\n", 1185 pr_warn("mon_alloc_msg front %d > prealloc %d (%u#%llu)\n",
1186 front_len, m->front_alloc_len, 1186 front_len, m->front_alloc_len,
1187 (unsigned int)con->peer_name.type, 1187 (unsigned int)con->peer_name.type,
1188 le64_to_cpu(con->peer_name.num)); 1188 le64_to_cpu(con->peer_name.num));
1189 ceph_msg_put(m); 1189 ceph_msg_put(m);
1190 m = ceph_msg_new(type, front_len, GFP_NOFS, false); 1190 m = ceph_msg_new(type, front_len, GFP_NOFS, false);
1191 } 1191 }
1192 1192
1193 return m; 1193 return m;
1194 } 1194 }
1195 1195
1196 /* 1196 /*
1197 * If the monitor connection resets, pick a new monitor and resubmit 1197 * If the monitor connection resets, pick a new monitor and resubmit
1198 * any pending requests. 1198 * any pending requests.
1199 */ 1199 */
1200 static void mon_fault(struct ceph_connection *con) 1200 static void mon_fault(struct ceph_connection *con)
1201 { 1201 {
1202 struct ceph_mon_client *monc = con->private; 1202 struct ceph_mon_client *monc = con->private;
1203 1203
1204 if (!monc) 1204 if (!monc)
1205 return; 1205 return;
1206 1206
1207 dout("mon_fault\n"); 1207 dout("mon_fault\n");
1208 mutex_lock(&monc->mutex); 1208 mutex_lock(&monc->mutex);
1209 if (!con->private) 1209 if (!con->private)
1210 goto out; 1210 goto out;
1211 1211
1212 if (!monc->hunting) 1212 if (!monc->hunting)
1213 pr_info("mon%d %s session lost, " 1213 pr_info("mon%d %s session lost, "
1214 "hunting for new mon\n", monc->cur_mon, 1214 "hunting for new mon\n", monc->cur_mon,
1215 ceph_pr_addr(&monc->con.peer_addr.in_addr)); 1215 ceph_pr_addr(&monc->con.peer_addr.in_addr));
1216 1216
1217 __close_session(monc); 1217 __close_session(monc);
1218 if (!monc->hunting) { 1218 if (!monc->hunting) {
1219 /* start hunting */ 1219 /* start hunting */
1220 monc->hunting = true; 1220 monc->hunting = true;
1221 __open_session(monc); 1221 __open_session(monc);
1222 } else { 1222 } else {
1223 /* already hunting, let's wait a bit */ 1223 /* already hunting, let's wait a bit */
1224 __schedule_delayed(monc); 1224 __schedule_delayed(monc);
1225 } 1225 }
1226 out: 1226 out:
1227 mutex_unlock(&monc->mutex); 1227 mutex_unlock(&monc->mutex);
1228 } 1228 }
1229 1229
1230 /* 1230 /*
1231 * We can ignore refcounting on the connection struct, as all references 1231 * We can ignore refcounting on the connection struct, as all references
1232 * will come from the messenger workqueue, which is drained prior to 1232 * will come from the messenger workqueue, which is drained prior to
1233 * mon_client destruction. 1233 * mon_client destruction.
1234 */ 1234 */
1235 static struct ceph_connection *con_get(struct ceph_connection *con) 1235 static struct ceph_connection *con_get(struct ceph_connection *con)
1236 { 1236 {
1237 return con; 1237 return con;
1238 } 1238 }
1239 1239
1240 static void con_put(struct ceph_connection *con) 1240 static void con_put(struct ceph_connection *con)
1241 { 1241 {
1242 } 1242 }
1243 1243
1244 static const struct ceph_connection_operations mon_con_ops = { 1244 static const struct ceph_connection_operations mon_con_ops = {
1245 .get = con_get, 1245 .get = con_get,
1246 .put = con_put, 1246 .put = con_put,
1247 .dispatch = dispatch, 1247 .dispatch = dispatch,
1248 .fault = mon_fault, 1248 .fault = mon_fault,
1249 .alloc_msg = mon_alloc_msg, 1249 .alloc_msg = mon_alloc_msg,
1250 }; 1250 };
1251 1251