Commit 83701246aee8f83b4b42483051b439fbe96ed47d

Authored by Yan, Zheng
Committed by Ilya Dryomov
1 parent 3738daa68a

ceph: sync read inline data

we can't use getattr to fetch inline data while holding Fr cap,
because it can cause deadlock. If we need to sync read inline data,
drop cap refs first, then use getattr to fetch inline data.

Signed-off-by: Yan, Zheng <zyan@redhat.com>

Showing 2 changed files with 116 additions and 13 deletions Side-by-side Diff

... ... @@ -192,17 +192,30 @@
192 192 struct ceph_osd_client *osdc =
193 193 &ceph_inode_to_client(inode)->client->osdc;
194 194 int err = 0;
  195 + u64 off = page_offset(page);
195 196 u64 len = PAGE_CACHE_SIZE;
196 197  
197   - err = ceph_readpage_from_fscache(inode, page);
  198 + if (off >= i_size_read(inode)) {
  199 + zero_user_segment(page, err, PAGE_CACHE_SIZE);
  200 + SetPageUptodate(page);
  201 + return 0;
  202 + }
198 203  
  204 + /*
  205 + * Uptodate inline data should have been added into page cache
  206 + * while getting Fcr caps.
  207 + */
  208 + if (ci->i_inline_version != CEPH_INLINE_NONE)
  209 + return -EINVAL;
  210 +
  211 + err = ceph_readpage_from_fscache(inode, page);
199 212 if (err == 0)
200 213 goto out;
201 214  
202 215 dout("readpage inode %p file %p page %p index %lu\n",
203 216 inode, filp, page, page->index);
204 217 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
205   - (u64) page_offset(page), &len,
  218 + off, &len,
206 219 ci->i_truncate_seq, ci->i_truncate_size,
207 220 &page, 1, 0);
208 221 if (err == -ENOENT)
... ... @@ -384,6 +397,9 @@
384 397 int rc = 0;
385 398 int max = 0;
386 399  
  400 + if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
  401 + return -EINVAL;
  402 +
387 403 rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list,
388 404 &nr_pages);
389 405  
... ... @@ -1219,8 +1235,8 @@
1219 1235 want = CEPH_CAP_FILE_CACHE;
1220 1236 while (1) {
1221 1237 got = 0;
1222   - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1,
1223   - &got, &pinned_page);
  1238 + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want,
  1239 + -1, &got, &pinned_page);
1224 1240 if (ret == 0)
1225 1241 break;
1226 1242 if (ret != -ERESTARTSYS) {
... ... @@ -1231,7 +1247,11 @@
1231 1247 dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
1232 1248 inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got));
1233 1249  
1234   - ret = filemap_fault(vma, vmf);
  1250 + if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
  1251 + ci->i_inline_version == CEPH_INLINE_NONE)
  1252 + ret = filemap_fault(vma, vmf);
  1253 + else
  1254 + ret = -EAGAIN;
1235 1255  
1236 1256 dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
1237 1257 inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret);
... ... @@ -1239,6 +1259,42 @@
1239 1259 page_cache_release(pinned_page);
1240 1260 ceph_put_cap_refs(ci, got);
1241 1261  
  1262 + if (ret != -EAGAIN)
  1263 + return ret;
  1264 +
  1265 + /* read inline data */
  1266 + if (off >= PAGE_CACHE_SIZE) {
  1267 + /* does not support inline data > PAGE_SIZE */
  1268 + ret = VM_FAULT_SIGBUS;
  1269 + } else {
  1270 + int ret1;
  1271 + struct address_space *mapping = inode->i_mapping;
  1272 + struct page *page = find_or_create_page(mapping, 0,
  1273 + mapping_gfp_mask(mapping) &
  1274 + ~__GFP_FS);
  1275 + if (!page) {
  1276 + ret = VM_FAULT_OOM;
  1277 + goto out;
  1278 + }
  1279 + ret1 = __ceph_do_getattr(inode, page,
  1280 + CEPH_STAT_CAP_INLINE_DATA, true);
  1281 + if (ret1 < 0 || off >= i_size_read(inode)) {
  1282 + unlock_page(page);
  1283 + page_cache_release(page);
  1284 + ret = VM_FAULT_SIGBUS;
  1285 + goto out;
  1286 + }
  1287 + if (ret1 < PAGE_CACHE_SIZE)
  1288 + zero_user_segment(page, ret1, PAGE_CACHE_SIZE);
  1289 + else
  1290 + flush_dcache_page(page);
  1291 + SetPageUptodate(page);
  1292 + vmf->page = page;
  1293 + ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
  1294 + }
  1295 +out:
  1296 + dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
  1297 + inode, off, (size_t)PAGE_CACHE_SIZE, ret);
1242 1298 return ret;
1243 1299 }
1244 1300  
... ... @@ -333,6 +333,11 @@
333 333 return 0;
334 334 }
335 335  
  336 +enum {
  337 + CHECK_EOF = 1,
  338 + READ_INLINE = 2,
  339 +};
  340 +
336 341 /*
337 342 * Read a range of bytes striped over one or more objects. Iterate over
338 343 * objects we stripe over. (That's not atomic, but good enough for now.)
... ... @@ -412,7 +417,7 @@
412 417 ret = read;
413 418 /* did we bounce off eof? */
414 419 if (pos + left > inode->i_size)
415   - *checkeof = 1;
  420 + *checkeof = CHECK_EOF;
416 421 }
417 422  
418 423 dout("striped_read returns %d\n", ret);
... ... @@ -808,7 +813,7 @@
808 813 struct page *pinned_page = NULL;
809 814 ssize_t ret;
810 815 int want, got = 0;
811   - int checkeof = 0, read = 0;
  816 + int retry_op = 0, read = 0;
812 817  
813 818 again:
814 819 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
... ... @@ -830,8 +835,12 @@
830 835 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
831 836 ceph_cap_string(got));
832 837  
833   - /* hmm, this isn't really async... */
834   - ret = ceph_sync_read(iocb, to, &checkeof);
  838 + if (ci->i_inline_version == CEPH_INLINE_NONE) {
  839 + /* hmm, this isn't really async... */
  840 + ret = ceph_sync_read(iocb, to, &retry_op);
  841 + } else {
  842 + retry_op = READ_INLINE;
  843 + }
835 844 } else {
836 845 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
837 846 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
838 847  
839 848  
840 849  
... ... @@ -846,12 +855,50 @@
846 855 pinned_page = NULL;
847 856 }
848 857 ceph_put_cap_refs(ci, got);
  858 + if (retry_op && ret >= 0) {
  859 + int statret;
  860 + struct page *page = NULL;
  861 + loff_t i_size;
  862 + if (retry_op == READ_INLINE) {
  863 + page = __page_cache_alloc(GFP_NOFS);
  864 + if (!page)
  865 + return -ENOMEM;
  866 + }
849 867  
850   - if (checkeof && ret >= 0) {
851   - int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
  868 + statret = __ceph_do_getattr(inode, page,
  869 + CEPH_STAT_CAP_INLINE_DATA, !!page);
  870 + if (statret < 0) {
  871 + __free_page(page);
  872 + if (statret == -ENODATA) {
  873 + BUG_ON(retry_op != READ_INLINE);
  874 + goto again;
  875 + }
  876 + return statret;
  877 + }
852 878  
  879 + i_size = i_size_read(inode);
  880 + if (retry_op == READ_INLINE) {
  881 + /* does not support inline data > PAGE_SIZE */
  882 + if (i_size > PAGE_CACHE_SIZE) {
  883 + ret = -EIO;
  884 + } else if (iocb->ki_pos < i_size) {
  885 + loff_t end = min_t(loff_t, i_size,
  886 + iocb->ki_pos + len);
  887 + if (statret < end)
  888 + zero_user_segment(page, statret, end);
  889 + ret = copy_page_to_iter(page,
  890 + iocb->ki_pos & ~PAGE_MASK,
  891 + end - iocb->ki_pos, to);
  892 + iocb->ki_pos += ret;
  893 + } else {
  894 + ret = 0;
  895 + }
  896 + __free_pages(page, 0);
  897 + return ret;
  898 + }
  899 +
853 900 /* hit EOF or hole? */
854   - if (statret == 0 && iocb->ki_pos < inode->i_size &&
  901 + if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
855 902 ret < len) {
856 903 dout("sync_read hit hole, ppos %lld < size %lld"
857 904 ", reading more\n", iocb->ki_pos,
... ... @@ -859,7 +906,7 @@
859 906  
860 907 read += ret;
861 908 len -= ret;
862   - checkeof = 0;
  909 + retry_op = 0;
863 910 goto again;
864 911 }
865 912 }