Commit 93aaa830fc173560505c3411806509299d8871ce

Authored by Linus Torvalds

Merge tag 'xfs-pnfs-for-linus-3.20-rc1' of git://git.kernel.org/pub/scm/linux/ke…

…rnel/git/dgc/linux-xfs

Pull xfs pnfs block layout support from Dave Chinner:
 "This contains the changes to XFS needed to support the PNFS block
  layout server that you pulled in through Bruce's NFS server tree
  merge.

  I originally thought that I'd need to merge changes into the NFS
  server side, but Bruce had already picked them up and so this is
  purely changes to the fs/xfs/ codebase.

  Summary:

  This update contains the implementation of the PNFS server export
  methods that enable use of XFS filesystems as a block layout target"

* tag 'xfs-pnfs-for-linus-3.20-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs:
  xfs: recall pNFS layouts on conflicting access
  xfs: implement pNFS export operations

Showing 10 changed files Side-by-side Diff

... ... @@ -121,4 +121,5 @@
121 121 xfs-$(CONFIG_PROC_FS) += xfs_stats.o
122 122 xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
123 123 xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
  124 +xfs-$(CONFIG_NFSD_PNFS) += xfs_pnfs.o
... ... @@ -30,6 +30,7 @@
30 30 #include "xfs_trace.h"
31 31 #include "xfs_icache.h"
32 32 #include "xfs_log.h"
  33 +#include "xfs_pnfs.h"
33 34  
34 35 /*
35 36 * Note that we only accept fileids which are long enough rather than allow
... ... @@ -245,5 +246,10 @@
245 246 .fh_to_parent = xfs_fs_fh_to_parent,
246 247 .get_parent = xfs_fs_get_parent,
247 248 .commit_metadata = xfs_fs_nfs_commit_metadata,
  249 +#ifdef CONFIG_NFSD_PNFS
  250 + .get_uuid = xfs_fs_get_uuid,
  251 + .map_blocks = xfs_fs_map_blocks,
  252 + .commit_blocks = xfs_fs_commit_blocks,
  253 +#endif
248 254 };
... ... @@ -36,6 +36,7 @@
36 36 #include "xfs_trace.h"
37 37 #include "xfs_log.h"
38 38 #include "xfs_icache.h"
  39 +#include "xfs_pnfs.h"
39 40  
40 41 #include <linux/aio.h>
41 42 #include <linux/dcache.h>
... ... @@ -554,6 +555,10 @@
554 555 if (error)
555 556 return error;
556 557  
  558 + error = xfs_break_layouts(inode, iolock);
  559 + if (error)
  560 + return error;
  561 +
557 562 /*
558 563 * If the offset is beyond the size of the file, we need to zero any
559 564 * blocks that fall between the existing EOF and the start of this
... ... @@ -822,6 +827,7 @@
822 827 struct xfs_inode *ip = XFS_I(inode);
823 828 long error;
824 829 enum xfs_prealloc_flags flags = 0;
  830 + uint iolock = XFS_IOLOCK_EXCL;
825 831 loff_t new_size = 0;
826 832  
827 833 if (!S_ISREG(inode->i_mode))
... ... @@ -830,7 +836,11 @@
830 836 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
831 837 return -EOPNOTSUPP;
832 838  
833   - xfs_ilock(ip, XFS_IOLOCK_EXCL);
  839 + xfs_ilock(ip, iolock);
  840 + error = xfs_break_layouts(inode, &iolock);
  841 + if (error)
  842 + goto out_unlock;
  843 +
834 844 if (mode & FALLOC_FL_PUNCH_HOLE) {
835 845 error = xfs_free_file_space(ip, offset, len);
836 846 if (error)
... ... @@ -894,7 +904,7 @@
894 904 }
895 905  
896 906 out_unlock:
897   - xfs_iunlock(ip, XFS_IOLOCK_EXCL);
  907 + xfs_iunlock(ip, iolock);
898 908 return error;
899 909 }
900 910  
... ... @@ -602,6 +602,12 @@
602 602 if (!mutex_trylock(&mp->m_growlock))
603 603 return -EWOULDBLOCK;
604 604 error = xfs_growfs_data_private(mp, in);
  605 + /*
  606 + * Increment the generation unconditionally, the error could be from
  607 + * updating the secondary superblocks, in which case the new size
  608 + * is live already.
  609 + */
  610 + mp->m_generation++;
605 611 mutex_unlock(&mp->m_growlock);
606 612 return error;
607 613 }
... ... @@ -39,6 +39,7 @@
39 39 #include "xfs_icache.h"
40 40 #include "xfs_symlink.h"
41 41 #include "xfs_trans.h"
  42 +#include "xfs_pnfs.h"
42 43  
43 44 #include <linux/capability.h>
44 45 #include <linux/dcache.h>
... ... @@ -608,6 +609,7 @@
608 609 {
609 610 struct iattr iattr;
610 611 enum xfs_prealloc_flags flags = 0;
  612 + uint iolock = XFS_IOLOCK_EXCL;
611 613 int error;
612 614  
613 615 /*
... ... @@ -636,7 +638,10 @@
636 638 if (error)
637 639 return error;
638 640  
639   - xfs_ilock(ip, XFS_IOLOCK_EXCL);
  641 + xfs_ilock(ip, iolock);
  642 + error = xfs_break_layouts(inode, &iolock);
  643 + if (error)
  644 + goto out_unlock;
640 645  
641 646 switch (bf->l_whence) {
642 647 case 0: /*SEEK_SET*/
... ... @@ -725,7 +730,7 @@
725 730 error = xfs_update_prealloc_flags(ip, flags);
726 731  
727 732 out_unlock:
728   - xfs_iunlock(ip, XFS_IOLOCK_EXCL);
  733 + xfs_iunlock(ip, iolock);
729 734 mnt_drop_write_file(filp);
730 735 return error;
731 736 }
... ... @@ -37,6 +37,7 @@
37 37 #include "xfs_da_btree.h"
38 38 #include "xfs_dir2.h"
39 39 #include "xfs_trans_space.h"
  40 +#include "xfs_pnfs.h"
40 41  
41 42 #include <linux/capability.h>
42 43 #include <linux/xattr.h>
... ... @@ -505,7 +506,7 @@
505 506 inode->i_mode |= mode & ~S_IFMT;
506 507 }
507 508  
508   -static void
  509 +void
509 510 xfs_setattr_time(
510 511 struct xfs_inode *ip,
511 512 struct iattr *iattr)
... ... @@ -979,9 +980,13 @@
979 980 int error;
980 981  
981 982 if (iattr->ia_valid & ATTR_SIZE) {
982   - xfs_ilock(ip, XFS_IOLOCK_EXCL);
983   - error = xfs_setattr_size(ip, iattr);
984   - xfs_iunlock(ip, XFS_IOLOCK_EXCL);
  983 + uint iolock = XFS_IOLOCK_EXCL;
  984 +
  985 + xfs_ilock(ip, iolock);
  986 + error = xfs_break_layouts(dentry->d_inode, &iolock);
  987 + if (!error)
  988 + error = xfs_setattr_size(ip, iattr);
  989 + xfs_iunlock(ip, iolock);
985 990 } else {
986 991 error = xfs_setattr_nonsize(ip, iattr, 0);
987 992 }
... ... @@ -32,6 +32,7 @@
32 32 */
33 33 #define XFS_ATTR_NOACL 0x01 /* Don't call posix_acl_chmod */
34 34  
  35 +extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr);
35 36 extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
36 37 int flags);
37 38 extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap);
... ... @@ -174,6 +174,17 @@
174 174 struct workqueue_struct *m_reclaim_workqueue;
175 175 struct workqueue_struct *m_log_workqueue;
176 176 struct workqueue_struct *m_eofblocks_workqueue;
  177 +
  178 + /*
  179 + * Generation of the filesysyem layout. This is incremented by each
  180 + * growfs, and used by the pNFS server to ensure the client updates
  181 + * its view of the block device once it gets a layout that might
  182 + * reference the newly added blocks. Does not need to be persistent
  183 + * as long as we only allow file system size increments, but if we
  184 + * ever support shrinks it would have to be persisted in addition
  185 + * to various other kinds of pain inflicted on the pNFS server.
  186 + */
  187 + __uint32_t m_generation;
177 188 } xfs_mount_t;
178 189  
179 190 /*
  1 +/*
  2 + * Copyright (c) 2014 Christoph Hellwig.
  3 + */
  4 +#include "xfs.h"
  5 +#include "xfs_format.h"
  6 +#include "xfs_log_format.h"
  7 +#include "xfs_trans_resv.h"
  8 +#include "xfs_sb.h"
  9 +#include "xfs_mount.h"
  10 +#include "xfs_inode.h"
  11 +#include "xfs_trans.h"
  12 +#include "xfs_log.h"
  13 +#include "xfs_bmap.h"
  14 +#include "xfs_bmap_util.h"
  15 +#include "xfs_error.h"
  16 +#include "xfs_iomap.h"
  17 +#include "xfs_shared.h"
  18 +#include "xfs_bit.h"
  19 +#include "xfs_pnfs.h"
  20 +
  21 +/*
  22 + * Ensure that we do not have any outstanding pNFS layouts that can be used by
  23 + * clients to directly read from or write to this inode. This must be called
  24 + * before every operation that can remove blocks from the extent map.
  25 + * Additionally we call it during the write operation, where aren't concerned
  26 + * about exposing unallocated blocks but just want to provide basic
  27 + * synchronization between a local writer and pNFS clients. mmap writes would
  28 + * also benefit from this sort of synchronization, but due to the tricky locking
  29 + * rules in the page fault path we don't bother.
  30 + */
  31 +int
  32 +xfs_break_layouts(
  33 + struct inode *inode,
  34 + uint *iolock)
  35 +{
  36 + struct xfs_inode *ip = XFS_I(inode);
  37 + int error;
  38 +
  39 + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
  40 +
  41 + while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
  42 + xfs_iunlock(ip, *iolock);
  43 + error = break_layout(inode, true);
  44 + *iolock = XFS_IOLOCK_EXCL;
  45 + xfs_ilock(ip, *iolock);
  46 + }
  47 +
  48 + return error;
  49 +}
  50 +
  51 +/*
  52 + * Get a unique ID including its location so that the client can identify
  53 + * the exported device.
  54 + */
  55 +int
  56 +xfs_fs_get_uuid(
  57 + struct super_block *sb,
  58 + u8 *buf,
  59 + u32 *len,
  60 + u64 *offset)
  61 +{
  62 + struct xfs_mount *mp = XFS_M(sb);
  63 +
  64 + printk_once(KERN_NOTICE
  65 +"XFS (%s): using experimental pNFS feature, use at your own risk!\n",
  66 + mp->m_fsname);
  67 +
  68 + if (*len < sizeof(uuid_t))
  69 + return -EINVAL;
  70 +
  71 + memcpy(buf, &mp->m_sb.sb_uuid, sizeof(uuid_t));
  72 + *len = sizeof(uuid_t);
  73 + *offset = offsetof(struct xfs_dsb, sb_uuid);
  74 + return 0;
  75 +}
  76 +
  77 +static void
  78 +xfs_bmbt_to_iomap(
  79 + struct xfs_inode *ip,
  80 + struct iomap *iomap,
  81 + struct xfs_bmbt_irec *imap)
  82 +{
  83 + struct xfs_mount *mp = ip->i_mount;
  84 +
  85 + if (imap->br_startblock == HOLESTARTBLOCK) {
  86 + iomap->blkno = IOMAP_NULL_BLOCK;
  87 + iomap->type = IOMAP_HOLE;
  88 + } else if (imap->br_startblock == DELAYSTARTBLOCK) {
  89 + iomap->blkno = IOMAP_NULL_BLOCK;
  90 + iomap->type = IOMAP_DELALLOC;
  91 + } else {
  92 + iomap->blkno =
  93 + XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
  94 + if (imap->br_state == XFS_EXT_UNWRITTEN)
  95 + iomap->type = IOMAP_UNWRITTEN;
  96 + else
  97 + iomap->type = IOMAP_MAPPED;
  98 + }
  99 + iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
  100 + iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
  101 +}
  102 +
  103 +/*
  104 + * Get a layout for the pNFS client.
  105 + */
  106 +int
  107 +xfs_fs_map_blocks(
  108 + struct inode *inode,
  109 + loff_t offset,
  110 + u64 length,
  111 + struct iomap *iomap,
  112 + bool write,
  113 + u32 *device_generation)
  114 +{
  115 + struct xfs_inode *ip = XFS_I(inode);
  116 + struct xfs_mount *mp = ip->i_mount;
  117 + struct xfs_bmbt_irec imap;
  118 + xfs_fileoff_t offset_fsb, end_fsb;
  119 + loff_t limit;
  120 + int bmapi_flags = XFS_BMAPI_ENTIRE;
  121 + int nimaps = 1;
  122 + uint lock_flags;
  123 + int error = 0;
  124 +
  125 + if (XFS_FORCED_SHUTDOWN(mp))
  126 + return -EIO;
  127 +
  128 + /*
  129 + * We can't export inodes residing on the realtime device. The realtime
  130 + * device doesn't have a UUID to identify it, so the client has no way
  131 + * to find it.
  132 + */
  133 + if (XFS_IS_REALTIME_INODE(ip))
  134 + return -ENXIO;
  135 +
  136 + /*
  137 + * Lock out any other I/O before we flush and invalidate the pagecache,
  138 + * and then hand out a layout to the remote system. This is very
  139 + * similar to direct I/O, except that the synchronization is much more
  140 + * complicated. See the comment near xfs_break_layouts for a detailed
  141 + * explanation.
  142 + */
  143 + xfs_ilock(ip, XFS_IOLOCK_EXCL);
  144 +
  145 + error = -EINVAL;
  146 + limit = mp->m_super->s_maxbytes;
  147 + if (!write)
  148 + limit = max(limit, round_up(i_size_read(inode),
  149 + inode->i_sb->s_blocksize));
  150 + if (offset > limit)
  151 + goto out_unlock;
  152 + if (offset > limit - length)
  153 + length = limit - offset;
  154 +
  155 + error = filemap_write_and_wait(inode->i_mapping);
  156 + if (error)
  157 + goto out_unlock;
  158 + error = invalidate_inode_pages2(inode->i_mapping);
  159 + if (WARN_ON_ONCE(error))
  160 + return error;
  161 +
  162 + end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length);
  163 + offset_fsb = XFS_B_TO_FSBT(mp, offset);
  164 +
  165 + lock_flags = xfs_ilock_data_map_shared(ip);
  166 + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
  167 + &imap, &nimaps, bmapi_flags);
  168 + xfs_iunlock(ip, lock_flags);
  169 +
  170 + if (error)
  171 + goto out_unlock;
  172 +
  173 + if (write) {
  174 + enum xfs_prealloc_flags flags = 0;
  175 +
  176 + ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
  177 +
  178 + if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) {
  179 + error = xfs_iomap_write_direct(ip, offset, length,
  180 + &imap, nimaps);
  181 + if (error)
  182 + goto out_unlock;
  183 +
  184 + /*
  185 + * Ensure the next transaction is committed
  186 + * synchronously so that the blocks allocated and
  187 + * handed out to the client are guaranteed to be
  188 + * present even after a server crash.
  189 + */
  190 + flags |= XFS_PREALLOC_SET | XFS_PREALLOC_SYNC;
  191 + }
  192 +
  193 + error = xfs_update_prealloc_flags(ip, flags);
  194 + if (error)
  195 + goto out_unlock;
  196 + }
  197 + xfs_iunlock(ip, XFS_IOLOCK_EXCL);
  198 +
  199 + xfs_bmbt_to_iomap(ip, iomap, &imap);
  200 + *device_generation = mp->m_generation;
  201 + return error;
  202 +out_unlock:
  203 + xfs_iunlock(ip, XFS_IOLOCK_EXCL);
  204 + return error;
  205 +}
  206 +
  207 +/*
  208 + * Ensure the size update falls into a valid allocated block.
  209 + */
  210 +static int
  211 +xfs_pnfs_validate_isize(
  212 + struct xfs_inode *ip,
  213 + xfs_off_t isize)
  214 +{
  215 + struct xfs_bmbt_irec imap;
  216 + int nimaps = 1;
  217 + int error = 0;
  218 +
  219 + xfs_ilock(ip, XFS_ILOCK_SHARED);
  220 + error = xfs_bmapi_read(ip, XFS_B_TO_FSBT(ip->i_mount, isize - 1), 1,
  221 + &imap, &nimaps, 0);
  222 + xfs_iunlock(ip, XFS_ILOCK_SHARED);
  223 + if (error)
  224 + return error;
  225 +
  226 + if (imap.br_startblock == HOLESTARTBLOCK ||
  227 + imap.br_startblock == DELAYSTARTBLOCK ||
  228 + imap.br_state == XFS_EXT_UNWRITTEN)
  229 + return -EIO;
  230 + return 0;
  231 +}
  232 +
  233 +/*
  234 + * Make sure the blocks described by maps are stable on disk. This includes
  235 + * converting any unwritten extents, flushing the disk cache and updating the
  236 + * time stamps.
  237 + *
  238 + * Note that we rely on the caller to always send us a timestamp update so that
  239 + * we always commit a transaction here. If that stops being true we will have
  240 + * to manually flush the cache here similar to what the fsync code path does
  241 + * for datasyncs on files that have no dirty metadata.
  242 + */
  243 +int
  244 +xfs_fs_commit_blocks(
  245 + struct inode *inode,
  246 + struct iomap *maps,
  247 + int nr_maps,
  248 + struct iattr *iattr)
  249 +{
  250 + struct xfs_inode *ip = XFS_I(inode);
  251 + struct xfs_mount *mp = ip->i_mount;
  252 + struct xfs_trans *tp;
  253 + bool update_isize = false;
  254 + int error, i;
  255 + loff_t size;
  256 +
  257 + ASSERT(iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME));
  258 +
  259 + xfs_ilock(ip, XFS_IOLOCK_EXCL);
  260 +
  261 + size = i_size_read(inode);
  262 + if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size) {
  263 + update_isize = true;
  264 + size = iattr->ia_size;
  265 + }
  266 +
  267 + for (i = 0; i < nr_maps; i++) {
  268 + u64 start, length, end;
  269 +
  270 + start = maps[i].offset;
  271 + if (start > size)
  272 + continue;
  273 +
  274 + end = start + maps[i].length;
  275 + if (end > size)
  276 + end = size;
  277 +
  278 + length = end - start;
  279 + if (!length)
  280 + continue;
  281 +
  282 + /*
  283 + * Make sure reads through the pagecache see the new data.
  284 + */
  285 + error = invalidate_inode_pages2_range(inode->i_mapping,
  286 + start >> PAGE_CACHE_SHIFT,
  287 + (end - 1) >> PAGE_CACHE_SHIFT);
  288 + WARN_ON_ONCE(error);
  289 +
  290 + error = xfs_iomap_write_unwritten(ip, start, length);
  291 + if (error)
  292 + goto out_drop_iolock;
  293 + }
  294 +
  295 + if (update_isize) {
  296 + error = xfs_pnfs_validate_isize(ip, size);
  297 + if (error)
  298 + goto out_drop_iolock;
  299 + }
  300 +
  301 + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
  302 + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
  303 + if (error)
  304 + goto out_drop_iolock;
  305 +
  306 + xfs_ilock(ip, XFS_ILOCK_EXCL);
  307 + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
  308 + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  309 +
  310 + xfs_setattr_time(ip, iattr);
  311 + if (update_isize) {
  312 + i_size_write(inode, iattr->ia_size);
  313 + ip->i_d.di_size = iattr->ia_size;
  314 + }
  315 +
  316 + xfs_trans_set_sync(tp);
  317 + error = xfs_trans_commit(tp, 0);
  318 +
  319 +out_drop_iolock:
  320 + xfs_iunlock(ip, XFS_IOLOCK_EXCL);
  321 + return error;
  322 +}
  1 +#ifndef _XFS_PNFS_H
  2 +#define _XFS_PNFS_H 1
  3 +
  4 +#ifdef CONFIG_NFSD_PNFS
  5 +int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
  6 +int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
  7 + struct iomap *iomap, bool write, u32 *device_generation);
  8 +int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
  9 + struct iattr *iattr);
  10 +
  11 +int xfs_break_layouts(struct inode *inode, uint *iolock);
  12 +#else
  13 +static inline int xfs_break_layouts(struct inode *inode, uint *iolock)
  14 +{
  15 + return 0;
  16 +}
  17 +#endif /* CONFIG_NFSD_PNFS */
  18 +#endif /* _XFS_PNFS_H */