Commit a564b8f0398636ba30b07c0eaebdef7ff7837249

Authored by Mel Gorman
Committed by Linus Torvalds
1 parent 29418aa4bd

nfs: enable swap on NFS

Implement the new swapfile a_ops for NFS and hook up ->direct_IO.  This
will set the NFS socket to SOCK_MEMALLOC and run socket reconnect under
PF_MEMALLOC as well as reset SOCK_MEMALLOC before engaging the protocol
->connect() method.

PF_MEMALLOC should allow the allocation of struct socket and related
objects and the early (re)setting of SOCK_MEMALLOC should allow us to
receive the packets required for the TCP connection buildup.

[jlayton@redhat.com: Restore PF_MEMALLOC task flags in all cases]
[dfeng@redhat.com: Fix handling of multiple swap files]
[a.p.zijlstra@chello.nl: Original patch]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric B Munson <emunson@mgebm.net>
Cc: Eric Paris <eparis@redhat.com>
Cc: James Morris <jmorris@namei.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: Neil Brown <neilb@suse.de>
Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Cc: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: Xiaotian Feng <dfeng@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 9 changed files with 149 additions and 34 deletions Side-by-side Diff

... ... @@ -86,6 +86,14 @@
86 86  
87 87 If unsure, say Y.
88 88  
  89 +config NFS_SWAP
  90 + bool "Provide swap over NFS support"
  91 + default n
  92 + depends on NFS_FS
  93 + select SUNRPC_SWAP
  94 + help
  95 + This option enables swapon to work on files located on NFS mounts.
  96 +
89 97 config NFS_V4_1
90 98 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)"
91 99 depends on NFS_V4 && EXPERIMENTAL
... ... @@ -115,17 +115,28 @@
115 115 * @nr_segs: size of iovec array
116 116 *
117 117 * The presence of this routine in the address space ops vector means
118   - * the NFS client supports direct I/O. However, we shunt off direct
119   - * read and write requests before the VFS gets them, so this method
120   - * should never be called.
  118 + * the NFS client supports direct I/O. However, for most direct IO, we
  119 + * shunt off direct read and write requests before the VFS gets them,
  120 + * so this method is only ever called for swap.
121 121 */
122 122 ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
123 123 {
  124 +#ifndef CONFIG_NFS_SWAP
124 125 dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
125 126 iocb->ki_filp->f_path.dentry->d_name.name,
126 127 (long long) pos, nr_segs);
127 128  
128 129 return -EINVAL;
  130 +#else
  131 + VM_BUG_ON(iocb->ki_left != PAGE_SIZE);
  132 + VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
  133 +
  134 + if (rw == READ || rw == KERNEL_READ)
  135 + return nfs_file_direct_read(iocb, iov, nr_segs, pos,
  136 + rw == READ ? true : false);
  137 + return nfs_file_direct_write(iocb, iov, nr_segs, pos,
  138 + rw == WRITE ? true : false);
  139 +#endif /* CONFIG_NFS_SWAP */
129 140 }
130 141  
131 142 static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
... ... @@ -303,7 +314,7 @@
303 314 */
304 315 static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc,
305 316 const struct iovec *iov,
306   - loff_t pos)
  317 + loff_t pos, bool uio)
307 318 {
308 319 struct nfs_direct_req *dreq = desc->pg_dreq;
309 320 struct nfs_open_context *ctx = dreq->ctx;
310 321  
... ... @@ -331,12 +342,20 @@
331 342 GFP_KERNEL);
332 343 if (!pagevec)
333 344 break;
334   - down_read(&current->mm->mmap_sem);
335   - result = get_user_pages(current, current->mm, user_addr,
  345 + if (uio) {
  346 + down_read(&current->mm->mmap_sem);
  347 + result = get_user_pages(current, current->mm, user_addr,
336 348 npages, 1, 0, pagevec, NULL);
337   - up_read(&current->mm->mmap_sem);
338   - if (result < 0)
339   - break;
  349 + up_read(&current->mm->mmap_sem);
  350 + if (result < 0)
  351 + break;
  352 + } else {
  353 + WARN_ON(npages != 1);
  354 + result = get_kernel_page(user_addr, 1, pagevec);
  355 + if (WARN_ON(result != 1))
  356 + break;
  357 + }
  358 +
340 359 if ((unsigned)result < npages) {
341 360 bytes = result * PAGE_SIZE;
342 361 if (bytes <= pgbase) {
... ... @@ -386,7 +405,7 @@
386 405 static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
387 406 const struct iovec *iov,
388 407 unsigned long nr_segs,
389   - loff_t pos)
  408 + loff_t pos, bool uio)
390 409 {
391 410 struct nfs_pageio_descriptor desc;
392 411 ssize_t result = -EINVAL;
... ... @@ -400,7 +419,7 @@
400 419  
401 420 for (seg = 0; seg < nr_segs; seg++) {
402 421 const struct iovec *vec = &iov[seg];
403   - result = nfs_direct_read_schedule_segment(&desc, vec, pos);
  422 + result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio);
404 423 if (result < 0)
405 424 break;
406 425 requested_bytes += result;
... ... @@ -426,7 +445,7 @@
426 445 }
427 446  
428 447 static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
429   - unsigned long nr_segs, loff_t pos)
  448 + unsigned long nr_segs, loff_t pos, bool uio)
430 449 {
431 450 ssize_t result = -ENOMEM;
432 451 struct inode *inode = iocb->ki_filp->f_mapping->host;
... ... @@ -444,7 +463,7 @@
444 463 if (!is_sync_kiocb(iocb))
445 464 dreq->iocb = iocb;
446 465  
447   - result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
  466 + result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
448 467 if (!result)
449 468 result = nfs_direct_wait(dreq);
450 469 NFS_I(inode)->read_io += result;
... ... @@ -610,7 +629,7 @@
610 629 */
611 630 static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc,
612 631 const struct iovec *iov,
613   - loff_t pos)
  632 + loff_t pos, bool uio)
614 633 {
615 634 struct nfs_direct_req *dreq = desc->pg_dreq;
616 635 struct nfs_open_context *ctx = dreq->ctx;
... ... @@ -638,12 +657,19 @@
638 657 if (!pagevec)
639 658 break;
640 659  
641   - down_read(&current->mm->mmap_sem);
642   - result = get_user_pages(current, current->mm, user_addr,
643   - npages, 0, 0, pagevec, NULL);
644   - up_read(&current->mm->mmap_sem);
645   - if (result < 0)
646   - break;
  660 + if (uio) {
  661 + down_read(&current->mm->mmap_sem);
  662 + result = get_user_pages(current, current->mm, user_addr,
  663 + npages, 0, 0, pagevec, NULL);
  664 + up_read(&current->mm->mmap_sem);
  665 + if (result < 0)
  666 + break;
  667 + } else {
  668 + WARN_ON(npages != 1);
  669 + result = get_kernel_page(user_addr, 0, pagevec);
  670 + if (WARN_ON(result != 1))
  671 + break;
  672 + }
647 673  
648 674 if ((unsigned)result < npages) {
649 675 bytes = result * PAGE_SIZE;
... ... @@ -774,7 +800,7 @@
774 800 static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
775 801 const struct iovec *iov,
776 802 unsigned long nr_segs,
777   - loff_t pos)
  803 + loff_t pos, bool uio)
778 804 {
779 805 struct nfs_pageio_descriptor desc;
780 806 struct inode *inode = dreq->inode;
... ... @@ -790,7 +816,7 @@
790 816  
791 817 for (seg = 0; seg < nr_segs; seg++) {
792 818 const struct iovec *vec = &iov[seg];
793   - result = nfs_direct_write_schedule_segment(&desc, vec, pos);
  819 + result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio);
794 820 if (result < 0)
795 821 break;
796 822 requested_bytes += result;
... ... @@ -818,7 +844,7 @@
818 844  
819 845 static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
820 846 unsigned long nr_segs, loff_t pos,
821   - size_t count)
  847 + size_t count, bool uio)
822 848 {
823 849 ssize_t result = -ENOMEM;
824 850 struct inode *inode = iocb->ki_filp->f_mapping->host;
... ... @@ -836,7 +862,7 @@
836 862 if (!is_sync_kiocb(iocb))
837 863 dreq->iocb = iocb;
838 864  
839   - result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos);
  865 + result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
840 866 if (!result)
841 867 result = nfs_direct_wait(dreq);
842 868 out_release:
... ... @@ -867,7 +893,7 @@
867 893 * cache.
868 894 */
869 895 ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
870   - unsigned long nr_segs, loff_t pos)
  896 + unsigned long nr_segs, loff_t pos, bool uio)
871 897 {
872 898 ssize_t retval = -EINVAL;
873 899 struct file *file = iocb->ki_filp;
... ... @@ -892,7 +918,7 @@
892 918  
893 919 task_io_account_read(count);
894 920  
895   - retval = nfs_direct_read(iocb, iov, nr_segs, pos);
  921 + retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio);
896 922 if (retval > 0)
897 923 iocb->ki_pos = pos + retval;
898 924  
... ... @@ -923,7 +949,7 @@
923 949 * is no atomic O_APPEND write facility in the NFS protocol.
924 950 */
925 951 ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
926   - unsigned long nr_segs, loff_t pos)
  952 + unsigned long nr_segs, loff_t pos, bool uio)
927 953 {
928 954 ssize_t retval = -EINVAL;
929 955 struct file *file = iocb->ki_filp;
... ... @@ -955,7 +981,7 @@
955 981  
956 982 task_io_account_write(count);
957 983  
958   - retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
  984 + retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio);
959 985 if (retval > 0) {
960 986 struct inode *inode = mapping->host;
961 987  
... ... @@ -175,7 +175,7 @@
175 175 ssize_t result;
176 176  
177 177 if (iocb->ki_filp->f_flags & O_DIRECT)
178   - return nfs_file_direct_read(iocb, iov, nr_segs, pos);
  178 + return nfs_file_direct_read(iocb, iov, nr_segs, pos, true);
179 179  
180 180 dprintk("NFS: read(%s/%s, %lu@%lu)\n",
181 181 dentry->d_parent->d_name.name, dentry->d_name.name,
... ... @@ -482,6 +482,20 @@
482 482 return nfs_wb_page(inode, page);
483 483 }
484 484  
  485 +#ifdef CONFIG_NFS_SWAP
  486 +static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
  487 + sector_t *span)
  488 +{
  489 + *span = sis->pages;
  490 + return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1);
  491 +}
  492 +
  493 +static void nfs_swap_deactivate(struct file *file)
  494 +{
  495 + xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0);
  496 +}
  497 +#endif
  498 +
485 499 const struct address_space_operations nfs_file_aops = {
486 500 .readpage = nfs_readpage,
487 501 .readpages = nfs_readpages,
... ... @@ -496,6 +510,10 @@
496 510 .migratepage = nfs_migrate_page,
497 511 .launder_page = nfs_launder_page,
498 512 .error_remove_page = generic_error_remove_page,
  513 +#ifdef CONFIG_NFS_SWAP
  514 + .swap_activate = nfs_swap_activate,
  515 + .swap_deactivate = nfs_swap_deactivate,
  516 +#endif
499 517 };
500 518  
501 519 /*
... ... @@ -570,7 +588,7 @@
570 588 size_t count = iov_length(iov, nr_segs);
571 589  
572 590 if (iocb->ki_filp->f_flags & O_DIRECT)
573   - return nfs_file_direct_write(iocb, iov, nr_segs, pos);
  591 + return nfs_file_direct_write(iocb, iov, nr_segs, pos, true);
574 592  
575 593 dprintk("NFS: write(%s/%s, %lu@%Ld)\n",
576 594 dentry->d_parent->d_name.name, dentry->d_name.name,
include/linux/nfs_fs.h
... ... @@ -473,10 +473,10 @@
473 473 unsigned long);
474 474 extern ssize_t nfs_file_direct_read(struct kiocb *iocb,
475 475 const struct iovec *iov, unsigned long nr_segs,
476   - loff_t pos);
  476 + loff_t pos, bool uio);
477 477 extern ssize_t nfs_file_direct_write(struct kiocb *iocb,
478 478 const struct iovec *iov, unsigned long nr_segs,
479   - loff_t pos);
  479 + loff_t pos, bool uio);
480 480  
481 481 /*
482 482 * linux/fs/nfs/dir.c
include/linux/sunrpc/xprt.h
... ... @@ -174,6 +174,8 @@
174 174 unsigned long state; /* transport state */
175 175 unsigned char shutdown : 1, /* being shut down */
176 176 resvport : 1; /* use a reserved port */
  177 + unsigned int swapper; /* we're swapping over this
  178 + transport */
177 179 unsigned int bind_index; /* bind function index */
178 180  
179 181 /*
... ... @@ -316,6 +318,7 @@
316 318 void xprt_disconnect_done(struct rpc_xprt *xprt);
317 319 void xprt_force_disconnect(struct rpc_xprt *xprt);
318 320 void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
  321 +int xs_swapper(struct rpc_xprt *xprt, int enable);
319 322  
320 323 /*
321 324 * Reserved bit positions in xprt->state
... ... @@ -21,6 +21,11 @@
21 21  
22 22 If unsure, say N.
23 23  
  24 +config SUNRPC_SWAP
  25 + bool
  26 + depends on SUNRPC
  27 + select NETVM
  28 +
24 29 config RPCSEC_GSS_KRB5
25 30 tristate "Secure RPC: Kerberos V mechanism"
26 31 depends on SUNRPC && CRYPTO
... ... @@ -717,6 +717,15 @@
717 717 atomic_inc(&clnt->cl_count);
718 718 if (clnt->cl_softrtry)
719 719 task->tk_flags |= RPC_TASK_SOFT;
  720 + if (sk_memalloc_socks()) {
  721 + struct rpc_xprt *xprt;
  722 +
  723 + rcu_read_lock();
  724 + xprt = rcu_dereference(clnt->cl_xprt);
  725 + if (xprt->swapper)
  726 + task->tk_flags |= RPC_TASK_SWAPPER;
  727 + rcu_read_unlock();
  728 + }
720 729 /* Add to the client's list of all tasks */
721 730 spin_lock(&clnt->cl_lock);
722 731 list_add_tail(&task->tk_task, &clnt->cl_tasks);
... ... @@ -812,8 +812,11 @@
812 812 void *rpc_malloc(struct rpc_task *task, size_t size)
813 813 {
814 814 struct rpc_buffer *buf;
815   - gfp_t gfp = RPC_IS_SWAPPER(task) ? GFP_ATOMIC : GFP_NOWAIT;
  815 + gfp_t gfp = GFP_NOWAIT;
816 816  
  817 + if (RPC_IS_SWAPPER(task))
  818 + gfp |= __GFP_MEMALLOC;
  819 +
817 820 size += sizeof(struct rpc_buffer);
818 821 if (size <= RPC_BUFFER_MAXSIZE)
819 822 buf = mempool_alloc(rpc_buffer_mempool, gfp);
... ... @@ -886,7 +889,7 @@
886 889 static struct rpc_task *
887 890 rpc_alloc_task(void)
888 891 {
889   - return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS);
  892 + return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOIO);
890 893 }
891 894  
892 895 /*
net/sunrpc/xprtsock.c
... ... @@ -1927,6 +1927,45 @@
1927 1927 xprt_wake_pending_tasks(xprt, status);
1928 1928 }
1929 1929  
  1930 +#ifdef CONFIG_SUNRPC_SWAP
  1931 +static void xs_set_memalloc(struct rpc_xprt *xprt)
  1932 +{
  1933 + struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
  1934 + xprt);
  1935 +
  1936 + if (xprt->swapper)
  1937 + sk_set_memalloc(transport->inet);
  1938 +}
  1939 +
  1940 +/**
  1941 + * xs_swapper - Tag this transport as being used for swap.
  1942 + * @xprt: transport to tag
  1943 + * @enable: enable/disable
  1944 + *
  1945 + */
  1946 +int xs_swapper(struct rpc_xprt *xprt, int enable)
  1947 +{
  1948 + struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
  1949 + xprt);
  1950 + int err = 0;
  1951 +
  1952 + if (enable) {
  1953 + xprt->swapper++;
  1954 + xs_set_memalloc(xprt);
  1955 + } else if (xprt->swapper) {
  1956 + xprt->swapper--;
  1957 + sk_clear_memalloc(transport->inet);
  1958 + }
  1959 +
  1960 + return err;
  1961 +}
  1962 +EXPORT_SYMBOL_GPL(xs_swapper);
  1963 +#else
  1964 +static void xs_set_memalloc(struct rpc_xprt *xprt)
  1965 +{
  1966 +}
  1967 +#endif
  1968 +
1930 1969 static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
1931 1970 {
1932 1971 struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
... ... @@ -1951,6 +1990,8 @@
1951 1990 transport->sock = sock;
1952 1991 transport->inet = sk;
1953 1992  
  1993 + xs_set_memalloc(xprt);
  1994 +
1954 1995 write_unlock_bh(&sk->sk_callback_lock);
1955 1996 }
1956 1997 xs_udp_do_set_buffer_size(xprt);
... ... @@ -2074,6 +2115,8 @@
2074 2115  
2075 2116 if (!xprt_bound(xprt))
2076 2117 goto out;
  2118 +
  2119 + xs_set_memalloc(xprt);
2077 2120  
2078 2121 /* Tell the socket layer to start connecting... */
2079 2122 xprt->stat.connect_count++;