Commit a564b8f0398636ba30b07c0eaebdef7ff7837249
Committed by
Linus Torvalds
1 parent
29418aa4bd
Exists in
smarc-l5.0.0_1.0.0-ga
and in
5 other branches
nfs: enable swap on NFS
Implement the new swapfile a_ops for NFS and hook up ->direct_IO. This will set the NFS socket to SOCK_MEMALLOC and run socket reconnect under PF_MEMALLOC as well as reset SOCK_MEMALLOC before engaging the protocol ->connect() method. PF_MEMALLOC should allow the allocation of struct socket and related objects and the early (re)setting of SOCK_MEMALLOC should allow us to receive the packets required for the TCP connection buildup. [jlayton@redhat.com: Restore PF_MEMALLOC task flags in all cases] [dfeng@redhat.com: Fix handling of multiple swap files] [a.p.zijlstra@chello.nl: Original patch] Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Rik van Riel <riel@redhat.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: David S. Miller <davem@davemloft.net> Cc: Eric B Munson <emunson@mgebm.net> Cc: Eric Paris <eparis@redhat.com> Cc: James Morris <jmorris@namei.org> Cc: Mel Gorman <mgorman@suse.de> Cc: Mike Christie <michaelc@cs.wisc.edu> Cc: Neil Brown <neilb@suse.de> Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: Xiaotian Feng <dfeng@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 9 changed files with 149 additions and 34 deletions Side-by-side Diff
fs/nfs/Kconfig
... | ... | @@ -86,6 +86,14 @@ |
86 | 86 | |
87 | 87 | If unsure, say Y. |
88 | 88 | |
89 | +config NFS_SWAP | |
90 | + bool "Provide swap over NFS support" | |
91 | + default n | |
92 | + depends on NFS_FS | |
93 | + select SUNRPC_SWAP | |
94 | + help | |
95 | + This option enables swapon to work on files located on NFS mounts. | |
96 | + | |
89 | 97 | config NFS_V4_1 |
90 | 98 | bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" |
91 | 99 | depends on NFS_V4 && EXPERIMENTAL |
fs/nfs/direct.c
... | ... | @@ -115,17 +115,28 @@ |
115 | 115 | * @nr_segs: size of iovec array |
116 | 116 | * |
117 | 117 | * The presence of this routine in the address space ops vector means |
118 | - * the NFS client supports direct I/O. However, we shunt off direct | |
119 | - * read and write requests before the VFS gets them, so this method | |
120 | - * should never be called. | |
118 | + * the NFS client supports direct I/O. However, for most direct IO, we | |
119 | + * shunt off direct read and write requests before the VFS gets them, | |
120 | + * so this method is only ever called for swap. | |
121 | 121 | */ |
122 | 122 | ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) |
123 | 123 | { |
124 | +#ifndef CONFIG_NFS_SWAP | |
124 | 125 | dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n", |
125 | 126 | iocb->ki_filp->f_path.dentry->d_name.name, |
126 | 127 | (long long) pos, nr_segs); |
127 | 128 | |
128 | 129 | return -EINVAL; |
130 | +#else | |
131 | + VM_BUG_ON(iocb->ki_left != PAGE_SIZE); | |
132 | + VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); | |
133 | + | |
134 | + if (rw == READ || rw == KERNEL_READ) | |
135 | + return nfs_file_direct_read(iocb, iov, nr_segs, pos, | |
136 | + rw == READ ? true : false); | |
137 | + return nfs_file_direct_write(iocb, iov, nr_segs, pos, | |
138 | + rw == WRITE ? true : false); | |
139 | +#endif /* CONFIG_NFS_SWAP */ | |
129 | 140 | } |
130 | 141 | |
131 | 142 | static void nfs_direct_release_pages(struct page **pages, unsigned int npages) |
... | ... | @@ -303,7 +314,7 @@ |
303 | 314 | */ |
304 | 315 | static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc, |
305 | 316 | const struct iovec *iov, |
306 | - loff_t pos) | |
317 | + loff_t pos, bool uio) | |
307 | 318 | { |
308 | 319 | struct nfs_direct_req *dreq = desc->pg_dreq; |
309 | 320 | struct nfs_open_context *ctx = dreq->ctx; |
310 | 321 | |
... | ... | @@ -331,12 +342,20 @@ |
331 | 342 | GFP_KERNEL); |
332 | 343 | if (!pagevec) |
333 | 344 | break; |
334 | - down_read(¤t->mm->mmap_sem); | |
335 | - result = get_user_pages(current, current->mm, user_addr, | |
345 | + if (uio) { | |
346 | + down_read(¤t->mm->mmap_sem); | |
347 | + result = get_user_pages(current, current->mm, user_addr, | |
336 | 348 | npages, 1, 0, pagevec, NULL); |
337 | - up_read(¤t->mm->mmap_sem); | |
338 | - if (result < 0) | |
339 | - break; | |
349 | + up_read(¤t->mm->mmap_sem); | |
350 | + if (result < 0) | |
351 | + break; | |
352 | + } else { | |
353 | + WARN_ON(npages != 1); | |
354 | + result = get_kernel_page(user_addr, 1, pagevec); | |
355 | + if (WARN_ON(result != 1)) | |
356 | + break; | |
357 | + } | |
358 | + | |
340 | 359 | if ((unsigned)result < npages) { |
341 | 360 | bytes = result * PAGE_SIZE; |
342 | 361 | if (bytes <= pgbase) { |
... | ... | @@ -386,7 +405,7 @@ |
386 | 405 | static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, |
387 | 406 | const struct iovec *iov, |
388 | 407 | unsigned long nr_segs, |
389 | - loff_t pos) | |
408 | + loff_t pos, bool uio) | |
390 | 409 | { |
391 | 410 | struct nfs_pageio_descriptor desc; |
392 | 411 | ssize_t result = -EINVAL; |
... | ... | @@ -400,7 +419,7 @@ |
400 | 419 | |
401 | 420 | for (seg = 0; seg < nr_segs; seg++) { |
402 | 421 | const struct iovec *vec = &iov[seg]; |
403 | - result = nfs_direct_read_schedule_segment(&desc, vec, pos); | |
422 | + result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio); | |
404 | 423 | if (result < 0) |
405 | 424 | break; |
406 | 425 | requested_bytes += result; |
... | ... | @@ -426,7 +445,7 @@ |
426 | 445 | } |
427 | 446 | |
428 | 447 | static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, |
429 | - unsigned long nr_segs, loff_t pos) | |
448 | + unsigned long nr_segs, loff_t pos, bool uio) | |
430 | 449 | { |
431 | 450 | ssize_t result = -ENOMEM; |
432 | 451 | struct inode *inode = iocb->ki_filp->f_mapping->host; |
... | ... | @@ -444,7 +463,7 @@ |
444 | 463 | if (!is_sync_kiocb(iocb)) |
445 | 464 | dreq->iocb = iocb; |
446 | 465 | |
447 | - result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos); | |
466 | + result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio); | |
448 | 467 | if (!result) |
449 | 468 | result = nfs_direct_wait(dreq); |
450 | 469 | NFS_I(inode)->read_io += result; |
... | ... | @@ -610,7 +629,7 @@ |
610 | 629 | */ |
611 | 630 | static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc, |
612 | 631 | const struct iovec *iov, |
613 | - loff_t pos) | |
632 | + loff_t pos, bool uio) | |
614 | 633 | { |
615 | 634 | struct nfs_direct_req *dreq = desc->pg_dreq; |
616 | 635 | struct nfs_open_context *ctx = dreq->ctx; |
... | ... | @@ -638,12 +657,19 @@ |
638 | 657 | if (!pagevec) |
639 | 658 | break; |
640 | 659 | |
641 | - down_read(¤t->mm->mmap_sem); | |
642 | - result = get_user_pages(current, current->mm, user_addr, | |
643 | - npages, 0, 0, pagevec, NULL); | |
644 | - up_read(¤t->mm->mmap_sem); | |
645 | - if (result < 0) | |
646 | - break; | |
660 | + if (uio) { | |
661 | + down_read(¤t->mm->mmap_sem); | |
662 | + result = get_user_pages(current, current->mm, user_addr, | |
663 | + npages, 0, 0, pagevec, NULL); | |
664 | + up_read(¤t->mm->mmap_sem); | |
665 | + if (result < 0) | |
666 | + break; | |
667 | + } else { | |
668 | + WARN_ON(npages != 1); | |
669 | + result = get_kernel_page(user_addr, 0, pagevec); | |
670 | + if (WARN_ON(result != 1)) | |
671 | + break; | |
672 | + } | |
647 | 673 | |
648 | 674 | if ((unsigned)result < npages) { |
649 | 675 | bytes = result * PAGE_SIZE; |
... | ... | @@ -774,7 +800,7 @@ |
774 | 800 | static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, |
775 | 801 | const struct iovec *iov, |
776 | 802 | unsigned long nr_segs, |
777 | - loff_t pos) | |
803 | + loff_t pos, bool uio) | |
778 | 804 | { |
779 | 805 | struct nfs_pageio_descriptor desc; |
780 | 806 | struct inode *inode = dreq->inode; |
... | ... | @@ -790,7 +816,7 @@ |
790 | 816 | |
791 | 817 | for (seg = 0; seg < nr_segs; seg++) { |
792 | 818 | const struct iovec *vec = &iov[seg]; |
793 | - result = nfs_direct_write_schedule_segment(&desc, vec, pos); | |
819 | + result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio); | |
794 | 820 | if (result < 0) |
795 | 821 | break; |
796 | 822 | requested_bytes += result; |
... | ... | @@ -818,7 +844,7 @@ |
818 | 844 | |
819 | 845 | static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, |
820 | 846 | unsigned long nr_segs, loff_t pos, |
821 | - size_t count) | |
847 | + size_t count, bool uio) | |
822 | 848 | { |
823 | 849 | ssize_t result = -ENOMEM; |
824 | 850 | struct inode *inode = iocb->ki_filp->f_mapping->host; |
... | ... | @@ -836,7 +862,7 @@ |
836 | 862 | if (!is_sync_kiocb(iocb)) |
837 | 863 | dreq->iocb = iocb; |
838 | 864 | |
839 | - result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos); | |
865 | + result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio); | |
840 | 866 | if (!result) |
841 | 867 | result = nfs_direct_wait(dreq); |
842 | 868 | out_release: |
... | ... | @@ -867,7 +893,7 @@ |
867 | 893 | * cache. |
868 | 894 | */ |
869 | 895 | ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, |
870 | - unsigned long nr_segs, loff_t pos) | |
896 | + unsigned long nr_segs, loff_t pos, bool uio) | |
871 | 897 | { |
872 | 898 | ssize_t retval = -EINVAL; |
873 | 899 | struct file *file = iocb->ki_filp; |
... | ... | @@ -892,7 +918,7 @@ |
892 | 918 | |
893 | 919 | task_io_account_read(count); |
894 | 920 | |
895 | - retval = nfs_direct_read(iocb, iov, nr_segs, pos); | |
921 | + retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio); | |
896 | 922 | if (retval > 0) |
897 | 923 | iocb->ki_pos = pos + retval; |
898 | 924 | |
... | ... | @@ -923,7 +949,7 @@ |
923 | 949 | * is no atomic O_APPEND write facility in the NFS protocol. |
924 | 950 | */ |
925 | 951 | ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, |
926 | - unsigned long nr_segs, loff_t pos) | |
952 | + unsigned long nr_segs, loff_t pos, bool uio) | |
927 | 953 | { |
928 | 954 | ssize_t retval = -EINVAL; |
929 | 955 | struct file *file = iocb->ki_filp; |
... | ... | @@ -955,7 +981,7 @@ |
955 | 981 | |
956 | 982 | task_io_account_write(count); |
957 | 983 | |
958 | - retval = nfs_direct_write(iocb, iov, nr_segs, pos, count); | |
984 | + retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio); | |
959 | 985 | if (retval > 0) { |
960 | 986 | struct inode *inode = mapping->host; |
961 | 987 |
fs/nfs/file.c
... | ... | @@ -175,7 +175,7 @@ |
175 | 175 | ssize_t result; |
176 | 176 | |
177 | 177 | if (iocb->ki_filp->f_flags & O_DIRECT) |
178 | - return nfs_file_direct_read(iocb, iov, nr_segs, pos); | |
178 | + return nfs_file_direct_read(iocb, iov, nr_segs, pos, true); | |
179 | 179 | |
180 | 180 | dprintk("NFS: read(%s/%s, %lu@%lu)\n", |
181 | 181 | dentry->d_parent->d_name.name, dentry->d_name.name, |
... | ... | @@ -482,6 +482,20 @@ |
482 | 482 | return nfs_wb_page(inode, page); |
483 | 483 | } |
484 | 484 | |
485 | +#ifdef CONFIG_NFS_SWAP | |
486 | +static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, | |
487 | + sector_t *span) | |
488 | +{ | |
489 | + *span = sis->pages; | |
490 | + return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1); | |
491 | +} | |
492 | + | |
493 | +static void nfs_swap_deactivate(struct file *file) | |
494 | +{ | |
495 | + xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0); | |
496 | +} | |
497 | +#endif | |
498 | + | |
485 | 499 | const struct address_space_operations nfs_file_aops = { |
486 | 500 | .readpage = nfs_readpage, |
487 | 501 | .readpages = nfs_readpages, |
... | ... | @@ -496,6 +510,10 @@ |
496 | 510 | .migratepage = nfs_migrate_page, |
497 | 511 | .launder_page = nfs_launder_page, |
498 | 512 | .error_remove_page = generic_error_remove_page, |
513 | +#ifdef CONFIG_NFS_SWAP | |
514 | + .swap_activate = nfs_swap_activate, | |
515 | + .swap_deactivate = nfs_swap_deactivate, | |
516 | +#endif | |
499 | 517 | }; |
500 | 518 | |
501 | 519 | /* |
... | ... | @@ -570,7 +588,7 @@ |
570 | 588 | size_t count = iov_length(iov, nr_segs); |
571 | 589 | |
572 | 590 | if (iocb->ki_filp->f_flags & O_DIRECT) |
573 | - return nfs_file_direct_write(iocb, iov, nr_segs, pos); | |
591 | + return nfs_file_direct_write(iocb, iov, nr_segs, pos, true); | |
574 | 592 | |
575 | 593 | dprintk("NFS: write(%s/%s, %lu@%Ld)\n", |
576 | 594 | dentry->d_parent->d_name.name, dentry->d_name.name, |
include/linux/nfs_fs.h
... | ... | @@ -473,10 +473,10 @@ |
473 | 473 | unsigned long); |
474 | 474 | extern ssize_t nfs_file_direct_read(struct kiocb *iocb, |
475 | 475 | const struct iovec *iov, unsigned long nr_segs, |
476 | - loff_t pos); | |
476 | + loff_t pos, bool uio); | |
477 | 477 | extern ssize_t nfs_file_direct_write(struct kiocb *iocb, |
478 | 478 | const struct iovec *iov, unsigned long nr_segs, |
479 | - loff_t pos); | |
479 | + loff_t pos, bool uio); | |
480 | 480 | |
481 | 481 | /* |
482 | 482 | * linux/fs/nfs/dir.c |
include/linux/sunrpc/xprt.h
... | ... | @@ -174,6 +174,8 @@ |
174 | 174 | unsigned long state; /* transport state */ |
175 | 175 | unsigned char shutdown : 1, /* being shut down */ |
176 | 176 | resvport : 1; /* use a reserved port */ |
177 | + unsigned int swapper; /* we're swapping over this | |
178 | + transport */ | |
177 | 179 | unsigned int bind_index; /* bind function index */ |
178 | 180 | |
179 | 181 | /* |
... | ... | @@ -316,6 +318,7 @@ |
316 | 318 | void xprt_disconnect_done(struct rpc_xprt *xprt); |
317 | 319 | void xprt_force_disconnect(struct rpc_xprt *xprt); |
318 | 320 | void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie); |
321 | +int xs_swapper(struct rpc_xprt *xprt, int enable); | |
319 | 322 | |
320 | 323 | /* |
321 | 324 | * Reserved bit positions in xprt->state |
net/sunrpc/Kconfig
net/sunrpc/clnt.c
... | ... | @@ -717,6 +717,15 @@ |
717 | 717 | atomic_inc(&clnt->cl_count); |
718 | 718 | if (clnt->cl_softrtry) |
719 | 719 | task->tk_flags |= RPC_TASK_SOFT; |
720 | + if (sk_memalloc_socks()) { | |
721 | + struct rpc_xprt *xprt; | |
722 | + | |
723 | + rcu_read_lock(); | |
724 | + xprt = rcu_dereference(clnt->cl_xprt); | |
725 | + if (xprt->swapper) | |
726 | + task->tk_flags |= RPC_TASK_SWAPPER; | |
727 | + rcu_read_unlock(); | |
728 | + } | |
720 | 729 | /* Add to the client's list of all tasks */ |
721 | 730 | spin_lock(&clnt->cl_lock); |
722 | 731 | list_add_tail(&task->tk_task, &clnt->cl_tasks); |
net/sunrpc/sched.c
... | ... | @@ -812,8 +812,11 @@ |
812 | 812 | void *rpc_malloc(struct rpc_task *task, size_t size) |
813 | 813 | { |
814 | 814 | struct rpc_buffer *buf; |
815 | - gfp_t gfp = RPC_IS_SWAPPER(task) ? GFP_ATOMIC : GFP_NOWAIT; | |
815 | + gfp_t gfp = GFP_NOWAIT; | |
816 | 816 | |
817 | + if (RPC_IS_SWAPPER(task)) | |
818 | + gfp |= __GFP_MEMALLOC; | |
819 | + | |
817 | 820 | size += sizeof(struct rpc_buffer); |
818 | 821 | if (size <= RPC_BUFFER_MAXSIZE) |
819 | 822 | buf = mempool_alloc(rpc_buffer_mempool, gfp); |
... | ... | @@ -886,7 +889,7 @@ |
886 | 889 | static struct rpc_task * |
887 | 890 | rpc_alloc_task(void) |
888 | 891 | { |
889 | - return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS); | |
892 | + return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOIO); | |
890 | 893 | } |
891 | 894 | |
892 | 895 | /* |
net/sunrpc/xprtsock.c
... | ... | @@ -1927,6 +1927,45 @@ |
1927 | 1927 | xprt_wake_pending_tasks(xprt, status); |
1928 | 1928 | } |
1929 | 1929 | |
1930 | +#ifdef CONFIG_SUNRPC_SWAP | |
1931 | +static void xs_set_memalloc(struct rpc_xprt *xprt) | |
1932 | +{ | |
1933 | + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, | |
1934 | + xprt); | |
1935 | + | |
1936 | + if (xprt->swapper) | |
1937 | + sk_set_memalloc(transport->inet); | |
1938 | +} | |
1939 | + | |
1940 | +/** | |
1941 | + * xs_swapper - Tag this transport as being used for swap. | |
1942 | + * @xprt: transport to tag | |
1943 | + * @enable: enable/disable | |
1944 | + * | |
1945 | + */ | |
1946 | +int xs_swapper(struct rpc_xprt *xprt, int enable) | |
1947 | +{ | |
1948 | + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, | |
1949 | + xprt); | |
1950 | + int err = 0; | |
1951 | + | |
1952 | + if (enable) { | |
1953 | + xprt->swapper++; | |
1954 | + xs_set_memalloc(xprt); | |
1955 | + } else if (xprt->swapper) { | |
1956 | + xprt->swapper--; | |
1957 | + sk_clear_memalloc(transport->inet); | |
1958 | + } | |
1959 | + | |
1960 | + return err; | |
1961 | +} | |
1962 | +EXPORT_SYMBOL_GPL(xs_swapper); | |
1963 | +#else | |
1964 | +static void xs_set_memalloc(struct rpc_xprt *xprt) | |
1965 | +{ | |
1966 | +} | |
1967 | +#endif | |
1968 | + | |
1930 | 1969 | static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) |
1931 | 1970 | { |
1932 | 1971 | struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); |
... | ... | @@ -1951,6 +1990,8 @@ |
1951 | 1990 | transport->sock = sock; |
1952 | 1991 | transport->inet = sk; |
1953 | 1992 | |
1993 | + xs_set_memalloc(xprt); | |
1994 | + | |
1954 | 1995 | write_unlock_bh(&sk->sk_callback_lock); |
1955 | 1996 | } |
1956 | 1997 | xs_udp_do_set_buffer_size(xprt); |
... | ... | @@ -2074,6 +2115,8 @@ |
2074 | 2115 | |
2075 | 2116 | if (!xprt_bound(xprt)) |
2076 | 2117 | goto out; |
2118 | + | |
2119 | + xs_set_memalloc(xprt); | |
2077 | 2120 | |
2078 | 2121 | /* Tell the socket layer to start connecting... */ |
2079 | 2122 | xprt->stat.connect_count++; |