Commit a2e2725541fad72416326798c2d7fa4dafb7d337

Authored by Arnaldo Carvalho de Melo
Committed by David S. Miller
1 parent c05e85a06e

net: Introduce recvmmsg socket syscall

Meaning receive multiple messages, reducing the number of syscalls and
net stack entry/exit operations.

Next patches will introduce mechanisms where protocols that want to
optimize this operation will provide an unlocked_recvmsg operation.

This takes into account comments made by:

. Paul Moore: sock_recvmsg is called only for the first datagram,
  sock_recvmsg_nosec is used for the rest.

. Caitlin Bestler: recvmmsg now has a struct timespec timeout, that
  works in the same fashion as the ppoll one.

  If the underlying protocol returns a datagram with MSG_OOB set, this
  will make recvmmsg return right away with as many datagrams (+ the OOB
  one) it has received so far.

. Rémi Denis-Courmont & Steven Whitehouse: If we receive N < vlen
  datagrams and then recvmsg returns an error, recvmmsg will return
  the successfully received datagrams, store the error and return it
  in the next call.

This paves the way for a subsequent optimization, sk_prot->unlocked_recvmsg,
where we will be able to acquire the lock only at batch start and end, not at
every underlying recvmsg call.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 25 changed files with 261 additions and 50 deletions Side-by-side Diff

arch/alpha/kernel/systbls.S
... ... @@ -497,6 +497,7 @@
497 497 .quad sys_signalfd
498 498 .quad sys_ni_syscall
499 499 .quad sys_eventfd
  500 + .quad sys_recvmmsg
500 501  
501 502 .size sys_call_table, . - sys_call_table
502 503 .type sys_call_table, @object
arch/arm/kernel/calls.S
... ... @@ -374,6 +374,7 @@
374 374 CALL(sys_pwritev)
375 375 CALL(sys_rt_tgsigqueueinfo)
376 376 CALL(sys_perf_event_open)
  377 +/* 365 */ CALL(sys_recvmmsg)
377 378 #ifndef syscalls_counted
378 379 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
379 380 #define syscalls_counted
arch/avr32/kernel/syscall_table.S
... ... @@ -295,5 +295,6 @@
295 295 .long sys_signalfd
296 296 .long sys_ni_syscall /* 280, was sys_timerfd */
297 297 .long sys_eventfd
  298 + .long sys_recvmmsg
298 299 .long sys_ni_syscall /* r8 is saturated at nr_syscalls */
arch/blackfin/mach-common/entry.S
... ... @@ -1621,6 +1621,7 @@
1621 1621 .long _sys_pwritev
1622 1622 .long _sys_rt_tgsigqueueinfo
1623 1623 .long _sys_perf_event_open
  1624 + .long _sys_recvmmsg /* 370 */
1624 1625  
1625 1626 .rept NR_syscalls-(.-_sys_call_table)/4
1626 1627 .long _sys_ni_syscall
arch/ia64/kernel/entry.S
... ... @@ -1806,6 +1806,7 @@
1806 1806 data8 sys_preadv
1807 1807 data8 sys_pwritev // 1320
1808 1808 data8 sys_rt_tgsigqueueinfo
  1809 + data8 sys_recvmmsg
1809 1810  
1810 1811 .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls
1811 1812 #endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */
arch/microblaze/kernel/syscall_table.S
... ... @@ -371,4 +371,5 @@
371 371 .long sys_ni_syscall
372 372 .long sys_rt_tgsigqueueinfo /* 365 */
373 373 .long sys_perf_event_open
  374 + .long sys_recvmmsg
arch/mips/kernel/scall32-o32.S
... ... @@ -583,6 +583,7 @@
583 583 sys sys_rt_tgsigqueueinfo 4
584 584 sys sys_perf_event_open 5
585 585 sys sys_accept4 4
  586 + sys sys_recvmmsg 5
586 587 .endm
587 588  
588 589 /* We pre-compute the number of _instruction_ bytes needed to
arch/mips/kernel/scall64-64.S
... ... @@ -420,5 +420,6 @@
420 420 PTR sys_rt_tgsigqueueinfo
421 421 PTR sys_perf_event_open
422 422 PTR sys_accept4
  423 + PTR sys_recvmmsg
423 424 .size sys_call_table,.-sys_call_table
arch/mips/kernel/scall64-n32.S
... ... @@ -418,5 +418,6 @@
418 418 PTR compat_sys_rt_tgsigqueueinfo /* 5295 */
419 419 PTR sys_perf_event_open
420 420 PTR sys_accept4
  421 + PTR compat_sys_recvmmsg
421 422 .size sysn32_call_table,.-sysn32_call_table
arch/mips/kernel/scall64-o32.S
... ... @@ -538,5 +538,6 @@
538 538 PTR compat_sys_rt_tgsigqueueinfo
539 539 PTR sys_perf_event_open
540 540 PTR sys_accept4
  541 + PTR compat_sys_recvmmsg
541 542 .size sys_call_table,.-sys_call_table
arch/sh/kernel/syscalls_64.S
... ... @@ -391,4 +391,5 @@
391 391 .long sys_pwritev
392 392 .long sys_rt_tgsigqueueinfo
393 393 .long sys_perf_event_open
  394 + .long sys_recvmmsg /* 365 */
arch/sparc/kernel/systbls_32.S
... ... @@ -82,5 +82,5 @@
82 82 /*310*/ .long sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate
83 83 /*315*/ .long sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1
84 84 /*320*/ .long sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv
85   -/*325*/ .long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open
  85 +/*325*/ .long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open, sys_recvmmsg
arch/sparc/kernel/systbls_64.S
... ... @@ -83,7 +83,7 @@
83 83 /*310*/ .word compat_sys_utimensat, compat_sys_signalfd, sys_timerfd_create, sys_eventfd, compat_sys_fallocate
84 84 .word compat_sys_timerfd_settime, compat_sys_timerfd_gettime, compat_sys_signalfd4, sys_eventfd2, sys_epoll_create1
85 85 /*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, compat_sys_preadv
86   - .word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_event_open
  86 + .word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_event_open, compat_sys_recvmmsg
87 87  
88 88 #endif /* CONFIG_COMPAT */
89 89  
... ... @@ -158,5 +158,5 @@
158 158 /*310*/ .word sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate
159 159 .word sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1
160 160 /*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv
161   - .word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open
  161 + .word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open, sys_recvmmsg
arch/x86/ia32/ia32entry.S
... ... @@ -832,5 +832,6 @@
832 832 .quad compat_sys_pwritev
833 833 .quad compat_sys_rt_tgsigqueueinfo /* 335 */
834 834 .quad sys_perf_event_open
  835 + .quad compat_sys_recvmmsg
835 836 ia32_syscall_end:
arch/x86/include/asm/unistd_32.h
... ... @@ -342,10 +342,11 @@
342 342 #define __NR_pwritev 334
343 343 #define __NR_rt_tgsigqueueinfo 335
344 344 #define __NR_perf_event_open 336
  345 +#define __NR_recvmmsg 337
345 346  
346 347 #ifdef __KERNEL__
347 348  
348   -#define NR_syscalls 337
  349 +#define NR_syscalls 338
349 350  
350 351 #define __ARCH_WANT_IPC_PARSE_VERSION
351 352 #define __ARCH_WANT_OLD_READDIR
arch/x86/include/asm/unistd_64.h
... ... @@ -661,6 +661,8 @@
661 661 __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
662 662 #define __NR_perf_event_open 298
663 663 __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
  664 +#define __NR_recvmmsg 299
  665 +__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
664 666  
665 667 #ifndef __NO_STUBS
666 668 #define __ARCH_WANT_OLD_READDIR
arch/x86/kernel/syscall_table_32.S
... ... @@ -336,4 +336,5 @@
336 336 .long sys_pwritev
337 337 .long sys_rt_tgsigqueueinfo /* 335 */
338 338 .long sys_perf_event_open
  339 + .long sys_recvmmsg
arch/xtensa/include/asm/unistd.h
... ... @@ -681,8 +681,10 @@
681 681 __SYSCALL(305, sys_ni_syscall, 0)
682 682 #define __NR_eventfd 306
683 683 __SYSCALL(306, sys_eventfd, 1)
  684 +#define __NR_recvmmsg 307
  685 +__SYSCALL(307, sys_recvmmsg, 5)
684 686  
685   -#define __NR_syscall_count 307
  687 +#define __NR_syscall_count 308
686 688  
687 689 /*
688 690 * sysxtensa syscall handler
... ... @@ -41,6 +41,7 @@
41 41 #define SYS_SENDMSG 16 /* sys_sendmsg(2) */
42 42 #define SYS_RECVMSG 17 /* sys_recvmsg(2) */
43 43 #define SYS_ACCEPT4 18 /* sys_accept4(2) */
  44 +#define SYS_RECVMMSG 19 /* sys_recvmmsg(2) */
44 45  
45 46 typedef enum {
46 47 SS_FREE = 0, /* not allocated */
include/linux/socket.h
... ... @@ -65,6 +65,12 @@
65 65 unsigned msg_flags;
66 66 };
67 67  
  68 +/* For recvmmsg/sendmmsg */
  69 +struct mmsghdr {
  70 + struct msghdr msg_hdr;
  71 + unsigned msg_len;
  72 +};
  73 +
68 74 /*
69 75 * POSIX 1003.1g - ancillary data object information
70 76 * Ancillary data consits of a sequence of pairs of
... ... @@ -312,6 +318,10 @@
312 318 extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr);
313 319 extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data);
314 320  
  321 +struct timespec;
  322 +
  323 +extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
  324 + unsigned int flags, struct timespec *timeout);
315 325 #endif
316 326 #endif /* not kernel and not glibc */
317 327 #endif /* _LINUX_SOCKET_H */
include/linux/syscalls.h
... ... @@ -25,6 +25,7 @@
25 25 struct list_head;
26 26 struct msgbuf;
27 27 struct msghdr;
  28 +struct mmsghdr;
28 29 struct msqid_ds;
29 30 struct new_utsname;
30 31 struct nfsctl_arg;
... ... @@ -677,6 +678,9 @@
677 678 asmlinkage long sys_recvfrom(int, void __user *, size_t, unsigned,
678 679 struct sockaddr __user *, int __user *);
679 680 asmlinkage long sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags);
  681 +asmlinkage long sys_recvmmsg(int fd, struct mmsghdr __user *msg,
  682 + unsigned int vlen, unsigned flags,
  683 + struct timespec __user *timeout);
680 684 asmlinkage long sys_socket(int, int, int);
681 685 asmlinkage long sys_socketpair(int, int, int, int __user *);
682 686 asmlinkage long sys_socketcall(int call, unsigned long __user *args);
include/net/compat.h
... ... @@ -18,6 +18,11 @@
18 18 compat_uint_t msg_flags;
19 19 };
20 20  
  21 +struct compat_mmsghdr {
  22 + struct compat_msghdr msg_hdr;
  23 + compat_uint_t msg_len;
  24 +};
  25 +
21 26 struct compat_cmsghdr {
22 27 compat_size_t cmsg_len;
23 28 compat_int_t cmsg_level;
... ... @@ -35,6 +40,9 @@
35 40 extern int verify_compat_iovec(struct msghdr *, struct iovec *, struct sockaddr *, int);
36 41 extern asmlinkage long compat_sys_sendmsg(int,struct compat_msghdr __user *,unsigned);
37 42 extern asmlinkage long compat_sys_recvmsg(int,struct compat_msghdr __user *,unsigned);
  43 +extern asmlinkage long compat_sys_recvmmsg(int, struct compat_mmsghdr __user *,
  44 + unsigned, unsigned,
  45 + struct timespec __user *);
38 46 extern asmlinkage long compat_sys_getsockopt(int, int, int, char __user *, int __user *);
39 47 extern int put_cmsg_compat(struct msghdr*, int, int, int, void *);
40 48  
... ... @@ -48,8 +48,10 @@
48 48 cond_syscall(sys_sendmsg);
49 49 cond_syscall(compat_sys_sendmsg);
50 50 cond_syscall(sys_recvmsg);
  51 +cond_syscall(sys_recvmmsg);
51 52 cond_syscall(compat_sys_recvmsg);
52 53 cond_syscall(compat_sys_recvfrom);
  54 +cond_syscall(compat_sys_recvmmsg);
53 55 cond_syscall(sys_socketcall);
54 56 cond_syscall(sys_futex);
55 57 cond_syscall(compat_sys_futex);
... ... @@ -727,10 +727,10 @@
727 727  
728 728 /* Argument list sizes for compat_sys_socketcall */
729 729 #define AL(x) ((x) * sizeof(u32))
730   -static unsigned char nas[19]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
  730 +static unsigned char nas[20]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
731 731 AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
732 732 AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
733   - AL(4)};
  733 + AL(4),AL(5)};
734 734 #undef AL
735 735  
736 736 asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned flags)
737 737  
... ... @@ -755,13 +755,36 @@
755 755 return sys_recvfrom(fd, buf, len, flags | MSG_CMSG_COMPAT, addr, addrlen);
756 756 }
757 757  
  758 +asmlinkage long compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg,
  759 + unsigned vlen, unsigned int flags,
  760 + struct timespec __user *timeout)
  761 +{
  762 + int datagrams;
  763 + struct timespec ktspec;
  764 + struct compat_timespec __user *utspec =
  765 + (struct compat_timespec __user *)timeout;
  766 +
  767 + if (get_user(ktspec.tv_sec, &utspec->tv_sec) ||
  768 + get_user(ktspec.tv_nsec, &utspec->tv_nsec))
  769 + return -EFAULT;
  770 +
  771 + datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
  772 + flags | MSG_CMSG_COMPAT, &ktspec);
  773 + if (datagrams > 0 &&
  774 + (put_user(ktspec.tv_sec, &utspec->tv_sec) ||
  775 + put_user(ktspec.tv_nsec, &utspec->tv_nsec)))
  776 + datagrams = -EFAULT;
  777 +
  778 + return datagrams;
  779 +}
  780 +
758 781 asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
759 782 {
760 783 int ret;
761 784 u32 a[6];
762 785 u32 a0, a1;
763 786  
764   - if (call < SYS_SOCKET || call > SYS_ACCEPT4)
  787 + if (call < SYS_SOCKET || call > SYS_RECVMMSG)
765 788 return -EINVAL;
766 789 if (copy_from_user(a, args, nas[call]))
767 790 return -EFAULT;
... ... @@ -822,6 +845,10 @@
822 845 break;
823 846 case SYS_RECVMSG:
824 847 ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
  848 + break;
  849 + case SYS_RECVMMSG:
  850 + ret = compat_sys_recvmmsg(a0, compat_ptr(a1), a[2], a[3],
  851 + compat_ptr(a[4]));
825 852 break;
826 853 case SYS_ACCEPT4:
827 854 ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]);
... ... @@ -683,10 +683,9 @@
683 683 }
684 684 EXPORT_SYMBOL_GPL(sock_recv_ts_and_drops);
685 685  
686   -static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
687   - struct msghdr *msg, size_t size, int flags)
  686 +static inline int __sock_recvmsg_nosec(struct kiocb *iocb, struct socket *sock,
  687 + struct msghdr *msg, size_t size, int flags)
688 688 {
689   - int err;
690 689 struct sock_iocb *si = kiocb_to_siocb(iocb);
691 690  
692 691 si->sock = sock;
693 692  
... ... @@ -695,13 +694,17 @@
695 694 si->size = size;
696 695 si->flags = flags;
697 696  
698   - err = security_socket_recvmsg(sock, msg, size, flags);
699   - if (err)
700   - return err;
701   -
702 697 return sock->ops->recvmsg(iocb, sock, msg, size, flags);
703 698 }
704 699  
  700 +static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
  701 + struct msghdr *msg, size_t size, int flags)
  702 +{
  703 + int err = security_socket_recvmsg(sock, msg, size, flags);
  704 +
  705 + return err ?: __sock_recvmsg_nosec(iocb, sock, msg, size, flags);
  706 +}
  707 +
705 708 int sock_recvmsg(struct socket *sock, struct msghdr *msg,
706 709 size_t size, int flags)
707 710 {
... ... @@ -717,6 +720,21 @@
717 720 return ret;
718 721 }
719 722  
  723 +static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg,
  724 + size_t size, int flags)
  725 +{
  726 + struct kiocb iocb;
  727 + struct sock_iocb siocb;
  728 + int ret;
  729 +
  730 + init_sync_kiocb(&iocb, NULL);
  731 + iocb.private = &siocb;
  732 + ret = __sock_recvmsg_nosec(&iocb, sock, msg, size, flags);
  733 + if (-EIOCBQUEUED == ret)
  734 + ret = wait_on_sync_kiocb(&iocb);
  735 + return ret;
  736 +}
  737 +
720 738 int kernel_recvmsg(struct socket *sock, struct msghdr *msg,
721 739 struct kvec *vec, size_t num, size_t size, int flags)
722 740 {
723 741  
724 742  
725 743  
... ... @@ -1983,22 +2001,15 @@
1983 2001 return err;
1984 2002 }
1985 2003  
1986   -/*
1987   - * BSD recvmsg interface
1988   - */
1989   -
1990   -SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
1991   - unsigned int, flags)
  2004 +static int __sys_recvmsg(struct socket *sock, struct msghdr __user *msg,
  2005 + struct msghdr *msg_sys, unsigned flags, int nosec)
1992 2006 {
1993 2007 struct compat_msghdr __user *msg_compat =
1994 2008 (struct compat_msghdr __user *)msg;
1995   - struct socket *sock;
1996 2009 struct iovec iovstack[UIO_FASTIOV];
1997 2010 struct iovec *iov = iovstack;
1998   - struct msghdr msg_sys;
1999 2011 unsigned long cmsg_ptr;
2000 2012 int err, iov_size, total_len, len;
2001   - int fput_needed;
2002 2013  
2003 2014 /* kernel mode address */
2004 2015 struct sockaddr_storage addr;
2005 2016  
2006 2017  
2007 2018  
2008 2019  
2009 2020  
... ... @@ -2008,27 +2019,23 @@
2008 2019 int __user *uaddr_len;
2009 2020  
2010 2021 if (MSG_CMSG_COMPAT & flags) {
2011   - if (get_compat_msghdr(&msg_sys, msg_compat))
  2022 + if (get_compat_msghdr(msg_sys, msg_compat))
2012 2023 return -EFAULT;
2013 2024 }
2014   - else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr)))
  2025 + else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr)))
2015 2026 return -EFAULT;
2016 2027  
2017   - sock = sockfd_lookup_light(fd, &err, &fput_needed);
2018   - if (!sock)
  2028 + err = -EMSGSIZE;
  2029 + if (msg_sys->msg_iovlen > UIO_MAXIOV)
2019 2030 goto out;
2020 2031  
2021   - err = -EMSGSIZE;
2022   - if (msg_sys.msg_iovlen > UIO_MAXIOV)
2023   - goto out_put;
2024   -
2025 2032 /* Check whether to allocate the iovec area */
2026 2033 err = -ENOMEM;
2027   - iov_size = msg_sys.msg_iovlen * sizeof(struct iovec);
2028   - if (msg_sys.msg_iovlen > UIO_FASTIOV) {
  2034 + iov_size = msg_sys->msg_iovlen * sizeof(struct iovec);
  2035 + if (msg_sys->msg_iovlen > UIO_FASTIOV) {
2029 2036 iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL);
2030 2037 if (!iov)
2031   - goto out_put;
  2038 + goto out;
2032 2039 }
2033 2040  
2034 2041 /*
2035 2042  
2036 2043  
2037 2044  
2038 2045  
2039 2046  
2040 2047  
2041 2048  
2042 2049  
... ... @@ -2036,46 +2043,47 @@
2036 2043 * kernel msghdr to use the kernel address space)
2037 2044 */
2038 2045  
2039   - uaddr = (__force void __user *)msg_sys.msg_name;
  2046 + uaddr = (__force void __user *)msg_sys->msg_name;
2040 2047 uaddr_len = COMPAT_NAMELEN(msg);
2041 2048 if (MSG_CMSG_COMPAT & flags) {
2042   - err = verify_compat_iovec(&msg_sys, iov,
  2049 + err = verify_compat_iovec(msg_sys, iov,
2043 2050 (struct sockaddr *)&addr,
2044 2051 VERIFY_WRITE);
2045 2052 } else
2046   - err = verify_iovec(&msg_sys, iov,
  2053 + err = verify_iovec(msg_sys, iov,
2047 2054 (struct sockaddr *)&addr,
2048 2055 VERIFY_WRITE);
2049 2056 if (err < 0)
2050 2057 goto out_freeiov;
2051 2058 total_len = err;
2052 2059  
2053   - cmsg_ptr = (unsigned long)msg_sys.msg_control;
2054   - msg_sys.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
  2060 + cmsg_ptr = (unsigned long)msg_sys->msg_control;
  2061 + msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
2055 2062  
2056 2063 if (sock->file->f_flags & O_NONBLOCK)
2057 2064 flags |= MSG_DONTWAIT;
2058   - err = sock_recvmsg(sock, &msg_sys, total_len, flags);
  2065 + err = (nosec ? sock_recvmsg_nosec : sock_recvmsg)(sock, msg_sys,
  2066 + total_len, flags);
2059 2067 if (err < 0)
2060 2068 goto out_freeiov;
2061 2069 len = err;
2062 2070  
2063 2071 if (uaddr != NULL) {
2064 2072 err = move_addr_to_user((struct sockaddr *)&addr,
2065   - msg_sys.msg_namelen, uaddr,
  2073 + msg_sys->msg_namelen, uaddr,
2066 2074 uaddr_len);
2067 2075 if (err < 0)
2068 2076 goto out_freeiov;
2069 2077 }
2070   - err = __put_user((msg_sys.msg_flags & ~MSG_CMSG_COMPAT),
  2078 + err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT),
2071 2079 COMPAT_FLAGS(msg));
2072 2080 if (err)
2073 2081 goto out_freeiov;
2074 2082 if (MSG_CMSG_COMPAT & flags)
2075   - err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr,
  2083 + err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
2076 2084 &msg_compat->msg_controllen);
2077 2085 else
2078   - err = __put_user((unsigned long)msg_sys.msg_control - cmsg_ptr,
  2086 + err = __put_user((unsigned long)msg_sys->msg_control - cmsg_ptr,
2079 2087 &msg->msg_controllen);
2080 2088 if (err)
2081 2089 goto out_freeiov;
2082 2090  
2083 2091  
2084 2092  
2085 2093  
... ... @@ -2084,21 +2092,150 @@
2084 2092 out_freeiov:
2085 2093 if (iov != iovstack)
2086 2094 sock_kfree_s(sock->sk, iov, iov_size);
2087   -out_put:
  2095 +out:
  2096 + return err;
  2097 +}
  2098 +
  2099 +/*
  2100 + * BSD recvmsg interface
  2101 + */
  2102 +
  2103 +SYSCALL_DEFINE3(recvmsg, int, fd, struct msghdr __user *, msg,
  2104 + unsigned int, flags)
  2105 +{
  2106 + int fput_needed, err;
  2107 + struct msghdr msg_sys;
  2108 + struct socket *sock = sockfd_lookup_light(fd, &err, &fput_needed);
  2109 +
  2110 + if (!sock)
  2111 + goto out;
  2112 +
  2113 + err = __sys_recvmsg(sock, msg, &msg_sys, flags, 0);
  2114 +
2088 2115 fput_light(sock->file, fput_needed);
2089 2116 out:
2090 2117 return err;
2091 2118 }
2092 2119  
2093   -#ifdef __ARCH_WANT_SYS_SOCKETCALL
  2120 +/*
  2121 + * Linux recvmmsg interface
  2122 + */
2094 2123  
  2124 +int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
  2125 + unsigned int flags, struct timespec *timeout)
  2126 +{
  2127 + int fput_needed, err, datagrams;
  2128 + struct socket *sock;
  2129 + struct mmsghdr __user *entry;
  2130 + struct msghdr msg_sys;
  2131 + struct timespec end_time;
  2132 +
  2133 + if (timeout &&
  2134 + poll_select_set_timeout(&end_time, timeout->tv_sec,
  2135 + timeout->tv_nsec))
  2136 + return -EINVAL;
  2137 +
  2138 + datagrams = 0;
  2139 +
  2140 + sock = sockfd_lookup_light(fd, &err, &fput_needed);
  2141 + if (!sock)
  2142 + return err;
  2143 +
  2144 + err = sock_error(sock->sk);
  2145 + if (err)
  2146 + goto out_put;
  2147 +
  2148 + entry = mmsg;
  2149 +
  2150 + while (datagrams < vlen) {
  2151 + /*
  2152 + * No need to ask LSM for more than the first datagram.
  2153 + */
  2154 + err = __sys_recvmsg(sock, (struct msghdr __user *)entry,
  2155 + &msg_sys, flags, datagrams);
  2156 + if (err < 0)
  2157 + break;
  2158 + err = put_user(err, &entry->msg_len);
  2159 + if (err)
  2160 + break;
  2161 + ++entry;
  2162 + ++datagrams;
  2163 +
  2164 + if (timeout) {
  2165 + ktime_get_ts(timeout);
  2166 + *timeout = timespec_sub(end_time, *timeout);
  2167 + if (timeout->tv_sec < 0) {
  2168 + timeout->tv_sec = timeout->tv_nsec = 0;
  2169 + break;
  2170 + }
  2171 +
  2172 + /* Timeout, return less than vlen datagrams */
  2173 + if (timeout->tv_nsec == 0 && timeout->tv_sec == 0)
  2174 + break;
  2175 + }
  2176 +
  2177 + /* Out of band data, return right away */
  2178 + if (msg_sys.msg_flags & MSG_OOB)
  2179 + break;
  2180 + }
  2181 +
  2182 +out_put:
  2183 + fput_light(sock->file, fput_needed);
  2184 +
  2185 + if (err == 0)
  2186 + return datagrams;
  2187 +
  2188 + if (datagrams != 0) {
  2189 + /*
  2190 + * We may return less entries than requested (vlen) if the
  2191 + * sock is non block and there aren't enough datagrams...
  2192 + */
  2193 + if (err != -EAGAIN) {
  2194 + /*
  2195 + * ... or if recvmsg returns an error after we
  2196 + * received some datagrams, where we record the
  2197 + * error to return on the next call or if the
  2198 + * app asks about it using getsockopt(SO_ERROR).
  2199 + */
  2200 + sock->sk->sk_err = -err;
  2201 + }
  2202 +
  2203 + return datagrams;
  2204 + }
  2205 +
  2206 + return err;
  2207 +}
  2208 +
  2209 +SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
  2210 + unsigned int, vlen, unsigned int, flags,
  2211 + struct timespec __user *, timeout)
  2212 +{
  2213 + int datagrams;
  2214 + struct timespec timeout_sys;
  2215 +
  2216 + if (!timeout)
  2217 + return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL);
  2218 +
  2219 + if (copy_from_user(&timeout_sys, timeout, sizeof(timeout_sys)))
  2220 + return -EFAULT;
  2221 +
  2222 + datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys);
  2223 +
  2224 + if (datagrams > 0 &&
  2225 + copy_to_user(timeout, &timeout_sys, sizeof(timeout_sys)))
  2226 + datagrams = -EFAULT;
  2227 +
  2228 + return datagrams;
  2229 +}
  2230 +
  2231 +#ifdef __ARCH_WANT_SYS_SOCKETCALL
2095 2232 /* Argument list sizes for sys_socketcall */
2096 2233 #define AL(x) ((x) * sizeof(unsigned long))
2097   -static const unsigned char nargs[19]={
  2234 +static const unsigned char nargs[20] = {
2098 2235 AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
2099 2236 AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
2100 2237 AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
2101   - AL(4)
  2238 + AL(4),AL(5)
2102 2239 };
2103 2240  
2104 2241 #undef AL
... ... @@ -2118,7 +2255,7 @@
2118 2255 int err;
2119 2256 unsigned int len;
2120 2257  
2121   - if (call < 1 || call > SYS_ACCEPT4)
  2258 + if (call < 1 || call > SYS_RECVMMSG)
2122 2259 return -EINVAL;
2123 2260  
2124 2261 len = nargs[call];
... ... @@ -2195,6 +2332,10 @@
2195 2332 break;
2196 2333 case SYS_RECVMSG:
2197 2334 err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
  2335 + break;
  2336 + case SYS_RECVMMSG:
  2337 + err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3],
  2338 + (struct timespec __user *)a[4]);
2198 2339 break;
2199 2340 case SYS_ACCEPT4:
2200 2341 err = sys_accept4(a0, (struct sockaddr __user *)a1,