Commit c845acb324aa85a39650a14e7696982ceea75dc1

Authored by Rainer Weikusat
Committed by David S. Miller
1 parent b5bdacf3bb

af_unix: Fix splice-bind deadlock

On 2015/11/06, Dmitry Vyukov reported a deadlock involving the splice
system call and AF_UNIX sockets,

http://lists.openwall.net/netdev/2015/11/06/24

The situation was analyzed as

(a while ago) A: socketpair()
B: splice() from a pipe to /mnt/regular_file
	does sb_start_write() on /mnt
C: try to freeze /mnt
	wait for B to finish with /mnt
A: bind() try to bind our socket to /mnt/new_socket_name
	lock our socket, see it not bound yet
	decide that it needs to create something in /mnt
	try to do sb_start_write() on /mnt, block (it's
	waiting for C).
D: splice() from the same pipe to our socket
	lock the pipe, see that socket is connected
	try to lock the socket, block waiting for A
B:	get around to actually feeding a chunk from
	pipe to file, try to lock the pipe.  Deadlock.

on 2015/11/10 by Al Viro,

http://lists.openwall.net/netdev/2015/11/10/4

The patch fixes this by removing the kern_path_create related code from
unix_mknod and executing it as part of unix_bind prior acquiring the
readlock of the socket in question. This means that A (as used above)
will sb_start_write on /mnt before it acquires the readlock, hence, it
won't indirectly block B which first did a sb_start_write and then
waited for a thread trying to acquire the readlock. Consequently, A
being blocked by C waiting for B won't cause a deadlock anymore
(effectively, both A and B acquire two locks in opposite order in the
situation described above).

Dmitry Vyukov(<dvyukov@google.com>) tested the original patch.

Signed-off-by: Rainer Weikusat <rweikusat@mobileactivedefense.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

Showing 1 changed file with 40 additions and 26 deletions Side-by-side Diff

... ... @@ -953,32 +953,20 @@
953 953 return NULL;
954 954 }
955 955  
956   -static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
  956 +static int unix_mknod(struct dentry *dentry, struct path *path, umode_t mode,
  957 + struct path *res)
957 958 {
958   - struct dentry *dentry;
959   - struct path path;
960   - int err = 0;
961   - /*
962   - * Get the parent directory, calculate the hash for last
963   - * component.
964   - */
965   - dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
966   - err = PTR_ERR(dentry);
967   - if (IS_ERR(dentry))
968   - return err;
  959 + int err;
969 960  
970   - /*
971   - * All right, let's create it.
972   - */
973   - err = security_path_mknod(&path, dentry, mode, 0);
  961 + err = security_path_mknod(path, dentry, mode, 0);
974 962 if (!err) {
975   - err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
  963 + err = vfs_mknod(d_inode(path->dentry), dentry, mode, 0);
976 964 if (!err) {
977   - res->mnt = mntget(path.mnt);
  965 + res->mnt = mntget(path->mnt);
978 966 res->dentry = dget(dentry);
979 967 }
980 968 }
981   - done_path_create(&path, dentry);
  969 +
982 970 return err;
983 971 }
984 972  
985 973  
... ... @@ -989,10 +977,12 @@
989 977 struct unix_sock *u = unix_sk(sk);
990 978 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
991 979 char *sun_path = sunaddr->sun_path;
992   - int err;
  980 + int err, name_err;
993 981 unsigned int hash;
994 982 struct unix_address *addr;
995 983 struct hlist_head *list;
  984 + struct path path;
  985 + struct dentry *dentry;
996 986  
997 987 err = -EINVAL;
998 988 if (sunaddr->sun_family != AF_UNIX)
999 989  
1000 990  
... ... @@ -1008,14 +998,34 @@
1008 998 goto out;
1009 999 addr_len = err;
1010 1000  
  1001 + name_err = 0;
  1002 + dentry = NULL;
  1003 + if (sun_path[0]) {
  1004 + /* Get the parent directory, calculate the hash for last
  1005 + * component.
  1006 + */
  1007 + dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
  1008 +
  1009 + if (IS_ERR(dentry)) {
  1010 + /* delay report until after 'already bound' check */
  1011 + name_err = PTR_ERR(dentry);
  1012 + dentry = NULL;
  1013 + }
  1014 + }
  1015 +
1011 1016 err = mutex_lock_interruptible(&u->readlock);
1012 1017 if (err)
1013   - goto out;
  1018 + goto out_path;
1014 1019  
1015 1020 err = -EINVAL;
1016 1021 if (u->addr)
1017 1022 goto out_up;
1018 1023  
  1024 + if (name_err) {
  1025 + err = name_err == -EEXIST ? -EADDRINUSE : name_err;
  1026 + goto out_up;
  1027 + }
  1028 +
1019 1029 err = -ENOMEM;
1020 1030 addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1021 1031 if (!addr)
1022 1032  
... ... @@ -1026,11 +1036,11 @@
1026 1036 addr->hash = hash ^ sk->sk_type;
1027 1037 atomic_set(&addr->refcnt, 1);
1028 1038  
1029   - if (sun_path[0]) {
1030   - struct path path;
  1039 + if (dentry) {
  1040 + struct path u_path;
1031 1041 umode_t mode = S_IFSOCK |
1032 1042 (SOCK_INODE(sock)->i_mode & ~current_umask());
1033   - err = unix_mknod(sun_path, mode, &path);
  1043 + err = unix_mknod(dentry, &path, mode, &u_path);
1034 1044 if (err) {
1035 1045 if (err == -EEXIST)
1036 1046 err = -EADDRINUSE;
1037 1047  
... ... @@ -1038,9 +1048,9 @@
1038 1048 goto out_up;
1039 1049 }
1040 1050 addr->hash = UNIX_HASH_SIZE;
1041   - hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1);
  1051 + hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1042 1052 spin_lock(&unix_table_lock);
1043   - u->path = path;
  1053 + u->path = u_path;
1044 1054 list = &unix_socket_table[hash];
1045 1055 } else {
1046 1056 spin_lock(&unix_table_lock);
... ... @@ -1063,6 +1073,10 @@
1063 1073 spin_unlock(&unix_table_lock);
1064 1074 out_up:
1065 1075 mutex_unlock(&u->readlock);
  1076 +out_path:
  1077 + if (dentry)
  1078 + done_path_create(&path, dentry);
  1079 +
1066 1080 out:
1067 1081 return err;
1068 1082 }