futex: Fix the write access fault problem for real

commit 64d1304a64 (futex: setup writeable mapping for futex ops which modify user space data) did address only half of the problem of write access faults. The patch was made on two wrong assumptions: 1) access_ok(VERIFY_WRITE,...) would actually check write access. On x86 it does _NOT_. It's a pure address range check. 2) a RW mapped region can not go away under us. That's wrong as well. Nobody can prevent another thread to call mprotect(PROT_READ) on that region where the futex resides. If that call hits between the get_user_pages_fast() verification and the actual write access in the atomic region we are toast again. The solution is to not rely on access_ok and get_user() for any write access related fault on private and shared futexes. Instead we need to fault it in with verification of write access. There is no generic non destructive write mechanism which would fault the user page in trough a #PF, but as we already know that we will fault we can as well call get_user_pages() directly and avoid the #PF overhead. If get_user_pages() returns -EFAULT we know that we can not fix it anymore and need to bail out to user space. Remove a bunch of confusing comments on this issue as well. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: stable@kernel.org

futex: Fix the write access fault problem for real
commit 64d1304a64 (futex: setup writeable mapping for futex ops which modify user space data) did address only half of the problem of write access faults. The patch was made on two wrong assumptions: 1) access_ok(VERIFY_WRITE,...) would actually check write access. On x86 it does _NOT_. It's a pure address range check. 2) a RW mapped region can not go away under us. That's wrong as well. Nobody can prevent another thread to call mprotect(PROT_READ) on that region where the futex resides. If that call hits between the get_user_pages_fast() verification and the actual write access in the atomic region we are toast again. The solution is to not rely on access_ok and get_user() for any write access related fault on private and shared futexes. Instead we need to fault it in with verification of write access. There is no generic non destructive write mechanism which would fault the user page in trough a #PF, but as we already know that we will fault we can as well call get_user_pages() directly and avoid the #PF overhead. If get_user_pages() returns -EFAULT we know that we can not fix it anymore and need to bail out to user space. Remove a bunch of confusing comments on this issue as well. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: stable@kernel.org
Thomas Gleixner
1 parent c82e6d450f
Showing 1 changed file with 24 additions and 21 deletions Side-by-side Diff
kernel/futex.c
@@ -284,6 +284,25 @@
 	drop_futex_key_refs(key);
 }
  
+/*
+ * fault_in_user_writeable - fault in user address and verify RW access
+ * @uaddr:	pointer to faulting user space address
+ *
+ * Slow path to fixup the fault we just took in the atomic write
+ * access to @uaddr.
+ *
+ * We have no generic implementation of a non destructive write to the
+ * user address. We know that we faulted in the atomic pagefault
+ * disabled section so we can as well avoid the #PF overhead by
+ * calling get_user_pages() right away.
+ */
+static int fault_in_user_writeable(u32 __user *uaddr)
+{
+	int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
+				 sizeof(*uaddr), 1, 0, NULL, NULL);
+	return ret < 0 ? ret : 0;
+}
+
 /**
  * futex_top_waiter() - Return the highest priority waiter on a futex
  * @hb:     the hash bucket the futex_q's reside in
@@ -896,7 +915,6 @@
 retry_private:
 	op_ret = futex_atomic_op_inuser(op, uaddr2);
 	if (unlikely(op_ret < 0)) {
-		u32 dummy;
  
 		double_unlock_hb(hb1, hb2);
  
@@ -914,7 +932,7 @@
 			goto out_put_keys;
 		}
  
-		ret = get_user(dummy, uaddr2);
+		ret = fault_in_user_writeable(uaddr2);
 		if (ret)
 			goto out_put_keys;
  
@@ -1204,7 +1222,7 @@
 			double_unlock_hb(hb1, hb2);
 			put_futex_key(fshared, &key2);
 			put_futex_key(fshared, &key1);
-			ret = get_user(curval2, uaddr2);
+			ret = fault_in_user_writeable(uaddr2);
 			if (!ret)
 				goto retry;
 			goto out;
@@ -1482,7 +1500,7 @@
 handle_fault:
 	spin_unlock(q->lock_ptr);
  
-	ret = get_user(uval, uaddr);
+	ret = fault_in_user_writeable(uaddr);
  
 	spin_lock(q->lock_ptr);
  
@@ -1807,7 +1825,6 @@
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct futex_hash_bucket *hb;
-	u32 uval;
 	struct futex_q q;
 	int res, ret;
  
  
@@ -1909,16 +1926,9 @@
 	return ret != -EINTR ? ret : -ERESTARTNOINTR;
  
 uaddr_faulted:
-	/*
-	 * We have to r/w  *(int __user *)uaddr, and we have to modify it
-	 * atomically.  Therefore, if we continue to fault after get_user()
-	 * below, we need to handle the fault ourselves, while still holding
-	 * the mmap_sem.  This can occur if the uaddr is under contention as
-	 * we have to drop the mmap_sem in order to call get_user().
-	 */
 	queue_unlock(&q, hb);
  
-	ret = get_user(uval, uaddr);
+	ret = fault_in_user_writeable(uaddr);
 	if (ret)
 		goto out_put_key;
  
  
@@ -2013,17 +2023,10 @@
 	return ret;
  
 pi_faulted:
-	/*
-	 * We have to r/w  *(int __user *)uaddr, and we have to modify it
-	 * atomically.  Therefore, if we continue to fault after get_user()
-	 * below, we need to handle the fault ourselves, while still holding
-	 * the mmap_sem.  This can occur if the uaddr is under contention as
-	 * we have to drop the mmap_sem in order to call get_user().
-	 */
 	spin_unlock(&hb->lock);
 	put_futex_key(fshared, &key);
  
-	ret = get_user(uval, uaddr);
+	ret = fault_in_user_writeable(uaddr);
 	if (!ret)
 		goto retry;
...	...	@@ -284,6 +284,25 @@
284	284	drop_futex_key_refs(key);
285	285	}
286	286
	287	+/*
	288	+ * fault_in_user_writeable - fault in user address and verify RW access
	289	+ * @uaddr: pointer to faulting user space address
	290	+ *
	291	+ * Slow path to fixup the fault we just took in the atomic write
	292	+ * access to @uaddr.
	293	+ *
	294	+ * We have no generic implementation of a non destructive write to the
	295	+ * user address. We know that we faulted in the atomic pagefault
	296	+ * disabled section so we can as well avoid the #PF overhead by
	297	+ * calling get_user_pages() right away.
	298	+ */
	299	+static int fault_in_user_writeable(u32 __user *uaddr)
	300	+{
	301	+ int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
	302	+ sizeof(*uaddr), 1, 0, NULL, NULL);
	303	+ return ret < 0 ? ret : 0;
	304	+}
	305	+
287	306	/**
288	307	* futex_top_waiter() - Return the highest priority waiter on a futex
289	308	* @hb: the hash bucket the futex_q's reside in
...	...	@@ -896,7 +915,6 @@
896	915	retry_private:
897	916	op_ret = futex_atomic_op_inuser(op, uaddr2);
898	917	if (unlikely(op_ret < 0)) {
899		- u32 dummy;
900	918
901	919	double_unlock_hb(hb1, hb2);
902	920
...	...	@@ -914,7 +932,7 @@
914	932	goto out_put_keys;
915	933	}
916	934
917		- ret = get_user(dummy, uaddr2);
	935	+ ret = fault_in_user_writeable(uaddr2);
918	936	if (ret)
919	937	goto out_put_keys;
920	938
...	...	@@ -1204,7 +1222,7 @@
1204	1222	double_unlock_hb(hb1, hb2);
1205	1223	put_futex_key(fshared, &key2);
1206	1224	put_futex_key(fshared, &key1);
1207		- ret = get_user(curval2, uaddr2);
	1225	+ ret = fault_in_user_writeable(uaddr2);
1208	1226	if (!ret)
1209	1227	goto retry;
1210	1228	goto out;
...	...	@@ -1482,7 +1500,7 @@
1482	1500	handle_fault:
1483	1501	spin_unlock(q->lock_ptr);
1484	1502
1485		- ret = get_user(uval, uaddr);
	1503	+ ret = fault_in_user_writeable(uaddr);
1486	1504
1487	1505	spin_lock(q->lock_ptr);
1488	1506
...	...	@@ -1807,7 +1825,6 @@
1807	1825	{
1808	1826	struct hrtimer_sleeper timeout, *to = NULL;
1809	1827	struct futex_hash_bucket *hb;
1810		- u32 uval;
1811	1828	struct futex_q q;
1812	1829	int res, ret;
1813	1830
1814	1831
...	...	@@ -1909,16 +1926,9 @@
1909	1926	return ret != -EINTR ? ret : -ERESTARTNOINTR;
1910	1927
1911	1928	uaddr_faulted:
1912		- /*
1913		- * We have to r/w (int __user )uaddr, and we have to modify it
1914		- * atomically. Therefore, if we continue to fault after get_user()
1915		- * below, we need to handle the fault ourselves, while still holding
1916		- * the mmap_sem. This can occur if the uaddr is under contention as
1917		- * we have to drop the mmap_sem in order to call get_user().
1918		- */
1919	1929	queue_unlock(&q, hb);
1920	1930
1921		- ret = get_user(uval, uaddr);
	1931	+ ret = fault_in_user_writeable(uaddr);
1922	1932	if (ret)
1923	1933	goto out_put_key;
1924	1934
1925	1935
...	...	@@ -2013,17 +2023,10 @@
2013	2023	return ret;
2014	2024
2015	2025	pi_faulted:
2016		- /*
2017		- * We have to r/w (int __user )uaddr, and we have to modify it
2018		- * atomically. Therefore, if we continue to fault after get_user()
2019		- * below, we need to handle the fault ourselves, while still holding
2020		- * the mmap_sem. This can occur if the uaddr is under contention as
2021		- * we have to drop the mmap_sem in order to call get_user().
2022		- */
2023	2026	spin_unlock(&hb->lock);
2024	2027	put_futex_key(fshared, &key);
2025	2028
2026		- ret = get_user(uval, uaddr);
	2029	+ ret = fault_in_user_writeable(uaddr);
2027	2030	if (!ret)
2028	2031	goto retry;
2029	2032