memcg: fix oom kill behavior

In current page-fault code, handle_mm_fault() -> ... -> mem_cgroup_charge() -> map page or handle error. -> check return code. If page fault's return code is VM_FAULT_OOM, page_fault_out_of_memory() is called. But if it's caused by memcg, OOM should have been already invoked. Then, I added a patch: a636b327f731143ccc544b966cfd8de6cb6d72c6. That patch records last_oom_jiffies for memcg's sub-hierarchy and prevents page_fault_out_of_memory from being invoked in near future. But Nishimura-san reported that check by jiffies is not enough when the system is terribly heavy. This patch changes memcg's oom logic as. * If memcg causes OOM-kill, continue to retry. * remove jiffies check which is used now. * add memcg-oom-lock which works like perzone oom lock. * If current is killed(as a process), bypass charge. Something more sophisticated can be added but this pactch does fundamental things. TODO: - add oom notifier - add permemcg disable-oom-kill flag and freezer at oom. - more chances for wake up oom waiter (when changing memory limit etc..) Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

memcg: fix oom kill behavior
In current page-fault code, handle_mm_fault() -> ... -> mem_cgroup_charge() -> map page or handle error. -> check return code. If page fault's return code is VM_FAULT_OOM, page_fault_out_of_memory() is called. But if it's caused by memcg, OOM should have been already invoked. Then, I added a patch: a636b327f731143ccc544b966cfd8de6cb6d72c6. That patch records last_oom_jiffies for memcg's sub-hierarchy and prevents page_fault_out_of_memory from being invoked in near future. But Nishimura-san reported that check by jiffies is not enough when the system is terribly heavy. This patch changes memcg's oom logic as. * If memcg causes OOM-kill, continue to retry. * remove jiffies check which is used now. * add memcg-oom-lock which works like perzone oom lock. * If current is killed(as a process), bypass charge. Something more sophisticated can be added but this pactch does fundamental things. TODO: - add oom notifier - add permemcg disable-oom-kill flag and freezer at oom. - more chances for wake up oom waiter (when changing memory limit etc..) Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
KAMEZAWA Hiroyuki · Linus Torvalds
1 parent 0263c12c12
Showing 3 changed files with 107 additions and 41 deletions Side-by-side Diff
include/linux/memcontrol.h
mm/memcontrol.c
mm/oom_kill.c
@@ -124,7 +124,6 @@
 	return false;
 }
  
-extern bool mem_cgroup_oom_called(struct task_struct *task);
 void mem_cgroup_update_file_mapped(struct page *page, int val);
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						gfp_t gfp_mask, int nid,
@@ -256,11 +255,6 @@
 static inline bool mem_cgroup_disabled(void)
 {
 	return true;
-}
-
-static inline bool mem_cgroup_oom_called(struct task_struct *task)
-{
-	return false;
 }
  
 static inline int
@@ -203,7 +203,7 @@
 	 * Should the accounting and control be hierarchical, per subtree?
 	 */
 	bool use_hierarchy;
-	unsigned long	last_oom_jiffies;
+	atomic_t	oom_lock;
 	atomic_t	refcnt;
  
 	unsigned int	swappiness;
  
  
  
  
  
  
  
  
@@ -1246,35 +1246,105 @@
 	return total;
 }
  
-bool mem_cgroup_oom_called(struct task_struct *task)
+static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
 {
-	bool ret = false;
-	struct mem_cgroup *mem;
-	struct mm_struct *mm;
+	int *val = (int *)data;
+	int x;
+	/*
+	 * Logically, we can stop scanning immediately when we find
+	 * a memcg is already locked. But condidering unlock ops and
+	 * creation/removal of memcg, scan-all is simple operation.
+	 */
+	x = atomic_inc_return(&mem->oom_lock);
+	*val = max(x, *val);
+	return 0;
+}
+/*
+ * Check OOM-Killer is already running under our hierarchy.
+ * If someone is running, return false.
+ */
+static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
+{
+	int lock_count = 0;
  
-	rcu_read_lock();
-	mm = task->mm;
-	if (!mm)
-		mm = &init_mm;
-	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
-	if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
-		ret = true;
-	rcu_read_unlock();
-	return ret;
+	mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
+
+	if (lock_count == 1)
+		return true;
+	return false;
 }
  
-static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
+static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
 {
-	mem->last_oom_jiffies = jiffies;
+	/*
+	 * When a new child is created while the hierarchy is under oom,
+	 * mem_cgroup_oom_lock() may not be called. We have to use
+	 * atomic_add_unless() here.
+	 */
+	atomic_add_unless(&mem->oom_lock, -1, 0);
 	return 0;
 }
  
-static void record_last_oom(struct mem_cgroup *mem)
+static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
 {
-	mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
+	mem_cgroup_walk_tree(mem, NULL,	mem_cgroup_oom_unlock_cb);
 }
  
+static DEFINE_MUTEX(memcg_oom_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
+
 /*
+ * try to call OOM killer. returns false if we should exit memory-reclaim loop.
+ */
+bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
+{
+	DEFINE_WAIT(wait);
+	bool locked;
+
+	/* At first, try to OOM lock hierarchy under mem.*/
+	mutex_lock(&memcg_oom_mutex);
+	locked = mem_cgroup_oom_lock(mem);
+	/*
+	 * Even if signal_pending(), we can't quit charge() loop without
+	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
+	 * under OOM is always welcomed, use TASK_KILLABLE here.
+	 */
+	if (!locked)
+		prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE);
+	mutex_unlock(&memcg_oom_mutex);
+
+	if (locked)
+		mem_cgroup_out_of_memory(mem, mask);
+	else {
+		schedule();
+		finish_wait(&memcg_oom_waitq, &wait);
+	}
+	mutex_lock(&memcg_oom_mutex);
+	mem_cgroup_oom_unlock(mem);
+	/*
+	 * Here, we use global waitq .....more fine grained waitq ?
+	 * Assume following hierarchy.
+	 * A/
+	 *   01
+	 *   02
+	 * assume OOM happens both in A and 01 at the same time. Tthey are
+	 * mutually exclusive by lock. (kill in 01 helps A.)
+	 * When we use per memcg waitq, we have to wake up waiters on A and 02
+	 * in addtion to waiters on 01. We use global waitq for avoiding mess.
+	 * It will not be a big problem.
+	 * (And a task may be moved to other groups while it's waiting for OOM.)
+	 */
+	wake_up_all(&memcg_oom_waitq);
+	mutex_unlock(&memcg_oom_mutex);
+
+	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+		return false;
+	/* Give chance to dying process */
+	schedule_timeout(1);
+	return true;
+}
+
+/*
  * Currently used to update mapped file statistics, but the routine can be
  * generalized to update other statistics as well.
  */
@@ -1443,11 +1513,14 @@
 	struct res_counter *fail_res;
 	int csize = CHARGE_SIZE;
  
-	if (unlikely(test_thread_flag(TIF_MEMDIE))) {
-		/* Don't account this! */
-		*memcg = NULL;
-		return 0;
-	}
+	/*
+	 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
+	 * in system level. So, allow to go ahead dying process in addition to
+	 * MEMDIE process.
+	 */
+	if (unlikely(test_thread_flag(TIF_MEMDIE)
+		     || fatal_signal_pending(current)))
+		goto bypass;
  
 	/*
 	 * We always charge the cgroup the mm_struct belongs to.
  
@@ -1560,11 +1633,15 @@
 		}
  
 		if (!nr_retries--) {
-			if (oom) {
-				mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
-				record_last_oom(mem_over_limit);
+			if (!oom)
+				goto nomem;
+			if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
+				nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+				continue;
 			}
-			goto nomem;
+			/* When we reach here, current task is dying .*/
+			css_put(&mem->css);
+			goto bypass;
 		}
 	}
 	if (csize > PAGE_SIZE)
@@ -1574,6 +1651,9 @@
 nomem:
 	css_put(&mem->css);
 	return -ENOMEM;
+bypass:
+	*memcg = NULL;
+	return 0;
 }
  
 /*
@@ -603,13 +603,6 @@
 		/* Got some memory back in the last second. */
 		return;
  
-	/*
-	 * If this is from memcg, oom-killer is already invoked.
-	 * and not worth to go system-wide-oom.
-	 */
-	if (mem_cgroup_oom_called(current))
-		goto rest_and_return;
-
 	if (sysctl_panic_on_oom)
 		panic("out of memory from page fault. panic_on_oom is selected.\n");
  
@@ -621,7 +614,6 @@
 	 * Give "p" a good chance of killing itself before we
 	 * retry to allocate memory.
 	 */
-rest_and_return:
 	if (!test_thread_flag(TIF_MEMDIE))
 		schedule_timeout_uninterruptible(1);
 }
...	...	@@ -124,7 +124,6 @@
124	124	return false;
125	125	}
126	126
127		-extern bool mem_cgroup_oom_called(struct task_struct *task);
128	127	void mem_cgroup_update_file_mapped(struct page *page, int val);
129	128	unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
130	129	gfp_t gfp_mask, int nid,
...	...	@@ -256,11 +255,6 @@
256	255	static inline bool mem_cgroup_disabled(void)
257	256	{
258	257	return true;
259		-}
260		-
261		-static inline bool mem_cgroup_oom_called(struct task_struct *task)
262		-{
263		- return false;
264	258	}
265	259
266	260	static inline int
...	...	@@ -203,7 +203,7 @@
203	203	* Should the accounting and control be hierarchical, per subtree?
204	204	*/
205	205	bool use_hierarchy;
206		- unsigned long last_oom_jiffies;
	206	+ atomic_t oom_lock;
207	207	atomic_t refcnt;
208	208
209	209	unsigned int swappiness;
210	210
211	211
212	212
213	213
214	214
215	215
216	216
217	217
...	...	@@ -1246,35 +1246,105 @@
1246	1246	return total;
1247	1247	}
1248	1248
1249		-bool mem_cgroup_oom_called(struct task_struct *task)
	1249	+static int mem_cgroup_oom_lock_cb(struct mem_cgroup mem, void data)
1250	1250	{
1251		- bool ret = false;
1252		- struct mem_cgroup *mem;
1253		- struct mm_struct *mm;
	1251	+ int val = (int )data;
	1252	+ int x;
	1253	+ /*
	1254	+ * Logically, we can stop scanning immediately when we find
	1255	+ * a memcg is already locked. But condidering unlock ops and
	1256	+ * creation/removal of memcg, scan-all is simple operation.
	1257	+ */
	1258	+ x = atomic_inc_return(&mem->oom_lock);
	1259	+ val = max(x, val);
	1260	+ return 0;
	1261	+}
	1262	+/*
	1263	+ * Check OOM-Killer is already running under our hierarchy.
	1264	+ * If someone is running, return false.
	1265	+ */
	1266	+static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
	1267	+{
	1268	+ int lock_count = 0;
1254	1269
1255		- rcu_read_lock();
1256		- mm = task->mm;
1257		- if (!mm)
1258		- mm = &init_mm;
1259		- mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
1260		- if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
1261		- ret = true;
1262		- rcu_read_unlock();
1263		- return ret;
	1270	+ mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
	1271	+
	1272	+ if (lock_count == 1)
	1273	+ return true;
	1274	+ return false;
1264	1275	}
1265	1276
1266		-static int record_last_oom_cb(struct mem_cgroup mem, void data)
	1277	+static int mem_cgroup_oom_unlock_cb(struct mem_cgroup mem, void data)
1267	1278	{
1268		- mem->last_oom_jiffies = jiffies;
	1279	+ /*
	1280	+ * When a new child is created while the hierarchy is under oom,
	1281	+ * mem_cgroup_oom_lock() may not be called. We have to use
	1282	+ * atomic_add_unless() here.
	1283	+ */
	1284	+ atomic_add_unless(&mem->oom_lock, -1, 0);
1269	1285	return 0;
1270	1286	}
1271	1287
1272		-static void record_last_oom(struct mem_cgroup *mem)
	1288	+static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1273	1289	{
1274		- mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
	1290	+ mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
1275	1291	}
1276	1292
	1293	+static DEFINE_MUTEX(memcg_oom_mutex);
	1294	+static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
	1295	+
1277	1296	/*
	1297	+ * try to call OOM killer. returns false if we should exit memory-reclaim loop.
	1298	+ */
	1299	+bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
	1300	+{
	1301	+ DEFINE_WAIT(wait);
	1302	+ bool locked;
	1303	+
	1304	+ /* At first, try to OOM lock hierarchy under mem.*/
	1305	+ mutex_lock(&memcg_oom_mutex);
	1306	+ locked = mem_cgroup_oom_lock(mem);
	1307	+ /*
	1308	+ * Even if signal_pending(), we can't quit charge() loop without
	1309	+ * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
	1310	+ * under OOM is always welcomed, use TASK_KILLABLE here.
	1311	+ */
	1312	+ if (!locked)
	1313	+ prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE);
	1314	+ mutex_unlock(&memcg_oom_mutex);
	1315	+
	1316	+ if (locked)
	1317	+ mem_cgroup_out_of_memory(mem, mask);
	1318	+ else {
	1319	+ schedule();
	1320	+ finish_wait(&memcg_oom_waitq, &wait);
	1321	+ }
	1322	+ mutex_lock(&memcg_oom_mutex);
	1323	+ mem_cgroup_oom_unlock(mem);
	1324	+ /*
	1325	+ * Here, we use global waitq .....more fine grained waitq ?
	1326	+ * Assume following hierarchy.
	1327	+ * A/
	1328	+ * 01
	1329	+ * 02
	1330	+ * assume OOM happens both in A and 01 at the same time. Tthey are
	1331	+ * mutually exclusive by lock. (kill in 01 helps A.)
	1332	+ * When we use per memcg waitq, we have to wake up waiters on A and 02
	1333	+ * in addtion to waiters on 01. We use global waitq for avoiding mess.
	1334	+ * It will not be a big problem.
	1335	+ * (And a task may be moved to other groups while it's waiting for OOM.)
	1336	+ */
	1337	+ wake_up_all(&memcg_oom_waitq);
	1338	+ mutex_unlock(&memcg_oom_mutex);
	1339	+
	1340	+ if (test_thread_flag(TIF_MEMDIE) \|\| fatal_signal_pending(current))
	1341	+ return false;
	1342	+ /* Give chance to dying process */
	1343	+ schedule_timeout(1);
	1344	+ return true;
	1345	+}
	1346	+
	1347	+/*
1278	1348	* Currently used to update mapped file statistics, but the routine can be
1279	1349	* generalized to update other statistics as well.
1280	1350	*/
...	...	@@ -1443,11 +1513,14 @@
1443	1513	struct res_counter *fail_res;
1444	1514	int csize = CHARGE_SIZE;
1445	1515
1446		- if (unlikely(test_thread_flag(TIF_MEMDIE))) {
1447		- /* Don't account this! */
1448		- *memcg = NULL;
1449		- return 0;
1450		- }
	1516	+ /*
	1517	+ * Unlike gloval-vm's OOM-kill, we're not in memory shortage
	1518	+ * in system level. So, allow to go ahead dying process in addition to
	1519	+ * MEMDIE process.
	1520	+ */
	1521	+ if (unlikely(test_thread_flag(TIF_MEMDIE)
	1522	+ \|\| fatal_signal_pending(current)))
	1523	+ goto bypass;
1451	1524
1452	1525	/*
1453	1526	* We always charge the cgroup the mm_struct belongs to.
1454	1527
...	...	@@ -1560,11 +1633,15 @@
1560	1633	}
1561	1634
1562	1635	if (!nr_retries--) {
1563		- if (oom) {
1564		- mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
1565		- record_last_oom(mem_over_limit);
	1636	+ if (!oom)
	1637	+ goto nomem;
	1638	+ if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
	1639	+ nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
	1640	+ continue;
1566	1641	}
1567		- goto nomem;
	1642	+ /* When we reach here, current task is dying .*/
	1643	+ css_put(&mem->css);
	1644	+ goto bypass;
1568	1645	}
1569	1646	}
1570	1647	if (csize > PAGE_SIZE)
...	...	@@ -1574,6 +1651,9 @@
1574	1651	nomem:
1575	1652	css_put(&mem->css);
1576	1653	return -ENOMEM;
	1654	+bypass:
	1655	+ *memcg = NULL;
	1656	+ return 0;
1577	1657	}
1578	1658
1579	1659	/*
...	...	@@ -603,13 +603,6 @@
603	603	/* Got some memory back in the last second. */
604	604	return;
605	605
606		- /*
607		- * If this is from memcg, oom-killer is already invoked.
608		- * and not worth to go system-wide-oom.
609		- */
610		- if (mem_cgroup_oom_called(current))
611		- goto rest_and_return;
612		-
613	606	if (sysctl_panic_on_oom)
614	607	panic("out of memory from page fault. panic_on_oom is selected.\n");
615	608
...	...	@@ -621,7 +614,6 @@
621	614	* Give "p" a good chance of killing itself before we
622	615	* retry to allocate memory.
623	616	*/
624		-rest_and_return:
625	617	if (!test_thread_flag(TIF_MEMDIE))
626	618	schedule_timeout_uninterruptible(1);
627	619	}