Commit 867578cbccb0893cc14fc29c670f7185809c90d6

Authored by KAMEZAWA Hiroyuki
Committed by Linus Torvalds
1 parent 0263c12c12

memcg: fix oom kill behavior

In current page-fault code,

	handle_mm_fault()
		-> ...
		-> mem_cgroup_charge()
		-> map page or handle error.
	-> check return code.

If page fault's return code is VM_FAULT_OOM, page_fault_out_of_memory() is
called.  But if it's caused by memcg, OOM should have been already
invoked.

Then, I added a patch: a636b327f731143ccc544b966cfd8de6cb6d72c6.  That
patch records last_oom_jiffies for memcg's sub-hierarchy and prevents
page_fault_out_of_memory from being invoked in near future.

But Nishimura-san reported that check by jiffies is not enough when the
system is terribly heavy.

This patch changes memcg's oom logic as.
 * If memcg causes OOM-kill, continue to retry.
 * remove jiffies check which is used now.
 * add memcg-oom-lock which works like perzone oom lock.
 * If current is killed(as a process), bypass charge.

Something more sophisticated can be added but this pactch does
fundamental things.
TODO:
 - add oom notifier
 - add permemcg disable-oom-kill flag and freezer at oom.
 - more chances for wake up oom waiter (when changing memory limit etc..)

Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 107 additions and 41 deletions Side-by-side Diff

include/linux/memcontrol.h
... ... @@ -124,7 +124,6 @@
124 124 return false;
125 125 }
126 126  
127   -extern bool mem_cgroup_oom_called(struct task_struct *task);
128 127 void mem_cgroup_update_file_mapped(struct page *page, int val);
129 128 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
130 129 gfp_t gfp_mask, int nid,
... ... @@ -256,11 +255,6 @@
256 255 static inline bool mem_cgroup_disabled(void)
257 256 {
258 257 return true;
259   -}
260   -
261   -static inline bool mem_cgroup_oom_called(struct task_struct *task)
262   -{
263   - return false;
264 258 }
265 259  
266 260 static inline int
... ... @@ -203,7 +203,7 @@
203 203 * Should the accounting and control be hierarchical, per subtree?
204 204 */
205 205 bool use_hierarchy;
206   - unsigned long last_oom_jiffies;
  206 + atomic_t oom_lock;
207 207 atomic_t refcnt;
208 208  
209 209 unsigned int swappiness;
210 210  
211 211  
212 212  
213 213  
214 214  
215 215  
216 216  
217 217  
... ... @@ -1246,35 +1246,105 @@
1246 1246 return total;
1247 1247 }
1248 1248  
1249   -bool mem_cgroup_oom_called(struct task_struct *task)
  1249 +static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
1250 1250 {
1251   - bool ret = false;
1252   - struct mem_cgroup *mem;
1253   - struct mm_struct *mm;
  1251 + int *val = (int *)data;
  1252 + int x;
  1253 + /*
  1254 + * Logically, we can stop scanning immediately when we find
  1255 + * a memcg is already locked. But condidering unlock ops and
  1256 + * creation/removal of memcg, scan-all is simple operation.
  1257 + */
  1258 + x = atomic_inc_return(&mem->oom_lock);
  1259 + *val = max(x, *val);
  1260 + return 0;
  1261 +}
  1262 +/*
  1263 + * Check OOM-Killer is already running under our hierarchy.
  1264 + * If someone is running, return false.
  1265 + */
  1266 +static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
  1267 +{
  1268 + int lock_count = 0;
1254 1269  
1255   - rcu_read_lock();
1256   - mm = task->mm;
1257   - if (!mm)
1258   - mm = &init_mm;
1259   - mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
1260   - if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
1261   - ret = true;
1262   - rcu_read_unlock();
1263   - return ret;
  1270 + mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
  1271 +
  1272 + if (lock_count == 1)
  1273 + return true;
  1274 + return false;
1264 1275 }
1265 1276  
1266   -static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
  1277 +static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
1267 1278 {
1268   - mem->last_oom_jiffies = jiffies;
  1279 + /*
  1280 + * When a new child is created while the hierarchy is under oom,
  1281 + * mem_cgroup_oom_lock() may not be called. We have to use
  1282 + * atomic_add_unless() here.
  1283 + */
  1284 + atomic_add_unless(&mem->oom_lock, -1, 0);
1269 1285 return 0;
1270 1286 }
1271 1287  
1272   -static void record_last_oom(struct mem_cgroup *mem)
  1288 +static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1273 1289 {
1274   - mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
  1290 + mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
1275 1291 }
1276 1292  
  1293 +static DEFINE_MUTEX(memcg_oom_mutex);
  1294 +static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
  1295 +
1277 1296 /*
  1297 + * try to call OOM killer. returns false if we should exit memory-reclaim loop.
  1298 + */
  1299 +bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
  1300 +{
  1301 + DEFINE_WAIT(wait);
  1302 + bool locked;
  1303 +
  1304 + /* At first, try to OOM lock hierarchy under mem.*/
  1305 + mutex_lock(&memcg_oom_mutex);
  1306 + locked = mem_cgroup_oom_lock(mem);
  1307 + /*
  1308 + * Even if signal_pending(), we can't quit charge() loop without
  1309 + * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
  1310 + * under OOM is always welcomed, use TASK_KILLABLE here.
  1311 + */
  1312 + if (!locked)
  1313 + prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE);
  1314 + mutex_unlock(&memcg_oom_mutex);
  1315 +
  1316 + if (locked)
  1317 + mem_cgroup_out_of_memory(mem, mask);
  1318 + else {
  1319 + schedule();
  1320 + finish_wait(&memcg_oom_waitq, &wait);
  1321 + }
  1322 + mutex_lock(&memcg_oom_mutex);
  1323 + mem_cgroup_oom_unlock(mem);
  1324 + /*
  1325 + * Here, we use global waitq .....more fine grained waitq ?
  1326 + * Assume following hierarchy.
  1327 + * A/
  1328 + * 01
  1329 + * 02
  1330 + * assume OOM happens both in A and 01 at the same time. Tthey are
  1331 + * mutually exclusive by lock. (kill in 01 helps A.)
  1332 + * When we use per memcg waitq, we have to wake up waiters on A and 02
  1333 + * in addtion to waiters on 01. We use global waitq for avoiding mess.
  1334 + * It will not be a big problem.
  1335 + * (And a task may be moved to other groups while it's waiting for OOM.)
  1336 + */
  1337 + wake_up_all(&memcg_oom_waitq);
  1338 + mutex_unlock(&memcg_oom_mutex);
  1339 +
  1340 + if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
  1341 + return false;
  1342 + /* Give chance to dying process */
  1343 + schedule_timeout(1);
  1344 + return true;
  1345 +}
  1346 +
  1347 +/*
1278 1348 * Currently used to update mapped file statistics, but the routine can be
1279 1349 * generalized to update other statistics as well.
1280 1350 */
... ... @@ -1443,11 +1513,14 @@
1443 1513 struct res_counter *fail_res;
1444 1514 int csize = CHARGE_SIZE;
1445 1515  
1446   - if (unlikely(test_thread_flag(TIF_MEMDIE))) {
1447   - /* Don't account this! */
1448   - *memcg = NULL;
1449   - return 0;
1450   - }
  1516 + /*
  1517 + * Unlike gloval-vm's OOM-kill, we're not in memory shortage
  1518 + * in system level. So, allow to go ahead dying process in addition to
  1519 + * MEMDIE process.
  1520 + */
  1521 + if (unlikely(test_thread_flag(TIF_MEMDIE)
  1522 + || fatal_signal_pending(current)))
  1523 + goto bypass;
1451 1524  
1452 1525 /*
1453 1526 * We always charge the cgroup the mm_struct belongs to.
1454 1527  
... ... @@ -1560,11 +1633,15 @@
1560 1633 }
1561 1634  
1562 1635 if (!nr_retries--) {
1563   - if (oom) {
1564   - mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
1565   - record_last_oom(mem_over_limit);
  1636 + if (!oom)
  1637 + goto nomem;
  1638 + if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
  1639 + nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
  1640 + continue;
1566 1641 }
1567   - goto nomem;
  1642 + /* When we reach here, current task is dying .*/
  1643 + css_put(&mem->css);
  1644 + goto bypass;
1568 1645 }
1569 1646 }
1570 1647 if (csize > PAGE_SIZE)
... ... @@ -1574,6 +1651,9 @@
1574 1651 nomem:
1575 1652 css_put(&mem->css);
1576 1653 return -ENOMEM;
  1654 +bypass:
  1655 + *memcg = NULL;
  1656 + return 0;
1577 1657 }
1578 1658  
1579 1659 /*
... ... @@ -603,13 +603,6 @@
603 603 /* Got some memory back in the last second. */
604 604 return;
605 605  
606   - /*
607   - * If this is from memcg, oom-killer is already invoked.
608   - * and not worth to go system-wide-oom.
609   - */
610   - if (mem_cgroup_oom_called(current))
611   - goto rest_and_return;
612   -
613 606 if (sysctl_panic_on_oom)
614 607 panic("out of memory from page fault. panic_on_oom is selected.\n");
615 608  
... ... @@ -621,7 +614,6 @@
621 614 * Give "p" a good chance of killing itself before we
622 615 * retry to allocate memory.
623 616 */
624   -rest_and_return:
625 617 if (!test_thread_flag(TIF_MEMDIE))
626 618 schedule_timeout_uninterruptible(1);
627 619 }