Commit 867578cbccb0893cc14fc29c670f7185809c90d6
Committed by
Linus Torvalds
1 parent
0263c12c12
Exists in
master
and in
4 other branches
memcg: fix oom kill behavior
In current page-fault code, handle_mm_fault() -> ... -> mem_cgroup_charge() -> map page or handle error. -> check return code. If page fault's return code is VM_FAULT_OOM, page_fault_out_of_memory() is called. But if it's caused by memcg, OOM should have been already invoked. Then, I added a patch: a636b327f731143ccc544b966cfd8de6cb6d72c6. That patch records last_oom_jiffies for memcg's sub-hierarchy and prevents page_fault_out_of_memory from being invoked in near future. But Nishimura-san reported that check by jiffies is not enough when the system is terribly heavy. This patch changes memcg's oom logic as. * If memcg causes OOM-kill, continue to retry. * remove jiffies check which is used now. * add memcg-oom-lock which works like perzone oom lock. * If current is killed(as a process), bypass charge. Something more sophisticated can be added but this pactch does fundamental things. TODO: - add oom notifier - add permemcg disable-oom-kill flag and freezer at oom. - more chances for wake up oom waiter (when changing memory limit etc..) Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 3 changed files with 107 additions and 41 deletions Side-by-side Diff
include/linux/memcontrol.h
... | ... | @@ -124,7 +124,6 @@ |
124 | 124 | return false; |
125 | 125 | } |
126 | 126 | |
127 | -extern bool mem_cgroup_oom_called(struct task_struct *task); | |
128 | 127 | void mem_cgroup_update_file_mapped(struct page *page, int val); |
129 | 128 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, |
130 | 129 | gfp_t gfp_mask, int nid, |
... | ... | @@ -256,11 +255,6 @@ |
256 | 255 | static inline bool mem_cgroup_disabled(void) |
257 | 256 | { |
258 | 257 | return true; |
259 | -} | |
260 | - | |
261 | -static inline bool mem_cgroup_oom_called(struct task_struct *task) | |
262 | -{ | |
263 | - return false; | |
264 | 258 | } |
265 | 259 | |
266 | 260 | static inline int |
mm/memcontrol.c
... | ... | @@ -203,7 +203,7 @@ |
203 | 203 | * Should the accounting and control be hierarchical, per subtree? |
204 | 204 | */ |
205 | 205 | bool use_hierarchy; |
206 | - unsigned long last_oom_jiffies; | |
206 | + atomic_t oom_lock; | |
207 | 207 | atomic_t refcnt; |
208 | 208 | |
209 | 209 | unsigned int swappiness; |
210 | 210 | |
211 | 211 | |
212 | 212 | |
213 | 213 | |
214 | 214 | |
215 | 215 | |
216 | 216 | |
217 | 217 | |
... | ... | @@ -1246,35 +1246,105 @@ |
1246 | 1246 | return total; |
1247 | 1247 | } |
1248 | 1248 | |
1249 | -bool mem_cgroup_oom_called(struct task_struct *task) | |
1249 | +static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) | |
1250 | 1250 | { |
1251 | - bool ret = false; | |
1252 | - struct mem_cgroup *mem; | |
1253 | - struct mm_struct *mm; | |
1251 | + int *val = (int *)data; | |
1252 | + int x; | |
1253 | + /* | |
1254 | + * Logically, we can stop scanning immediately when we find | |
1255 | + * a memcg is already locked. But condidering unlock ops and | |
1256 | + * creation/removal of memcg, scan-all is simple operation. | |
1257 | + */ | |
1258 | + x = atomic_inc_return(&mem->oom_lock); | |
1259 | + *val = max(x, *val); | |
1260 | + return 0; | |
1261 | +} | |
1262 | +/* | |
1263 | + * Check OOM-Killer is already running under our hierarchy. | |
1264 | + * If someone is running, return false. | |
1265 | + */ | |
1266 | +static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) | |
1267 | +{ | |
1268 | + int lock_count = 0; | |
1254 | 1269 | |
1255 | - rcu_read_lock(); | |
1256 | - mm = task->mm; | |
1257 | - if (!mm) | |
1258 | - mm = &init_mm; | |
1259 | - mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | |
1260 | - if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) | |
1261 | - ret = true; | |
1262 | - rcu_read_unlock(); | |
1263 | - return ret; | |
1270 | + mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); | |
1271 | + | |
1272 | + if (lock_count == 1) | |
1273 | + return true; | |
1274 | + return false; | |
1264 | 1275 | } |
1265 | 1276 | |
1266 | -static int record_last_oom_cb(struct mem_cgroup *mem, void *data) | |
1277 | +static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) | |
1267 | 1278 | { |
1268 | - mem->last_oom_jiffies = jiffies; | |
1279 | + /* | |
1280 | + * When a new child is created while the hierarchy is under oom, | |
1281 | + * mem_cgroup_oom_lock() may not be called. We have to use | |
1282 | + * atomic_add_unless() here. | |
1283 | + */ | |
1284 | + atomic_add_unless(&mem->oom_lock, -1, 0); | |
1269 | 1285 | return 0; |
1270 | 1286 | } |
1271 | 1287 | |
1272 | -static void record_last_oom(struct mem_cgroup *mem) | |
1288 | +static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) | |
1273 | 1289 | { |
1274 | - mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); | |
1290 | + mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); | |
1275 | 1291 | } |
1276 | 1292 | |
1293 | +static DEFINE_MUTEX(memcg_oom_mutex); | |
1294 | +static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | |
1295 | + | |
1277 | 1296 | /* |
1297 | + * try to call OOM killer. returns false if we should exit memory-reclaim loop. | |
1298 | + */ | |
1299 | +bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |
1300 | +{ | |
1301 | + DEFINE_WAIT(wait); | |
1302 | + bool locked; | |
1303 | + | |
1304 | + /* At first, try to OOM lock hierarchy under mem.*/ | |
1305 | + mutex_lock(&memcg_oom_mutex); | |
1306 | + locked = mem_cgroup_oom_lock(mem); | |
1307 | + /* | |
1308 | + * Even if signal_pending(), we can't quit charge() loop without | |
1309 | + * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | |
1310 | + * under OOM is always welcomed, use TASK_KILLABLE here. | |
1311 | + */ | |
1312 | + if (!locked) | |
1313 | + prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE); | |
1314 | + mutex_unlock(&memcg_oom_mutex); | |
1315 | + | |
1316 | + if (locked) | |
1317 | + mem_cgroup_out_of_memory(mem, mask); | |
1318 | + else { | |
1319 | + schedule(); | |
1320 | + finish_wait(&memcg_oom_waitq, &wait); | |
1321 | + } | |
1322 | + mutex_lock(&memcg_oom_mutex); | |
1323 | + mem_cgroup_oom_unlock(mem); | |
1324 | + /* | |
1325 | + * Here, we use global waitq .....more fine grained waitq ? | |
1326 | + * Assume following hierarchy. | |
1327 | + * A/ | |
1328 | + * 01 | |
1329 | + * 02 | |
1330 | + * assume OOM happens both in A and 01 at the same time. Tthey are | |
1331 | + * mutually exclusive by lock. (kill in 01 helps A.) | |
1332 | + * When we use per memcg waitq, we have to wake up waiters on A and 02 | |
1333 | + * in addtion to waiters on 01. We use global waitq for avoiding mess. | |
1334 | + * It will not be a big problem. | |
1335 | + * (And a task may be moved to other groups while it's waiting for OOM.) | |
1336 | + */ | |
1337 | + wake_up_all(&memcg_oom_waitq); | |
1338 | + mutex_unlock(&memcg_oom_mutex); | |
1339 | + | |
1340 | + if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | |
1341 | + return false; | |
1342 | + /* Give chance to dying process */ | |
1343 | + schedule_timeout(1); | |
1344 | + return true; | |
1345 | +} | |
1346 | + | |
1347 | +/* | |
1278 | 1348 | * Currently used to update mapped file statistics, but the routine can be |
1279 | 1349 | * generalized to update other statistics as well. |
1280 | 1350 | */ |
... | ... | @@ -1443,11 +1513,14 @@ |
1443 | 1513 | struct res_counter *fail_res; |
1444 | 1514 | int csize = CHARGE_SIZE; |
1445 | 1515 | |
1446 | - if (unlikely(test_thread_flag(TIF_MEMDIE))) { | |
1447 | - /* Don't account this! */ | |
1448 | - *memcg = NULL; | |
1449 | - return 0; | |
1450 | - } | |
1516 | + /* | |
1517 | + * Unlike gloval-vm's OOM-kill, we're not in memory shortage | |
1518 | + * in system level. So, allow to go ahead dying process in addition to | |
1519 | + * MEMDIE process. | |
1520 | + */ | |
1521 | + if (unlikely(test_thread_flag(TIF_MEMDIE) | |
1522 | + || fatal_signal_pending(current))) | |
1523 | + goto bypass; | |
1451 | 1524 | |
1452 | 1525 | /* |
1453 | 1526 | * We always charge the cgroup the mm_struct belongs to. |
1454 | 1527 | |
... | ... | @@ -1560,11 +1633,15 @@ |
1560 | 1633 | } |
1561 | 1634 | |
1562 | 1635 | if (!nr_retries--) { |
1563 | - if (oom) { | |
1564 | - mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); | |
1565 | - record_last_oom(mem_over_limit); | |
1636 | + if (!oom) | |
1637 | + goto nomem; | |
1638 | + if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { | |
1639 | + nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | |
1640 | + continue; | |
1566 | 1641 | } |
1567 | - goto nomem; | |
1642 | + /* When we reach here, current task is dying .*/ | |
1643 | + css_put(&mem->css); | |
1644 | + goto bypass; | |
1568 | 1645 | } |
1569 | 1646 | } |
1570 | 1647 | if (csize > PAGE_SIZE) |
... | ... | @@ -1574,6 +1651,9 @@ |
1574 | 1651 | nomem: |
1575 | 1652 | css_put(&mem->css); |
1576 | 1653 | return -ENOMEM; |
1654 | +bypass: | |
1655 | + *memcg = NULL; | |
1656 | + return 0; | |
1577 | 1657 | } |
1578 | 1658 | |
1579 | 1659 | /* |
mm/oom_kill.c
... | ... | @@ -603,13 +603,6 @@ |
603 | 603 | /* Got some memory back in the last second. */ |
604 | 604 | return; |
605 | 605 | |
606 | - /* | |
607 | - * If this is from memcg, oom-killer is already invoked. | |
608 | - * and not worth to go system-wide-oom. | |
609 | - */ | |
610 | - if (mem_cgroup_oom_called(current)) | |
611 | - goto rest_and_return; | |
612 | - | |
613 | 606 | if (sysctl_panic_on_oom) |
614 | 607 | panic("out of memory from page fault. panic_on_oom is selected.\n"); |
615 | 608 | |
... | ... | @@ -621,7 +614,6 @@ |
621 | 614 | * Give "p" a good chance of killing itself before we |
622 | 615 | * retry to allocate memory. |
623 | 616 | */ |
624 | -rest_and_return: | |
625 | 617 | if (!test_thread_flag(TIF_MEMDIE)) |
626 | 618 | schedule_timeout_uninterruptible(1); |
627 | 619 | } |