Commit dc98df5a1b7be402a0e1c71f1b89ccf249ac15ee

Authored by KAMEZAWA Hiroyuki
Committed by Linus Torvalds
1 parent 595f4b694c

memcg: oom wakeup filter

memcg's oom waitqueue is a system-wide wait_queue (for handling
hierarchy.) So, it's better to add custom wake function and do filtering
in wake up path.

This patch adds a filtering feature for waking up oom-waiters.  Hierarchy
is properly handled.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 46 additions and 17 deletions Side-by-side Diff

... ... @@ -1293,14 +1293,56 @@
1293 1293 static DEFINE_MUTEX(memcg_oom_mutex);
1294 1294 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1295 1295  
  1296 +struct oom_wait_info {
  1297 + struct mem_cgroup *mem;
  1298 + wait_queue_t wait;
  1299 +};
  1300 +
  1301 +static int memcg_oom_wake_function(wait_queue_t *wait,
  1302 + unsigned mode, int sync, void *arg)
  1303 +{
  1304 + struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
  1305 + struct oom_wait_info *oom_wait_info;
  1306 +
  1307 + oom_wait_info = container_of(wait, struct oom_wait_info, wait);
  1308 +
  1309 + if (oom_wait_info->mem == wake_mem)
  1310 + goto wakeup;
  1311 + /* if no hierarchy, no match */
  1312 + if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
  1313 + return 0;
  1314 + /*
  1315 + * Both of oom_wait_info->mem and wake_mem are stable under us.
  1316 + * Then we can use css_is_ancestor without taking care of RCU.
  1317 + */
  1318 + if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
  1319 + !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
  1320 + return 0;
  1321 +
  1322 +wakeup:
  1323 + return autoremove_wake_function(wait, mode, sync, arg);
  1324 +}
  1325 +
  1326 +static void memcg_wakeup_oom(struct mem_cgroup *mem)
  1327 +{
  1328 + /* for filtering, pass "mem" as argument. */
  1329 + __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
  1330 +}
  1331 +
1296 1332 /*
1297 1333 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1298 1334 */
1299 1335 bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1300 1336 {
1301   - DEFINE_WAIT(wait);
  1337 + struct oom_wait_info owait;
1302 1338 bool locked;
1303 1339  
  1340 + owait.mem = mem;
  1341 + owait.wait.flags = 0;
  1342 + owait.wait.func = memcg_oom_wake_function;
  1343 + owait.wait.private = current;
  1344 + INIT_LIST_HEAD(&owait.wait.task_list);
  1345 +
1304 1346 /* At first, try to OOM lock hierarchy under mem.*/
1305 1347 mutex_lock(&memcg_oom_mutex);
1306 1348 locked = mem_cgroup_oom_lock(mem);
1307 1349  
1308 1350  
... ... @@ -1310,31 +1352,18 @@
1310 1352 * under OOM is always welcomed, use TASK_KILLABLE here.
1311 1353 */
1312 1354 if (!locked)
1313   - prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE);
  1355 + prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1314 1356 mutex_unlock(&memcg_oom_mutex);
1315 1357  
1316 1358 if (locked)
1317 1359 mem_cgroup_out_of_memory(mem, mask);
1318 1360 else {
1319 1361 schedule();
1320   - finish_wait(&memcg_oom_waitq, &wait);
  1362 + finish_wait(&memcg_oom_waitq, &owait.wait);
1321 1363 }
1322 1364 mutex_lock(&memcg_oom_mutex);
1323 1365 mem_cgroup_oom_unlock(mem);
1324   - /*
1325   - * Here, we use global waitq .....more fine grained waitq ?
1326   - * Assume following hierarchy.
1327   - * A/
1328   - * 01
1329   - * 02
1330   - * assume OOM happens both in A and 01 at the same time. Tthey are
1331   - * mutually exclusive by lock. (kill in 01 helps A.)
1332   - * When we use per memcg waitq, we have to wake up waiters on A and 02
1333   - * in addtion to waiters on 01. We use global waitq for avoiding mess.
1334   - * It will not be a big problem.
1335   - * (And a task may be moved to other groups while it's waiting for OOM.)
1336   - */
1337   - wake_up_all(&memcg_oom_waitq);
  1366 + memcg_wakeup_oom(mem);
1338 1367 mutex_unlock(&memcg_oom_mutex);
1339 1368  
1340 1369 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))