Commit 6f48d0ebd907ae419387f27b602ee98870cfa7bb

Authored by David Rientjes
Committed by Linus Torvalds
1 parent 5e9d834a0e

oom: select task from tasklist for mempolicy ooms

The oom killer presently kills current whenever there is no more memory
free or reclaimable on its mempolicy's nodes.  There is no guarantee that
current is a memory-hogging task or that killing it will free any
substantial amount of memory, however.

In such situations, it is better to scan the tasklist for nodes that are
allowed to allocate on current's set of nodes and kill the task with the
highest badness() score.  This ensures that the most memory-hogging task,
or the one configured by the user with /proc/pid/oom_adj, is always
selected in such scenarios.

Signed-off-by: David Rientjes <rientjes@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 124 additions and 37 deletions Side-by-side Diff

include/linux/mempolicy.h
... ... @@ -210,6 +210,8 @@
210 210 unsigned long addr, gfp_t gfp_flags,
211 211 struct mempolicy **mpol, nodemask_t **nodemask);
212 212 extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
  213 +extern bool mempolicy_nodemask_intersects(struct task_struct *tsk,
  214 + const nodemask_t *mask);
213 215 extern unsigned slab_node(struct mempolicy *policy);
214 216  
215 217 extern enum zone_type policy_zone;
... ... @@ -338,7 +340,16 @@
338 340 return node_zonelist(0, gfp_flags);
339 341 }
340 342  
341   -static inline bool init_nodemask_of_mempolicy(nodemask_t *m) { return false; }
  343 +static inline bool init_nodemask_of_mempolicy(nodemask_t *m)
  344 +{
  345 + return false;
  346 +}
  347 +
  348 +static inline bool mempolicy_nodemask_intersects(struct task_struct *tsk,
  349 + const nodemask_t *mask)
  350 +{
  351 + return false;
  352 +}
342 353  
343 354 static inline int do_migrate_pages(struct mm_struct *mm,
344 355 const nodemask_t *from_nodes,
... ... @@ -1712,6 +1712,50 @@
1712 1712 }
1713 1713 #endif
1714 1714  
  1715 +/*
  1716 + * mempolicy_nodemask_intersects
  1717 + *
  1718 + * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
  1719 + * policy. Otherwise, check for intersection between mask and the policy
  1720 + * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
  1721 + * policy, always return true since it may allocate elsewhere on fallback.
  1722 + *
  1723 + * Takes task_lock(tsk) to prevent freeing of its mempolicy.
  1724 + */
  1725 +bool mempolicy_nodemask_intersects(struct task_struct *tsk,
  1726 + const nodemask_t *mask)
  1727 +{
  1728 + struct mempolicy *mempolicy;
  1729 + bool ret = true;
  1730 +
  1731 + if (!mask)
  1732 + return ret;
  1733 + task_lock(tsk);
  1734 + mempolicy = tsk->mempolicy;
  1735 + if (!mempolicy)
  1736 + goto out;
  1737 +
  1738 + switch (mempolicy->mode) {
  1739 + case MPOL_PREFERRED:
  1740 + /*
  1741 + * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
  1742 + * allocate from, they may fallback to other nodes when oom.
  1743 + * Thus, it's possible for tsk to have allocated memory from
  1744 + * nodes in mask.
  1745 + */
  1746 + break;
  1747 + case MPOL_BIND:
  1748 + case MPOL_INTERLEAVE:
  1749 + ret = nodes_intersects(mempolicy->v.nodes, *mask);
  1750 + break;
  1751 + default:
  1752 + BUG();
  1753 + }
  1754 +out:
  1755 + task_unlock(tsk);
  1756 + return ret;
  1757 +}
  1758 +
1715 1759 /* Allocate a page in interleaved policy.
1716 1760 Own path because it needs to do special accounting. */
1717 1761 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
... ... @@ -27,6 +27,7 @@
27 27 #include <linux/module.h>
28 28 #include <linux/notifier.h>
29 29 #include <linux/memcontrol.h>
  30 +#include <linux/mempolicy.h>
30 31 #include <linux/security.h>
31 32  
32 33 int sysctl_panic_on_oom;
33 34  
34 35  
35 36  
36 37  
37 38  
38 39  
... ... @@ -35,23 +36,57 @@
35 36 static DEFINE_SPINLOCK(zone_scan_lock);
36 37 /* #define DEBUG */
37 38  
38   -/*
39   - * Is all threads of the target process nodes overlap ours?
  39 +#ifdef CONFIG_NUMA
  40 +/**
  41 + * has_intersects_mems_allowed() - check task eligiblity for kill
  42 + * @tsk: task struct of which task to consider
  43 + * @mask: nodemask passed to page allocator for mempolicy ooms
  44 + *
  45 + * Task eligibility is determined by whether or not a candidate task, @tsk,
  46 + * shares the same mempolicy nodes as current if it is bound by such a policy
  47 + * and whether or not it has the same set of allowed cpuset nodes.
40 48 */
41   -static int has_intersects_mems_allowed(struct task_struct *tsk)
  49 +static bool has_intersects_mems_allowed(struct task_struct *tsk,
  50 + const nodemask_t *mask)
42 51 {
43   - struct task_struct *t;
  52 + struct task_struct *start = tsk;
44 53  
45   - t = tsk;
46 54 do {
47   - if (cpuset_mems_allowed_intersects(current, t))
48   - return 1;
49   - t = next_thread(t);
50   - } while (t != tsk);
51   -
52   - return 0;
  55 + if (mask) {
  56 + /*
  57 + * If this is a mempolicy constrained oom, tsk's
  58 + * cpuset is irrelevant. Only return true if its
  59 + * mempolicy intersects current, otherwise it may be
  60 + * needlessly killed.
  61 + */
  62 + if (mempolicy_nodemask_intersects(tsk, mask))
  63 + return true;
  64 + } else {
  65 + /*
  66 + * This is not a mempolicy constrained oom, so only
  67 + * check the mems of tsk's cpuset.
  68 + */
  69 + if (cpuset_mems_allowed_intersects(current, tsk))
  70 + return true;
  71 + }
  72 + tsk = next_thread(tsk);
  73 + } while (tsk != start);
  74 + return false;
53 75 }
  76 +#else
  77 +static bool has_intersects_mems_allowed(struct task_struct *tsk,
  78 + const nodemask_t *mask)
  79 +{
  80 + return true;
  81 +}
  82 +#endif /* CONFIG_NUMA */
54 83  
  84 +/*
  85 + * The process p may have detached its own ->mm while exiting or through
  86 + * use_mm(), but one or more of its subthreads may still have a valid
  87 + * pointer. Return p, or any of its subthreads with a valid ->mm, with
  88 + * task_lock() held.
  89 + */
55 90 static struct task_struct *find_lock_task_mm(struct task_struct *p)
56 91 {
57 92 struct task_struct *t = p;
... ... @@ -106,10 +141,6 @@
106 141 * The memory size of the process is the basis for the badness.
107 142 */
108 143 points = p->mm->total_vm;
109   -
110   - /*
111   - * After this unlock we can no longer dereference local variable `mm'
112   - */
113 144 task_unlock(p);
114 145  
115 146 /*
... ... @@ -253,7 +284,8 @@
253 284 * (not docbooked, we don't want this one cluttering up the manual)
254 285 */
255 286 static struct task_struct *select_bad_process(unsigned long *ppoints,
256   - struct mem_cgroup *mem)
  287 + struct mem_cgroup *mem, enum oom_constraint constraint,
  288 + const nodemask_t *mask)
257 289 {
258 290 struct task_struct *p;
259 291 struct task_struct *chosen = NULL;
... ... @@ -269,7 +301,9 @@
269 301 continue;
270 302 if (mem && !task_in_mem_cgroup(p, mem))
271 303 continue;
272   - if (!has_intersects_mems_allowed(p))
  304 + if (!has_intersects_mems_allowed(p,
  305 + constraint == CONSTRAINT_MEMORY_POLICY ? mask :
  306 + NULL))
273 307 continue;
274 308  
275 309 /*
... ... @@ -497,7 +531,7 @@
497 531 panic("out of memory(memcg). panic_on_oom is selected.\n");
498 532 read_lock(&tasklist_lock);
499 533 retry:
500   - p = select_bad_process(&points, mem);
  534 + p = select_bad_process(&points, mem, CONSTRAINT_NONE, NULL);
501 535 if (!p || PTR_ERR(p) == -1UL)
502 536 goto out;
503 537  
... ... @@ -576,7 +610,8 @@
576 610 /*
577 611 * Must be called with tasklist_lock held for read.
578 612 */
579   -static void __out_of_memory(gfp_t gfp_mask, int order)
  613 +static void __out_of_memory(gfp_t gfp_mask, int order,
  614 + enum oom_constraint constraint, const nodemask_t *mask)
580 615 {
581 616 struct task_struct *p;
582 617 unsigned long points;
... ... @@ -590,7 +625,7 @@
590 625 * Rambo mode: Shoot down a process and hope it solves whatever
591 626 * issues we may have.
592 627 */
593   - p = select_bad_process(&points, NULL);
  628 + p = select_bad_process(&points, NULL, constraint, mask);
594 629  
595 630 if (PTR_ERR(p) == -1UL)
596 631 return;
... ... @@ -624,7 +659,8 @@
624 659 panic("out of memory from page fault. panic_on_oom is selected.\n");
625 660  
626 661 read_lock(&tasklist_lock);
627   - __out_of_memory(0, 0); /* unknown gfp_mask and order */
  662 + /* unknown gfp_mask and order */
  663 + __out_of_memory(0, 0, CONSTRAINT_NONE, NULL);
628 664 read_unlock(&tasklist_lock);
629 665  
630 666 /*
... ... @@ -640,6 +676,7 @@
640 676 * @zonelist: zonelist pointer
641 677 * @gfp_mask: memory allocation flags
642 678 * @order: amount of memory being requested as a power of 2
  679 + * @nodemask: nodemask passed to page allocator
643 680 *
644 681 * If we run out of memory, we have the choice between either
645 682 * killing a random task (bad), letting the system crash (worse)
646 683  
647 684  
648 685  
... ... @@ -678,24 +715,19 @@
678 715 */
679 716 constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
680 717 read_lock(&tasklist_lock);
681   -
682   - switch (constraint) {
683   - case CONSTRAINT_MEMORY_POLICY:
684   - oom_kill_process(current, gfp_mask, order, 0, NULL,
685   - "No available memory (MPOL_BIND)");
686   - break;
687   -
688   - case CONSTRAINT_NONE:
689   - if (sysctl_panic_on_oom) {
  718 + if (unlikely(sysctl_panic_on_oom)) {
  719 + /*
  720 + * panic_on_oom only affects CONSTRAINT_NONE, the kernel
  721 + * should not panic for cpuset or mempolicy induced memory
  722 + * failures.
  723 + */
  724 + if (constraint == CONSTRAINT_NONE) {
690 725 dump_header(NULL, gfp_mask, order, NULL);
691   - panic("out of memory. panic_on_oom is selected\n");
  726 + read_unlock(&tasklist_lock);
  727 + panic("Out of memory: panic_on_oom is enabled\n");
692 728 }
693   - /* Fall-through */
694   - case CONSTRAINT_CPUSET:
695   - __out_of_memory(gfp_mask, order);
696   - break;
697 729 }
698   -
  730 + __out_of_memory(gfp_mask, order, constraint, nodemask);
699 731 read_unlock(&tasklist_lock);
700 732  
701 733 /*