Blame view

mm/oom_kill.c 19.9 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
  /*
   *  linux/mm/oom_kill.c
   * 
   *  Copyright (C)  1998,2000  Rik van Riel
   *	Thanks go out to Claus Fischer for some serious inspiration and
   *	for goading me into coding this file...
a63d83f42   David Rientjes   oom: badness heur...
7
8
   *  Copyright (C)  2010  Google, Inc.
   *	Rewritten by David Rientjes
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
9
10
   *
   *  The routines in this file are used to kill a process when
a49335cce   Paul Jackson   [PATCH] cpusets: ...
11
12
   *  we're seriously out of memory. This gets called from __alloc_pages()
   *  in mm/page_alloc.c when we really run out of memory.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
13
14
15
16
17
18
   *
   *  Since we won't call these routines often (on a well-configured
   *  machine) this file will double as a 'coding guide' and a signpost
   *  for newbie kernel hackers. It features several pointers to major
   *  kernel subsystems and hints as to where to find out what things do.
   */
8ac773b4f   Alexey Dobriyan   [PATCH] OOM kille...
19
  #include <linux/oom.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
20
  #include <linux/mm.h>
4e950f6f0   Alexey Dobriyan   Remove fs.h from ...
21
  #include <linux/err.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
22
  #include <linux/gfp.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
23
24
25
26
  #include <linux/sched.h>
  #include <linux/swap.h>
  #include <linux/timex.h>
  #include <linux/jiffies.h>
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
27
  #include <linux/cpuset.h>
8bc719d3c   Martin Schwidefsky   [PATCH] out of me...
28
29
  #include <linux/module.h>
  #include <linux/notifier.h>
c7ba5c9e8   Pavel Emelianov   Memory controller...
30
  #include <linux/memcontrol.h>
6f48d0ebd   David Rientjes   oom: select task ...
31
  #include <linux/mempolicy.h>
5cd9c58fb   David Howells   security: Fix set...
32
  #include <linux/security.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
33

fadd8fbd1   KAMEZAWA Hiroyuki   [PATCH] support f...
34
  int sysctl_panic_on_oom;
fe071d7e8   David Rientjes   oom: add oom_kill...
35
  int sysctl_oom_kill_allocating_task;
ad915c432   David Rientjes   oom: enable oom t...
36
  int sysctl_oom_dump_tasks = 1;
c7d4caeb1   David Rientjes   oom: fix zone_sca...
37
  static DEFINE_SPINLOCK(zone_scan_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
38

6f48d0ebd   David Rientjes   oom: select task ...
39
40
41
42
43
44
45
46
47
  #ifdef CONFIG_NUMA
  /**
   * has_intersects_mems_allowed() - check task eligiblity for kill
   * @tsk: task struct of which task to consider
   * @mask: nodemask passed to page allocator for mempolicy ooms
   *
   * Task eligibility is determined by whether or not a candidate task, @tsk,
   * shares the same mempolicy nodes as current if it is bound by such a policy
   * and whether or not it has the same set of allowed cpuset nodes.
495789a51   KOSAKI Motohiro   oom: make oom_sco...
48
   */
6f48d0ebd   David Rientjes   oom: select task ...
49
50
  static bool has_intersects_mems_allowed(struct task_struct *tsk,
  					const nodemask_t *mask)
495789a51   KOSAKI Motohiro   oom: make oom_sco...
51
  {
6f48d0ebd   David Rientjes   oom: select task ...
52
  	struct task_struct *start = tsk;
495789a51   KOSAKI Motohiro   oom: make oom_sco...
53

495789a51   KOSAKI Motohiro   oom: make oom_sco...
54
  	do {
6f48d0ebd   David Rientjes   oom: select task ...
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
  		if (mask) {
  			/*
  			 * If this is a mempolicy constrained oom, tsk's
  			 * cpuset is irrelevant.  Only return true if its
  			 * mempolicy intersects current, otherwise it may be
  			 * needlessly killed.
  			 */
  			if (mempolicy_nodemask_intersects(tsk, mask))
  				return true;
  		} else {
  			/*
  			 * This is not a mempolicy constrained oom, so only
  			 * check the mems of tsk's cpuset.
  			 */
  			if (cpuset_mems_allowed_intersects(current, tsk))
  				return true;
  		}
df1090a8d   KOSAKI Motohiro   oom: cleanup has_...
72
  	} while_each_thread(start, tsk);
6f48d0ebd   David Rientjes   oom: select task ...
73
74
75
76
77
78
79
  	return false;
  }
  #else
  static bool has_intersects_mems_allowed(struct task_struct *tsk,
  					const nodemask_t *mask)
  {
  	return true;
495789a51   KOSAKI Motohiro   oom: make oom_sco...
80
  }
6f48d0ebd   David Rientjes   oom: select task ...
81
  #endif /* CONFIG_NUMA */
495789a51   KOSAKI Motohiro   oom: make oom_sco...
82

6f48d0ebd   David Rientjes   oom: select task ...
83
  /*
93b43fa55   Luis Claudio R. Goncalves   oom: give the dyi...
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
   * If this is a system OOM (not a memcg OOM) and the task selected to be
   * killed is not already running at high (RT) priorities, speed up the
   * recovery by boosting the dying task to the lowest FIFO priority.
   * That helps with the recovery and avoids interfering with RT tasks.
   */
  static void boost_dying_task_prio(struct task_struct *p,
  				  struct mem_cgroup *mem)
  {
  	struct sched_param param = { .sched_priority = 1 };
  
  	if (mem)
  		return;
  
  	if (!rt_task(p))
  		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
  }
  
  /*
6f48d0ebd   David Rientjes   oom: select task ...
102
103
104
105
106
   * The process p may have detached its own ->mm while exiting or through
   * use_mm(), but one or more of its subthreads may still have a valid
   * pointer.  Return p, or any of its subthreads with a valid ->mm, with
   * task_lock() held.
   */
158e0a2d1   KAMEZAWA Hiroyuki   memcg: use find_l...
107
  struct task_struct *find_lock_task_mm(struct task_struct *p)
dd8e8f405   Oleg Nesterov   oom: introduce fi...
108
109
110
111
112
113
114
115
116
117
118
119
  {
  	struct task_struct *t = p;
  
  	do {
  		task_lock(t);
  		if (likely(t->mm))
  			return t;
  		task_unlock(t);
  	} while_each_thread(p, t);
  
  	return NULL;
  }
ab290adba   KOSAKI Motohiro   oom: make oom_unk...
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
  /* return true if the task is not adequate as candidate victim task. */
  static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem,
  			   const nodemask_t *nodemask)
  {
  	if (is_global_init(p))
  		return true;
  	if (p->flags & PF_KTHREAD)
  		return true;
  
  	/* When mem_cgroup_out_of_memory() and p is not member of the group */
  	if (mem && !task_in_mem_cgroup(p, mem))
  		return true;
  
  	/* p may not have freeable memory in nodemask */
  	if (!has_intersects_mems_allowed(p, nodemask))
  		return true;
  
  	return false;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
139
  /**
a63d83f42   David Rientjes   oom: badness heur...
140
   * oom_badness - heuristic function to determine which candidate task to kill
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
141
   * @p: task struct of which task we should calculate
a63d83f42   David Rientjes   oom: badness heur...
142
   * @totalpages: total present RAM allowed for page allocation
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
143
   *
a63d83f42   David Rientjes   oom: badness heur...
144
145
146
   * The heuristic for determining which task to kill is made to be as simple and
   * predictable as possible.  The goal is to return the highest value for the
   * task consuming the most memory to avoid subsequent oom failures.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
147
   */
a63d83f42   David Rientjes   oom: badness heur...
148
149
  unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
  		      const nodemask_t *nodemask, unsigned long totalpages)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
150
  {
a63d83f42   David Rientjes   oom: badness heur...
151
  	int points;
28b83c519   KOSAKI Motohiro   oom: move oom_adj...
152

26ebc9849   KOSAKI Motohiro   oom: /proc/<pid>/...
153
154
  	if (oom_unkillable_task(p, mem, nodemask))
  		return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
155

dd8e8f405   Oleg Nesterov   oom: introduce fi...
156
157
  	p = find_lock_task_mm(p);
  	if (!p)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
158
159
160
  		return 0;
  
  	/*
a63d83f42   David Rientjes   oom: badness heur...
161
162
  	 * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
  	 * need to be executed for something that cannot be killed.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
163
  	 */
a63d83f42   David Rientjes   oom: badness heur...
164
165
166
167
  	if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
  		task_unlock(p);
  		return 0;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
168
169
  
  	/*
a63d83f42   David Rientjes   oom: badness heur...
170
171
  	 * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
  	 * priority for oom killing.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
172
  	 */
a63d83f42   David Rientjes   oom: badness heur...
173
174
175
176
  	if (p->flags & PF_OOM_ORIGIN) {
  		task_unlock(p);
  		return 1000;
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
177
178
  
  	/*
a63d83f42   David Rientjes   oom: badness heur...
179
180
  	 * The memory controller may have a limit of 0 bytes, so avoid a divide
  	 * by zero, if necessary.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
181
  	 */
a63d83f42   David Rientjes   oom: badness heur...
182
183
  	if (!totalpages)
  		totalpages = 1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
184
185
  
  	/*
a63d83f42   David Rientjes   oom: badness heur...
186
187
  	 * The baseline for the badness score is the proportion of RAM that each
  	 * task's rss and swap space use.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
188
  	 */
a63d83f42   David Rientjes   oom: badness heur...
189
190
191
  	points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
  			totalpages;
  	task_unlock(p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
192
193
  
  	/*
a63d83f42   David Rientjes   oom: badness heur...
194
195
  	 * Root processes get 3% bonus, just like the __vm_enough_memory()
  	 * implementation used by LSMs.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
196
  	 */
a63d83f42   David Rientjes   oom: badness heur...
197
198
  	if (has_capability_noaudit(p, CAP_SYS_ADMIN))
  		points -= 30;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
199
200
  
  	/*
a63d83f42   David Rientjes   oom: badness heur...
201
202
203
  	 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
  	 * either completely disable oom killing or always prefer a certain
  	 * task.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
204
  	 */
a63d83f42   David Rientjes   oom: badness heur...
205
  	points += p->signal->oom_score_adj;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
206

a63d83f42   David Rientjes   oom: badness heur...
207
208
209
  	if (points < 0)
  		return 0;
  	return (points < 1000) ? points : 1000;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
210
211
212
  }
  
  /*
9b0f8b040   Christoph Lameter   [PATCH] Terminate...
213
214
   * Determine the type of allocation constraint.
   */
9b0f8b040   Christoph Lameter   [PATCH] Terminate...
215
  #ifdef CONFIG_NUMA
4365a5676   KAMEZAWA Hiroyuki   oom-kill: fix NUM...
216
  static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
a63d83f42   David Rientjes   oom: badness heur...
217
218
  				gfp_t gfp_mask, nodemask_t *nodemask,
  				unsigned long *totalpages)
4365a5676   KAMEZAWA Hiroyuki   oom-kill: fix NUM...
219
  {
54a6eb5c4   Mel Gorman   mm: use two zonel...
220
  	struct zone *zone;
dd1a239f6   Mel Gorman   mm: have zonelist...
221
  	struct zoneref *z;
54a6eb5c4   Mel Gorman   mm: use two zonel...
222
  	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
a63d83f42   David Rientjes   oom: badness heur...
223
224
  	bool cpuset_limited = false;
  	int nid;
9b0f8b040   Christoph Lameter   [PATCH] Terminate...
225

a63d83f42   David Rientjes   oom: badness heur...
226
227
228
229
230
  	/* Default to all available memory */
  	*totalpages = totalram_pages + total_swap_pages;
  
  	if (!zonelist)
  		return CONSTRAINT_NONE;
4365a5676   KAMEZAWA Hiroyuki   oom-kill: fix NUM...
231
232
233
234
235
236
237
  	/*
  	 * Reach here only when __GFP_NOFAIL is used. So, we should avoid
  	 * to kill current.We have to random task kill in this case.
  	 * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
  	 */
  	if (gfp_mask & __GFP_THISNODE)
  		return CONSTRAINT_NONE;
9b0f8b040   Christoph Lameter   [PATCH] Terminate...
238

4365a5676   KAMEZAWA Hiroyuki   oom-kill: fix NUM...
239
  	/*
a63d83f42   David Rientjes   oom: badness heur...
240
241
242
  	 * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
  	 * the page allocator means a mempolicy is in effect.  Cpuset policy
  	 * is enforced in get_page_from_freelist().
4365a5676   KAMEZAWA Hiroyuki   oom-kill: fix NUM...
243
  	 */
a63d83f42   David Rientjes   oom: badness heur...
244
245
246
247
  	if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
  		*totalpages = total_swap_pages;
  		for_each_node_mask(nid, *nodemask)
  			*totalpages += node_spanned_pages(nid);
9b0f8b040   Christoph Lameter   [PATCH] Terminate...
248
  		return CONSTRAINT_MEMORY_POLICY;
a63d83f42   David Rientjes   oom: badness heur...
249
  	}
4365a5676   KAMEZAWA Hiroyuki   oom-kill: fix NUM...
250
251
252
253
254
  
  	/* Check this allocation failure is caused by cpuset's wall function */
  	for_each_zone_zonelist_nodemask(zone, z, zonelist,
  			high_zoneidx, nodemask)
  		if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
a63d83f42   David Rientjes   oom: badness heur...
255
  			cpuset_limited = true;
9b0f8b040   Christoph Lameter   [PATCH] Terminate...
256

a63d83f42   David Rientjes   oom: badness heur...
257
258
259
260
261
262
  	if (cpuset_limited) {
  		*totalpages = total_swap_pages;
  		for_each_node_mask(nid, cpuset_current_mems_allowed)
  			*totalpages += node_spanned_pages(nid);
  		return CONSTRAINT_CPUSET;
  	}
9b0f8b040   Christoph Lameter   [PATCH] Terminate...
263
264
  	return CONSTRAINT_NONE;
  }
4365a5676   KAMEZAWA Hiroyuki   oom-kill: fix NUM...
265
266
  #else
  static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
a63d83f42   David Rientjes   oom: badness heur...
267
268
  				gfp_t gfp_mask, nodemask_t *nodemask,
  				unsigned long *totalpages)
4365a5676   KAMEZAWA Hiroyuki   oom-kill: fix NUM...
269
  {
a63d83f42   David Rientjes   oom: badness heur...
270
  	*totalpages = totalram_pages + total_swap_pages;
4365a5676   KAMEZAWA Hiroyuki   oom-kill: fix NUM...
271
272
273
  	return CONSTRAINT_NONE;
  }
  #endif
9b0f8b040   Christoph Lameter   [PATCH] Terminate...
274
275
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
276
277
278
279
280
   * Simple selection loop. We chose the process with the highest
   * number of 'points'. We expect the caller will lock the tasklist.
   *
   * (not docbooked, we don't want this one cluttering up the manual)
   */
a63d83f42   David Rientjes   oom: badness heur...
281
282
283
  static struct task_struct *select_bad_process(unsigned int *ppoints,
  		unsigned long totalpages, struct mem_cgroup *mem,
  		const nodemask_t *nodemask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
284
  {
495789a51   KOSAKI Motohiro   oom: make oom_sco...
285
  	struct task_struct *p;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
286
  	struct task_struct *chosen = NULL;
9827b781f   Kurt Garloff   [PATCH] OOM kill:...
287
  	*ppoints = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
288

495789a51   KOSAKI Motohiro   oom: make oom_sco...
289
  	for_each_process(p) {
a63d83f42   David Rientjes   oom: badness heur...
290
  		unsigned int points;
a49335cce   Paul Jackson   [PATCH] cpusets: ...
291

ab290adba   KOSAKI Motohiro   oom: make oom_unk...
292
  		if (oom_unkillable_task(p, mem, nodemask))
6cf86ac6f   David Rientjes   oom: filter tasks...
293
  			continue;
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
294

a49335cce   Paul Jackson   [PATCH] cpusets: ...
295
  		/*
b78483a4b   Nick Piggin   [PATCH] oom: don'...
296
297
298
299
300
301
302
303
304
305
306
307
  		 * This task already has access to memory reserves and is
  		 * being killed. Don't allow any other task access to the
  		 * memory reserve.
  		 *
  		 * Note: this may have a chance of deadlock if it gets
  		 * blocked waiting for another task which itself is waiting
  		 * for memory. Is there a better alternative?
  		 */
  		if (test_tsk_thread_flag(p, TIF_MEMDIE))
  			return ERR_PTR(-1UL);
  
  		/*
6937a25cf   Dave Peterson   [PATCH] mm: fix t...
308
  		 * This is in the process of releasing memory so wait for it
a49335cce   Paul Jackson   [PATCH] cpusets: ...
309
  		 * to finish before killing some other task by mistake.
50ec3bbff   Nick Piggin   [PATCH] oom: hand...
310
311
312
313
314
  		 *
  		 * However, if p is the current task, we allow the 'kill' to
  		 * go ahead if it is exiting: this will simply set TIF_MEMDIE,
  		 * which will allow it to gain access to memory reserves in
  		 * the process of exiting and releasing its resources.
b78483a4b   Nick Piggin   [PATCH] oom: don'...
315
  		 * Otherwise we could get an easy OOM deadlock.
a49335cce   Paul Jackson   [PATCH] cpusets: ...
316
  		 */
cef1d3523   KOSAKI Motohiro   oom: multi thread...
317
  		if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) {
b78483a4b   Nick Piggin   [PATCH] oom: don'...
318
319
  			if (p != current)
  				return ERR_PTR(-1UL);
972c4ea59   Oleg Nesterov   [PATCH] select_ba...
320
  			chosen = p;
a63d83f42   David Rientjes   oom: badness heur...
321
  			*ppoints = 1000;
50ec3bbff   Nick Piggin   [PATCH] oom: hand...
322
  		}
972c4ea59   Oleg Nesterov   [PATCH] select_ba...
323

a63d83f42   David Rientjes   oom: badness heur...
324
325
  		points = oom_badness(p, mem, nodemask, totalpages);
  		if (points > *ppoints) {
a49335cce   Paul Jackson   [PATCH] cpusets: ...
326
  			chosen = p;
9827b781f   Kurt Garloff   [PATCH] OOM kill:...
327
  			*ppoints = points;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
328
  		}
495789a51   KOSAKI Motohiro   oom: make oom_sco...
329
  	}
972c4ea59   Oleg Nesterov   [PATCH] select_ba...
330

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
331
332
333
334
  	return chosen;
  }
  
  /**
1b578df02   Randy Dunlap   mm/oom_kill: fix ...
335
   * dump_tasks - dump current memory state of all system tasks
74ab7f1d3   David Rientjes   oom: improve comm...
336
   * @mem: current's memory controller, if constrained
1b578df02   Randy Dunlap   mm/oom_kill: fix ...
337
   *
fef1bdd68   David Rientjes   oom: add sysctl t...
338
339
   * Dumps the current memory state of all system tasks, excluding kernel threads.
   * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
a63d83f42   David Rientjes   oom: badness heur...
340
   * value, oom_score_adj value, and name.
fef1bdd68   David Rientjes   oom: add sysctl t...
341
342
343
344
345
346
347
348
   *
   * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
   * shown.
   *
   * Call with tasklist_lock read-locked.
   */
  static void dump_tasks(const struct mem_cgroup *mem)
  {
c55db9578   KOSAKI Motohiro   oom: dump_tasks u...
349
350
  	struct task_struct *p;
  	struct task_struct *task;
fef1bdd68   David Rientjes   oom: add sysctl t...
351

a63d83f42   David Rientjes   oom: badness heur...
352
353
  	pr_info("[ pid ]   uid  tgid total_vm      rss cpu oom_adj oom_score_adj name
  ");
c55db9578   KOSAKI Motohiro   oom: dump_tasks u...
354
  	for_each_process(p) {
c55db9578   KOSAKI Motohiro   oom: dump_tasks u...
355
  		if (p->flags & PF_KTHREAD)
fef1bdd68   David Rientjes   oom: add sysctl t...
356
  			continue;
c55db9578   KOSAKI Motohiro   oom: dump_tasks u...
357
  		if (mem && !task_in_mem_cgroup(p, mem))
b4416d2be   David Rientjes   oom: do not dump ...
358
  			continue;
fef1bdd68   David Rientjes   oom: add sysctl t...
359

c55db9578   KOSAKI Motohiro   oom: dump_tasks u...
360
361
  		task = find_lock_task_mm(p);
  		if (!task) {
6d2661ede   David Rientjes   oom: fix possible...
362
  			/*
74ab7f1d3   David Rientjes   oom: improve comm...
363
364
  			 * This is a kthread or all of p's threads have already
  			 * detached their mm's.  There's no need to report
c55db9578   KOSAKI Motohiro   oom: dump_tasks u...
365
  			 * them; they can't be oom killed anyway.
6d2661ede   David Rientjes   oom: fix possible...
366
  			 */
6d2661ede   David Rientjes   oom: fix possible...
367
368
  			continue;
  		}
c55db9578   KOSAKI Motohiro   oom: dump_tasks u...
369

a63d83f42   David Rientjes   oom: badness heur...
370
371
372
373
374
375
  		pr_info("[%5d] %5d %5d %8lu %8lu %3u     %3d         %5d %s
  ",
  			task->pid, __task_cred(task)->uid, task->tgid,
  			task->mm->total_vm, get_mm_rss(task->mm),
  			task_cpu(task), task->signal->oom_adj,
  			task->signal->oom_score_adj, task->comm);
c55db9578   KOSAKI Motohiro   oom: dump_tasks u...
376
377
  		task_unlock(task);
  	}
fef1bdd68   David Rientjes   oom: add sysctl t...
378
  }
d31f56dbf   Daisuke Nishimura   memcg: avoid oom-...
379
380
  static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
  							struct mem_cgroup *mem)
1b604d75b   David Rientjes   oom: dump stack a...
381
  {
5e9d834a0   David Rientjes   oom: sacrifice ch...
382
  	task_lock(current);
1b604d75b   David Rientjes   oom: dump stack a...
383
  	pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
a63d83f42   David Rientjes   oom: badness heur...
384
385
386
387
  		"oom_adj=%d, oom_score_adj=%d
  ",
  		current->comm, gfp_mask, order, current->signal->oom_adj,
  		current->signal->oom_score_adj);
1b604d75b   David Rientjes   oom: dump stack a...
388
389
390
  	cpuset_print_task_mems_allowed(current);
  	task_unlock(current);
  	dump_stack();
d31f56dbf   Daisuke Nishimura   memcg: avoid oom-...
391
  	mem_cgroup_print_oom_info(mem, p);
1b604d75b   David Rientjes   oom: dump stack a...
392
393
394
395
  	show_mem();
  	if (sysctl_oom_dump_tasks)
  		dump_tasks(mem);
  }
3b4798cbc   KOSAKI Motohiro   oom-kill: show vi...
396
  #define K(x) ((x) << (PAGE_SHIFT-10))
93b43fa55   Luis Claudio R. Goncalves   oom: give the dyi...
397
  static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
398
  {
dd8e8f405   Oleg Nesterov   oom: introduce fi...
399
  	p = find_lock_task_mm(p);
a96cfd6e9   KOSAKI Motohiro   oom: move OOM_DIS...
400
  	if (!p) {
b940fd703   David Rientjes   oom: remove unnec...
401
402
403
404
405
406
407
408
  		task_unlock(p);
  		return 1;
  	}
  	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB
  ",
  		task_pid_nr(p), p->comm, K(p->mm->total_vm),
  		K(get_mm_counter(p->mm, MM_ANONPAGES)),
  		K(get_mm_counter(p->mm, MM_FILEPAGES)));
3b4798cbc   KOSAKI Motohiro   oom-kill: show vi...
409
  	task_unlock(p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
410

93b43fa55   Luis Claudio R. Goncalves   oom: give the dyi...
411

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
412
  	set_tsk_thread_flag(p, TIF_MEMDIE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
413
  	force_sig(SIGKILL, p);
93b43fa55   Luis Claudio R. Goncalves   oom: give the dyi...
414
415
416
417
418
419
420
  
  	/*
  	 * We give our sacrificial lamb high priority and access to
  	 * all the memory it needs. That way it should be able to
  	 * exit() and clear out its resources quickly...
  	 */
  	boost_dying_task_prio(p, mem);
013159227   Dave Peterson   [PATCH] mm: fix m...
421
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
422
  }
b940fd703   David Rientjes   oom: remove unnec...
423
  #undef K
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
424

7213f5066   David Rientjes   oom: suppress ext...
425
  static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
a63d83f42   David Rientjes   oom: badness heur...
426
427
428
  			    unsigned int points, unsigned long totalpages,
  			    struct mem_cgroup *mem, nodemask_t *nodemask,
  			    const char *message)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
429
  {
5e9d834a0   David Rientjes   oom: sacrifice ch...
430
431
  	struct task_struct *victim = p;
  	struct task_struct *child;
dd8e8f405   Oleg Nesterov   oom: introduce fi...
432
  	struct task_struct *t = p;
a63d83f42   David Rientjes   oom: badness heur...
433
  	unsigned int victim_points = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
434

1b604d75b   David Rientjes   oom: dump stack a...
435
  	if (printk_ratelimit())
d31f56dbf   Daisuke Nishimura   memcg: avoid oom-...
436
  		dump_header(p, gfp_mask, order, mem);
7213f5066   David Rientjes   oom: suppress ext...
437

50ec3bbff   Nick Piggin   [PATCH] oom: hand...
438
439
440
441
  	/*
  	 * If the task is already exiting, don't alarm the sysadmin or kill
  	 * its children or threads, just set TIF_MEMDIE so it can die quickly
  	 */
0753ba01e   KOSAKI Motohiro   mm: revert "oom: ...
442
  	if (p->flags & PF_EXITING) {
4358997ae   David Rientjes   oom: avoid sendin...
443
  		set_tsk_thread_flag(p, TIF_MEMDIE);
93b43fa55   Luis Claudio R. Goncalves   oom: give the dyi...
444
  		boost_dying_task_prio(p, mem);
50ec3bbff   Nick Piggin   [PATCH] oom: hand...
445
446
  		return 0;
  	}
5e9d834a0   David Rientjes   oom: sacrifice ch...
447
  	task_lock(p);
a63d83f42   David Rientjes   oom: badness heur...
448
449
  	pr_err("%s: Kill process %d (%s) score %d or sacrifice child
  ",
5e9d834a0   David Rientjes   oom: sacrifice ch...
450
451
  		message, task_pid_nr(p), p->comm, points);
  	task_unlock(p);
f3af38d30   Nick Piggin   [PATCH] oom: clea...
452

5e9d834a0   David Rientjes   oom: sacrifice ch...
453
454
455
456
457
458
  	/*
  	 * If any of p's children has a different mm and is eligible for kill,
  	 * the one with the highest badness() score is sacrificed for its
  	 * parent.  This attempts to lose the minimal amount of work done while
  	 * still freeing memory.
  	 */
dd8e8f405   Oleg Nesterov   oom: introduce fi...
459
  	do {
5e9d834a0   David Rientjes   oom: sacrifice ch...
460
  		list_for_each_entry(child, &t->children, sibling) {
a63d83f42   David Rientjes   oom: badness heur...
461
  			unsigned int child_points;
5e9d834a0   David Rientjes   oom: sacrifice ch...
462

a63d83f42   David Rientjes   oom: badness heur...
463
464
465
466
467
  			/*
  			 * oom_badness() returns 0 if the thread is unkillable
  			 */
  			child_points = oom_badness(child, mem, nodemask,
  								totalpages);
5e9d834a0   David Rientjes   oom: sacrifice ch...
468
469
470
471
  			if (child_points > victim_points) {
  				victim = child;
  				victim_points = child_points;
  			}
dd8e8f405   Oleg Nesterov   oom: introduce fi...
472
473
  		}
  	} while_each_thread(p, t);
93b43fa55   Luis Claudio R. Goncalves   oom: give the dyi...
474
  	return oom_kill_task(victim, mem);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
475
  }
309ed8825   David Rientjes   oom: extract pani...
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
  /*
   * Determines whether the kernel must panic because of the panic_on_oom sysctl.
   */
  static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
  				int order)
  {
  	if (likely(!sysctl_panic_on_oom))
  		return;
  	if (sysctl_panic_on_oom != 2) {
  		/*
  		 * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
  		 * does not panic for cpuset, mempolicy, or memcg allocation
  		 * failures.
  		 */
  		if (constraint != CONSTRAINT_NONE)
  			return;
  	}
  	read_lock(&tasklist_lock);
  	dump_header(NULL, gfp_mask, order, NULL);
  	read_unlock(&tasklist_lock);
  	panic("Out of memory: %s panic_on_oom is enabled
  ",
  		sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
  }
00f0b8259   Balbir Singh   Memory controller...
500
  #ifdef CONFIG_CGROUP_MEM_RES_CTLR
c7ba5c9e8   Pavel Emelianov   Memory controller...
501
502
  void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
  {
a63d83f42   David Rientjes   oom: badness heur...
503
504
  	unsigned long limit;
  	unsigned int points = 0;
c7ba5c9e8   Pavel Emelianov   Memory controller...
505
  	struct task_struct *p;
309ed8825   David Rientjes   oom: extract pani...
506
  	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0);
a63d83f42   David Rientjes   oom: badness heur...
507
  	limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
e115f2d89   Li Zefan   memcg: fix oops i...
508
  	read_lock(&tasklist_lock);
c7ba5c9e8   Pavel Emelianov   Memory controller...
509
  retry:
a63d83f42   David Rientjes   oom: badness heur...
510
  	p = select_bad_process(&points, limit, mem, NULL);
df64f81bb   David Rientjes   memcg: make oom k...
511
  	if (!p || PTR_ERR(p) == -1UL)
c7ba5c9e8   Pavel Emelianov   Memory controller...
512
  		goto out;
a63d83f42   David Rientjes   oom: badness heur...
513
  	if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL,
c7ba5c9e8   Pavel Emelianov   Memory controller...
514
515
516
  				"Memory cgroup out of memory"))
  		goto retry;
  out:
e115f2d89   Li Zefan   memcg: fix oops i...
517
  	read_unlock(&tasklist_lock);
c7ba5c9e8   Pavel Emelianov   Memory controller...
518
519
  }
  #endif
8bc719d3c   Martin Schwidefsky   [PATCH] out of me...
520
521
522
523
524
525
526
527
528
529
530
531
532
  static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
  
  int register_oom_notifier(struct notifier_block *nb)
  {
  	return blocking_notifier_chain_register(&oom_notify_list, nb);
  }
  EXPORT_SYMBOL_GPL(register_oom_notifier);
  
  int unregister_oom_notifier(struct notifier_block *nb)
  {
  	return blocking_notifier_chain_unregister(&oom_notify_list, nb);
  }
  EXPORT_SYMBOL_GPL(unregister_oom_notifier);
098d7f128   David Rientjes   oom: add per-zone...
533
534
535
536
537
  /*
   * Try to acquire the OOM killer lock for the zones in zonelist.  Returns zero
   * if a parallel OOM killing is already taking place that includes a zone in
   * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
   */
ff321feac   Minchan Kim   mm: rename try_se...
538
  int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
098d7f128   David Rientjes   oom: add per-zone...
539
  {
dd1a239f6   Mel Gorman   mm: have zonelist...
540
541
  	struct zoneref *z;
  	struct zone *zone;
098d7f128   David Rientjes   oom: add per-zone...
542
  	int ret = 1;
c7d4caeb1   David Rientjes   oom: fix zone_sca...
543
  	spin_lock(&zone_scan_lock);
dd1a239f6   Mel Gorman   mm: have zonelist...
544
545
  	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
  		if (zone_is_oom_locked(zone)) {
098d7f128   David Rientjes   oom: add per-zone...
546
547
548
  			ret = 0;
  			goto out;
  		}
dd1a239f6   Mel Gorman   mm: have zonelist...
549
550
551
552
  	}
  
  	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
  		/*
c7d4caeb1   David Rientjes   oom: fix zone_sca...
553
  		 * Lock each zone in the zonelist under zone_scan_lock so a
ff321feac   Minchan Kim   mm: rename try_se...
554
  		 * parallel invocation of try_set_zonelist_oom() doesn't succeed
dd1a239f6   Mel Gorman   mm: have zonelist...
555
556
557
558
  		 * when it shouldn't.
  		 */
  		zone_set_flag(zone, ZONE_OOM_LOCKED);
  	}
098d7f128   David Rientjes   oom: add per-zone...
559

098d7f128   David Rientjes   oom: add per-zone...
560
  out:
c7d4caeb1   David Rientjes   oom: fix zone_sca...
561
  	spin_unlock(&zone_scan_lock);
098d7f128   David Rientjes   oom: add per-zone...
562
563
564
565
566
567
568
569
  	return ret;
  }
  
  /*
   * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
   * allocation attempts with zonelists containing them may now recall the OOM
   * killer, if necessary.
   */
dd1a239f6   Mel Gorman   mm: have zonelist...
570
  void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
098d7f128   David Rientjes   oom: add per-zone...
571
  {
dd1a239f6   Mel Gorman   mm: have zonelist...
572
573
  	struct zoneref *z;
  	struct zone *zone;
098d7f128   David Rientjes   oom: add per-zone...
574

c7d4caeb1   David Rientjes   oom: fix zone_sca...
575
  	spin_lock(&zone_scan_lock);
dd1a239f6   Mel Gorman   mm: have zonelist...
576
577
578
  	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
  		zone_clear_flag(zone, ZONE_OOM_LOCKED);
  	}
c7d4caeb1   David Rientjes   oom: fix zone_sca...
579
  	spin_unlock(&zone_scan_lock);
098d7f128   David Rientjes   oom: add per-zone...
580
  }
1c0fe6e3b   Nick Piggin   mm: invoke oom-ki...
581
  /*
e36589323   David Rientjes   oom: remove speci...
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
   * Try to acquire the oom killer lock for all system zones.  Returns zero if a
   * parallel oom killing is taking place, otherwise locks all zones and returns
   * non-zero.
   */
  static int try_set_system_oom(void)
  {
  	struct zone *zone;
  	int ret = 1;
  
  	spin_lock(&zone_scan_lock);
  	for_each_populated_zone(zone)
  		if (zone_is_oom_locked(zone)) {
  			ret = 0;
  			goto out;
  		}
  	for_each_populated_zone(zone)
  		zone_set_flag(zone, ZONE_OOM_LOCKED);
  out:
  	spin_unlock(&zone_scan_lock);
  	return ret;
  }
  
  /*
   * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation
   * attempts or page faults may now recall the oom killer, if necessary.
   */
  static void clear_system_oom(void)
  {
  	struct zone *zone;
  
  	spin_lock(&zone_scan_lock);
  	for_each_populated_zone(zone)
  		zone_clear_flag(zone, ZONE_OOM_LOCKED);
  	spin_unlock(&zone_scan_lock);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
617
  /**
6937a25cf   Dave Peterson   [PATCH] mm: fix t...
618
   * out_of_memory - kill the "best" process when we run out of memory
1b578df02   Randy Dunlap   mm/oom_kill: fix ...
619
620
621
   * @zonelist: zonelist pointer
   * @gfp_mask: memory allocation flags
   * @order: amount of memory being requested as a power of 2
6f48d0ebd   David Rientjes   oom: select task ...
622
   * @nodemask: nodemask passed to page allocator
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
623
624
625
626
627
628
   *
   * If we run out of memory, we have the choice between either
   * killing a random task (bad), letting the system crash (worse)
   * OR try to be smart about which process to kill. Note that we
   * don't have to be perfect here, we just have to be good.
   */
4365a5676   KAMEZAWA Hiroyuki   oom-kill: fix NUM...
629
630
  void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
  		int order, nodemask_t *nodemask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
631
  {
0aad4b312   David Rientjes   oom: fold __out_o...
632
  	struct task_struct *p;
a63d83f42   David Rientjes   oom: badness heur...
633
  	unsigned long totalpages;
8bc719d3c   Martin Schwidefsky   [PATCH] out of me...
634
  	unsigned long freed = 0;
a63d83f42   David Rientjes   oom: badness heur...
635
  	unsigned int points;
e36589323   David Rientjes   oom: remove speci...
636
  	enum oom_constraint constraint = CONSTRAINT_NONE;
8bc719d3c   Martin Schwidefsky   [PATCH] out of me...
637
638
639
640
641
  
  	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
  	if (freed > 0)
  		/* Got some memory back in the last second. */
  		return;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
642

7b98c2e40   David Rientjes   oom: give current...
643
644
645
646
647
648
649
  	/*
  	 * If current has a pending SIGKILL, then automatically select it.  The
  	 * goal is to allow it to allocate so that it may quickly exit and free
  	 * its memory.
  	 */
  	if (fatal_signal_pending(current)) {
  		set_thread_flag(TIF_MEMDIE);
93b43fa55   Luis Claudio R. Goncalves   oom: give the dyi...
650
  		boost_dying_task_prio(current, NULL);
7b98c2e40   David Rientjes   oom: give current...
651
652
  		return;
  	}
9b0f8b040   Christoph Lameter   [PATCH] Terminate...
653
654
655
656
  	/*
  	 * Check if there were limitations on the allocation (only relevant for
  	 * NUMA) that may require different handling.
  	 */
a63d83f42   David Rientjes   oom: badness heur...
657
658
  	constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
  						&totalpages);
309ed8825   David Rientjes   oom: extract pani...
659
  	check_panic_on_oom(constraint, gfp_mask, order);
0aad4b312   David Rientjes   oom: fold __out_o...
660

2b45ab339   David Rientjes   oom: fix constrai...
661
  	read_lock(&tasklist_lock);
f88ccad58   KOSAKI Motohiro   oom: oom_kill_pro...
662
  	if (sysctl_oom_kill_allocating_task &&
a96cfd6e9   KOSAKI Motohiro   oom: move OOM_DIS...
663
664
  	    !oom_unkillable_task(current, NULL, nodemask) &&
  	    (current->signal->oom_adj != OOM_DISABLE)) {
0aad4b312   David Rientjes   oom: fold __out_o...
665
666
667
668
669
  		/*
  		 * oom_kill_process() needs tasklist_lock held.  If it returns
  		 * non-zero, current could not be killed so we must fallback to
  		 * the tasklist scan.
  		 */
a63d83f42   David Rientjes   oom: badness heur...
670
671
  		if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
  				NULL, nodemask,
0aad4b312   David Rientjes   oom: fold __out_o...
672
673
674
675
676
  				"Out of memory (oom_kill_allocating_task)"))
  			return;
  	}
  
  retry:
a63d83f42   David Rientjes   oom: badness heur...
677
  	p = select_bad_process(&points, totalpages, NULL,
f44200320   David Rientjes   oom: remove const...
678
679
  			constraint == CONSTRAINT_MEMORY_POLICY ? nodemask :
  								 NULL);
0aad4b312   David Rientjes   oom: fold __out_o...
680
681
682
683
684
685
686
687
688
689
  	if (PTR_ERR(p) == -1UL)
  		return;
  
  	/* Found nothing?!?! Either we hang forever, or we panic. */
  	if (!p) {
  		dump_header(NULL, gfp_mask, order, NULL);
  		read_unlock(&tasklist_lock);
  		panic("Out of memory and no killable processes...
  ");
  	}
a63d83f42   David Rientjes   oom: badness heur...
690
691
  	if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
  				nodemask, "Out of memory"))
0aad4b312   David Rientjes   oom: fold __out_o...
692
  		goto retry;
140ffcec4   Andrew Morton   [PATCH] out_of_me...
693
  	read_unlock(&tasklist_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
694
695
696
  
  	/*
  	 * Give "p" a good chance of killing itself before we
2f659f462   Kirill Korotaev   [PATCH] Optimise ...
697
  	 * retry to allocate memory unless "p" is current
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
698
  	 */
2f659f462   Kirill Korotaev   [PATCH] Optimise ...
699
  	if (!test_thread_flag(TIF_MEMDIE))
140ffcec4   Andrew Morton   [PATCH] out_of_me...
700
  		schedule_timeout_uninterruptible(1);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
701
  }
e36589323   David Rientjes   oom: remove speci...
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
  
  /*
   * The pagefault handler calls here because it is out of memory, so kill a
   * memory-hogging task.  If a populated zone has ZONE_OOM_LOCKED set, a parallel
   * oom killing is already in progress so do nothing.  If a task is found with
   * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.
   */
  void pagefault_out_of_memory(void)
  {
  	if (try_set_system_oom()) {
  		out_of_memory(NULL, 0, 0, NULL);
  		clear_system_oom();
  	}
  	if (!test_thread_flag(TIF_MEMDIE))
  		schedule_timeout_uninterruptible(1);
  }