Commit 0753ba01e126020bf0f8150934903b48935b697d
Committed by
Linus Torvalds
1 parent
89a4eb4b66
Exists in
master
and in
20 other branches
mm: revert "oom: move oom_adj value"
The commit 2ff05b2b (oom: move oom_adj value) moveed the oom_adj value to the mm_struct. It was a very good first step for sanitize OOM. However Paul Menage reported the commit makes regression to his job scheduler. Current OOM logic can kill OOM_DISABLED process. Why? His program has the code of similar to the following. ... set_oom_adj(OOM_DISABLE); /* The job scheduler never killed by oom */ ... if (vfork() == 0) { set_oom_adj(0); /* Invoked child can be killed */ execve("foo-bar-cmd"); } .... vfork() parent and child are shared the same mm_struct. then above set_oom_adj(0) doesn't only change oom_adj for vfork() child, it's also change oom_adj for vfork() parent. Then, vfork() parent (job scheduler) lost OOM immune and it was killed. Actually, fork-setting-exec idiom is very frequently used in userland program. We must not break this assumption. Then, this patch revert commit 2ff05b2b and related commit. Reverted commit list --------------------- - commit 2ff05b2b4e (oom: move oom_adj value from task_struct to mm_struct) - commit 4d8b9135c3 (oom: avoid unnecessary mm locking and scanning for OOM_DISABLE) - commit 8123681022 (oom: only oom kill exiting tasks with attached memory) - commit 933b787b57 (mm: copy over oom_adj value at fork time) Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Paul Menage <menage@google.com> Cc: David Rientjes <rientjes@google.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Rik van Riel <riel@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 6 changed files with 48 additions and 54 deletions Side-by-side Diff
Documentation/filesystems/proc.txt
... | ... | @@ -1167,13 +1167,11 @@ |
1167 | 1167 | 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score |
1168 | 1168 | ------------------------------------------------------ |
1169 | 1169 | |
1170 | -This file can be used to adjust the score used to select which processes should | |
1171 | -be killed in an out-of-memory situation. The oom_adj value is a characteristic | |
1172 | -of the task's mm, so all threads that share an mm with pid will have the same | |
1173 | -oom_adj value. A high value will increase the likelihood of this process being | |
1174 | -killed by the oom-killer. Valid values are in the range -16 to +15 as | |
1175 | -explained below and a special value of -17, which disables oom-killing | |
1176 | -altogether for threads sharing pid's mm. | |
1170 | +This file can be used to adjust the score used to select which processes | |
1171 | +should be killed in an out-of-memory situation. Giving it a high score will | |
1172 | +increase the likelihood of this process being killed by the oom-killer. Valid | |
1173 | +values are in the range -16 to +15, plus the special value -17, which disables | |
1174 | +oom-killing altogether for this process. | |
1177 | 1175 | |
1178 | 1176 | The process to be killed in an out-of-memory situation is selected among all others |
1179 | 1177 | based on its badness score. This value equals the original memory size of the process |
... | ... | @@ -1186,9 +1184,6 @@ |
1186 | 1184 | the parent's score if they do not share the same memory. Thus forking servers |
1187 | 1185 | are the prime candidates to be killed. Having only one 'hungry' child will make |
1188 | 1186 | parent less preferable than the child. |
1189 | - | |
1190 | -/proc/<pid>/oom_adj cannot be changed for kthreads since they are immune from | |
1191 | -oom-killing already. | |
1192 | 1187 | |
1193 | 1188 | /proc/<pid>/oom_score shows process' current badness score. |
1194 | 1189 |
fs/proc/base.c
... | ... | @@ -1003,12 +1003,7 @@ |
1003 | 1003 | |
1004 | 1004 | if (!task) |
1005 | 1005 | return -ESRCH; |
1006 | - task_lock(task); | |
1007 | - if (task->mm) | |
1008 | - oom_adjust = task->mm->oom_adj; | |
1009 | - else | |
1010 | - oom_adjust = OOM_DISABLE; | |
1011 | - task_unlock(task); | |
1006 | + oom_adjust = task->oomkilladj; | |
1012 | 1007 | put_task_struct(task); |
1013 | 1008 | |
1014 | 1009 | len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); |
1015 | 1010 | |
1016 | 1011 | |
... | ... | @@ -1037,19 +1032,11 @@ |
1037 | 1032 | task = get_proc_task(file->f_path.dentry->d_inode); |
1038 | 1033 | if (!task) |
1039 | 1034 | return -ESRCH; |
1040 | - task_lock(task); | |
1041 | - if (!task->mm) { | |
1042 | - task_unlock(task); | |
1035 | + if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { | |
1043 | 1036 | put_task_struct(task); |
1044 | - return -EINVAL; | |
1045 | - } | |
1046 | - if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) { | |
1047 | - task_unlock(task); | |
1048 | - put_task_struct(task); | |
1049 | 1037 | return -EACCES; |
1050 | 1038 | } |
1051 | - task->mm->oom_adj = oom_adjust; | |
1052 | - task_unlock(task); | |
1039 | + task->oomkilladj = oom_adjust; | |
1053 | 1040 | put_task_struct(task); |
1054 | 1041 | if (end - buffer == 0) |
1055 | 1042 | return -EIO; |
include/linux/mm_types.h
include/linux/sched.h
kernel/fork.c
... | ... | @@ -426,7 +426,6 @@ |
426 | 426 | init_rwsem(&mm->mmap_sem); |
427 | 427 | INIT_LIST_HEAD(&mm->mmlist); |
428 | 428 | mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; |
429 | - mm->oom_adj = (current->mm) ? current->mm->oom_adj : 0; | |
430 | 429 | mm->core_state = NULL; |
431 | 430 | mm->nr_ptes = 0; |
432 | 431 | set_mm_counter(mm, file_rss, 0); |
mm/oom_kill.c
... | ... | @@ -58,7 +58,6 @@ |
58 | 58 | unsigned long points, cpu_time, run_time; |
59 | 59 | struct mm_struct *mm; |
60 | 60 | struct task_struct *child; |
61 | - int oom_adj; | |
62 | 61 | |
63 | 62 | task_lock(p); |
64 | 63 | mm = p->mm; |
... | ... | @@ -66,11 +65,6 @@ |
66 | 65 | task_unlock(p); |
67 | 66 | return 0; |
68 | 67 | } |
69 | - oom_adj = mm->oom_adj; | |
70 | - if (oom_adj == OOM_DISABLE) { | |
71 | - task_unlock(p); | |
72 | - return 0; | |
73 | - } | |
74 | 68 | |
75 | 69 | /* |
76 | 70 | * The memory size of the process is the basis for the badness. |
77 | 71 | |
78 | 72 | |
79 | 73 | |
... | ... | @@ -154,15 +148,15 @@ |
154 | 148 | points /= 8; |
155 | 149 | |
156 | 150 | /* |
157 | - * Adjust the score by oom_adj. | |
151 | + * Adjust the score by oomkilladj. | |
158 | 152 | */ |
159 | - if (oom_adj) { | |
160 | - if (oom_adj > 0) { | |
153 | + if (p->oomkilladj) { | |
154 | + if (p->oomkilladj > 0) { | |
161 | 155 | if (!points) |
162 | 156 | points = 1; |
163 | - points <<= oom_adj; | |
157 | + points <<= p->oomkilladj; | |
164 | 158 | } else |
165 | - points >>= -(oom_adj); | |
159 | + points >>= -(p->oomkilladj); | |
166 | 160 | } |
167 | 161 | |
168 | 162 | #ifdef DEBUG |
169 | 163 | |
... | ... | @@ -257,8 +251,11 @@ |
257 | 251 | *ppoints = ULONG_MAX; |
258 | 252 | } |
259 | 253 | |
254 | + if (p->oomkilladj == OOM_DISABLE) | |
255 | + continue; | |
256 | + | |
260 | 257 | points = badness(p, uptime.tv_sec); |
261 | - if (points > *ppoints) { | |
258 | + if (points > *ppoints || !chosen) { | |
262 | 259 | chosen = p; |
263 | 260 | *ppoints = points; |
264 | 261 | } |
... | ... | @@ -307,7 +304,8 @@ |
307 | 304 | } |
308 | 305 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", |
309 | 306 | p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, |
310 | - get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); | |
307 | + get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, | |
308 | + p->comm); | |
311 | 309 | task_unlock(p); |
312 | 310 | } while_each_thread(g, p); |
313 | 311 | } |
314 | 312 | |
... | ... | @@ -325,8 +323,11 @@ |
325 | 323 | return; |
326 | 324 | } |
327 | 325 | |
328 | - if (!p->mm) | |
326 | + if (!p->mm) { | |
327 | + WARN_ON(1); | |
328 | + printk(KERN_WARNING "tried to kill an mm-less task!\n"); | |
329 | 329 | return; |
330 | + } | |
330 | 331 | |
331 | 332 | if (verbose) |
332 | 333 | printk(KERN_ERR "Killed process %d (%s)\n", |
333 | 334 | |
334 | 335 | |
... | ... | @@ -348,13 +349,28 @@ |
348 | 349 | struct mm_struct *mm; |
349 | 350 | struct task_struct *g, *q; |
350 | 351 | |
351 | - task_lock(p); | |
352 | 352 | mm = p->mm; |
353 | - if (!mm || mm->oom_adj == OOM_DISABLE) { | |
354 | - task_unlock(p); | |
353 | + | |
354 | + /* WARNING: mm may not be dereferenced since we did not obtain its | |
355 | + * value from get_task_mm(p). This is OK since all we need to do is | |
356 | + * compare mm to q->mm below. | |
357 | + * | |
358 | + * Furthermore, even if mm contains a non-NULL value, p->mm may | |
359 | + * change to NULL at any time since we do not hold task_lock(p). | |
360 | + * However, this is of no concern to us. | |
361 | + */ | |
362 | + | |
363 | + if (mm == NULL) | |
355 | 364 | return 1; |
356 | - } | |
357 | - task_unlock(p); | |
365 | + | |
366 | + /* | |
367 | + * Don't kill the process if any threads are set to OOM_DISABLE | |
368 | + */ | |
369 | + do_each_thread(g, q) { | |
370 | + if (q->mm == mm && q->oomkilladj == OOM_DISABLE) | |
371 | + return 1; | |
372 | + } while_each_thread(g, q); | |
373 | + | |
358 | 374 | __oom_kill_task(p, 1); |
359 | 375 | |
360 | 376 | /* |
361 | 377 | |
... | ... | @@ -377,11 +393,10 @@ |
377 | 393 | struct task_struct *c; |
378 | 394 | |
379 | 395 | if (printk_ratelimit()) { |
380 | - task_lock(current); | |
381 | 396 | printk(KERN_WARNING "%s invoked oom-killer: " |
382 | - "gfp_mask=0x%x, order=%d, oom_adj=%d\n", | |
383 | - current->comm, gfp_mask, order, | |
384 | - current->mm ? current->mm->oom_adj : OOM_DISABLE); | |
397 | + "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", | |
398 | + current->comm, gfp_mask, order, current->oomkilladj); | |
399 | + task_lock(current); | |
385 | 400 | cpuset_print_task_mems_allowed(current); |
386 | 401 | task_unlock(current); |
387 | 402 | dump_stack(); |
388 | 403 | |
... | ... | @@ -394,9 +409,8 @@ |
394 | 409 | /* |
395 | 410 | * If the task is already exiting, don't alarm the sysadmin or kill |
396 | 411 | * its children or threads, just set TIF_MEMDIE so it can die quickly |
397 | - * if its mm is still attached. | |
398 | 412 | */ |
399 | - if (p->mm && (p->flags & PF_EXITING)) { | |
413 | + if (p->flags & PF_EXITING) { | |
400 | 414 | __oom_kill_task(p, 0); |
401 | 415 | return 0; |
402 | 416 | } |