Commit b34a6b1da371ed8af1221459a18c67970f7e3d53

Authored by Vasiliy Kulikov
Committed by Linus Torvalds
1 parent d40dcdb017

ipc: introduce shm_rmid_forced sysctl

Add support for the shm_rmid_forced sysctl.  If set to 1, all shared
memory objects in current ipc namespace will be automatically forced to
use IPC_RMID.

The POSIX way of handling shmem allows one to create shm objects and
call shmdt(), leaving shm object associated with no process, thus
consuming memory not counted via rlimits.

With shm_rmid_forced=1 the shared memory object is counted at least for
one process, so OOM killer may effectively kill the fat process holding
the shared memory.

It obviously breaks POSIX - some programs relying on the feature would
stop working.  So set shm_rmid_forced=1 only if you're sure nobody uses
"orphaned" memory.  Use shm_rmid_forced=0 by default for compatability
reasons.

The feature was previously impemented in -ow as a configure option.

[akpm@linux-foundation.org: fix documentation, per Randy]
[akpm@linux-foundation.org: fix warning]
[akpm@linux-foundation.org: readability/conventionality tweaks]
[akpm@linux-foundation.org: fix shm_rmid_forced/shm_forced_rmid confusion, use standard comment layout]
Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Serge E. Hallyn" <serge.hallyn@canonical.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Solar Designer <solar@openwall.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 6 changed files with 163 additions and 4 deletions Side-by-side Diff

Documentation/sysctl/kernel.txt
... ... @@ -61,6 +61,7 @@
61 61 - rtsig-nr
62 62 - sem
63 63 - sg-big-buff [ generic SCSI device (sg) ]
  64 +- shm_rmid_forced
64 65 - shmall
65 66 - shmmax [ sysv ipc ]
66 67 - shmmni
... ... @@ -515,6 +516,27 @@
515 516 on the maximum shared memory segment size that can be created.
516 517 Shared memory segments up to 1Gb are now supported in the
517 518 kernel. This value defaults to SHMMAX.
  519 +
  520 +==============================================================
  521 +
  522 +shm_rmid_forced:
  523 +
  524 +Linux lets you set resource limits, including how much memory one
  525 +process can consume, via setrlimit(2). Unfortunately, shared memory
  526 +segments are allowed to exist without association with any process, and
  527 +thus might not be counted against any resource limits. If enabled,
  528 +shared memory segments are automatically destroyed when their attach
  529 +count becomes zero after a detach or a process termination. It will
  530 +also destroy segments that were created, but never attached to, on exit
  531 +from the process. The only use left for IPC_RMID is to immediately
  532 +destroy an unattached segment. Of course, this breaks the way things are
  533 +defined, so some applications might stop working. Note that this
  534 +feature will do you no good unless you also configure your resource
  535 +limits (in particular, RLIMIT_AS and RLIMIT_NPROC). Most systems don't
  536 +need this.
  537 +
  538 +Note that if you change this from 0 to 1, already created segments
  539 +without users and with a dead originative process will be destroyed.
518 540  
519 541 ==============================================================
520 542  
include/linux/ipc_namespace.h
... ... @@ -44,6 +44,11 @@
44 44 size_t shm_ctlall;
45 45 int shm_ctlmni;
46 46 int shm_tot;
  47 + /*
  48 + * Defines whether IPC_RMID is forced for _all_ shm segments regardless
  49 + * of shmctl()
  50 + */
  51 + int shm_rmid_forced;
47 52  
48 53 struct notifier_block ipcns_nb;
49 54  
... ... @@ -72,6 +77,7 @@
72 77 extern int cond_register_ipcns_notifier(struct ipc_namespace *);
73 78 extern void unregister_ipcns_notifier(struct ipc_namespace *);
74 79 extern int ipcns_notify(unsigned long);
  80 +extern void shm_destroy_orphaned(struct ipc_namespace *ns);
75 81 #else /* CONFIG_SYSVIPC */
76 82 static inline int register_ipcns_notifier(struct ipc_namespace *ns)
77 83 { return 0; }
... ... @@ -79,6 +85,7 @@
79 85 { return 0; }
80 86 static inline void unregister_ipcns_notifier(struct ipc_namespace *ns) { }
81 87 static inline int ipcns_notify(unsigned long l) { return 0; }
  88 +static inline void shm_destroy_orphaned(struct ipc_namespace *ns) {}
82 89 #endif /* CONFIG_SYSVIPC */
83 90  
84 91 #ifdef CONFIG_POSIX_MQUEUE
... ... @@ -106,6 +106,7 @@
106 106 #ifdef CONFIG_SYSVIPC
107 107 long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr);
108 108 extern int is_file_shm_hugepages(struct file *file);
  109 +extern void exit_shm(struct task_struct *task);
109 110 #else
110 111 static inline long do_shmat(int shmid, char __user *shmaddr,
111 112 int shmflg, unsigned long *addr)
... ... @@ -115,6 +116,9 @@
115 116 static inline int is_file_shm_hugepages(struct file *file)
116 117 {
117 118 return 0;
  119 +}
  120 +static inline void exit_shm(struct task_struct *task)
  121 +{
118 122 }
119 123 #endif
120 124  
... ... @@ -31,12 +31,37 @@
31 31 void __user *buffer, size_t *lenp, loff_t *ppos)
32 32 {
33 33 struct ctl_table ipc_table;
  34 +
34 35 memcpy(&ipc_table, table, sizeof(ipc_table));
35 36 ipc_table.data = get_ipc(table);
36 37  
37 38 return proc_dointvec(&ipc_table, write, buffer, lenp, ppos);
38 39 }
39 40  
  41 +static int proc_ipc_dointvec_minmax(ctl_table *table, int write,
  42 + void __user *buffer, size_t *lenp, loff_t *ppos)
  43 +{
  44 + struct ctl_table ipc_table;
  45 +
  46 + memcpy(&ipc_table, table, sizeof(ipc_table));
  47 + ipc_table.data = get_ipc(table);
  48 +
  49 + return proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos);
  50 +}
  51 +
  52 +static int proc_ipc_dointvec_minmax_orphans(ctl_table *table, int write,
  53 + void __user *buffer, size_t *lenp, loff_t *ppos)
  54 +{
  55 + struct ipc_namespace *ns = current->nsproxy->ipc_ns;
  56 + int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos);
  57 +
  58 + if (err < 0)
  59 + return err;
  60 + if (ns->shm_rmid_forced)
  61 + shm_destroy_orphaned(ns);
  62 + return err;
  63 +}
  64 +
40 65 static int proc_ipc_callback_dointvec(ctl_table *table, int write,
41 66 void __user *buffer, size_t *lenp, loff_t *ppos)
42 67 {
... ... @@ -125,6 +150,8 @@
125 150 #else
126 151 #define proc_ipc_doulongvec_minmax NULL
127 152 #define proc_ipc_dointvec NULL
  153 +#define proc_ipc_dointvec_minmax NULL
  154 +#define proc_ipc_dointvec_minmax_orphans NULL
128 155 #define proc_ipc_callback_dointvec NULL
129 156 #define proc_ipcauto_dointvec_minmax NULL
130 157 #endif
... ... @@ -153,6 +180,15 @@
153 180 .maxlen = sizeof (init_ipc_ns.shm_ctlmni),
154 181 .mode = 0644,
155 182 .proc_handler = proc_ipc_dointvec,
  183 + },
  184 + {
  185 + .procname = "shm_rmid_forced",
  186 + .data = &init_ipc_ns.shm_rmid_forced,
  187 + .maxlen = sizeof(init_ipc_ns.shm_rmid_forced),
  188 + .mode = 0644,
  189 + .proc_handler = proc_ipc_dointvec_minmax_orphans,
  190 + .extra1 = &zero,
  191 + .extra2 = &one,
156 192 },
157 193 {
158 194 .procname = "msgmax",
... ... @@ -74,6 +74,7 @@
74 74 ns->shm_ctlmax = SHMMAX;
75 75 ns->shm_ctlall = SHMALL;
76 76 ns->shm_ctlmni = SHMMNI;
  77 + ns->shm_rmid_forced = 0;
77 78 ns->shm_tot = 0;
78 79 ipc_init_ids(&shm_ids(ns));
79 80 }
... ... @@ -187,6 +188,23 @@
187 188 }
188 189  
189 190 /*
  191 + * shm_may_destroy - identifies whether shm segment should be destroyed now
  192 + *
  193 + * Returns true if and only if there are no active users of the segment and
  194 + * one of the following is true:
  195 + *
  196 + * 1) shmctl(id, IPC_RMID, NULL) was called for this shp
  197 + *
  198 + * 2) sysctl kernel.shm_rmid_forced is set to 1.
  199 + */
  200 +static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
  201 +{
  202 + return (shp->shm_nattch == 0) &&
  203 + (ns->shm_rmid_forced ||
  204 + (shp->shm_perm.mode & SHM_DEST));
  205 +}
  206 +
  207 +/*
190 208 * remove the attach descriptor vma.
191 209 * free memory for segment if it is marked destroyed.
192 210 * The descriptor has already been removed from the current->mm->mmap list
193 211  
... ... @@ -206,14 +224,86 @@
206 224 shp->shm_lprid = task_tgid_vnr(current);
207 225 shp->shm_dtim = get_seconds();
208 226 shp->shm_nattch--;
209   - if(shp->shm_nattch == 0 &&
210   - shp->shm_perm.mode & SHM_DEST)
  227 + if (shm_may_destroy(ns, shp))
211 228 shm_destroy(ns, shp);
212 229 else
213 230 shm_unlock(shp);
214 231 up_write(&shm_ids(ns).rw_mutex);
215 232 }
216 233  
  234 +static int shm_try_destroy_current(int id, void *p, void *data)
  235 +{
  236 + struct ipc_namespace *ns = data;
  237 + struct shmid_kernel *shp = shm_lock(ns, id);
  238 +
  239 + if (IS_ERR(shp))
  240 + return 0;
  241 +
  242 + if (shp->shm_cprid != task_tgid_vnr(current)) {
  243 + shm_unlock(shp);
  244 + return 0;
  245 + }
  246 +
  247 + if (shm_may_destroy(ns, shp))
  248 + shm_destroy(ns, shp);
  249 + else
  250 + shm_unlock(shp);
  251 + return 0;
  252 +}
  253 +
  254 +static int shm_try_destroy_orphaned(int id, void *p, void *data)
  255 +{
  256 + struct ipc_namespace *ns = data;
  257 + struct shmid_kernel *shp = shm_lock(ns, id);
  258 + struct task_struct *task;
  259 +
  260 + if (IS_ERR(shp))
  261 + return 0;
  262 +
  263 + /*
  264 + * We want to destroy segments without users and with already
  265 + * exit'ed originating process.
  266 + *
  267 + * XXX: the originating process may exist in another pid namespace.
  268 + */
  269 + task = find_task_by_vpid(shp->shm_cprid);
  270 + if (task != NULL) {
  271 + shm_unlock(shp);
  272 + return 0;
  273 + }
  274 +
  275 + if (shm_may_destroy(ns, shp))
  276 + shm_destroy(ns, shp);
  277 + else
  278 + shm_unlock(shp);
  279 + return 0;
  280 +}
  281 +
  282 +void shm_destroy_orphaned(struct ipc_namespace *ns)
  283 +{
  284 + down_write(&shm_ids(ns).rw_mutex);
  285 + idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
  286 + up_write(&shm_ids(ns).rw_mutex);
  287 +}
  288 +
  289 +
  290 +void exit_shm(struct task_struct *task)
  291 +{
  292 + struct nsproxy *nsp = task->nsproxy;
  293 + struct ipc_namespace *ns;
  294 +
  295 + if (!nsp)
  296 + return;
  297 + ns = nsp->ipc_ns;
  298 + if (!ns || !ns->shm_rmid_forced)
  299 + return;
  300 +
  301 + /* Destroy all already created segments, but not mapped yet */
  302 + down_write(&shm_ids(ns).rw_mutex);
  303 + idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns);
  304 + up_write(&shm_ids(ns).rw_mutex);
  305 +}
  306 +
217 307 static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
218 308 {
219 309 struct file *file = vma->vm_file;
... ... @@ -950,8 +1040,7 @@
950 1040 shp = shm_lock(ns, shmid);
951 1041 BUG_ON(IS_ERR(shp));
952 1042 shp->shm_nattch--;
953   - if(shp->shm_nattch == 0 &&
954   - shp->shm_perm.mode & SHM_DEST)
  1043 + if (shm_may_destroy(ns, shp))
955 1044 shm_destroy(ns, shp);
956 1045 else
957 1046 shm_unlock(shp);
... ... @@ -980,6 +980,7 @@
980 980 trace_sched_process_exit(tsk);
981 981  
982 982 exit_sem(tsk);
  983 + exit_shm(tsk);
983 984 exit_files(tsk);
984 985 exit_fs(tsk);
985 986 check_stack_usage();