Blame view

kernel/pid_namespace.c 10 KB
74bd59bb3   Pavel Emelyanov   namespaces: clean...
1
2
3
4
5
6
7
8
9
10
11
12
  /*
   * Pid namespaces
   *
   * Authors:
   *    (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
   *    (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
   *     Many thanks to Oleg Nesterov for comments and help
   *
   */
  
  #include <linux/pid.h>
  #include <linux/pid_namespace.h>
49f4d8b93   Eric W. Biederman   pidns: Capture th...
13
  #include <linux/user_namespace.h>
74bd59bb3   Pavel Emelyanov   namespaces: clean...
14
15
  #include <linux/syscalls.h>
  #include <linux/err.h>
0b6b030fc   Pavel Emelyanov   bsdacct: switch f...
16
  #include <linux/acct.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
17
  #include <linux/slab.h>
0bb80f240   David Howells   proc: Split the n...
18
  #include <linux/proc_ns.h>
cf3f89214   Daniel Lezcano   pidns: add reboot...
19
  #include <linux/reboot.h>
523a6a945   Eric W. Biederman   pidns: Export fre...
20
  #include <linux/export.h>
74bd59bb3   Pavel Emelyanov   namespaces: clean...
21

74bd59bb3   Pavel Emelyanov   namespaces: clean...
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
  struct pid_cache {
  	int nr_ids;
  	char name[16];
  	struct kmem_cache *cachep;
  	struct list_head list;
  };
  
  static LIST_HEAD(pid_caches_lh);
  static DEFINE_MUTEX(pid_caches_mutex);
  static struct kmem_cache *pid_ns_cachep;
  
  /*
   * creates the kmem cache to allocate pids from.
   * @nr_ids: the number of numerical ids this pid will have to carry
   */
  
  static struct kmem_cache *create_pid_cachep(int nr_ids)
  {
  	struct pid_cache *pcache;
  	struct kmem_cache *cachep;
  
  	mutex_lock(&pid_caches_mutex);
  	list_for_each_entry(pcache, &pid_caches_lh, list)
  		if (pcache->nr_ids == nr_ids)
  			goto out;
  
  	pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
  	if (pcache == NULL)
  		goto err_alloc;
  
  	snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
  	cachep = kmem_cache_create(pcache->name,
  			sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
  			0, SLAB_HWCACHE_ALIGN, NULL);
  	if (cachep == NULL)
  		goto err_cachep;
  
  	pcache->nr_ids = nr_ids;
  	pcache->cachep = cachep;
  	list_add(&pcache->list, &pid_caches_lh);
  out:
  	mutex_unlock(&pid_caches_mutex);
  	return pcache->cachep;
  
  err_cachep:
  	kfree(pcache);
  err_alloc:
  	mutex_unlock(&pid_caches_mutex);
  	return NULL;
  }
0a01f2cc3   Eric W. Biederman   pidns: Make the p...
72
73
74
75
76
  static void proc_cleanup_work(struct work_struct *work)
  {
  	struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
  	pid_ns_release_proc(ns);
  }
f23025057   Andrew Vagin   pidns: limit the ...
77
78
  /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
  #define MAX_PID_NS_LEVEL 32
49f4d8b93   Eric W. Biederman   pidns: Capture th...
79
80
  static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
  	struct pid_namespace *parent_pid_ns)
74bd59bb3   Pavel Emelyanov   namespaces: clean...
81
82
  {
  	struct pid_namespace *ns;
ed469a63c   Alexey Dobriyan   pidns: make creat...
83
  	unsigned int level = parent_pid_ns->level + 1;
f23025057   Andrew Vagin   pidns: limit the ...
84
85
86
87
88
89
90
  	int i;
  	int err;
  
  	if (level > MAX_PID_NS_LEVEL) {
  		err = -EINVAL;
  		goto out;
  	}
74bd59bb3   Pavel Emelyanov   namespaces: clean...
91

f23025057   Andrew Vagin   pidns: limit the ...
92
  	err = -ENOMEM;
84406c153   Pavel Emelyanov   pidns: use kzallo...
93
  	ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
94
95
96
97
98
99
100
101
102
103
  	if (ns == NULL)
  		goto out;
  
  	ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
  	if (!ns->pidmap[0].page)
  		goto out_free;
  
  	ns->pid_cachep = create_pid_cachep(level + 1);
  	if (ns->pid_cachep == NULL)
  		goto out_free_map;
6344c433a   Al Viro   new helpers: ns_a...
104
  	err = ns_alloc_inum(&ns->ns);
98f842e67   Eric W. Biederman   proc: Usable inod...
105
106
  	if (err)
  		goto out_free_map;
33c429405   Al Viro   copy address of p...
107
  	ns->ns.ops = &pidns_operations;
98f842e67   Eric W. Biederman   proc: Usable inod...
108

74bd59bb3   Pavel Emelyanov   namespaces: clean...
109
  	kref_init(&ns->kref);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
110
  	ns->level = level;
ed469a63c   Alexey Dobriyan   pidns: make creat...
111
  	ns->parent = get_pid_ns(parent_pid_ns);
49f4d8b93   Eric W. Biederman   pidns: Capture th...
112
  	ns->user_ns = get_user_ns(user_ns);
c876ad768   Eric W. Biederman   pidns: Stop pid a...
113
  	ns->nr_hashed = PIDNS_HASH_ADDING;
0a01f2cc3   Eric W. Biederman   pidns: Make the p...
114
  	INIT_WORK(&ns->proc_work, proc_cleanup_work);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
115
116
117
  
  	set_bit(0, ns->pidmap[0].page);
  	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
84406c153   Pavel Emelyanov   pidns: use kzallo...
118
  	for (i = 1; i < PIDMAP_ENTRIES; i++)
74bd59bb3   Pavel Emelyanov   namespaces: clean...
119
  		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
120
121
122
123
124
125
126
127
  
  	return ns;
  
  out_free_map:
  	kfree(ns->pidmap[0].page);
  out_free:
  	kmem_cache_free(pid_ns_cachep, ns);
  out:
4308eebbe   Eric W. Biederman   pidns: call pid_n...
128
  	return ERR_PTR(err);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
129
  }
1adfcb03e   Al Viro   pid_namespace: ma...
130
131
132
133
134
  static void delayed_free_pidns(struct rcu_head *p)
  {
  	kmem_cache_free(pid_ns_cachep,
  			container_of(p, struct pid_namespace, rcu));
  }
74bd59bb3   Pavel Emelyanov   namespaces: clean...
135
136
137
  static void destroy_pid_namespace(struct pid_namespace *ns)
  {
  	int i;
6344c433a   Al Viro   new helpers: ns_a...
138
  	ns_free_inum(&ns->ns);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
139
140
  	for (i = 0; i < PIDMAP_ENTRIES; i++)
  		kfree(ns->pidmap[i].page);
49f4d8b93   Eric W. Biederman   pidns: Capture th...
141
  	put_user_ns(ns->user_ns);
1adfcb03e   Al Viro   pid_namespace: ma...
142
  	call_rcu(&ns->rcu, delayed_free_pidns);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
143
  }
49f4d8b93   Eric W. Biederman   pidns: Capture th...
144
145
  struct pid_namespace *copy_pid_ns(unsigned long flags,
  	struct user_namespace *user_ns, struct pid_namespace *old_ns)
74bd59bb3   Pavel Emelyanov   namespaces: clean...
146
  {
74bd59bb3   Pavel Emelyanov   namespaces: clean...
147
  	if (!(flags & CLONE_NEWPID))
dca4a9796   Alexey Dobriyan   pidns: rewrite co...
148
  		return get_pid_ns(old_ns);
225778d68   Eric W. Biederman   pidns: Deny stran...
149
150
  	if (task_active_pid_ns(current) != old_ns)
  		return ERR_PTR(-EINVAL);
49f4d8b93   Eric W. Biederman   pidns: Capture th...
151
  	return create_pid_namespace(user_ns, old_ns);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
152
  }
bbc2e3ef8   Cyrill Gorcunov   pidns: remove rec...
153
  static void free_pid_ns(struct kref *kref)
74bd59bb3   Pavel Emelyanov   namespaces: clean...
154
  {
bbc2e3ef8   Cyrill Gorcunov   pidns: remove rec...
155
  	struct pid_namespace *ns;
74bd59bb3   Pavel Emelyanov   namespaces: clean...
156
157
  
  	ns = container_of(kref, struct pid_namespace, kref);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
158
  	destroy_pid_namespace(ns);
bbc2e3ef8   Cyrill Gorcunov   pidns: remove rec...
159
  }
74bd59bb3   Pavel Emelyanov   namespaces: clean...
160

bbc2e3ef8   Cyrill Gorcunov   pidns: remove rec...
161
162
163
164
165
166
167
168
169
170
  void put_pid_ns(struct pid_namespace *ns)
  {
  	struct pid_namespace *parent;
  
  	while (ns != &init_pid_ns) {
  		parent = ns->parent;
  		if (!kref_put(&ns->kref, free_pid_ns))
  			break;
  		ns = parent;
  	}
74bd59bb3   Pavel Emelyanov   namespaces: clean...
171
  }
bbc2e3ef8   Cyrill Gorcunov   pidns: remove rec...
172
  EXPORT_SYMBOL_GPL(put_pid_ns);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
173
174
175
176
177
  
  void zap_pid_ns_processes(struct pid_namespace *pid_ns)
  {
  	int nr;
  	int rc;
00c10bc13   Eric W. Biederman   pidns: make kille...
178
  	struct task_struct *task, *me = current;
751c644b9   Eric W. Biederman   pid: Handle the e...
179
  	int init_pids = thread_group_leader(me) ? 1 : 2;
00c10bc13   Eric W. Biederman   pidns: make kille...
180

c876ad768   Eric W. Biederman   pidns: Stop pid a...
181
182
  	/* Don't allow any more processes into the pid namespace */
  	disable_pid_allocation(pid_ns);
a53b83154   Oleg Nesterov   exit: pidns: fix/...
183
184
185
186
187
  	/*
  	 * Ignore SIGCHLD causing any terminated children to autoreap.
  	 * This speeds up the namespace shutdown, plus see the comment
  	 * below.
  	 */
00c10bc13   Eric W. Biederman   pidns: make kille...
188
189
190
  	spin_lock_irq(&me->sighand->siglock);
  	me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
  	spin_unlock_irq(&me->sighand->siglock);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
  
  	/*
  	 * The last thread in the cgroup-init thread group is terminating.
  	 * Find remaining pid_ts in the namespace, signal and wait for them
  	 * to exit.
  	 *
  	 * Note:  This signals each threads in the namespace - even those that
  	 * 	  belong to the same thread group, To avoid this, we would have
  	 * 	  to walk the entire tasklist looking a processes in this
  	 * 	  namespace, but that could be unnecessarily expensive if the
  	 * 	  pid namespace has just a few processes. Or we need to
  	 * 	  maintain a tasklist for each pid namespace.
  	 *
  	 */
  	read_lock(&tasklist_lock);
  	nr = next_pidmap(pid_ns, 1);
  	while (nr > 0) {
e4da026f9   Sukadev Bhattiprolu   signals: zap_pid_...
208
  		rcu_read_lock();
e4da026f9   Sukadev Bhattiprolu   signals: zap_pid_...
209
  		task = pid_task(find_vpid(nr), PIDTYPE_PID);
a02d6fd64   Oleg Nesterov   signal: zap_pid_n...
210
211
  		if (task && !__fatal_signal_pending(task))
  			send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
e4da026f9   Sukadev Bhattiprolu   signals: zap_pid_...
212
213
  
  		rcu_read_unlock();
74bd59bb3   Pavel Emelyanov   namespaces: clean...
214
215
216
  		nr = next_pidmap(pid_ns, nr);
  	}
  	read_unlock(&tasklist_lock);
a53b83154   Oleg Nesterov   exit: pidns: fix/...
217
218
219
220
221
  	/*
  	 * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
  	 * sys_wait4() will also block until our children traced from the
  	 * parent namespace are detached and become EXIT_DEAD.
  	 */
74bd59bb3   Pavel Emelyanov   namespaces: clean...
222
223
224
225
  	do {
  		clear_thread_flag(TIF_SIGPENDING);
  		rc = sys_wait4(-1, NULL, __WALL, NULL);
  	} while (rc != -ECHILD);
6347e9009   Eric W. Biederman   pidns: guarantee ...
226
  	/*
a53b83154   Oleg Nesterov   exit: pidns: fix/...
227
228
229
230
231
232
233
234
235
236
237
238
239
240
  	 * sys_wait4() above can't reap the EXIT_DEAD children but we do not
  	 * really care, we could reparent them to the global init. We could
  	 * exit and reap ->child_reaper even if it is not the last thread in
  	 * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
  	 * pid_ns can not go away until proc_kill_sb() drops the reference.
  	 *
  	 * But this ns can also have other tasks injected by setns()+fork().
  	 * Again, ignoring the user visible semantics we do not really need
  	 * to wait until they are all reaped, but they can be reparented to
  	 * us and thus we need to ensure that pid->child_reaper stays valid
  	 * until they all go away. See free_pid()->wake_up_process().
  	 *
  	 * We rely on ignored SIGCHLD, an injected zombie must be autoreaped
  	 * if reparented.
6347e9009   Eric W. Biederman   pidns: guarantee ...
241
242
  	 */
  	for (;;) {
af4b8a83a   Eric W. Biederman   pidns: Wait in za...
243
  		set_current_state(TASK_UNINTERRUPTIBLE);
751c644b9   Eric W. Biederman   pid: Handle the e...
244
  		if (pid_ns->nr_hashed == init_pids)
6347e9009   Eric W. Biederman   pidns: guarantee ...
245
246
247
  			break;
  		schedule();
  	}
af4b8a83a   Eric W. Biederman   pidns: Wait in za...
248
  	__set_current_state(TASK_RUNNING);
6347e9009   Eric W. Biederman   pidns: guarantee ...
249

cf3f89214   Daniel Lezcano   pidns: add reboot...
250
251
  	if (pid_ns->reboot)
  		current->signal->group_exit_code = pid_ns->reboot;
0b6b030fc   Pavel Emelyanov   bsdacct: switch f...
252
  	acct_exit_ns(pid_ns);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
253
254
  	return;
  }
98ed57eef   Cyrill Gorcunov   sysctl: make kern...
255
  #ifdef CONFIG_CHECKPOINT_RESTORE
b8f566b04   Pavel Emelyanov   sysctl: add the k...
256
257
258
  static int pid_ns_ctl_handler(struct ctl_table *table, int write,
  		void __user *buffer, size_t *lenp, loff_t *ppos)
  {
49f4d8b93   Eric W. Biederman   pidns: Capture th...
259
  	struct pid_namespace *pid_ns = task_active_pid_ns(current);
b8f566b04   Pavel Emelyanov   sysctl: add the k...
260
  	struct ctl_table tmp = *table;
49f4d8b93   Eric W. Biederman   pidns: Capture th...
261
  	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
b8f566b04   Pavel Emelyanov   sysctl: add the k...
262
263
264
265
266
267
268
  		return -EPERM;
  
  	/*
  	 * Writing directly to ns' last_pid field is OK, since this field
  	 * is volatile in a living namespace anyway and a code writing to
  	 * it should synchronize its usage with external means.
  	 */
49f4d8b93   Eric W. Biederman   pidns: Capture th...
269
  	tmp.data = &pid_ns->last_pid;
579035dc5   Andrew Vagin   pid-namespace: li...
270
  	return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
b8f566b04   Pavel Emelyanov   sysctl: add the k...
271
  }
579035dc5   Andrew Vagin   pid-namespace: li...
272
273
  extern int pid_max;
  static int zero = 0;
b8f566b04   Pavel Emelyanov   sysctl: add the k...
274
275
276
277
278
279
  static struct ctl_table pid_ns_ctl_table[] = {
  	{
  		.procname = "ns_last_pid",
  		.maxlen = sizeof(int),
  		.mode = 0666, /* permissions are checked in the handler */
  		.proc_handler = pid_ns_ctl_handler,
579035dc5   Andrew Vagin   pid-namespace: li...
280
281
  		.extra1 = &zero,
  		.extra2 = &pid_max,
b8f566b04   Pavel Emelyanov   sysctl: add the k...
282
283
284
  	},
  	{ }
  };
b8f566b04   Pavel Emelyanov   sysctl: add the k...
285
  static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
98ed57eef   Cyrill Gorcunov   sysctl: make kern...
286
  #endif	/* CONFIG_CHECKPOINT_RESTORE */
b8f566b04   Pavel Emelyanov   sysctl: add the k...
287

cf3f89214   Daniel Lezcano   pidns: add reboot...
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
  int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
  {
  	if (pid_ns == &init_pid_ns)
  		return 0;
  
  	switch (cmd) {
  	case LINUX_REBOOT_CMD_RESTART2:
  	case LINUX_REBOOT_CMD_RESTART:
  		pid_ns->reboot = SIGHUP;
  		break;
  
  	case LINUX_REBOOT_CMD_POWER_OFF:
  	case LINUX_REBOOT_CMD_HALT:
  		pid_ns->reboot = SIGINT;
  		break;
  	default:
  		return -EINVAL;
  	}
  
  	read_lock(&tasklist_lock);
  	force_sig(SIGKILL, pid_ns->child_reaper);
  	read_unlock(&tasklist_lock);
  
  	do_exit(0);
  
  	/* Not reached */
  	return 0;
  }
3c0411846   Al Viro   switch the rest o...
316
317
318
319
  static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
  {
  	return container_of(ns, struct pid_namespace, ns);
  }
64964528b   Al Viro   make proc_ns_oper...
320
  static struct ns_common *pidns_get(struct task_struct *task)
57e8391d3   Eric W. Biederman   pidns: Add setns ...
321
322
323
324
  {
  	struct pid_namespace *ns;
  
  	rcu_read_lock();
d23082257   Oleg Nesterov   pid_namespace: pi...
325
326
327
  	ns = task_active_pid_ns(task);
  	if (ns)
  		get_pid_ns(ns);
57e8391d3   Eric W. Biederman   pidns: Add setns ...
328
  	rcu_read_unlock();
3c0411846   Al Viro   switch the rest o...
329
  	return ns ? &ns->ns : NULL;
57e8391d3   Eric W. Biederman   pidns: Add setns ...
330
  }
64964528b   Al Viro   make proc_ns_oper...
331
  static void pidns_put(struct ns_common *ns)
57e8391d3   Eric W. Biederman   pidns: Add setns ...
332
  {
3c0411846   Al Viro   switch the rest o...
333
  	put_pid_ns(to_pid_ns(ns));
57e8391d3   Eric W. Biederman   pidns: Add setns ...
334
  }
64964528b   Al Viro   make proc_ns_oper...
335
  static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
57e8391d3   Eric W. Biederman   pidns: Add setns ...
336
337
  {
  	struct pid_namespace *active = task_active_pid_ns(current);
3c0411846   Al Viro   switch the rest o...
338
  	struct pid_namespace *ancestor, *new = to_pid_ns(ns);
57e8391d3   Eric W. Biederman   pidns: Add setns ...
339

5e4a08476   Eric W. Biederman   userns: Require C...
340
  	if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
c7b96acf1   Eric W. Biederman   userns: Kill nso...
341
  	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
57e8391d3   Eric W. Biederman   pidns: Add setns ...
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
  		return -EPERM;
  
  	/*
  	 * Only allow entering the current active pid namespace
  	 * or a child of the current active pid namespace.
  	 *
  	 * This is required for fork to return a usable pid value and
  	 * this maintains the property that processes and their
  	 * children can not escape their current pid namespace.
  	 */
  	if (new->level < active->level)
  		return -EINVAL;
  
  	ancestor = new;
  	while (ancestor->level > active->level)
  		ancestor = ancestor->parent;
  	if (ancestor != active)
  		return -EINVAL;
c2b1df2eb   Andy Lutomirski   Rename nsproxy.pi...
360
361
  	put_pid_ns(nsproxy->pid_ns_for_children);
  	nsproxy->pid_ns_for_children = get_pid_ns(new);
57e8391d3   Eric W. Biederman   pidns: Add setns ...
362
363
364
365
366
367
368
369
370
371
  	return 0;
  }
  
  const struct proc_ns_operations pidns_operations = {
  	.name		= "pid",
  	.type		= CLONE_NEWPID,
  	.get		= pidns_get,
  	.put		= pidns_put,
  	.install	= pidns_install,
  };
74bd59bb3   Pavel Emelyanov   namespaces: clean...
372
373
374
  static __init int pid_namespaces_init(void)
  {
  	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
98ed57eef   Cyrill Gorcunov   sysctl: make kern...
375
376
  
  #ifdef CONFIG_CHECKPOINT_RESTORE
b8f566b04   Pavel Emelyanov   sysctl: add the k...
377
  	register_sysctl_paths(kern_path, pid_ns_ctl_table);
98ed57eef   Cyrill Gorcunov   sysctl: make kern...
378
  #endif
74bd59bb3   Pavel Emelyanov   namespaces: clean...
379
380
381
382
  	return 0;
  }
  
  __initcall(pid_namespaces_init);