Blame view

kernel/pid_namespace.c 9.05 KB
74bd59bb3   Pavel Emelyanov   namespaces: clean...
1
2
3
4
5
6
7
8
9
10
11
12
  /*
   * Pid namespaces
   *
   * Authors:
   *    (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
   *    (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
   *     Many thanks to Oleg Nesterov for comments and help
   *
   */
  
  #include <linux/pid.h>
  #include <linux/pid_namespace.h>
49f4d8b93   Eric W. Biederman   pidns: Capture th...
13
  #include <linux/user_namespace.h>
74bd59bb3   Pavel Emelyanov   namespaces: clean...
14
15
  #include <linux/syscalls.h>
  #include <linux/err.h>
0b6b030fc   Pavel Emelyanov   bsdacct: switch f...
16
  #include <linux/acct.h>
5a0e3ad6a   Tejun Heo   include cleanup: ...
17
  #include <linux/slab.h>
0bb80f240   David Howells   proc: Split the n...
18
  #include <linux/proc_ns.h>
cf3f89214   Daniel Lezcano   pidns: add reboot...
19
  #include <linux/reboot.h>
523a6a945   Eric W. Biederman   pidns: Export fre...
20
  #include <linux/export.h>
74bd59bb3   Pavel Emelyanov   namespaces: clean...
21

74bd59bb3   Pavel Emelyanov   namespaces: clean...
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
  struct pid_cache {
  	int nr_ids;
  	char name[16];
  	struct kmem_cache *cachep;
  	struct list_head list;
  };
  
  static LIST_HEAD(pid_caches_lh);
  static DEFINE_MUTEX(pid_caches_mutex);
  static struct kmem_cache *pid_ns_cachep;
  
  /*
   * creates the kmem cache to allocate pids from.
   * @nr_ids: the number of numerical ids this pid will have to carry
   */
  
  static struct kmem_cache *create_pid_cachep(int nr_ids)
  {
  	struct pid_cache *pcache;
  	struct kmem_cache *cachep;
  
  	mutex_lock(&pid_caches_mutex);
  	list_for_each_entry(pcache, &pid_caches_lh, list)
  		if (pcache->nr_ids == nr_ids)
  			goto out;
  
  	pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
  	if (pcache == NULL)
  		goto err_alloc;
  
  	snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
  	cachep = kmem_cache_create(pcache->name,
  			sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
  			0, SLAB_HWCACHE_ALIGN, NULL);
  	if (cachep == NULL)
  		goto err_cachep;
  
  	pcache->nr_ids = nr_ids;
  	pcache->cachep = cachep;
  	list_add(&pcache->list, &pid_caches_lh);
  out:
  	mutex_unlock(&pid_caches_mutex);
  	return pcache->cachep;
  
  err_cachep:
  	kfree(pcache);
  err_alloc:
  	mutex_unlock(&pid_caches_mutex);
  	return NULL;
  }
0a01f2cc3   Eric W. Biederman   pidns: Make the p...
72
73
74
75
76
  static void proc_cleanup_work(struct work_struct *work)
  {
  	struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
  	pid_ns_release_proc(ns);
  }
f23025057   Andrew Vagin   pidns: limit the ...
77
78
  /* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
  #define MAX_PID_NS_LEVEL 32
49f4d8b93   Eric W. Biederman   pidns: Capture th...
79
80
  static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
  	struct pid_namespace *parent_pid_ns)
74bd59bb3   Pavel Emelyanov   namespaces: clean...
81
82
  {
  	struct pid_namespace *ns;
ed469a63c   Alexey Dobriyan   pidns: make creat...
83
  	unsigned int level = parent_pid_ns->level + 1;
f23025057   Andrew Vagin   pidns: limit the ...
84
85
86
87
88
89
90
  	int i;
  	int err;
  
  	if (level > MAX_PID_NS_LEVEL) {
  		err = -EINVAL;
  		goto out;
  	}
74bd59bb3   Pavel Emelyanov   namespaces: clean...
91

f23025057   Andrew Vagin   pidns: limit the ...
92
  	err = -ENOMEM;
84406c153   Pavel Emelyanov   pidns: use kzallo...
93
  	ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
94
95
96
97
98
99
100
101
102
103
  	if (ns == NULL)
  		goto out;
  
  	ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
  	if (!ns->pidmap[0].page)
  		goto out_free;
  
  	ns->pid_cachep = create_pid_cachep(level + 1);
  	if (ns->pid_cachep == NULL)
  		goto out_free_map;
98f842e67   Eric W. Biederman   proc: Usable inod...
104
105
106
  	err = proc_alloc_inum(&ns->proc_inum);
  	if (err)
  		goto out_free_map;
74bd59bb3   Pavel Emelyanov   namespaces: clean...
107
  	kref_init(&ns->kref);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
108
  	ns->level = level;
ed469a63c   Alexey Dobriyan   pidns: make creat...
109
  	ns->parent = get_pid_ns(parent_pid_ns);
49f4d8b93   Eric W. Biederman   pidns: Capture th...
110
  	ns->user_ns = get_user_ns(user_ns);
c876ad768   Eric W. Biederman   pidns: Stop pid a...
111
  	ns->nr_hashed = PIDNS_HASH_ADDING;
0a01f2cc3   Eric W. Biederman   pidns: Make the p...
112
  	INIT_WORK(&ns->proc_work, proc_cleanup_work);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
113
114
115
  
  	set_bit(0, ns->pidmap[0].page);
  	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
84406c153   Pavel Emelyanov   pidns: use kzallo...
116
  	for (i = 1; i < PIDMAP_ENTRIES; i++)
74bd59bb3   Pavel Emelyanov   namespaces: clean...
117
  		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
118
119
120
121
122
123
124
125
  
  	return ns;
  
  out_free_map:
  	kfree(ns->pidmap[0].page);
  out_free:
  	kmem_cache_free(pid_ns_cachep, ns);
  out:
4308eebbe   Eric W. Biederman   pidns: call pid_n...
126
  	return ERR_PTR(err);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
127
  }
1adfcb03e   Al Viro   pid_namespace: ma...
128
129
130
131
132
  static void delayed_free_pidns(struct rcu_head *p)
  {
  	kmem_cache_free(pid_ns_cachep,
  			container_of(p, struct pid_namespace, rcu));
  }
74bd59bb3   Pavel Emelyanov   namespaces: clean...
133
134
135
  static void destroy_pid_namespace(struct pid_namespace *ns)
  {
  	int i;
98f842e67   Eric W. Biederman   proc: Usable inod...
136
  	proc_free_inum(ns->proc_inum);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
137
138
  	for (i = 0; i < PIDMAP_ENTRIES; i++)
  		kfree(ns->pidmap[i].page);
49f4d8b93   Eric W. Biederman   pidns: Capture th...
139
  	put_user_ns(ns->user_ns);
1adfcb03e   Al Viro   pid_namespace: ma...
140
  	call_rcu(&ns->rcu, delayed_free_pidns);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
141
  }
49f4d8b93   Eric W. Biederman   pidns: Capture th...
142
143
  struct pid_namespace *copy_pid_ns(unsigned long flags,
  	struct user_namespace *user_ns, struct pid_namespace *old_ns)
74bd59bb3   Pavel Emelyanov   namespaces: clean...
144
  {
74bd59bb3   Pavel Emelyanov   namespaces: clean...
145
  	if (!(flags & CLONE_NEWPID))
dca4a9796   Alexey Dobriyan   pidns: rewrite co...
146
  		return get_pid_ns(old_ns);
225778d68   Eric W. Biederman   pidns: Deny stran...
147
148
  	if (task_active_pid_ns(current) != old_ns)
  		return ERR_PTR(-EINVAL);
49f4d8b93   Eric W. Biederman   pidns: Capture th...
149
  	return create_pid_namespace(user_ns, old_ns);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
150
  }
bbc2e3ef8   Cyrill Gorcunov   pidns: remove rec...
151
  static void free_pid_ns(struct kref *kref)
74bd59bb3   Pavel Emelyanov   namespaces: clean...
152
  {
bbc2e3ef8   Cyrill Gorcunov   pidns: remove rec...
153
  	struct pid_namespace *ns;
74bd59bb3   Pavel Emelyanov   namespaces: clean...
154
155
  
  	ns = container_of(kref, struct pid_namespace, kref);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
156
  	destroy_pid_namespace(ns);
bbc2e3ef8   Cyrill Gorcunov   pidns: remove rec...
157
  }
74bd59bb3   Pavel Emelyanov   namespaces: clean...
158

bbc2e3ef8   Cyrill Gorcunov   pidns: remove rec...
159
160
161
162
163
164
165
166
167
168
  void put_pid_ns(struct pid_namespace *ns)
  {
  	struct pid_namespace *parent;
  
  	while (ns != &init_pid_ns) {
  		parent = ns->parent;
  		if (!kref_put(&ns->kref, free_pid_ns))
  			break;
  		ns = parent;
  	}
74bd59bb3   Pavel Emelyanov   namespaces: clean...
169
  }
bbc2e3ef8   Cyrill Gorcunov   pidns: remove rec...
170
  EXPORT_SYMBOL_GPL(put_pid_ns);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
171
172
173
174
175
  
  void zap_pid_ns_processes(struct pid_namespace *pid_ns)
  {
  	int nr;
  	int rc;
00c10bc13   Eric W. Biederman   pidns: make kille...
176
  	struct task_struct *task, *me = current;
751c644b9   Eric W. Biederman   pid: Handle the e...
177
  	int init_pids = thread_group_leader(me) ? 1 : 2;
00c10bc13   Eric W. Biederman   pidns: make kille...
178

c876ad768   Eric W. Biederman   pidns: Stop pid a...
179
180
  	/* Don't allow any more processes into the pid namespace */
  	disable_pid_allocation(pid_ns);
00c10bc13   Eric W. Biederman   pidns: make kille...
181
182
183
184
  	/* Ignore SIGCHLD causing any terminated children to autoreap */
  	spin_lock_irq(&me->sighand->siglock);
  	me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
  	spin_unlock_irq(&me->sighand->siglock);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
  
  	/*
  	 * The last thread in the cgroup-init thread group is terminating.
  	 * Find remaining pid_ts in the namespace, signal and wait for them
  	 * to exit.
  	 *
  	 * Note:  This signals each threads in the namespace - even those that
  	 * 	  belong to the same thread group, To avoid this, we would have
  	 * 	  to walk the entire tasklist looking a processes in this
  	 * 	  namespace, but that could be unnecessarily expensive if the
  	 * 	  pid namespace has just a few processes. Or we need to
  	 * 	  maintain a tasklist for each pid namespace.
  	 *
  	 */
  	read_lock(&tasklist_lock);
  	nr = next_pidmap(pid_ns, 1);
  	while (nr > 0) {
e4da026f9   Sukadev Bhattiprolu   signals: zap_pid_...
202
  		rcu_read_lock();
e4da026f9   Sukadev Bhattiprolu   signals: zap_pid_...
203
  		task = pid_task(find_vpid(nr), PIDTYPE_PID);
a02d6fd64   Oleg Nesterov   signal: zap_pid_n...
204
205
  		if (task && !__fatal_signal_pending(task))
  			send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
e4da026f9   Sukadev Bhattiprolu   signals: zap_pid_...
206
207
  
  		rcu_read_unlock();
74bd59bb3   Pavel Emelyanov   namespaces: clean...
208
209
210
  		nr = next_pidmap(pid_ns, nr);
  	}
  	read_unlock(&tasklist_lock);
6347e9009   Eric W. Biederman   pidns: guarantee ...
211
  	/* Firstly reap the EXIT_ZOMBIE children we may have. */
74bd59bb3   Pavel Emelyanov   namespaces: clean...
212
213
214
215
  	do {
  		clear_thread_flag(TIF_SIGPENDING);
  		rc = sys_wait4(-1, NULL, __WALL, NULL);
  	} while (rc != -ECHILD);
6347e9009   Eric W. Biederman   pidns: guarantee ...
216
217
  	/*
  	 * sys_wait4() above can't reap the TASK_DEAD children.
af4b8a83a   Eric W. Biederman   pidns: Wait in za...
218
  	 * Make sure they all go away, see free_pid().
6347e9009   Eric W. Biederman   pidns: guarantee ...
219
220
  	 */
  	for (;;) {
af4b8a83a   Eric W. Biederman   pidns: Wait in za...
221
  		set_current_state(TASK_UNINTERRUPTIBLE);
751c644b9   Eric W. Biederman   pid: Handle the e...
222
  		if (pid_ns->nr_hashed == init_pids)
6347e9009   Eric W. Biederman   pidns: guarantee ...
223
224
225
  			break;
  		schedule();
  	}
af4b8a83a   Eric W. Biederman   pidns: Wait in za...
226
  	__set_current_state(TASK_RUNNING);
6347e9009   Eric W. Biederman   pidns: guarantee ...
227

cf3f89214   Daniel Lezcano   pidns: add reboot...
228
229
  	if (pid_ns->reboot)
  		current->signal->group_exit_code = pid_ns->reboot;
0b6b030fc   Pavel Emelyanov   bsdacct: switch f...
230
  	acct_exit_ns(pid_ns);
74bd59bb3   Pavel Emelyanov   namespaces: clean...
231
232
  	return;
  }
98ed57eef   Cyrill Gorcunov   sysctl: make kern...
233
  #ifdef CONFIG_CHECKPOINT_RESTORE
b8f566b04   Pavel Emelyanov   sysctl: add the k...
234
235
236
  static int pid_ns_ctl_handler(struct ctl_table *table, int write,
  		void __user *buffer, size_t *lenp, loff_t *ppos)
  {
49f4d8b93   Eric W. Biederman   pidns: Capture th...
237
  	struct pid_namespace *pid_ns = task_active_pid_ns(current);
b8f566b04   Pavel Emelyanov   sysctl: add the k...
238
  	struct ctl_table tmp = *table;
49f4d8b93   Eric W. Biederman   pidns: Capture th...
239
  	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
b8f566b04   Pavel Emelyanov   sysctl: add the k...
240
241
242
243
244
245
246
  		return -EPERM;
  
  	/*
  	 * Writing directly to ns' last_pid field is OK, since this field
  	 * is volatile in a living namespace anyway and a code writing to
  	 * it should synchronize its usage with external means.
  	 */
49f4d8b93   Eric W. Biederman   pidns: Capture th...
247
  	tmp.data = &pid_ns->last_pid;
579035dc5   Andrew Vagin   pid-namespace: li...
248
  	return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
b8f566b04   Pavel Emelyanov   sysctl: add the k...
249
  }
579035dc5   Andrew Vagin   pid-namespace: li...
250
251
  extern int pid_max;
  static int zero = 0;
b8f566b04   Pavel Emelyanov   sysctl: add the k...
252
253
254
255
256
257
  static struct ctl_table pid_ns_ctl_table[] = {
  	{
  		.procname = "ns_last_pid",
  		.maxlen = sizeof(int),
  		.mode = 0666, /* permissions are checked in the handler */
  		.proc_handler = pid_ns_ctl_handler,
579035dc5   Andrew Vagin   pid-namespace: li...
258
259
  		.extra1 = &zero,
  		.extra2 = &pid_max,
b8f566b04   Pavel Emelyanov   sysctl: add the k...
260
261
262
  	},
  	{ }
  };
b8f566b04   Pavel Emelyanov   sysctl: add the k...
263
  static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
98ed57eef   Cyrill Gorcunov   sysctl: make kern...
264
  #endif	/* CONFIG_CHECKPOINT_RESTORE */
b8f566b04   Pavel Emelyanov   sysctl: add the k...
265

cf3f89214   Daniel Lezcano   pidns: add reboot...
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
  int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
  {
  	if (pid_ns == &init_pid_ns)
  		return 0;
  
  	switch (cmd) {
  	case LINUX_REBOOT_CMD_RESTART2:
  	case LINUX_REBOOT_CMD_RESTART:
  		pid_ns->reboot = SIGHUP;
  		break;
  
  	case LINUX_REBOOT_CMD_POWER_OFF:
  	case LINUX_REBOOT_CMD_HALT:
  		pid_ns->reboot = SIGINT;
  		break;
  	default:
  		return -EINVAL;
  	}
  
  	read_lock(&tasklist_lock);
  	force_sig(SIGKILL, pid_ns->child_reaper);
  	read_unlock(&tasklist_lock);
  
  	do_exit(0);
  
  	/* Not reached */
  	return 0;
  }
57e8391d3   Eric W. Biederman   pidns: Add setns ...
294
295
296
297
298
  static void *pidns_get(struct task_struct *task)
  {
  	struct pid_namespace *ns;
  
  	rcu_read_lock();
d23082257   Oleg Nesterov   pid_namespace: pi...
299
300
301
  	ns = task_active_pid_ns(task);
  	if (ns)
  		get_pid_ns(ns);
57e8391d3   Eric W. Biederman   pidns: Add setns ...
302
303
304
305
306
307
308
309
310
311
312
313
314
315
  	rcu_read_unlock();
  
  	return ns;
  }
  
  static void pidns_put(void *ns)
  {
  	put_pid_ns(ns);
  }
  
  static int pidns_install(struct nsproxy *nsproxy, void *ns)
  {
  	struct pid_namespace *active = task_active_pid_ns(current);
  	struct pid_namespace *ancestor, *new = ns;
5e4a08476   Eric W. Biederman   userns: Require C...
316
  	if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
c7b96acf1   Eric W. Biederman   userns: Kill nso...
317
  	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
57e8391d3   Eric W. Biederman   pidns: Add setns ...
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
  		return -EPERM;
  
  	/*
  	 * Only allow entering the current active pid namespace
  	 * or a child of the current active pid namespace.
  	 *
  	 * This is required for fork to return a usable pid value and
  	 * this maintains the property that processes and their
  	 * children can not escape their current pid namespace.
  	 */
  	if (new->level < active->level)
  		return -EINVAL;
  
  	ancestor = new;
  	while (ancestor->level > active->level)
  		ancestor = ancestor->parent;
  	if (ancestor != active)
  		return -EINVAL;
c2b1df2eb   Andy Lutomirski   Rename nsproxy.pi...
336
337
  	put_pid_ns(nsproxy->pid_ns_for_children);
  	nsproxy->pid_ns_for_children = get_pid_ns(new);
57e8391d3   Eric W. Biederman   pidns: Add setns ...
338
339
  	return 0;
  }
98f842e67   Eric W. Biederman   proc: Usable inod...
340
341
342
343
344
  static unsigned int pidns_inum(void *ns)
  {
  	struct pid_namespace *pid_ns = ns;
  	return pid_ns->proc_inum;
  }
57e8391d3   Eric W. Biederman   pidns: Add setns ...
345
346
347
348
349
350
  const struct proc_ns_operations pidns_operations = {
  	.name		= "pid",
  	.type		= CLONE_NEWPID,
  	.get		= pidns_get,
  	.put		= pidns_put,
  	.install	= pidns_install,
98f842e67   Eric W. Biederman   proc: Usable inod...
351
  	.inum		= pidns_inum,
57e8391d3   Eric W. Biederman   pidns: Add setns ...
352
  };
74bd59bb3   Pavel Emelyanov   namespaces: clean...
353
354
355
  static __init int pid_namespaces_init(void)
  {
  	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
98ed57eef   Cyrill Gorcunov   sysctl: make kern...
356
357
  
  #ifdef CONFIG_CHECKPOINT_RESTORE
b8f566b04   Pavel Emelyanov   sysctl: add the k...
358
  	register_sysctl_paths(kern_path, pid_ns_ctl_table);
98ed57eef   Cyrill Gorcunov   sysctl: make kern...
359
  #endif
74bd59bb3   Pavel Emelyanov   namespaces: clean...
360
361
362
363
  	return 0;
  }
  
  __initcall(pid_namespaces_init);