Commit 4a7215f13452bf2e8d271b2b9f09fddd990b4c04

Authored by Eric W. Biederman
Committed by Greg Kroah-Hartman
1 parent 54b8ced1ef

userns: Add a knob to disable setgroups on a per user namespace basis

commit 9cc46516ddf497ea16e8d7cb986ae03a0f6b92f8 upstream.

- Expose the knob to user space through a proc file /proc/<pid>/setgroups

  A value of "deny" means the setgroups system call is disabled in the
  current processes user namespace and can not be enabled in the
  future in this user namespace.

  A value of "allow" means the segtoups system call is enabled.

- Descendant user namespaces inherit the value of setgroups from
  their parents.

- A proc file is used (instead of a sysctl) as sysctls currently do
  not allow checking the permissions at open time.

- Writing to the proc file is restricted to before the gid_map
  for the user namespace is set.

  This ensures that disabling setgroups at a user namespace
  level will never remove the ability to call setgroups
  from a process that already has that ability.

  A process may opt in to the setgroups disable for itself by
  creating, entering and configuring a user namespace or by calling
  setns on an existing user namespace with setgroups disabled.
  Processes without privileges already can not call setgroups so this
  is a noop.  Prodcess with privilege become processes without
  privilege when entering a user namespace and as with any other path
  to dropping privilege they would not have the ability to call
  setgroups.  So this remains within the bounds of what is possible
  without a knob to disable setgroups permanently in a user namespace.

Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Showing 4 changed files with 146 additions and 0 deletions Inline Diff

1 /* 1 /*
2 * linux/fs/proc/base.c 2 * linux/fs/proc/base.c
3 * 3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * 5 *
6 * proc base directory handling functions 6 * proc base directory handling functions
7 * 7 *
8 * 1999, Al Viro. Rewritten. Now it covers the whole per-process part. 8 * 1999, Al Viro. Rewritten. Now it covers the whole per-process part.
9 * Instead of using magical inumbers to determine the kind of object 9 * Instead of using magical inumbers to determine the kind of object
10 * we allocate and fill in-core inodes upon lookup. They don't even 10 * we allocate and fill in-core inodes upon lookup. They don't even
11 * go into icache. We cache the reference to task_struct upon lookup too. 11 * go into icache. We cache the reference to task_struct upon lookup too.
12 * Eventually it should become a filesystem in its own. We don't use the 12 * Eventually it should become a filesystem in its own. We don't use the
13 * rest of procfs anymore. 13 * rest of procfs anymore.
14 * 14 *
15 * 15 *
16 * Changelog: 16 * Changelog:
17 * 17-Jan-2005 17 * 17-Jan-2005
18 * Allan Bezerra 18 * Allan Bezerra
19 * Bruna Moreira <bruna.moreira@indt.org.br> 19 * Bruna Moreira <bruna.moreira@indt.org.br>
20 * Edjard Mota <edjard.mota@indt.org.br> 20 * Edjard Mota <edjard.mota@indt.org.br>
21 * Ilias Biris <ilias.biris@indt.org.br> 21 * Ilias Biris <ilias.biris@indt.org.br>
22 * Mauricio Lin <mauricio.lin@indt.org.br> 22 * Mauricio Lin <mauricio.lin@indt.org.br>
23 * 23 *
24 * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT 24 * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
25 * 25 *
26 * A new process specific entry (smaps) included in /proc. It shows the 26 * A new process specific entry (smaps) included in /proc. It shows the
27 * size of rss for each memory area. The maps entry lacks information 27 * size of rss for each memory area. The maps entry lacks information
28 * about physical memory size (rss) for each mapped file, i.e., 28 * about physical memory size (rss) for each mapped file, i.e.,
29 * rss information for executables and library files. 29 * rss information for executables and library files.
30 * This additional information is useful for any tools that need to know 30 * This additional information is useful for any tools that need to know
31 * about physical memory consumption for a process specific library. 31 * about physical memory consumption for a process specific library.
32 * 32 *
33 * Changelog: 33 * Changelog:
34 * 21-Feb-2005 34 * 21-Feb-2005
35 * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT 35 * Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
36 * Pud inclusion in the page table walking. 36 * Pud inclusion in the page table walking.
37 * 37 *
38 * ChangeLog: 38 * ChangeLog:
39 * 10-Mar-2005 39 * 10-Mar-2005
40 * 10LE Instituto Nokia de Tecnologia - INdT: 40 * 10LE Instituto Nokia de Tecnologia - INdT:
41 * A better way to walks through the page table as suggested by Hugh Dickins. 41 * A better way to walks through the page table as suggested by Hugh Dickins.
42 * 42 *
43 * Simo Piiroinen <simo.piiroinen@nokia.com>: 43 * Simo Piiroinen <simo.piiroinen@nokia.com>:
44 * Smaps information related to shared, private, clean and dirty pages. 44 * Smaps information related to shared, private, clean and dirty pages.
45 * 45 *
46 * Paul Mundt <paul.mundt@nokia.com>: 46 * Paul Mundt <paul.mundt@nokia.com>:
47 * Overall revision about smaps. 47 * Overall revision about smaps.
48 */ 48 */
49 49
50 #include <asm/uaccess.h> 50 #include <asm/uaccess.h>
51 51
52 #include <linux/errno.h> 52 #include <linux/errno.h>
53 #include <linux/time.h> 53 #include <linux/time.h>
54 #include <linux/proc_fs.h> 54 #include <linux/proc_fs.h>
55 #include <linux/stat.h> 55 #include <linux/stat.h>
56 #include <linux/task_io_accounting_ops.h> 56 #include <linux/task_io_accounting_ops.h>
57 #include <linux/init.h> 57 #include <linux/init.h>
58 #include <linux/capability.h> 58 #include <linux/capability.h>
59 #include <linux/file.h> 59 #include <linux/file.h>
60 #include <linux/fdtable.h> 60 #include <linux/fdtable.h>
61 #include <linux/string.h> 61 #include <linux/string.h>
62 #include <linux/seq_file.h> 62 #include <linux/seq_file.h>
63 #include <linux/namei.h> 63 #include <linux/namei.h>
64 #include <linux/mnt_namespace.h> 64 #include <linux/mnt_namespace.h>
65 #include <linux/mm.h> 65 #include <linux/mm.h>
66 #include <linux/swap.h> 66 #include <linux/swap.h>
67 #include <linux/rcupdate.h> 67 #include <linux/rcupdate.h>
68 #include <linux/kallsyms.h> 68 #include <linux/kallsyms.h>
69 #include <linux/stacktrace.h> 69 #include <linux/stacktrace.h>
70 #include <linux/resource.h> 70 #include <linux/resource.h>
71 #include <linux/module.h> 71 #include <linux/module.h>
72 #include <linux/mount.h> 72 #include <linux/mount.h>
73 #include <linux/security.h> 73 #include <linux/security.h>
74 #include <linux/ptrace.h> 74 #include <linux/ptrace.h>
75 #include <linux/tracehook.h> 75 #include <linux/tracehook.h>
76 #include <linux/printk.h> 76 #include <linux/printk.h>
77 #include <linux/cgroup.h> 77 #include <linux/cgroup.h>
78 #include <linux/cpuset.h> 78 #include <linux/cpuset.h>
79 #include <linux/audit.h> 79 #include <linux/audit.h>
80 #include <linux/poll.h> 80 #include <linux/poll.h>
81 #include <linux/nsproxy.h> 81 #include <linux/nsproxy.h>
82 #include <linux/oom.h> 82 #include <linux/oom.h>
83 #include <linux/elf.h> 83 #include <linux/elf.h>
84 #include <linux/pid_namespace.h> 84 #include <linux/pid_namespace.h>
85 #include <linux/user_namespace.h> 85 #include <linux/user_namespace.h>
86 #include <linux/fs_struct.h> 86 #include <linux/fs_struct.h>
87 #include <linux/slab.h> 87 #include <linux/slab.h>
88 #include <linux/flex_array.h> 88 #include <linux/flex_array.h>
89 #include <linux/posix-timers.h> 89 #include <linux/posix-timers.h>
90 #ifdef CONFIG_HARDWALL 90 #ifdef CONFIG_HARDWALL
91 #include <asm/hardwall.h> 91 #include <asm/hardwall.h>
92 #endif 92 #endif
93 #include <trace/events/oom.h> 93 #include <trace/events/oom.h>
94 #include "internal.h" 94 #include "internal.h"
95 #include "fd.h" 95 #include "fd.h"
96 96
97 /* NOTE: 97 /* NOTE:
98 * Implementing inode permission operations in /proc is almost 98 * Implementing inode permission operations in /proc is almost
99 * certainly an error. Permission checks need to happen during 99 * certainly an error. Permission checks need to happen during
100 * each system call not at open time. The reason is that most of 100 * each system call not at open time. The reason is that most of
101 * what we wish to check for permissions in /proc varies at runtime. 101 * what we wish to check for permissions in /proc varies at runtime.
102 * 102 *
103 * The classic example of a problem is opening file descriptors 103 * The classic example of a problem is opening file descriptors
104 * in /proc for a task before it execs a suid executable. 104 * in /proc for a task before it execs a suid executable.
105 */ 105 */
106 106
107 struct pid_entry { 107 struct pid_entry {
108 const char *name; 108 const char *name;
109 int len; 109 int len;
110 umode_t mode; 110 umode_t mode;
111 const struct inode_operations *iop; 111 const struct inode_operations *iop;
112 const struct file_operations *fop; 112 const struct file_operations *fop;
113 union proc_op op; 113 union proc_op op;
114 }; 114 };
115 115
116 #define NOD(NAME, MODE, IOP, FOP, OP) { \ 116 #define NOD(NAME, MODE, IOP, FOP, OP) { \
117 .name = (NAME), \ 117 .name = (NAME), \
118 .len = sizeof(NAME) - 1, \ 118 .len = sizeof(NAME) - 1, \
119 .mode = MODE, \ 119 .mode = MODE, \
120 .iop = IOP, \ 120 .iop = IOP, \
121 .fop = FOP, \ 121 .fop = FOP, \
122 .op = OP, \ 122 .op = OP, \
123 } 123 }
124 124
125 #define DIR(NAME, MODE, iops, fops) \ 125 #define DIR(NAME, MODE, iops, fops) \
126 NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} ) 126 NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
127 #define LNK(NAME, get_link) \ 127 #define LNK(NAME, get_link) \
128 NOD(NAME, (S_IFLNK|S_IRWXUGO), \ 128 NOD(NAME, (S_IFLNK|S_IRWXUGO), \
129 &proc_pid_link_inode_operations, NULL, \ 129 &proc_pid_link_inode_operations, NULL, \
130 { .proc_get_link = get_link } ) 130 { .proc_get_link = get_link } )
131 #define REG(NAME, MODE, fops) \ 131 #define REG(NAME, MODE, fops) \
132 NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) 132 NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
133 #define ONE(NAME, MODE, show) \ 133 #define ONE(NAME, MODE, show) \
134 NOD(NAME, (S_IFREG|(MODE)), \ 134 NOD(NAME, (S_IFREG|(MODE)), \
135 NULL, &proc_single_file_operations, \ 135 NULL, &proc_single_file_operations, \
136 { .proc_show = show } ) 136 { .proc_show = show } )
137 137
138 /* 138 /*
139 * Count the number of hardlinks for the pid_entry table, excluding the . 139 * Count the number of hardlinks for the pid_entry table, excluding the .
140 * and .. links. 140 * and .. links.
141 */ 141 */
142 static unsigned int pid_entry_count_dirs(const struct pid_entry *entries, 142 static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
143 unsigned int n) 143 unsigned int n)
144 { 144 {
145 unsigned int i; 145 unsigned int i;
146 unsigned int count; 146 unsigned int count;
147 147
148 count = 0; 148 count = 0;
149 for (i = 0; i < n; ++i) { 149 for (i = 0; i < n; ++i) {
150 if (S_ISDIR(entries[i].mode)) 150 if (S_ISDIR(entries[i].mode))
151 ++count; 151 ++count;
152 } 152 }
153 153
154 return count; 154 return count;
155 } 155 }
156 156
157 static int get_task_root(struct task_struct *task, struct path *root) 157 static int get_task_root(struct task_struct *task, struct path *root)
158 { 158 {
159 int result = -ENOENT; 159 int result = -ENOENT;
160 160
161 task_lock(task); 161 task_lock(task);
162 if (task->fs) { 162 if (task->fs) {
163 get_fs_root(task->fs, root); 163 get_fs_root(task->fs, root);
164 result = 0; 164 result = 0;
165 } 165 }
166 task_unlock(task); 166 task_unlock(task);
167 return result; 167 return result;
168 } 168 }
169 169
170 static int proc_cwd_link(struct dentry *dentry, struct path *path) 170 static int proc_cwd_link(struct dentry *dentry, struct path *path)
171 { 171 {
172 struct task_struct *task = get_proc_task(dentry->d_inode); 172 struct task_struct *task = get_proc_task(dentry->d_inode);
173 int result = -ENOENT; 173 int result = -ENOENT;
174 174
175 if (task) { 175 if (task) {
176 task_lock(task); 176 task_lock(task);
177 if (task->fs) { 177 if (task->fs) {
178 get_fs_pwd(task->fs, path); 178 get_fs_pwd(task->fs, path);
179 result = 0; 179 result = 0;
180 } 180 }
181 task_unlock(task); 181 task_unlock(task);
182 put_task_struct(task); 182 put_task_struct(task);
183 } 183 }
184 return result; 184 return result;
185 } 185 }
186 186
187 static int proc_root_link(struct dentry *dentry, struct path *path) 187 static int proc_root_link(struct dentry *dentry, struct path *path)
188 { 188 {
189 struct task_struct *task = get_proc_task(dentry->d_inode); 189 struct task_struct *task = get_proc_task(dentry->d_inode);
190 int result = -ENOENT; 190 int result = -ENOENT;
191 191
192 if (task) { 192 if (task) {
193 result = get_task_root(task, path); 193 result = get_task_root(task, path);
194 put_task_struct(task); 194 put_task_struct(task);
195 } 195 }
196 return result; 196 return result;
197 } 197 }
198 198
199 static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns, 199 static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns,
200 struct pid *pid, struct task_struct *task) 200 struct pid *pid, struct task_struct *task)
201 { 201 {
202 /* 202 /*
203 * Rely on struct seq_operations::show() being called once 203 * Rely on struct seq_operations::show() being called once
204 * per internal buffer allocation. See single_open(), traverse(). 204 * per internal buffer allocation. See single_open(), traverse().
205 */ 205 */
206 BUG_ON(m->size < PAGE_SIZE); 206 BUG_ON(m->size < PAGE_SIZE);
207 m->count += get_cmdline(task, m->buf, PAGE_SIZE); 207 m->count += get_cmdline(task, m->buf, PAGE_SIZE);
208 return 0; 208 return 0;
209 } 209 }
210 210
211 static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns, 211 static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns,
212 struct pid *pid, struct task_struct *task) 212 struct pid *pid, struct task_struct *task)
213 { 213 {
214 struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ); 214 struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ);
215 if (mm && !IS_ERR(mm)) { 215 if (mm && !IS_ERR(mm)) {
216 unsigned int nwords = 0; 216 unsigned int nwords = 0;
217 do { 217 do {
218 nwords += 2; 218 nwords += 2;
219 } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ 219 } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
220 seq_write(m, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0])); 220 seq_write(m, mm->saved_auxv, nwords * sizeof(mm->saved_auxv[0]));
221 mmput(mm); 221 mmput(mm);
222 return 0; 222 return 0;
223 } else 223 } else
224 return PTR_ERR(mm); 224 return PTR_ERR(mm);
225 } 225 }
226 226
227 227
228 #ifdef CONFIG_KALLSYMS 228 #ifdef CONFIG_KALLSYMS
229 /* 229 /*
230 * Provides a wchan file via kallsyms in a proper one-value-per-file format. 230 * Provides a wchan file via kallsyms in a proper one-value-per-file format.
231 * Returns the resolved symbol. If that fails, simply return the address. 231 * Returns the resolved symbol. If that fails, simply return the address.
232 */ 232 */
233 static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, 233 static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
234 struct pid *pid, struct task_struct *task) 234 struct pid *pid, struct task_struct *task)
235 { 235 {
236 unsigned long wchan; 236 unsigned long wchan;
237 char symname[KSYM_NAME_LEN]; 237 char symname[KSYM_NAME_LEN];
238 238
239 wchan = get_wchan(task); 239 wchan = get_wchan(task);
240 240
241 if (lookup_symbol_name(wchan, symname) < 0) 241 if (lookup_symbol_name(wchan, symname) < 0)
242 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 242 if (!ptrace_may_access(task, PTRACE_MODE_READ))
243 return 0; 243 return 0;
244 else 244 else
245 return seq_printf(m, "%lu", wchan); 245 return seq_printf(m, "%lu", wchan);
246 else 246 else
247 return seq_printf(m, "%s", symname); 247 return seq_printf(m, "%s", symname);
248 } 248 }
249 #endif /* CONFIG_KALLSYMS */ 249 #endif /* CONFIG_KALLSYMS */
250 250
251 static int lock_trace(struct task_struct *task) 251 static int lock_trace(struct task_struct *task)
252 { 252 {
253 int err = mutex_lock_killable(&task->signal->cred_guard_mutex); 253 int err = mutex_lock_killable(&task->signal->cred_guard_mutex);
254 if (err) 254 if (err)
255 return err; 255 return err;
256 if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) { 256 if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
257 mutex_unlock(&task->signal->cred_guard_mutex); 257 mutex_unlock(&task->signal->cred_guard_mutex);
258 return -EPERM; 258 return -EPERM;
259 } 259 }
260 return 0; 260 return 0;
261 } 261 }
262 262
263 static void unlock_trace(struct task_struct *task) 263 static void unlock_trace(struct task_struct *task)
264 { 264 {
265 mutex_unlock(&task->signal->cred_guard_mutex); 265 mutex_unlock(&task->signal->cred_guard_mutex);
266 } 266 }
267 267
268 #ifdef CONFIG_STACKTRACE 268 #ifdef CONFIG_STACKTRACE
269 269
270 #define MAX_STACK_TRACE_DEPTH 64 270 #define MAX_STACK_TRACE_DEPTH 64
271 271
272 static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, 272 static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
273 struct pid *pid, struct task_struct *task) 273 struct pid *pid, struct task_struct *task)
274 { 274 {
275 struct stack_trace trace; 275 struct stack_trace trace;
276 unsigned long *entries; 276 unsigned long *entries;
277 int err; 277 int err;
278 int i; 278 int i;
279 279
280 entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL); 280 entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
281 if (!entries) 281 if (!entries)
282 return -ENOMEM; 282 return -ENOMEM;
283 283
284 trace.nr_entries = 0; 284 trace.nr_entries = 0;
285 trace.max_entries = MAX_STACK_TRACE_DEPTH; 285 trace.max_entries = MAX_STACK_TRACE_DEPTH;
286 trace.entries = entries; 286 trace.entries = entries;
287 trace.skip = 0; 287 trace.skip = 0;
288 288
289 err = lock_trace(task); 289 err = lock_trace(task);
290 if (!err) { 290 if (!err) {
291 save_stack_trace_tsk(task, &trace); 291 save_stack_trace_tsk(task, &trace);
292 292
293 for (i = 0; i < trace.nr_entries; i++) { 293 for (i = 0; i < trace.nr_entries; i++) {
294 seq_printf(m, "[<%pK>] %pS\n", 294 seq_printf(m, "[<%pK>] %pS\n",
295 (void *)entries[i], (void *)entries[i]); 295 (void *)entries[i], (void *)entries[i]);
296 } 296 }
297 unlock_trace(task); 297 unlock_trace(task);
298 } 298 }
299 kfree(entries); 299 kfree(entries);
300 300
301 return err; 301 return err;
302 } 302 }
303 #endif 303 #endif
304 304
305 #ifdef CONFIG_SCHEDSTATS 305 #ifdef CONFIG_SCHEDSTATS
306 /* 306 /*
307 * Provides /proc/PID/schedstat 307 * Provides /proc/PID/schedstat
308 */ 308 */
309 static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, 309 static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
310 struct pid *pid, struct task_struct *task) 310 struct pid *pid, struct task_struct *task)
311 { 311 {
312 return seq_printf(m, "%llu %llu %lu\n", 312 return seq_printf(m, "%llu %llu %lu\n",
313 (unsigned long long)task->se.sum_exec_runtime, 313 (unsigned long long)task->se.sum_exec_runtime,
314 (unsigned long long)task->sched_info.run_delay, 314 (unsigned long long)task->sched_info.run_delay,
315 task->sched_info.pcount); 315 task->sched_info.pcount);
316 } 316 }
317 #endif 317 #endif
318 318
319 #ifdef CONFIG_LATENCYTOP 319 #ifdef CONFIG_LATENCYTOP
320 static int lstats_show_proc(struct seq_file *m, void *v) 320 static int lstats_show_proc(struct seq_file *m, void *v)
321 { 321 {
322 int i; 322 int i;
323 struct inode *inode = m->private; 323 struct inode *inode = m->private;
324 struct task_struct *task = get_proc_task(inode); 324 struct task_struct *task = get_proc_task(inode);
325 325
326 if (!task) 326 if (!task)
327 return -ESRCH; 327 return -ESRCH;
328 seq_puts(m, "Latency Top version : v0.1\n"); 328 seq_puts(m, "Latency Top version : v0.1\n");
329 for (i = 0; i < 32; i++) { 329 for (i = 0; i < 32; i++) {
330 struct latency_record *lr = &task->latency_record[i]; 330 struct latency_record *lr = &task->latency_record[i];
331 if (lr->backtrace[0]) { 331 if (lr->backtrace[0]) {
332 int q; 332 int q;
333 seq_printf(m, "%i %li %li", 333 seq_printf(m, "%i %li %li",
334 lr->count, lr->time, lr->max); 334 lr->count, lr->time, lr->max);
335 for (q = 0; q < LT_BACKTRACEDEPTH; q++) { 335 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
336 unsigned long bt = lr->backtrace[q]; 336 unsigned long bt = lr->backtrace[q];
337 if (!bt) 337 if (!bt)
338 break; 338 break;
339 if (bt == ULONG_MAX) 339 if (bt == ULONG_MAX)
340 break; 340 break;
341 seq_printf(m, " %ps", (void *)bt); 341 seq_printf(m, " %ps", (void *)bt);
342 } 342 }
343 seq_putc(m, '\n'); 343 seq_putc(m, '\n');
344 } 344 }
345 345
346 } 346 }
347 put_task_struct(task); 347 put_task_struct(task);
348 return 0; 348 return 0;
349 } 349 }
350 350
351 static int lstats_open(struct inode *inode, struct file *file) 351 static int lstats_open(struct inode *inode, struct file *file)
352 { 352 {
353 return single_open(file, lstats_show_proc, inode); 353 return single_open(file, lstats_show_proc, inode);
354 } 354 }
355 355
356 static ssize_t lstats_write(struct file *file, const char __user *buf, 356 static ssize_t lstats_write(struct file *file, const char __user *buf,
357 size_t count, loff_t *offs) 357 size_t count, loff_t *offs)
358 { 358 {
359 struct task_struct *task = get_proc_task(file_inode(file)); 359 struct task_struct *task = get_proc_task(file_inode(file));
360 360
361 if (!task) 361 if (!task)
362 return -ESRCH; 362 return -ESRCH;
363 clear_all_latency_tracing(task); 363 clear_all_latency_tracing(task);
364 put_task_struct(task); 364 put_task_struct(task);
365 365
366 return count; 366 return count;
367 } 367 }
368 368
369 static const struct file_operations proc_lstats_operations = { 369 static const struct file_operations proc_lstats_operations = {
370 .open = lstats_open, 370 .open = lstats_open,
371 .read = seq_read, 371 .read = seq_read,
372 .write = lstats_write, 372 .write = lstats_write,
373 .llseek = seq_lseek, 373 .llseek = seq_lseek,
374 .release = single_release, 374 .release = single_release,
375 }; 375 };
376 376
377 #endif 377 #endif
378 378
379 static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, 379 static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
380 struct pid *pid, struct task_struct *task) 380 struct pid *pid, struct task_struct *task)
381 { 381 {
382 unsigned long totalpages = totalram_pages + total_swap_pages; 382 unsigned long totalpages = totalram_pages + total_swap_pages;
383 unsigned long points = 0; 383 unsigned long points = 0;
384 384
385 read_lock(&tasklist_lock); 385 read_lock(&tasklist_lock);
386 if (pid_alive(task)) 386 if (pid_alive(task))
387 points = oom_badness(task, NULL, NULL, totalpages) * 387 points = oom_badness(task, NULL, NULL, totalpages) *
388 1000 / totalpages; 388 1000 / totalpages;
389 read_unlock(&tasklist_lock); 389 read_unlock(&tasklist_lock);
390 return seq_printf(m, "%lu\n", points); 390 return seq_printf(m, "%lu\n", points);
391 } 391 }
392 392
393 struct limit_names { 393 struct limit_names {
394 const char *name; 394 const char *name;
395 const char *unit; 395 const char *unit;
396 }; 396 };
397 397
398 static const struct limit_names lnames[RLIM_NLIMITS] = { 398 static const struct limit_names lnames[RLIM_NLIMITS] = {
399 [RLIMIT_CPU] = {"Max cpu time", "seconds"}, 399 [RLIMIT_CPU] = {"Max cpu time", "seconds"},
400 [RLIMIT_FSIZE] = {"Max file size", "bytes"}, 400 [RLIMIT_FSIZE] = {"Max file size", "bytes"},
401 [RLIMIT_DATA] = {"Max data size", "bytes"}, 401 [RLIMIT_DATA] = {"Max data size", "bytes"},
402 [RLIMIT_STACK] = {"Max stack size", "bytes"}, 402 [RLIMIT_STACK] = {"Max stack size", "bytes"},
403 [RLIMIT_CORE] = {"Max core file size", "bytes"}, 403 [RLIMIT_CORE] = {"Max core file size", "bytes"},
404 [RLIMIT_RSS] = {"Max resident set", "bytes"}, 404 [RLIMIT_RSS] = {"Max resident set", "bytes"},
405 [RLIMIT_NPROC] = {"Max processes", "processes"}, 405 [RLIMIT_NPROC] = {"Max processes", "processes"},
406 [RLIMIT_NOFILE] = {"Max open files", "files"}, 406 [RLIMIT_NOFILE] = {"Max open files", "files"},
407 [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"}, 407 [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"},
408 [RLIMIT_AS] = {"Max address space", "bytes"}, 408 [RLIMIT_AS] = {"Max address space", "bytes"},
409 [RLIMIT_LOCKS] = {"Max file locks", "locks"}, 409 [RLIMIT_LOCKS] = {"Max file locks", "locks"},
410 [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"}, 410 [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"},
411 [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"}, 411 [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
412 [RLIMIT_NICE] = {"Max nice priority", NULL}, 412 [RLIMIT_NICE] = {"Max nice priority", NULL},
413 [RLIMIT_RTPRIO] = {"Max realtime priority", NULL}, 413 [RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
414 [RLIMIT_RTTIME] = {"Max realtime timeout", "us"}, 414 [RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
415 }; 415 };
416 416
417 /* Display limits for a process */ 417 /* Display limits for a process */
418 static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns, 418 static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns,
419 struct pid *pid, struct task_struct *task) 419 struct pid *pid, struct task_struct *task)
420 { 420 {
421 unsigned int i; 421 unsigned int i;
422 unsigned long flags; 422 unsigned long flags;
423 423
424 struct rlimit rlim[RLIM_NLIMITS]; 424 struct rlimit rlim[RLIM_NLIMITS];
425 425
426 if (!lock_task_sighand(task, &flags)) 426 if (!lock_task_sighand(task, &flags))
427 return 0; 427 return 0;
428 memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS); 428 memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
429 unlock_task_sighand(task, &flags); 429 unlock_task_sighand(task, &flags);
430 430
431 /* 431 /*
432 * print the file header 432 * print the file header
433 */ 433 */
434 seq_printf(m, "%-25s %-20s %-20s %-10s\n", 434 seq_printf(m, "%-25s %-20s %-20s %-10s\n",
435 "Limit", "Soft Limit", "Hard Limit", "Units"); 435 "Limit", "Soft Limit", "Hard Limit", "Units");
436 436
437 for (i = 0; i < RLIM_NLIMITS; i++) { 437 for (i = 0; i < RLIM_NLIMITS; i++) {
438 if (rlim[i].rlim_cur == RLIM_INFINITY) 438 if (rlim[i].rlim_cur == RLIM_INFINITY)
439 seq_printf(m, "%-25s %-20s ", 439 seq_printf(m, "%-25s %-20s ",
440 lnames[i].name, "unlimited"); 440 lnames[i].name, "unlimited");
441 else 441 else
442 seq_printf(m, "%-25s %-20lu ", 442 seq_printf(m, "%-25s %-20lu ",
443 lnames[i].name, rlim[i].rlim_cur); 443 lnames[i].name, rlim[i].rlim_cur);
444 444
445 if (rlim[i].rlim_max == RLIM_INFINITY) 445 if (rlim[i].rlim_max == RLIM_INFINITY)
446 seq_printf(m, "%-20s ", "unlimited"); 446 seq_printf(m, "%-20s ", "unlimited");
447 else 447 else
448 seq_printf(m, "%-20lu ", rlim[i].rlim_max); 448 seq_printf(m, "%-20lu ", rlim[i].rlim_max);
449 449
450 if (lnames[i].unit) 450 if (lnames[i].unit)
451 seq_printf(m, "%-10s\n", lnames[i].unit); 451 seq_printf(m, "%-10s\n", lnames[i].unit);
452 else 452 else
453 seq_putc(m, '\n'); 453 seq_putc(m, '\n');
454 } 454 }
455 455
456 return 0; 456 return 0;
457 } 457 }
458 458
459 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK 459 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
460 static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, 460 static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns,
461 struct pid *pid, struct task_struct *task) 461 struct pid *pid, struct task_struct *task)
462 { 462 {
463 long nr; 463 long nr;
464 unsigned long args[6], sp, pc; 464 unsigned long args[6], sp, pc;
465 int res = lock_trace(task); 465 int res = lock_trace(task);
466 if (res) 466 if (res)
467 return res; 467 return res;
468 468
469 if (task_current_syscall(task, &nr, args, 6, &sp, &pc)) 469 if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
470 seq_puts(m, "running\n"); 470 seq_puts(m, "running\n");
471 else if (nr < 0) 471 else if (nr < 0)
472 seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc); 472 seq_printf(m, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
473 else 473 else
474 seq_printf(m, 474 seq_printf(m,
475 "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n", 475 "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
476 nr, 476 nr,
477 args[0], args[1], args[2], args[3], args[4], args[5], 477 args[0], args[1], args[2], args[3], args[4], args[5],
478 sp, pc); 478 sp, pc);
479 unlock_trace(task); 479 unlock_trace(task);
480 return res; 480 return res;
481 } 481 }
482 #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ 482 #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
483 483
484 /************************************************************************/ 484 /************************************************************************/
485 /* Here the fs part begins */ 485 /* Here the fs part begins */
486 /************************************************************************/ 486 /************************************************************************/
487 487
488 /* permission checks */ 488 /* permission checks */
489 static int proc_fd_access_allowed(struct inode *inode) 489 static int proc_fd_access_allowed(struct inode *inode)
490 { 490 {
491 struct task_struct *task; 491 struct task_struct *task;
492 int allowed = 0; 492 int allowed = 0;
493 /* Allow access to a task's file descriptors if it is us or we 493 /* Allow access to a task's file descriptors if it is us or we
494 * may use ptrace attach to the process and find out that 494 * may use ptrace attach to the process and find out that
495 * information. 495 * information.
496 */ 496 */
497 task = get_proc_task(inode); 497 task = get_proc_task(inode);
498 if (task) { 498 if (task) {
499 allowed = ptrace_may_access(task, PTRACE_MODE_READ); 499 allowed = ptrace_may_access(task, PTRACE_MODE_READ);
500 put_task_struct(task); 500 put_task_struct(task);
501 } 501 }
502 return allowed; 502 return allowed;
503 } 503 }
504 504
505 int proc_setattr(struct dentry *dentry, struct iattr *attr) 505 int proc_setattr(struct dentry *dentry, struct iattr *attr)
506 { 506 {
507 int error; 507 int error;
508 struct inode *inode = dentry->d_inode; 508 struct inode *inode = dentry->d_inode;
509 509
510 if (attr->ia_valid & ATTR_MODE) 510 if (attr->ia_valid & ATTR_MODE)
511 return -EPERM; 511 return -EPERM;
512 512
513 error = inode_change_ok(inode, attr); 513 error = inode_change_ok(inode, attr);
514 if (error) 514 if (error)
515 return error; 515 return error;
516 516
517 setattr_copy(inode, attr); 517 setattr_copy(inode, attr);
518 mark_inode_dirty(inode); 518 mark_inode_dirty(inode);
519 return 0; 519 return 0;
520 } 520 }
521 521
522 /* 522 /*
523 * May current process learn task's sched/cmdline info (for hide_pid_min=1) 523 * May current process learn task's sched/cmdline info (for hide_pid_min=1)
524 * or euid/egid (for hide_pid_min=2)? 524 * or euid/egid (for hide_pid_min=2)?
525 */ 525 */
526 static bool has_pid_permissions(struct pid_namespace *pid, 526 static bool has_pid_permissions(struct pid_namespace *pid,
527 struct task_struct *task, 527 struct task_struct *task,
528 int hide_pid_min) 528 int hide_pid_min)
529 { 529 {
530 if (pid->hide_pid < hide_pid_min) 530 if (pid->hide_pid < hide_pid_min)
531 return true; 531 return true;
532 if (in_group_p(pid->pid_gid)) 532 if (in_group_p(pid->pid_gid))
533 return true; 533 return true;
534 return ptrace_may_access(task, PTRACE_MODE_READ); 534 return ptrace_may_access(task, PTRACE_MODE_READ);
535 } 535 }
536 536
537 537
538 static int proc_pid_permission(struct inode *inode, int mask) 538 static int proc_pid_permission(struct inode *inode, int mask)
539 { 539 {
540 struct pid_namespace *pid = inode->i_sb->s_fs_info; 540 struct pid_namespace *pid = inode->i_sb->s_fs_info;
541 struct task_struct *task; 541 struct task_struct *task;
542 bool has_perms; 542 bool has_perms;
543 543
544 task = get_proc_task(inode); 544 task = get_proc_task(inode);
545 if (!task) 545 if (!task)
546 return -ESRCH; 546 return -ESRCH;
547 has_perms = has_pid_permissions(pid, task, 1); 547 has_perms = has_pid_permissions(pid, task, 1);
548 put_task_struct(task); 548 put_task_struct(task);
549 549
550 if (!has_perms) { 550 if (!has_perms) {
551 if (pid->hide_pid == 2) { 551 if (pid->hide_pid == 2) {
552 /* 552 /*
553 * Let's make getdents(), stat(), and open() 553 * Let's make getdents(), stat(), and open()
554 * consistent with each other. If a process 554 * consistent with each other. If a process
555 * may not stat() a file, it shouldn't be seen 555 * may not stat() a file, it shouldn't be seen
556 * in procfs at all. 556 * in procfs at all.
557 */ 557 */
558 return -ENOENT; 558 return -ENOENT;
559 } 559 }
560 560
561 return -EPERM; 561 return -EPERM;
562 } 562 }
563 return generic_permission(inode, mask); 563 return generic_permission(inode, mask);
564 } 564 }
565 565
566 566
567 567
568 static const struct inode_operations proc_def_inode_operations = { 568 static const struct inode_operations proc_def_inode_operations = {
569 .setattr = proc_setattr, 569 .setattr = proc_setattr,
570 }; 570 };
571 571
572 static int proc_single_show(struct seq_file *m, void *v) 572 static int proc_single_show(struct seq_file *m, void *v)
573 { 573 {
574 struct inode *inode = m->private; 574 struct inode *inode = m->private;
575 struct pid_namespace *ns; 575 struct pid_namespace *ns;
576 struct pid *pid; 576 struct pid *pid;
577 struct task_struct *task; 577 struct task_struct *task;
578 int ret; 578 int ret;
579 579
580 ns = inode->i_sb->s_fs_info; 580 ns = inode->i_sb->s_fs_info;
581 pid = proc_pid(inode); 581 pid = proc_pid(inode);
582 task = get_pid_task(pid, PIDTYPE_PID); 582 task = get_pid_task(pid, PIDTYPE_PID);
583 if (!task) 583 if (!task)
584 return -ESRCH; 584 return -ESRCH;
585 585
586 ret = PROC_I(inode)->op.proc_show(m, ns, pid, task); 586 ret = PROC_I(inode)->op.proc_show(m, ns, pid, task);
587 587
588 put_task_struct(task); 588 put_task_struct(task);
589 return ret; 589 return ret;
590 } 590 }
591 591
592 static int proc_single_open(struct inode *inode, struct file *filp) 592 static int proc_single_open(struct inode *inode, struct file *filp)
593 { 593 {
594 return single_open(filp, proc_single_show, inode); 594 return single_open(filp, proc_single_show, inode);
595 } 595 }
596 596
597 static const struct file_operations proc_single_file_operations = { 597 static const struct file_operations proc_single_file_operations = {
598 .open = proc_single_open, 598 .open = proc_single_open,
599 .read = seq_read, 599 .read = seq_read,
600 .llseek = seq_lseek, 600 .llseek = seq_lseek,
601 .release = single_release, 601 .release = single_release,
602 }; 602 };
603 603
604 604
605 struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) 605 struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode)
606 { 606 {
607 struct task_struct *task = get_proc_task(inode); 607 struct task_struct *task = get_proc_task(inode);
608 struct mm_struct *mm = ERR_PTR(-ESRCH); 608 struct mm_struct *mm = ERR_PTR(-ESRCH);
609 609
610 if (task) { 610 if (task) {
611 mm = mm_access(task, mode); 611 mm = mm_access(task, mode);
612 put_task_struct(task); 612 put_task_struct(task);
613 613
614 if (!IS_ERR_OR_NULL(mm)) { 614 if (!IS_ERR_OR_NULL(mm)) {
615 /* ensure this mm_struct can't be freed */ 615 /* ensure this mm_struct can't be freed */
616 atomic_inc(&mm->mm_count); 616 atomic_inc(&mm->mm_count);
617 /* but do not pin its memory */ 617 /* but do not pin its memory */
618 mmput(mm); 618 mmput(mm);
619 } 619 }
620 } 620 }
621 621
622 return mm; 622 return mm;
623 } 623 }
624 624
625 static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) 625 static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
626 { 626 {
627 struct mm_struct *mm = proc_mem_open(inode, mode); 627 struct mm_struct *mm = proc_mem_open(inode, mode);
628 628
629 if (IS_ERR(mm)) 629 if (IS_ERR(mm))
630 return PTR_ERR(mm); 630 return PTR_ERR(mm);
631 631
632 file->private_data = mm; 632 file->private_data = mm;
633 return 0; 633 return 0;
634 } 634 }
635 635
636 static int mem_open(struct inode *inode, struct file *file) 636 static int mem_open(struct inode *inode, struct file *file)
637 { 637 {
638 int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH); 638 int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
639 639
640 /* OK to pass negative loff_t, we can catch out-of-range */ 640 /* OK to pass negative loff_t, we can catch out-of-range */
641 file->f_mode |= FMODE_UNSIGNED_OFFSET; 641 file->f_mode |= FMODE_UNSIGNED_OFFSET;
642 642
643 return ret; 643 return ret;
644 } 644 }
645 645
646 static ssize_t mem_rw(struct file *file, char __user *buf, 646 static ssize_t mem_rw(struct file *file, char __user *buf,
647 size_t count, loff_t *ppos, int write) 647 size_t count, loff_t *ppos, int write)
648 { 648 {
649 struct mm_struct *mm = file->private_data; 649 struct mm_struct *mm = file->private_data;
650 unsigned long addr = *ppos; 650 unsigned long addr = *ppos;
651 ssize_t copied; 651 ssize_t copied;
652 char *page; 652 char *page;
653 653
654 if (!mm) 654 if (!mm)
655 return 0; 655 return 0;
656 656
657 page = (char *)__get_free_page(GFP_TEMPORARY); 657 page = (char *)__get_free_page(GFP_TEMPORARY);
658 if (!page) 658 if (!page)
659 return -ENOMEM; 659 return -ENOMEM;
660 660
661 copied = 0; 661 copied = 0;
662 if (!atomic_inc_not_zero(&mm->mm_users)) 662 if (!atomic_inc_not_zero(&mm->mm_users))
663 goto free; 663 goto free;
664 664
665 while (count > 0) { 665 while (count > 0) {
666 int this_len = min_t(int, count, PAGE_SIZE); 666 int this_len = min_t(int, count, PAGE_SIZE);
667 667
668 if (write && copy_from_user(page, buf, this_len)) { 668 if (write && copy_from_user(page, buf, this_len)) {
669 copied = -EFAULT; 669 copied = -EFAULT;
670 break; 670 break;
671 } 671 }
672 672
673 this_len = access_remote_vm(mm, addr, page, this_len, write); 673 this_len = access_remote_vm(mm, addr, page, this_len, write);
674 if (!this_len) { 674 if (!this_len) {
675 if (!copied) 675 if (!copied)
676 copied = -EIO; 676 copied = -EIO;
677 break; 677 break;
678 } 678 }
679 679
680 if (!write && copy_to_user(buf, page, this_len)) { 680 if (!write && copy_to_user(buf, page, this_len)) {
681 copied = -EFAULT; 681 copied = -EFAULT;
682 break; 682 break;
683 } 683 }
684 684
685 buf += this_len; 685 buf += this_len;
686 addr += this_len; 686 addr += this_len;
687 copied += this_len; 687 copied += this_len;
688 count -= this_len; 688 count -= this_len;
689 } 689 }
690 *ppos = addr; 690 *ppos = addr;
691 691
692 mmput(mm); 692 mmput(mm);
693 free: 693 free:
694 free_page((unsigned long) page); 694 free_page((unsigned long) page);
695 return copied; 695 return copied;
696 } 696 }
697 697
698 static ssize_t mem_read(struct file *file, char __user *buf, 698 static ssize_t mem_read(struct file *file, char __user *buf,
699 size_t count, loff_t *ppos) 699 size_t count, loff_t *ppos)
700 { 700 {
701 return mem_rw(file, buf, count, ppos, 0); 701 return mem_rw(file, buf, count, ppos, 0);
702 } 702 }
703 703
704 static ssize_t mem_write(struct file *file, const char __user *buf, 704 static ssize_t mem_write(struct file *file, const char __user *buf,
705 size_t count, loff_t *ppos) 705 size_t count, loff_t *ppos)
706 { 706 {
707 return mem_rw(file, (char __user*)buf, count, ppos, 1); 707 return mem_rw(file, (char __user*)buf, count, ppos, 1);
708 } 708 }
709 709
710 loff_t mem_lseek(struct file *file, loff_t offset, int orig) 710 loff_t mem_lseek(struct file *file, loff_t offset, int orig)
711 { 711 {
712 switch (orig) { 712 switch (orig) {
713 case 0: 713 case 0:
714 file->f_pos = offset; 714 file->f_pos = offset;
715 break; 715 break;
716 case 1: 716 case 1:
717 file->f_pos += offset; 717 file->f_pos += offset;
718 break; 718 break;
719 default: 719 default:
720 return -EINVAL; 720 return -EINVAL;
721 } 721 }
722 force_successful_syscall_return(); 722 force_successful_syscall_return();
723 return file->f_pos; 723 return file->f_pos;
724 } 724 }
725 725
726 static int mem_release(struct inode *inode, struct file *file) 726 static int mem_release(struct inode *inode, struct file *file)
727 { 727 {
728 struct mm_struct *mm = file->private_data; 728 struct mm_struct *mm = file->private_data;
729 if (mm) 729 if (mm)
730 mmdrop(mm); 730 mmdrop(mm);
731 return 0; 731 return 0;
732 } 732 }
733 733
734 static const struct file_operations proc_mem_operations = { 734 static const struct file_operations proc_mem_operations = {
735 .llseek = mem_lseek, 735 .llseek = mem_lseek,
736 .read = mem_read, 736 .read = mem_read,
737 .write = mem_write, 737 .write = mem_write,
738 .open = mem_open, 738 .open = mem_open,
739 .release = mem_release, 739 .release = mem_release,
740 }; 740 };
741 741
742 static int environ_open(struct inode *inode, struct file *file) 742 static int environ_open(struct inode *inode, struct file *file)
743 { 743 {
744 return __mem_open(inode, file, PTRACE_MODE_READ); 744 return __mem_open(inode, file, PTRACE_MODE_READ);
745 } 745 }
746 746
747 static ssize_t environ_read(struct file *file, char __user *buf, 747 static ssize_t environ_read(struct file *file, char __user *buf,
748 size_t count, loff_t *ppos) 748 size_t count, loff_t *ppos)
749 { 749 {
750 char *page; 750 char *page;
751 unsigned long src = *ppos; 751 unsigned long src = *ppos;
752 int ret = 0; 752 int ret = 0;
753 struct mm_struct *mm = file->private_data; 753 struct mm_struct *mm = file->private_data;
754 754
755 if (!mm) 755 if (!mm)
756 return 0; 756 return 0;
757 757
758 page = (char *)__get_free_page(GFP_TEMPORARY); 758 page = (char *)__get_free_page(GFP_TEMPORARY);
759 if (!page) 759 if (!page)
760 return -ENOMEM; 760 return -ENOMEM;
761 761
762 ret = 0; 762 ret = 0;
763 if (!atomic_inc_not_zero(&mm->mm_users)) 763 if (!atomic_inc_not_zero(&mm->mm_users))
764 goto free; 764 goto free;
765 while (count > 0) { 765 while (count > 0) {
766 size_t this_len, max_len; 766 size_t this_len, max_len;
767 int retval; 767 int retval;
768 768
769 if (src >= (mm->env_end - mm->env_start)) 769 if (src >= (mm->env_end - mm->env_start))
770 break; 770 break;
771 771
772 this_len = mm->env_end - (mm->env_start + src); 772 this_len = mm->env_end - (mm->env_start + src);
773 773
774 max_len = min_t(size_t, PAGE_SIZE, count); 774 max_len = min_t(size_t, PAGE_SIZE, count);
775 this_len = min(max_len, this_len); 775 this_len = min(max_len, this_len);
776 776
777 retval = access_remote_vm(mm, (mm->env_start + src), 777 retval = access_remote_vm(mm, (mm->env_start + src),
778 page, this_len, 0); 778 page, this_len, 0);
779 779
780 if (retval <= 0) { 780 if (retval <= 0) {
781 ret = retval; 781 ret = retval;
782 break; 782 break;
783 } 783 }
784 784
785 if (copy_to_user(buf, page, retval)) { 785 if (copy_to_user(buf, page, retval)) {
786 ret = -EFAULT; 786 ret = -EFAULT;
787 break; 787 break;
788 } 788 }
789 789
790 ret += retval; 790 ret += retval;
791 src += retval; 791 src += retval;
792 buf += retval; 792 buf += retval;
793 count -= retval; 793 count -= retval;
794 } 794 }
795 *ppos = src; 795 *ppos = src;
796 mmput(mm); 796 mmput(mm);
797 797
798 free: 798 free:
799 free_page((unsigned long) page); 799 free_page((unsigned long) page);
800 return ret; 800 return ret;
801 } 801 }
802 802
803 static const struct file_operations proc_environ_operations = { 803 static const struct file_operations proc_environ_operations = {
804 .open = environ_open, 804 .open = environ_open,
805 .read = environ_read, 805 .read = environ_read,
806 .llseek = generic_file_llseek, 806 .llseek = generic_file_llseek,
807 .release = mem_release, 807 .release = mem_release,
808 }; 808 };
809 809
810 static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, 810 static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
811 loff_t *ppos) 811 loff_t *ppos)
812 { 812 {
813 struct task_struct *task = get_proc_task(file_inode(file)); 813 struct task_struct *task = get_proc_task(file_inode(file));
814 char buffer[PROC_NUMBUF]; 814 char buffer[PROC_NUMBUF];
815 int oom_adj = OOM_ADJUST_MIN; 815 int oom_adj = OOM_ADJUST_MIN;
816 size_t len; 816 size_t len;
817 unsigned long flags; 817 unsigned long flags;
818 818
819 if (!task) 819 if (!task)
820 return -ESRCH; 820 return -ESRCH;
821 if (lock_task_sighand(task, &flags)) { 821 if (lock_task_sighand(task, &flags)) {
822 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) 822 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
823 oom_adj = OOM_ADJUST_MAX; 823 oom_adj = OOM_ADJUST_MAX;
824 else 824 else
825 oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / 825 oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
826 OOM_SCORE_ADJ_MAX; 826 OOM_SCORE_ADJ_MAX;
827 unlock_task_sighand(task, &flags); 827 unlock_task_sighand(task, &flags);
828 } 828 }
829 put_task_struct(task); 829 put_task_struct(task);
830 len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); 830 len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
831 return simple_read_from_buffer(buf, count, ppos, buffer, len); 831 return simple_read_from_buffer(buf, count, ppos, buffer, len);
832 } 832 }
833 833
834 static ssize_t oom_adj_write(struct file *file, const char __user *buf, 834 static ssize_t oom_adj_write(struct file *file, const char __user *buf,
835 size_t count, loff_t *ppos) 835 size_t count, loff_t *ppos)
836 { 836 {
837 struct task_struct *task; 837 struct task_struct *task;
838 char buffer[PROC_NUMBUF]; 838 char buffer[PROC_NUMBUF];
839 int oom_adj; 839 int oom_adj;
840 unsigned long flags; 840 unsigned long flags;
841 int err; 841 int err;
842 842
843 memset(buffer, 0, sizeof(buffer)); 843 memset(buffer, 0, sizeof(buffer));
844 if (count > sizeof(buffer) - 1) 844 if (count > sizeof(buffer) - 1)
845 count = sizeof(buffer) - 1; 845 count = sizeof(buffer) - 1;
846 if (copy_from_user(buffer, buf, count)) { 846 if (copy_from_user(buffer, buf, count)) {
847 err = -EFAULT; 847 err = -EFAULT;
848 goto out; 848 goto out;
849 } 849 }
850 850
851 err = kstrtoint(strstrip(buffer), 0, &oom_adj); 851 err = kstrtoint(strstrip(buffer), 0, &oom_adj);
852 if (err) 852 if (err)
853 goto out; 853 goto out;
854 if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) && 854 if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
855 oom_adj != OOM_DISABLE) { 855 oom_adj != OOM_DISABLE) {
856 err = -EINVAL; 856 err = -EINVAL;
857 goto out; 857 goto out;
858 } 858 }
859 859
860 task = get_proc_task(file_inode(file)); 860 task = get_proc_task(file_inode(file));
861 if (!task) { 861 if (!task) {
862 err = -ESRCH; 862 err = -ESRCH;
863 goto out; 863 goto out;
864 } 864 }
865 865
866 task_lock(task); 866 task_lock(task);
867 if (!task->mm) { 867 if (!task->mm) {
868 err = -EINVAL; 868 err = -EINVAL;
869 goto err_task_lock; 869 goto err_task_lock;
870 } 870 }
871 871
872 if (!lock_task_sighand(task, &flags)) { 872 if (!lock_task_sighand(task, &flags)) {
873 err = -ESRCH; 873 err = -ESRCH;
874 goto err_task_lock; 874 goto err_task_lock;
875 } 875 }
876 876
877 /* 877 /*
878 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum 878 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
879 * value is always attainable. 879 * value is always attainable.
880 */ 880 */
881 if (oom_adj == OOM_ADJUST_MAX) 881 if (oom_adj == OOM_ADJUST_MAX)
882 oom_adj = OOM_SCORE_ADJ_MAX; 882 oom_adj = OOM_SCORE_ADJ_MAX;
883 else 883 else
884 oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; 884 oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
885 885
886 if (oom_adj < task->signal->oom_score_adj && 886 if (oom_adj < task->signal->oom_score_adj &&
887 !capable(CAP_SYS_RESOURCE)) { 887 !capable(CAP_SYS_RESOURCE)) {
888 err = -EACCES; 888 err = -EACCES;
889 goto err_sighand; 889 goto err_sighand;
890 } 890 }
891 891
892 /* 892 /*
893 * /proc/pid/oom_adj is provided for legacy purposes, ask users to use 893 * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
894 * /proc/pid/oom_score_adj instead. 894 * /proc/pid/oom_score_adj instead.
895 */ 895 */
896 pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", 896 pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
897 current->comm, task_pid_nr(current), task_pid_nr(task), 897 current->comm, task_pid_nr(current), task_pid_nr(task),
898 task_pid_nr(task)); 898 task_pid_nr(task));
899 899
900 task->signal->oom_score_adj = oom_adj; 900 task->signal->oom_score_adj = oom_adj;
901 trace_oom_score_adj_update(task); 901 trace_oom_score_adj_update(task);
902 err_sighand: 902 err_sighand:
903 unlock_task_sighand(task, &flags); 903 unlock_task_sighand(task, &flags);
904 err_task_lock: 904 err_task_lock:
905 task_unlock(task); 905 task_unlock(task);
906 put_task_struct(task); 906 put_task_struct(task);
907 out: 907 out:
908 return err < 0 ? err : count; 908 return err < 0 ? err : count;
909 } 909 }
910 910
911 static const struct file_operations proc_oom_adj_operations = { 911 static const struct file_operations proc_oom_adj_operations = {
912 .read = oom_adj_read, 912 .read = oom_adj_read,
913 .write = oom_adj_write, 913 .write = oom_adj_write,
914 .llseek = generic_file_llseek, 914 .llseek = generic_file_llseek,
915 }; 915 };
916 916
917 static ssize_t oom_score_adj_read(struct file *file, char __user *buf, 917 static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
918 size_t count, loff_t *ppos) 918 size_t count, loff_t *ppos)
919 { 919 {
920 struct task_struct *task = get_proc_task(file_inode(file)); 920 struct task_struct *task = get_proc_task(file_inode(file));
921 char buffer[PROC_NUMBUF]; 921 char buffer[PROC_NUMBUF];
922 short oom_score_adj = OOM_SCORE_ADJ_MIN; 922 short oom_score_adj = OOM_SCORE_ADJ_MIN;
923 unsigned long flags; 923 unsigned long flags;
924 size_t len; 924 size_t len;
925 925
926 if (!task) 926 if (!task)
927 return -ESRCH; 927 return -ESRCH;
928 if (lock_task_sighand(task, &flags)) { 928 if (lock_task_sighand(task, &flags)) {
929 oom_score_adj = task->signal->oom_score_adj; 929 oom_score_adj = task->signal->oom_score_adj;
930 unlock_task_sighand(task, &flags); 930 unlock_task_sighand(task, &flags);
931 } 931 }
932 put_task_struct(task); 932 put_task_struct(task);
933 len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); 933 len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
934 return simple_read_from_buffer(buf, count, ppos, buffer, len); 934 return simple_read_from_buffer(buf, count, ppos, buffer, len);
935 } 935 }
936 936
937 static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, 937 static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
938 size_t count, loff_t *ppos) 938 size_t count, loff_t *ppos)
939 { 939 {
940 struct task_struct *task; 940 struct task_struct *task;
941 char buffer[PROC_NUMBUF]; 941 char buffer[PROC_NUMBUF];
942 unsigned long flags; 942 unsigned long flags;
943 int oom_score_adj; 943 int oom_score_adj;
944 int err; 944 int err;
945 945
946 memset(buffer, 0, sizeof(buffer)); 946 memset(buffer, 0, sizeof(buffer));
947 if (count > sizeof(buffer) - 1) 947 if (count > sizeof(buffer) - 1)
948 count = sizeof(buffer) - 1; 948 count = sizeof(buffer) - 1;
949 if (copy_from_user(buffer, buf, count)) { 949 if (copy_from_user(buffer, buf, count)) {
950 err = -EFAULT; 950 err = -EFAULT;
951 goto out; 951 goto out;
952 } 952 }
953 953
954 err = kstrtoint(strstrip(buffer), 0, &oom_score_adj); 954 err = kstrtoint(strstrip(buffer), 0, &oom_score_adj);
955 if (err) 955 if (err)
956 goto out; 956 goto out;
957 if (oom_score_adj < OOM_SCORE_ADJ_MIN || 957 if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
958 oom_score_adj > OOM_SCORE_ADJ_MAX) { 958 oom_score_adj > OOM_SCORE_ADJ_MAX) {
959 err = -EINVAL; 959 err = -EINVAL;
960 goto out; 960 goto out;
961 } 961 }
962 962
963 task = get_proc_task(file_inode(file)); 963 task = get_proc_task(file_inode(file));
964 if (!task) { 964 if (!task) {
965 err = -ESRCH; 965 err = -ESRCH;
966 goto out; 966 goto out;
967 } 967 }
968 968
969 task_lock(task); 969 task_lock(task);
970 if (!task->mm) { 970 if (!task->mm) {
971 err = -EINVAL; 971 err = -EINVAL;
972 goto err_task_lock; 972 goto err_task_lock;
973 } 973 }
974 974
975 if (!lock_task_sighand(task, &flags)) { 975 if (!lock_task_sighand(task, &flags)) {
976 err = -ESRCH; 976 err = -ESRCH;
977 goto err_task_lock; 977 goto err_task_lock;
978 } 978 }
979 979
980 if ((short)oom_score_adj < task->signal->oom_score_adj_min && 980 if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
981 !capable(CAP_SYS_RESOURCE)) { 981 !capable(CAP_SYS_RESOURCE)) {
982 err = -EACCES; 982 err = -EACCES;
983 goto err_sighand; 983 goto err_sighand;
984 } 984 }
985 985
986 task->signal->oom_score_adj = (short)oom_score_adj; 986 task->signal->oom_score_adj = (short)oom_score_adj;
987 if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) 987 if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
988 task->signal->oom_score_adj_min = (short)oom_score_adj; 988 task->signal->oom_score_adj_min = (short)oom_score_adj;
989 trace_oom_score_adj_update(task); 989 trace_oom_score_adj_update(task);
990 990
991 err_sighand: 991 err_sighand:
992 unlock_task_sighand(task, &flags); 992 unlock_task_sighand(task, &flags);
993 err_task_lock: 993 err_task_lock:
994 task_unlock(task); 994 task_unlock(task);
995 put_task_struct(task); 995 put_task_struct(task);
996 out: 996 out:
997 return err < 0 ? err : count; 997 return err < 0 ? err : count;
998 } 998 }
999 999
1000 static const struct file_operations proc_oom_score_adj_operations = { 1000 static const struct file_operations proc_oom_score_adj_operations = {
1001 .read = oom_score_adj_read, 1001 .read = oom_score_adj_read,
1002 .write = oom_score_adj_write, 1002 .write = oom_score_adj_write,
1003 .llseek = default_llseek, 1003 .llseek = default_llseek,
1004 }; 1004 };
1005 1005
1006 #ifdef CONFIG_AUDITSYSCALL 1006 #ifdef CONFIG_AUDITSYSCALL
1007 #define TMPBUFLEN 21 1007 #define TMPBUFLEN 21
1008 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, 1008 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
1009 size_t count, loff_t *ppos) 1009 size_t count, loff_t *ppos)
1010 { 1010 {
1011 struct inode * inode = file_inode(file); 1011 struct inode * inode = file_inode(file);
1012 struct task_struct *task = get_proc_task(inode); 1012 struct task_struct *task = get_proc_task(inode);
1013 ssize_t length; 1013 ssize_t length;
1014 char tmpbuf[TMPBUFLEN]; 1014 char tmpbuf[TMPBUFLEN];
1015 1015
1016 if (!task) 1016 if (!task)
1017 return -ESRCH; 1017 return -ESRCH;
1018 length = scnprintf(tmpbuf, TMPBUFLEN, "%u", 1018 length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1019 from_kuid(file->f_cred->user_ns, 1019 from_kuid(file->f_cred->user_ns,
1020 audit_get_loginuid(task))); 1020 audit_get_loginuid(task)));
1021 put_task_struct(task); 1021 put_task_struct(task);
1022 return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); 1022 return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1023 } 1023 }
1024 1024
1025 static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, 1025 static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1026 size_t count, loff_t *ppos) 1026 size_t count, loff_t *ppos)
1027 { 1027 {
1028 struct inode * inode = file_inode(file); 1028 struct inode * inode = file_inode(file);
1029 char *page, *tmp; 1029 char *page, *tmp;
1030 ssize_t length; 1030 ssize_t length;
1031 uid_t loginuid; 1031 uid_t loginuid;
1032 kuid_t kloginuid; 1032 kuid_t kloginuid;
1033 1033
1034 rcu_read_lock(); 1034 rcu_read_lock();
1035 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { 1035 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
1036 rcu_read_unlock(); 1036 rcu_read_unlock();
1037 return -EPERM; 1037 return -EPERM;
1038 } 1038 }
1039 rcu_read_unlock(); 1039 rcu_read_unlock();
1040 1040
1041 if (count >= PAGE_SIZE) 1041 if (count >= PAGE_SIZE)
1042 count = PAGE_SIZE - 1; 1042 count = PAGE_SIZE - 1;
1043 1043
1044 if (*ppos != 0) { 1044 if (*ppos != 0) {
1045 /* No partial writes. */ 1045 /* No partial writes. */
1046 return -EINVAL; 1046 return -EINVAL;
1047 } 1047 }
1048 page = (char*)__get_free_page(GFP_TEMPORARY); 1048 page = (char*)__get_free_page(GFP_TEMPORARY);
1049 if (!page) 1049 if (!page)
1050 return -ENOMEM; 1050 return -ENOMEM;
1051 length = -EFAULT; 1051 length = -EFAULT;
1052 if (copy_from_user(page, buf, count)) 1052 if (copy_from_user(page, buf, count))
1053 goto out_free_page; 1053 goto out_free_page;
1054 1054
1055 page[count] = '\0'; 1055 page[count] = '\0';
1056 loginuid = simple_strtoul(page, &tmp, 10); 1056 loginuid = simple_strtoul(page, &tmp, 10);
1057 if (tmp == page) { 1057 if (tmp == page) {
1058 length = -EINVAL; 1058 length = -EINVAL;
1059 goto out_free_page; 1059 goto out_free_page;
1060 1060
1061 } 1061 }
1062 1062
1063 /* is userspace tring to explicitly UNSET the loginuid? */ 1063 /* is userspace tring to explicitly UNSET the loginuid? */
1064 if (loginuid == AUDIT_UID_UNSET) { 1064 if (loginuid == AUDIT_UID_UNSET) {
1065 kloginuid = INVALID_UID; 1065 kloginuid = INVALID_UID;
1066 } else { 1066 } else {
1067 kloginuid = make_kuid(file->f_cred->user_ns, loginuid); 1067 kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
1068 if (!uid_valid(kloginuid)) { 1068 if (!uid_valid(kloginuid)) {
1069 length = -EINVAL; 1069 length = -EINVAL;
1070 goto out_free_page; 1070 goto out_free_page;
1071 } 1071 }
1072 } 1072 }
1073 1073
1074 length = audit_set_loginuid(kloginuid); 1074 length = audit_set_loginuid(kloginuid);
1075 if (likely(length == 0)) 1075 if (likely(length == 0))
1076 length = count; 1076 length = count;
1077 1077
1078 out_free_page: 1078 out_free_page:
1079 free_page((unsigned long) page); 1079 free_page((unsigned long) page);
1080 return length; 1080 return length;
1081 } 1081 }
1082 1082
1083 static const struct file_operations proc_loginuid_operations = { 1083 static const struct file_operations proc_loginuid_operations = {
1084 .read = proc_loginuid_read, 1084 .read = proc_loginuid_read,
1085 .write = proc_loginuid_write, 1085 .write = proc_loginuid_write,
1086 .llseek = generic_file_llseek, 1086 .llseek = generic_file_llseek,
1087 }; 1087 };
1088 1088
1089 static ssize_t proc_sessionid_read(struct file * file, char __user * buf, 1089 static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
1090 size_t count, loff_t *ppos) 1090 size_t count, loff_t *ppos)
1091 { 1091 {
1092 struct inode * inode = file_inode(file); 1092 struct inode * inode = file_inode(file);
1093 struct task_struct *task = get_proc_task(inode); 1093 struct task_struct *task = get_proc_task(inode);
1094 ssize_t length; 1094 ssize_t length;
1095 char tmpbuf[TMPBUFLEN]; 1095 char tmpbuf[TMPBUFLEN];
1096 1096
1097 if (!task) 1097 if (!task)
1098 return -ESRCH; 1098 return -ESRCH;
1099 length = scnprintf(tmpbuf, TMPBUFLEN, "%u", 1099 length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1100 audit_get_sessionid(task)); 1100 audit_get_sessionid(task));
1101 put_task_struct(task); 1101 put_task_struct(task);
1102 return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); 1102 return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1103 } 1103 }
1104 1104
1105 static const struct file_operations proc_sessionid_operations = { 1105 static const struct file_operations proc_sessionid_operations = {
1106 .read = proc_sessionid_read, 1106 .read = proc_sessionid_read,
1107 .llseek = generic_file_llseek, 1107 .llseek = generic_file_llseek,
1108 }; 1108 };
1109 #endif 1109 #endif
1110 1110
1111 #ifdef CONFIG_FAULT_INJECTION 1111 #ifdef CONFIG_FAULT_INJECTION
1112 static ssize_t proc_fault_inject_read(struct file * file, char __user * buf, 1112 static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
1113 size_t count, loff_t *ppos) 1113 size_t count, loff_t *ppos)
1114 { 1114 {
1115 struct task_struct *task = get_proc_task(file_inode(file)); 1115 struct task_struct *task = get_proc_task(file_inode(file));
1116 char buffer[PROC_NUMBUF]; 1116 char buffer[PROC_NUMBUF];
1117 size_t len; 1117 size_t len;
1118 int make_it_fail; 1118 int make_it_fail;
1119 1119
1120 if (!task) 1120 if (!task)
1121 return -ESRCH; 1121 return -ESRCH;
1122 make_it_fail = task->make_it_fail; 1122 make_it_fail = task->make_it_fail;
1123 put_task_struct(task); 1123 put_task_struct(task);
1124 1124
1125 len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail); 1125 len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);
1126 1126
1127 return simple_read_from_buffer(buf, count, ppos, buffer, len); 1127 return simple_read_from_buffer(buf, count, ppos, buffer, len);
1128 } 1128 }
1129 1129
1130 static ssize_t proc_fault_inject_write(struct file * file, 1130 static ssize_t proc_fault_inject_write(struct file * file,
1131 const char __user * buf, size_t count, loff_t *ppos) 1131 const char __user * buf, size_t count, loff_t *ppos)
1132 { 1132 {
1133 struct task_struct *task; 1133 struct task_struct *task;
1134 char buffer[PROC_NUMBUF], *end; 1134 char buffer[PROC_NUMBUF], *end;
1135 int make_it_fail; 1135 int make_it_fail;
1136 1136
1137 if (!capable(CAP_SYS_RESOURCE)) 1137 if (!capable(CAP_SYS_RESOURCE))
1138 return -EPERM; 1138 return -EPERM;
1139 memset(buffer, 0, sizeof(buffer)); 1139 memset(buffer, 0, sizeof(buffer));
1140 if (count > sizeof(buffer) - 1) 1140 if (count > sizeof(buffer) - 1)
1141 count = sizeof(buffer) - 1; 1141 count = sizeof(buffer) - 1;
1142 if (copy_from_user(buffer, buf, count)) 1142 if (copy_from_user(buffer, buf, count))
1143 return -EFAULT; 1143 return -EFAULT;
1144 make_it_fail = simple_strtol(strstrip(buffer), &end, 0); 1144 make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
1145 if (*end) 1145 if (*end)
1146 return -EINVAL; 1146 return -EINVAL;
1147 if (make_it_fail < 0 || make_it_fail > 1) 1147 if (make_it_fail < 0 || make_it_fail > 1)
1148 return -EINVAL; 1148 return -EINVAL;
1149 1149
1150 task = get_proc_task(file_inode(file)); 1150 task = get_proc_task(file_inode(file));
1151 if (!task) 1151 if (!task)
1152 return -ESRCH; 1152 return -ESRCH;
1153 task->make_it_fail = make_it_fail; 1153 task->make_it_fail = make_it_fail;
1154 put_task_struct(task); 1154 put_task_struct(task);
1155 1155
1156 return count; 1156 return count;
1157 } 1157 }
1158 1158
1159 static const struct file_operations proc_fault_inject_operations = { 1159 static const struct file_operations proc_fault_inject_operations = {
1160 .read = proc_fault_inject_read, 1160 .read = proc_fault_inject_read,
1161 .write = proc_fault_inject_write, 1161 .write = proc_fault_inject_write,
1162 .llseek = generic_file_llseek, 1162 .llseek = generic_file_llseek,
1163 }; 1163 };
1164 #endif 1164 #endif
1165 1165
1166 1166
1167 #ifdef CONFIG_SCHED_DEBUG 1167 #ifdef CONFIG_SCHED_DEBUG
1168 /* 1168 /*
1169 * Print out various scheduling related per-task fields: 1169 * Print out various scheduling related per-task fields:
1170 */ 1170 */
1171 static int sched_show(struct seq_file *m, void *v) 1171 static int sched_show(struct seq_file *m, void *v)
1172 { 1172 {
1173 struct inode *inode = m->private; 1173 struct inode *inode = m->private;
1174 struct task_struct *p; 1174 struct task_struct *p;
1175 1175
1176 p = get_proc_task(inode); 1176 p = get_proc_task(inode);
1177 if (!p) 1177 if (!p)
1178 return -ESRCH; 1178 return -ESRCH;
1179 proc_sched_show_task(p, m); 1179 proc_sched_show_task(p, m);
1180 1180
1181 put_task_struct(p); 1181 put_task_struct(p);
1182 1182
1183 return 0; 1183 return 0;
1184 } 1184 }
1185 1185
1186 static ssize_t 1186 static ssize_t
1187 sched_write(struct file *file, const char __user *buf, 1187 sched_write(struct file *file, const char __user *buf,
1188 size_t count, loff_t *offset) 1188 size_t count, loff_t *offset)
1189 { 1189 {
1190 struct inode *inode = file_inode(file); 1190 struct inode *inode = file_inode(file);
1191 struct task_struct *p; 1191 struct task_struct *p;
1192 1192
1193 p = get_proc_task(inode); 1193 p = get_proc_task(inode);
1194 if (!p) 1194 if (!p)
1195 return -ESRCH; 1195 return -ESRCH;
1196 proc_sched_set_task(p); 1196 proc_sched_set_task(p);
1197 1197
1198 put_task_struct(p); 1198 put_task_struct(p);
1199 1199
1200 return count; 1200 return count;
1201 } 1201 }
1202 1202
1203 static int sched_open(struct inode *inode, struct file *filp) 1203 static int sched_open(struct inode *inode, struct file *filp)
1204 { 1204 {
1205 return single_open(filp, sched_show, inode); 1205 return single_open(filp, sched_show, inode);
1206 } 1206 }
1207 1207
1208 static const struct file_operations proc_pid_sched_operations = { 1208 static const struct file_operations proc_pid_sched_operations = {
1209 .open = sched_open, 1209 .open = sched_open,
1210 .read = seq_read, 1210 .read = seq_read,
1211 .write = sched_write, 1211 .write = sched_write,
1212 .llseek = seq_lseek, 1212 .llseek = seq_lseek,
1213 .release = single_release, 1213 .release = single_release,
1214 }; 1214 };
1215 1215
1216 #endif 1216 #endif
1217 1217
1218 #ifdef CONFIG_SCHED_AUTOGROUP 1218 #ifdef CONFIG_SCHED_AUTOGROUP
1219 /* 1219 /*
1220 * Print out autogroup related information: 1220 * Print out autogroup related information:
1221 */ 1221 */
1222 static int sched_autogroup_show(struct seq_file *m, void *v) 1222 static int sched_autogroup_show(struct seq_file *m, void *v)
1223 { 1223 {
1224 struct inode *inode = m->private; 1224 struct inode *inode = m->private;
1225 struct task_struct *p; 1225 struct task_struct *p;
1226 1226
1227 p = get_proc_task(inode); 1227 p = get_proc_task(inode);
1228 if (!p) 1228 if (!p)
1229 return -ESRCH; 1229 return -ESRCH;
1230 proc_sched_autogroup_show_task(p, m); 1230 proc_sched_autogroup_show_task(p, m);
1231 1231
1232 put_task_struct(p); 1232 put_task_struct(p);
1233 1233
1234 return 0; 1234 return 0;
1235 } 1235 }
1236 1236
1237 static ssize_t 1237 static ssize_t
1238 sched_autogroup_write(struct file *file, const char __user *buf, 1238 sched_autogroup_write(struct file *file, const char __user *buf,
1239 size_t count, loff_t *offset) 1239 size_t count, loff_t *offset)
1240 { 1240 {
1241 struct inode *inode = file_inode(file); 1241 struct inode *inode = file_inode(file);
1242 struct task_struct *p; 1242 struct task_struct *p;
1243 char buffer[PROC_NUMBUF]; 1243 char buffer[PROC_NUMBUF];
1244 int nice; 1244 int nice;
1245 int err; 1245 int err;
1246 1246
1247 memset(buffer, 0, sizeof(buffer)); 1247 memset(buffer, 0, sizeof(buffer));
1248 if (count > sizeof(buffer) - 1) 1248 if (count > sizeof(buffer) - 1)
1249 count = sizeof(buffer) - 1; 1249 count = sizeof(buffer) - 1;
1250 if (copy_from_user(buffer, buf, count)) 1250 if (copy_from_user(buffer, buf, count))
1251 return -EFAULT; 1251 return -EFAULT;
1252 1252
1253 err = kstrtoint(strstrip(buffer), 0, &nice); 1253 err = kstrtoint(strstrip(buffer), 0, &nice);
1254 if (err < 0) 1254 if (err < 0)
1255 return err; 1255 return err;
1256 1256
1257 p = get_proc_task(inode); 1257 p = get_proc_task(inode);
1258 if (!p) 1258 if (!p)
1259 return -ESRCH; 1259 return -ESRCH;
1260 1260
1261 err = proc_sched_autogroup_set_nice(p, nice); 1261 err = proc_sched_autogroup_set_nice(p, nice);
1262 if (err) 1262 if (err)
1263 count = err; 1263 count = err;
1264 1264
1265 put_task_struct(p); 1265 put_task_struct(p);
1266 1266
1267 return count; 1267 return count;
1268 } 1268 }
1269 1269
1270 static int sched_autogroup_open(struct inode *inode, struct file *filp) 1270 static int sched_autogroup_open(struct inode *inode, struct file *filp)
1271 { 1271 {
1272 int ret; 1272 int ret;
1273 1273
1274 ret = single_open(filp, sched_autogroup_show, NULL); 1274 ret = single_open(filp, sched_autogroup_show, NULL);
1275 if (!ret) { 1275 if (!ret) {
1276 struct seq_file *m = filp->private_data; 1276 struct seq_file *m = filp->private_data;
1277 1277
1278 m->private = inode; 1278 m->private = inode;
1279 } 1279 }
1280 return ret; 1280 return ret;
1281 } 1281 }
1282 1282
1283 static const struct file_operations proc_pid_sched_autogroup_operations = { 1283 static const struct file_operations proc_pid_sched_autogroup_operations = {
1284 .open = sched_autogroup_open, 1284 .open = sched_autogroup_open,
1285 .read = seq_read, 1285 .read = seq_read,
1286 .write = sched_autogroup_write, 1286 .write = sched_autogroup_write,
1287 .llseek = seq_lseek, 1287 .llseek = seq_lseek,
1288 .release = single_release, 1288 .release = single_release,
1289 }; 1289 };
1290 1290
1291 #endif /* CONFIG_SCHED_AUTOGROUP */ 1291 #endif /* CONFIG_SCHED_AUTOGROUP */
1292 1292
1293 static ssize_t comm_write(struct file *file, const char __user *buf, 1293 static ssize_t comm_write(struct file *file, const char __user *buf,
1294 size_t count, loff_t *offset) 1294 size_t count, loff_t *offset)
1295 { 1295 {
1296 struct inode *inode = file_inode(file); 1296 struct inode *inode = file_inode(file);
1297 struct task_struct *p; 1297 struct task_struct *p;
1298 char buffer[TASK_COMM_LEN]; 1298 char buffer[TASK_COMM_LEN];
1299 const size_t maxlen = sizeof(buffer) - 1; 1299 const size_t maxlen = sizeof(buffer) - 1;
1300 1300
1301 memset(buffer, 0, sizeof(buffer)); 1301 memset(buffer, 0, sizeof(buffer));
1302 if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count)) 1302 if (copy_from_user(buffer, buf, count > maxlen ? maxlen : count))
1303 return -EFAULT; 1303 return -EFAULT;
1304 1304
1305 p = get_proc_task(inode); 1305 p = get_proc_task(inode);
1306 if (!p) 1306 if (!p)
1307 return -ESRCH; 1307 return -ESRCH;
1308 1308
1309 if (same_thread_group(current, p)) 1309 if (same_thread_group(current, p))
1310 set_task_comm(p, buffer); 1310 set_task_comm(p, buffer);
1311 else 1311 else
1312 count = -EINVAL; 1312 count = -EINVAL;
1313 1313
1314 put_task_struct(p); 1314 put_task_struct(p);
1315 1315
1316 return count; 1316 return count;
1317 } 1317 }
1318 1318
1319 static int comm_show(struct seq_file *m, void *v) 1319 static int comm_show(struct seq_file *m, void *v)
1320 { 1320 {
1321 struct inode *inode = m->private; 1321 struct inode *inode = m->private;
1322 struct task_struct *p; 1322 struct task_struct *p;
1323 1323
1324 p = get_proc_task(inode); 1324 p = get_proc_task(inode);
1325 if (!p) 1325 if (!p)
1326 return -ESRCH; 1326 return -ESRCH;
1327 1327
1328 task_lock(p); 1328 task_lock(p);
1329 seq_printf(m, "%s\n", p->comm); 1329 seq_printf(m, "%s\n", p->comm);
1330 task_unlock(p); 1330 task_unlock(p);
1331 1331
1332 put_task_struct(p); 1332 put_task_struct(p);
1333 1333
1334 return 0; 1334 return 0;
1335 } 1335 }
1336 1336
1337 static int comm_open(struct inode *inode, struct file *filp) 1337 static int comm_open(struct inode *inode, struct file *filp)
1338 { 1338 {
1339 return single_open(filp, comm_show, inode); 1339 return single_open(filp, comm_show, inode);
1340 } 1340 }
1341 1341
1342 static const struct file_operations proc_pid_set_comm_operations = { 1342 static const struct file_operations proc_pid_set_comm_operations = {
1343 .open = comm_open, 1343 .open = comm_open,
1344 .read = seq_read, 1344 .read = seq_read,
1345 .write = comm_write, 1345 .write = comm_write,
1346 .llseek = seq_lseek, 1346 .llseek = seq_lseek,
1347 .release = single_release, 1347 .release = single_release,
1348 }; 1348 };
1349 1349
1350 static int proc_exe_link(struct dentry *dentry, struct path *exe_path) 1350 static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
1351 { 1351 {
1352 struct task_struct *task; 1352 struct task_struct *task;
1353 struct mm_struct *mm; 1353 struct mm_struct *mm;
1354 struct file *exe_file; 1354 struct file *exe_file;
1355 1355
1356 task = get_proc_task(dentry->d_inode); 1356 task = get_proc_task(dentry->d_inode);
1357 if (!task) 1357 if (!task)
1358 return -ENOENT; 1358 return -ENOENT;
1359 mm = get_task_mm(task); 1359 mm = get_task_mm(task);
1360 put_task_struct(task); 1360 put_task_struct(task);
1361 if (!mm) 1361 if (!mm)
1362 return -ENOENT; 1362 return -ENOENT;
1363 exe_file = get_mm_exe_file(mm); 1363 exe_file = get_mm_exe_file(mm);
1364 mmput(mm); 1364 mmput(mm);
1365 if (exe_file) { 1365 if (exe_file) {
1366 *exe_path = exe_file->f_path; 1366 *exe_path = exe_file->f_path;
1367 path_get(&exe_file->f_path); 1367 path_get(&exe_file->f_path);
1368 fput(exe_file); 1368 fput(exe_file);
1369 return 0; 1369 return 0;
1370 } else 1370 } else
1371 return -ENOENT; 1371 return -ENOENT;
1372 } 1372 }
1373 1373
1374 static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) 1374 static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
1375 { 1375 {
1376 struct inode *inode = dentry->d_inode; 1376 struct inode *inode = dentry->d_inode;
1377 struct path path; 1377 struct path path;
1378 int error = -EACCES; 1378 int error = -EACCES;
1379 1379
1380 /* Are we allowed to snoop on the tasks file descriptors? */ 1380 /* Are we allowed to snoop on the tasks file descriptors? */
1381 if (!proc_fd_access_allowed(inode)) 1381 if (!proc_fd_access_allowed(inode))
1382 goto out; 1382 goto out;
1383 1383
1384 error = PROC_I(inode)->op.proc_get_link(dentry, &path); 1384 error = PROC_I(inode)->op.proc_get_link(dentry, &path);
1385 if (error) 1385 if (error)
1386 goto out; 1386 goto out;
1387 1387
1388 nd_jump_link(nd, &path); 1388 nd_jump_link(nd, &path);
1389 return NULL; 1389 return NULL;
1390 out: 1390 out:
1391 return ERR_PTR(error); 1391 return ERR_PTR(error);
1392 } 1392 }
1393 1393
1394 static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) 1394 static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
1395 { 1395 {
1396 char *tmp = (char*)__get_free_page(GFP_TEMPORARY); 1396 char *tmp = (char*)__get_free_page(GFP_TEMPORARY);
1397 char *pathname; 1397 char *pathname;
1398 int len; 1398 int len;
1399 1399
1400 if (!tmp) 1400 if (!tmp)
1401 return -ENOMEM; 1401 return -ENOMEM;
1402 1402
1403 pathname = d_path(path, tmp, PAGE_SIZE); 1403 pathname = d_path(path, tmp, PAGE_SIZE);
1404 len = PTR_ERR(pathname); 1404 len = PTR_ERR(pathname);
1405 if (IS_ERR(pathname)) 1405 if (IS_ERR(pathname))
1406 goto out; 1406 goto out;
1407 len = tmp + PAGE_SIZE - 1 - pathname; 1407 len = tmp + PAGE_SIZE - 1 - pathname;
1408 1408
1409 if (len > buflen) 1409 if (len > buflen)
1410 len = buflen; 1410 len = buflen;
1411 if (copy_to_user(buffer, pathname, len)) 1411 if (copy_to_user(buffer, pathname, len))
1412 len = -EFAULT; 1412 len = -EFAULT;
1413 out: 1413 out:
1414 free_page((unsigned long)tmp); 1414 free_page((unsigned long)tmp);
1415 return len; 1415 return len;
1416 } 1416 }
1417 1417
1418 static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen) 1418 static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
1419 { 1419 {
1420 int error = -EACCES; 1420 int error = -EACCES;
1421 struct inode *inode = dentry->d_inode; 1421 struct inode *inode = dentry->d_inode;
1422 struct path path; 1422 struct path path;
1423 1423
1424 /* Are we allowed to snoop on the tasks file descriptors? */ 1424 /* Are we allowed to snoop on the tasks file descriptors? */
1425 if (!proc_fd_access_allowed(inode)) 1425 if (!proc_fd_access_allowed(inode))
1426 goto out; 1426 goto out;
1427 1427
1428 error = PROC_I(inode)->op.proc_get_link(dentry, &path); 1428 error = PROC_I(inode)->op.proc_get_link(dentry, &path);
1429 if (error) 1429 if (error)
1430 goto out; 1430 goto out;
1431 1431
1432 error = do_proc_readlink(&path, buffer, buflen); 1432 error = do_proc_readlink(&path, buffer, buflen);
1433 path_put(&path); 1433 path_put(&path);
1434 out: 1434 out:
1435 return error; 1435 return error;
1436 } 1436 }
1437 1437
1438 const struct inode_operations proc_pid_link_inode_operations = { 1438 const struct inode_operations proc_pid_link_inode_operations = {
1439 .readlink = proc_pid_readlink, 1439 .readlink = proc_pid_readlink,
1440 .follow_link = proc_pid_follow_link, 1440 .follow_link = proc_pid_follow_link,
1441 .setattr = proc_setattr, 1441 .setattr = proc_setattr,
1442 }; 1442 };
1443 1443
1444 1444
1445 /* building an inode */ 1445 /* building an inode */
1446 1446
1447 struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) 1447 struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
1448 { 1448 {
1449 struct inode * inode; 1449 struct inode * inode;
1450 struct proc_inode *ei; 1450 struct proc_inode *ei;
1451 const struct cred *cred; 1451 const struct cred *cred;
1452 1452
1453 /* We need a new inode */ 1453 /* We need a new inode */
1454 1454
1455 inode = new_inode(sb); 1455 inode = new_inode(sb);
1456 if (!inode) 1456 if (!inode)
1457 goto out; 1457 goto out;
1458 1458
1459 /* Common stuff */ 1459 /* Common stuff */
1460 ei = PROC_I(inode); 1460 ei = PROC_I(inode);
1461 inode->i_ino = get_next_ino(); 1461 inode->i_ino = get_next_ino();
1462 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 1462 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1463 inode->i_op = &proc_def_inode_operations; 1463 inode->i_op = &proc_def_inode_operations;
1464 1464
1465 /* 1465 /*
1466 * grab the reference to task. 1466 * grab the reference to task.
1467 */ 1467 */
1468 ei->pid = get_task_pid(task, PIDTYPE_PID); 1468 ei->pid = get_task_pid(task, PIDTYPE_PID);
1469 if (!ei->pid) 1469 if (!ei->pid)
1470 goto out_unlock; 1470 goto out_unlock;
1471 1471
1472 if (task_dumpable(task)) { 1472 if (task_dumpable(task)) {
1473 rcu_read_lock(); 1473 rcu_read_lock();
1474 cred = __task_cred(task); 1474 cred = __task_cred(task);
1475 inode->i_uid = cred->euid; 1475 inode->i_uid = cred->euid;
1476 inode->i_gid = cred->egid; 1476 inode->i_gid = cred->egid;
1477 rcu_read_unlock(); 1477 rcu_read_unlock();
1478 } 1478 }
1479 security_task_to_inode(task, inode); 1479 security_task_to_inode(task, inode);
1480 1480
1481 out: 1481 out:
1482 return inode; 1482 return inode;
1483 1483
1484 out_unlock: 1484 out_unlock:
1485 iput(inode); 1485 iput(inode);
1486 return NULL; 1486 return NULL;
1487 } 1487 }
1488 1488
1489 int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) 1489 int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1490 { 1490 {
1491 struct inode *inode = dentry->d_inode; 1491 struct inode *inode = dentry->d_inode;
1492 struct task_struct *task; 1492 struct task_struct *task;
1493 const struct cred *cred; 1493 const struct cred *cred;
1494 struct pid_namespace *pid = dentry->d_sb->s_fs_info; 1494 struct pid_namespace *pid = dentry->d_sb->s_fs_info;
1495 1495
1496 generic_fillattr(inode, stat); 1496 generic_fillattr(inode, stat);
1497 1497
1498 rcu_read_lock(); 1498 rcu_read_lock();
1499 stat->uid = GLOBAL_ROOT_UID; 1499 stat->uid = GLOBAL_ROOT_UID;
1500 stat->gid = GLOBAL_ROOT_GID; 1500 stat->gid = GLOBAL_ROOT_GID;
1501 task = pid_task(proc_pid(inode), PIDTYPE_PID); 1501 task = pid_task(proc_pid(inode), PIDTYPE_PID);
1502 if (task) { 1502 if (task) {
1503 if (!has_pid_permissions(pid, task, 2)) { 1503 if (!has_pid_permissions(pid, task, 2)) {
1504 rcu_read_unlock(); 1504 rcu_read_unlock();
1505 /* 1505 /*
1506 * This doesn't prevent learning whether PID exists, 1506 * This doesn't prevent learning whether PID exists,
1507 * it only makes getattr() consistent with readdir(). 1507 * it only makes getattr() consistent with readdir().
1508 */ 1508 */
1509 return -ENOENT; 1509 return -ENOENT;
1510 } 1510 }
1511 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || 1511 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
1512 task_dumpable(task)) { 1512 task_dumpable(task)) {
1513 cred = __task_cred(task); 1513 cred = __task_cred(task);
1514 stat->uid = cred->euid; 1514 stat->uid = cred->euid;
1515 stat->gid = cred->egid; 1515 stat->gid = cred->egid;
1516 } 1516 }
1517 } 1517 }
1518 rcu_read_unlock(); 1518 rcu_read_unlock();
1519 return 0; 1519 return 0;
1520 } 1520 }
1521 1521
1522 /* dentry stuff */ 1522 /* dentry stuff */
1523 1523
1524 /* 1524 /*
1525 * Exceptional case: normally we are not allowed to unhash a busy 1525 * Exceptional case: normally we are not allowed to unhash a busy
1526 * directory. In this case, however, we can do it - no aliasing problems 1526 * directory. In this case, however, we can do it - no aliasing problems
1527 * due to the way we treat inodes. 1527 * due to the way we treat inodes.
1528 * 1528 *
1529 * Rewrite the inode's ownerships here because the owning task may have 1529 * Rewrite the inode's ownerships here because the owning task may have
1530 * performed a setuid(), etc. 1530 * performed a setuid(), etc.
1531 * 1531 *
1532 * Before the /proc/pid/status file was created the only way to read 1532 * Before the /proc/pid/status file was created the only way to read
1533 * the effective uid of a /process was to stat /proc/pid. Reading 1533 * the effective uid of a /process was to stat /proc/pid. Reading
1534 * /proc/pid/status is slow enough that procps and other packages 1534 * /proc/pid/status is slow enough that procps and other packages
1535 * kept stating /proc/pid. To keep the rules in /proc simple I have 1535 * kept stating /proc/pid. To keep the rules in /proc simple I have
1536 * made this apply to all per process world readable and executable 1536 * made this apply to all per process world readable and executable
1537 * directories. 1537 * directories.
1538 */ 1538 */
1539 int pid_revalidate(struct dentry *dentry, unsigned int flags) 1539 int pid_revalidate(struct dentry *dentry, unsigned int flags)
1540 { 1540 {
1541 struct inode *inode; 1541 struct inode *inode;
1542 struct task_struct *task; 1542 struct task_struct *task;
1543 const struct cred *cred; 1543 const struct cred *cred;
1544 1544
1545 if (flags & LOOKUP_RCU) 1545 if (flags & LOOKUP_RCU)
1546 return -ECHILD; 1546 return -ECHILD;
1547 1547
1548 inode = dentry->d_inode; 1548 inode = dentry->d_inode;
1549 task = get_proc_task(inode); 1549 task = get_proc_task(inode);
1550 1550
1551 if (task) { 1551 if (task) {
1552 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || 1552 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
1553 task_dumpable(task)) { 1553 task_dumpable(task)) {
1554 rcu_read_lock(); 1554 rcu_read_lock();
1555 cred = __task_cred(task); 1555 cred = __task_cred(task);
1556 inode->i_uid = cred->euid; 1556 inode->i_uid = cred->euid;
1557 inode->i_gid = cred->egid; 1557 inode->i_gid = cred->egid;
1558 rcu_read_unlock(); 1558 rcu_read_unlock();
1559 } else { 1559 } else {
1560 inode->i_uid = GLOBAL_ROOT_UID; 1560 inode->i_uid = GLOBAL_ROOT_UID;
1561 inode->i_gid = GLOBAL_ROOT_GID; 1561 inode->i_gid = GLOBAL_ROOT_GID;
1562 } 1562 }
1563 inode->i_mode &= ~(S_ISUID | S_ISGID); 1563 inode->i_mode &= ~(S_ISUID | S_ISGID);
1564 security_task_to_inode(task, inode); 1564 security_task_to_inode(task, inode);
1565 put_task_struct(task); 1565 put_task_struct(task);
1566 return 1; 1566 return 1;
1567 } 1567 }
1568 return 0; 1568 return 0;
1569 } 1569 }
1570 1570
1571 static inline bool proc_inode_is_dead(struct inode *inode) 1571 static inline bool proc_inode_is_dead(struct inode *inode)
1572 { 1572 {
1573 return !proc_pid(inode)->tasks[PIDTYPE_PID].first; 1573 return !proc_pid(inode)->tasks[PIDTYPE_PID].first;
1574 } 1574 }
1575 1575
1576 int pid_delete_dentry(const struct dentry *dentry) 1576 int pid_delete_dentry(const struct dentry *dentry)
1577 { 1577 {
1578 /* Is the task we represent dead? 1578 /* Is the task we represent dead?
1579 * If so, then don't put the dentry on the lru list, 1579 * If so, then don't put the dentry on the lru list,
1580 * kill it immediately. 1580 * kill it immediately.
1581 */ 1581 */
1582 return proc_inode_is_dead(dentry->d_inode); 1582 return proc_inode_is_dead(dentry->d_inode);
1583 } 1583 }
1584 1584
1585 const struct dentry_operations pid_dentry_operations = 1585 const struct dentry_operations pid_dentry_operations =
1586 { 1586 {
1587 .d_revalidate = pid_revalidate, 1587 .d_revalidate = pid_revalidate,
1588 .d_delete = pid_delete_dentry, 1588 .d_delete = pid_delete_dentry,
1589 }; 1589 };
1590 1590
1591 /* Lookups */ 1591 /* Lookups */
1592 1592
1593 /* 1593 /*
1594 * Fill a directory entry. 1594 * Fill a directory entry.
1595 * 1595 *
1596 * If possible create the dcache entry and derive our inode number and 1596 * If possible create the dcache entry and derive our inode number and
1597 * file type from dcache entry. 1597 * file type from dcache entry.
1598 * 1598 *
1599 * Since all of the proc inode numbers are dynamically generated, the inode 1599 * Since all of the proc inode numbers are dynamically generated, the inode
1600 * numbers do not exist until the inode is cache. This means creating the 1600 * numbers do not exist until the inode is cache. This means creating the
1601 * the dcache entry in readdir is necessary to keep the inode numbers 1601 * the dcache entry in readdir is necessary to keep the inode numbers
1602 * reported by readdir in sync with the inode numbers reported 1602 * reported by readdir in sync with the inode numbers reported
1603 * by stat. 1603 * by stat.
1604 */ 1604 */
1605 bool proc_fill_cache(struct file *file, struct dir_context *ctx, 1605 bool proc_fill_cache(struct file *file, struct dir_context *ctx,
1606 const char *name, int len, 1606 const char *name, int len,
1607 instantiate_t instantiate, struct task_struct *task, const void *ptr) 1607 instantiate_t instantiate, struct task_struct *task, const void *ptr)
1608 { 1608 {
1609 struct dentry *child, *dir = file->f_path.dentry; 1609 struct dentry *child, *dir = file->f_path.dentry;
1610 struct qstr qname = QSTR_INIT(name, len); 1610 struct qstr qname = QSTR_INIT(name, len);
1611 struct inode *inode; 1611 struct inode *inode;
1612 unsigned type; 1612 unsigned type;
1613 ino_t ino; 1613 ino_t ino;
1614 1614
1615 child = d_hash_and_lookup(dir, &qname); 1615 child = d_hash_and_lookup(dir, &qname);
1616 if (!child) { 1616 if (!child) {
1617 child = d_alloc(dir, &qname); 1617 child = d_alloc(dir, &qname);
1618 if (!child) 1618 if (!child)
1619 goto end_instantiate; 1619 goto end_instantiate;
1620 if (instantiate(dir->d_inode, child, task, ptr) < 0) { 1620 if (instantiate(dir->d_inode, child, task, ptr) < 0) {
1621 dput(child); 1621 dput(child);
1622 goto end_instantiate; 1622 goto end_instantiate;
1623 } 1623 }
1624 } 1624 }
1625 inode = child->d_inode; 1625 inode = child->d_inode;
1626 ino = inode->i_ino; 1626 ino = inode->i_ino;
1627 type = inode->i_mode >> 12; 1627 type = inode->i_mode >> 12;
1628 dput(child); 1628 dput(child);
1629 return dir_emit(ctx, name, len, ino, type); 1629 return dir_emit(ctx, name, len, ino, type);
1630 1630
1631 end_instantiate: 1631 end_instantiate:
1632 return dir_emit(ctx, name, len, 1, DT_UNKNOWN); 1632 return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
1633 } 1633 }
1634 1634
1635 #ifdef CONFIG_CHECKPOINT_RESTORE 1635 #ifdef CONFIG_CHECKPOINT_RESTORE
1636 1636
1637 /* 1637 /*
1638 * dname_to_vma_addr - maps a dentry name into two unsigned longs 1638 * dname_to_vma_addr - maps a dentry name into two unsigned longs
1639 * which represent vma start and end addresses. 1639 * which represent vma start and end addresses.
1640 */ 1640 */
1641 static int dname_to_vma_addr(struct dentry *dentry, 1641 static int dname_to_vma_addr(struct dentry *dentry,
1642 unsigned long *start, unsigned long *end) 1642 unsigned long *start, unsigned long *end)
1643 { 1643 {
1644 if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2) 1644 if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
1645 return -EINVAL; 1645 return -EINVAL;
1646 1646
1647 return 0; 1647 return 0;
1648 } 1648 }
1649 1649
1650 static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) 1650 static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
1651 { 1651 {
1652 unsigned long vm_start, vm_end; 1652 unsigned long vm_start, vm_end;
1653 bool exact_vma_exists = false; 1653 bool exact_vma_exists = false;
1654 struct mm_struct *mm = NULL; 1654 struct mm_struct *mm = NULL;
1655 struct task_struct *task; 1655 struct task_struct *task;
1656 const struct cred *cred; 1656 const struct cred *cred;
1657 struct inode *inode; 1657 struct inode *inode;
1658 int status = 0; 1658 int status = 0;
1659 1659
1660 if (flags & LOOKUP_RCU) 1660 if (flags & LOOKUP_RCU)
1661 return -ECHILD; 1661 return -ECHILD;
1662 1662
1663 if (!capable(CAP_SYS_ADMIN)) { 1663 if (!capable(CAP_SYS_ADMIN)) {
1664 status = -EPERM; 1664 status = -EPERM;
1665 goto out_notask; 1665 goto out_notask;
1666 } 1666 }
1667 1667
1668 inode = dentry->d_inode; 1668 inode = dentry->d_inode;
1669 task = get_proc_task(inode); 1669 task = get_proc_task(inode);
1670 if (!task) 1670 if (!task)
1671 goto out_notask; 1671 goto out_notask;
1672 1672
1673 mm = mm_access(task, PTRACE_MODE_READ); 1673 mm = mm_access(task, PTRACE_MODE_READ);
1674 if (IS_ERR_OR_NULL(mm)) 1674 if (IS_ERR_OR_NULL(mm))
1675 goto out; 1675 goto out;
1676 1676
1677 if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { 1677 if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
1678 down_read(&mm->mmap_sem); 1678 down_read(&mm->mmap_sem);
1679 exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end); 1679 exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
1680 up_read(&mm->mmap_sem); 1680 up_read(&mm->mmap_sem);
1681 } 1681 }
1682 1682
1683 mmput(mm); 1683 mmput(mm);
1684 1684
1685 if (exact_vma_exists) { 1685 if (exact_vma_exists) {
1686 if (task_dumpable(task)) { 1686 if (task_dumpable(task)) {
1687 rcu_read_lock(); 1687 rcu_read_lock();
1688 cred = __task_cred(task); 1688 cred = __task_cred(task);
1689 inode->i_uid = cred->euid; 1689 inode->i_uid = cred->euid;
1690 inode->i_gid = cred->egid; 1690 inode->i_gid = cred->egid;
1691 rcu_read_unlock(); 1691 rcu_read_unlock();
1692 } else { 1692 } else {
1693 inode->i_uid = GLOBAL_ROOT_UID; 1693 inode->i_uid = GLOBAL_ROOT_UID;
1694 inode->i_gid = GLOBAL_ROOT_GID; 1694 inode->i_gid = GLOBAL_ROOT_GID;
1695 } 1695 }
1696 security_task_to_inode(task, inode); 1696 security_task_to_inode(task, inode);
1697 status = 1; 1697 status = 1;
1698 } 1698 }
1699 1699
1700 out: 1700 out:
1701 put_task_struct(task); 1701 put_task_struct(task);
1702 1702
1703 out_notask: 1703 out_notask:
1704 return status; 1704 return status;
1705 } 1705 }
1706 1706
1707 static const struct dentry_operations tid_map_files_dentry_operations = { 1707 static const struct dentry_operations tid_map_files_dentry_operations = {
1708 .d_revalidate = map_files_d_revalidate, 1708 .d_revalidate = map_files_d_revalidate,
1709 .d_delete = pid_delete_dentry, 1709 .d_delete = pid_delete_dentry,
1710 }; 1710 };
1711 1711
1712 static int proc_map_files_get_link(struct dentry *dentry, struct path *path) 1712 static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
1713 { 1713 {
1714 unsigned long vm_start, vm_end; 1714 unsigned long vm_start, vm_end;
1715 struct vm_area_struct *vma; 1715 struct vm_area_struct *vma;
1716 struct task_struct *task; 1716 struct task_struct *task;
1717 struct mm_struct *mm; 1717 struct mm_struct *mm;
1718 int rc; 1718 int rc;
1719 1719
1720 rc = -ENOENT; 1720 rc = -ENOENT;
1721 task = get_proc_task(dentry->d_inode); 1721 task = get_proc_task(dentry->d_inode);
1722 if (!task) 1722 if (!task)
1723 goto out; 1723 goto out;
1724 1724
1725 mm = get_task_mm(task); 1725 mm = get_task_mm(task);
1726 put_task_struct(task); 1726 put_task_struct(task);
1727 if (!mm) 1727 if (!mm)
1728 goto out; 1728 goto out;
1729 1729
1730 rc = dname_to_vma_addr(dentry, &vm_start, &vm_end); 1730 rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
1731 if (rc) 1731 if (rc)
1732 goto out_mmput; 1732 goto out_mmput;
1733 1733
1734 rc = -ENOENT; 1734 rc = -ENOENT;
1735 down_read(&mm->mmap_sem); 1735 down_read(&mm->mmap_sem);
1736 vma = find_exact_vma(mm, vm_start, vm_end); 1736 vma = find_exact_vma(mm, vm_start, vm_end);
1737 if (vma && vma->vm_file) { 1737 if (vma && vma->vm_file) {
1738 *path = vma->vm_file->f_path; 1738 *path = vma->vm_file->f_path;
1739 path_get(path); 1739 path_get(path);
1740 rc = 0; 1740 rc = 0;
1741 } 1741 }
1742 up_read(&mm->mmap_sem); 1742 up_read(&mm->mmap_sem);
1743 1743
1744 out_mmput: 1744 out_mmput:
1745 mmput(mm); 1745 mmput(mm);
1746 out: 1746 out:
1747 return rc; 1747 return rc;
1748 } 1748 }
1749 1749
1750 struct map_files_info { 1750 struct map_files_info {
1751 fmode_t mode; 1751 fmode_t mode;
1752 unsigned long len; 1752 unsigned long len;
1753 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ 1753 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
1754 }; 1754 };
1755 1755
1756 static int 1756 static int
1757 proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, 1757 proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
1758 struct task_struct *task, const void *ptr) 1758 struct task_struct *task, const void *ptr)
1759 { 1759 {
1760 fmode_t mode = (fmode_t)(unsigned long)ptr; 1760 fmode_t mode = (fmode_t)(unsigned long)ptr;
1761 struct proc_inode *ei; 1761 struct proc_inode *ei;
1762 struct inode *inode; 1762 struct inode *inode;
1763 1763
1764 inode = proc_pid_make_inode(dir->i_sb, task); 1764 inode = proc_pid_make_inode(dir->i_sb, task);
1765 if (!inode) 1765 if (!inode)
1766 return -ENOENT; 1766 return -ENOENT;
1767 1767
1768 ei = PROC_I(inode); 1768 ei = PROC_I(inode);
1769 ei->op.proc_get_link = proc_map_files_get_link; 1769 ei->op.proc_get_link = proc_map_files_get_link;
1770 1770
1771 inode->i_op = &proc_pid_link_inode_operations; 1771 inode->i_op = &proc_pid_link_inode_operations;
1772 inode->i_size = 64; 1772 inode->i_size = 64;
1773 inode->i_mode = S_IFLNK; 1773 inode->i_mode = S_IFLNK;
1774 1774
1775 if (mode & FMODE_READ) 1775 if (mode & FMODE_READ)
1776 inode->i_mode |= S_IRUSR; 1776 inode->i_mode |= S_IRUSR;
1777 if (mode & FMODE_WRITE) 1777 if (mode & FMODE_WRITE)
1778 inode->i_mode |= S_IWUSR; 1778 inode->i_mode |= S_IWUSR;
1779 1779
1780 d_set_d_op(dentry, &tid_map_files_dentry_operations); 1780 d_set_d_op(dentry, &tid_map_files_dentry_operations);
1781 d_add(dentry, inode); 1781 d_add(dentry, inode);
1782 1782
1783 return 0; 1783 return 0;
1784 } 1784 }
1785 1785
1786 static struct dentry *proc_map_files_lookup(struct inode *dir, 1786 static struct dentry *proc_map_files_lookup(struct inode *dir,
1787 struct dentry *dentry, unsigned int flags) 1787 struct dentry *dentry, unsigned int flags)
1788 { 1788 {
1789 unsigned long vm_start, vm_end; 1789 unsigned long vm_start, vm_end;
1790 struct vm_area_struct *vma; 1790 struct vm_area_struct *vma;
1791 struct task_struct *task; 1791 struct task_struct *task;
1792 int result; 1792 int result;
1793 struct mm_struct *mm; 1793 struct mm_struct *mm;
1794 1794
1795 result = -EPERM; 1795 result = -EPERM;
1796 if (!capable(CAP_SYS_ADMIN)) 1796 if (!capable(CAP_SYS_ADMIN))
1797 goto out; 1797 goto out;
1798 1798
1799 result = -ENOENT; 1799 result = -ENOENT;
1800 task = get_proc_task(dir); 1800 task = get_proc_task(dir);
1801 if (!task) 1801 if (!task)
1802 goto out; 1802 goto out;
1803 1803
1804 result = -EACCES; 1804 result = -EACCES;
1805 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 1805 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1806 goto out_put_task; 1806 goto out_put_task;
1807 1807
1808 result = -ENOENT; 1808 result = -ENOENT;
1809 if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) 1809 if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
1810 goto out_put_task; 1810 goto out_put_task;
1811 1811
1812 mm = get_task_mm(task); 1812 mm = get_task_mm(task);
1813 if (!mm) 1813 if (!mm)
1814 goto out_put_task; 1814 goto out_put_task;
1815 1815
1816 down_read(&mm->mmap_sem); 1816 down_read(&mm->mmap_sem);
1817 vma = find_exact_vma(mm, vm_start, vm_end); 1817 vma = find_exact_vma(mm, vm_start, vm_end);
1818 if (!vma) 1818 if (!vma)
1819 goto out_no_vma; 1819 goto out_no_vma;
1820 1820
1821 if (vma->vm_file) 1821 if (vma->vm_file)
1822 result = proc_map_files_instantiate(dir, dentry, task, 1822 result = proc_map_files_instantiate(dir, dentry, task,
1823 (void *)(unsigned long)vma->vm_file->f_mode); 1823 (void *)(unsigned long)vma->vm_file->f_mode);
1824 1824
1825 out_no_vma: 1825 out_no_vma:
1826 up_read(&mm->mmap_sem); 1826 up_read(&mm->mmap_sem);
1827 mmput(mm); 1827 mmput(mm);
1828 out_put_task: 1828 out_put_task:
1829 put_task_struct(task); 1829 put_task_struct(task);
1830 out: 1830 out:
1831 return ERR_PTR(result); 1831 return ERR_PTR(result);
1832 } 1832 }
1833 1833
1834 static const struct inode_operations proc_map_files_inode_operations = { 1834 static const struct inode_operations proc_map_files_inode_operations = {
1835 .lookup = proc_map_files_lookup, 1835 .lookup = proc_map_files_lookup,
1836 .permission = proc_fd_permission, 1836 .permission = proc_fd_permission,
1837 .setattr = proc_setattr, 1837 .setattr = proc_setattr,
1838 }; 1838 };
1839 1839
1840 static int 1840 static int
1841 proc_map_files_readdir(struct file *file, struct dir_context *ctx) 1841 proc_map_files_readdir(struct file *file, struct dir_context *ctx)
1842 { 1842 {
1843 struct vm_area_struct *vma; 1843 struct vm_area_struct *vma;
1844 struct task_struct *task; 1844 struct task_struct *task;
1845 struct mm_struct *mm; 1845 struct mm_struct *mm;
1846 unsigned long nr_files, pos, i; 1846 unsigned long nr_files, pos, i;
1847 struct flex_array *fa = NULL; 1847 struct flex_array *fa = NULL;
1848 struct map_files_info info; 1848 struct map_files_info info;
1849 struct map_files_info *p; 1849 struct map_files_info *p;
1850 int ret; 1850 int ret;
1851 1851
1852 ret = -EPERM; 1852 ret = -EPERM;
1853 if (!capable(CAP_SYS_ADMIN)) 1853 if (!capable(CAP_SYS_ADMIN))
1854 goto out; 1854 goto out;
1855 1855
1856 ret = -ENOENT; 1856 ret = -ENOENT;
1857 task = get_proc_task(file_inode(file)); 1857 task = get_proc_task(file_inode(file));
1858 if (!task) 1858 if (!task)
1859 goto out; 1859 goto out;
1860 1860
1861 ret = -EACCES; 1861 ret = -EACCES;
1862 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 1862 if (!ptrace_may_access(task, PTRACE_MODE_READ))
1863 goto out_put_task; 1863 goto out_put_task;
1864 1864
1865 ret = 0; 1865 ret = 0;
1866 if (!dir_emit_dots(file, ctx)) 1866 if (!dir_emit_dots(file, ctx))
1867 goto out_put_task; 1867 goto out_put_task;
1868 1868
1869 mm = get_task_mm(task); 1869 mm = get_task_mm(task);
1870 if (!mm) 1870 if (!mm)
1871 goto out_put_task; 1871 goto out_put_task;
1872 down_read(&mm->mmap_sem); 1872 down_read(&mm->mmap_sem);
1873 1873
1874 nr_files = 0; 1874 nr_files = 0;
1875 1875
1876 /* 1876 /*
1877 * We need two passes here: 1877 * We need two passes here:
1878 * 1878 *
1879 * 1) Collect vmas of mapped files with mmap_sem taken 1879 * 1) Collect vmas of mapped files with mmap_sem taken
1880 * 2) Release mmap_sem and instantiate entries 1880 * 2) Release mmap_sem and instantiate entries
1881 * 1881 *
1882 * otherwise we get lockdep complained, since filldir() 1882 * otherwise we get lockdep complained, since filldir()
1883 * routine might require mmap_sem taken in might_fault(). 1883 * routine might require mmap_sem taken in might_fault().
1884 */ 1884 */
1885 1885
1886 for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { 1886 for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
1887 if (vma->vm_file && ++pos > ctx->pos) 1887 if (vma->vm_file && ++pos > ctx->pos)
1888 nr_files++; 1888 nr_files++;
1889 } 1889 }
1890 1890
1891 if (nr_files) { 1891 if (nr_files) {
1892 fa = flex_array_alloc(sizeof(info), nr_files, 1892 fa = flex_array_alloc(sizeof(info), nr_files,
1893 GFP_KERNEL); 1893 GFP_KERNEL);
1894 if (!fa || flex_array_prealloc(fa, 0, nr_files, 1894 if (!fa || flex_array_prealloc(fa, 0, nr_files,
1895 GFP_KERNEL)) { 1895 GFP_KERNEL)) {
1896 ret = -ENOMEM; 1896 ret = -ENOMEM;
1897 if (fa) 1897 if (fa)
1898 flex_array_free(fa); 1898 flex_array_free(fa);
1899 up_read(&mm->mmap_sem); 1899 up_read(&mm->mmap_sem);
1900 mmput(mm); 1900 mmput(mm);
1901 goto out_put_task; 1901 goto out_put_task;
1902 } 1902 }
1903 for (i = 0, vma = mm->mmap, pos = 2; vma; 1903 for (i = 0, vma = mm->mmap, pos = 2; vma;
1904 vma = vma->vm_next) { 1904 vma = vma->vm_next) {
1905 if (!vma->vm_file) 1905 if (!vma->vm_file)
1906 continue; 1906 continue;
1907 if (++pos <= ctx->pos) 1907 if (++pos <= ctx->pos)
1908 continue; 1908 continue;
1909 1909
1910 info.mode = vma->vm_file->f_mode; 1910 info.mode = vma->vm_file->f_mode;
1911 info.len = snprintf(info.name, 1911 info.len = snprintf(info.name,
1912 sizeof(info.name), "%lx-%lx", 1912 sizeof(info.name), "%lx-%lx",
1913 vma->vm_start, vma->vm_end); 1913 vma->vm_start, vma->vm_end);
1914 if (flex_array_put(fa, i++, &info, GFP_KERNEL)) 1914 if (flex_array_put(fa, i++, &info, GFP_KERNEL))
1915 BUG(); 1915 BUG();
1916 } 1916 }
1917 } 1917 }
1918 up_read(&mm->mmap_sem); 1918 up_read(&mm->mmap_sem);
1919 1919
1920 for (i = 0; i < nr_files; i++) { 1920 for (i = 0; i < nr_files; i++) {
1921 p = flex_array_get(fa, i); 1921 p = flex_array_get(fa, i);
1922 if (!proc_fill_cache(file, ctx, 1922 if (!proc_fill_cache(file, ctx,
1923 p->name, p->len, 1923 p->name, p->len,
1924 proc_map_files_instantiate, 1924 proc_map_files_instantiate,
1925 task, 1925 task,
1926 (void *)(unsigned long)p->mode)) 1926 (void *)(unsigned long)p->mode))
1927 break; 1927 break;
1928 ctx->pos++; 1928 ctx->pos++;
1929 } 1929 }
1930 if (fa) 1930 if (fa)
1931 flex_array_free(fa); 1931 flex_array_free(fa);
1932 mmput(mm); 1932 mmput(mm);
1933 1933
1934 out_put_task: 1934 out_put_task:
1935 put_task_struct(task); 1935 put_task_struct(task);
1936 out: 1936 out:
1937 return ret; 1937 return ret;
1938 } 1938 }
1939 1939
1940 static const struct file_operations proc_map_files_operations = { 1940 static const struct file_operations proc_map_files_operations = {
1941 .read = generic_read_dir, 1941 .read = generic_read_dir,
1942 .iterate = proc_map_files_readdir, 1942 .iterate = proc_map_files_readdir,
1943 .llseek = default_llseek, 1943 .llseek = default_llseek,
1944 }; 1944 };
1945 1945
1946 struct timers_private { 1946 struct timers_private {
1947 struct pid *pid; 1947 struct pid *pid;
1948 struct task_struct *task; 1948 struct task_struct *task;
1949 struct sighand_struct *sighand; 1949 struct sighand_struct *sighand;
1950 struct pid_namespace *ns; 1950 struct pid_namespace *ns;
1951 unsigned long flags; 1951 unsigned long flags;
1952 }; 1952 };
1953 1953
1954 static void *timers_start(struct seq_file *m, loff_t *pos) 1954 static void *timers_start(struct seq_file *m, loff_t *pos)
1955 { 1955 {
1956 struct timers_private *tp = m->private; 1956 struct timers_private *tp = m->private;
1957 1957
1958 tp->task = get_pid_task(tp->pid, PIDTYPE_PID); 1958 tp->task = get_pid_task(tp->pid, PIDTYPE_PID);
1959 if (!tp->task) 1959 if (!tp->task)
1960 return ERR_PTR(-ESRCH); 1960 return ERR_PTR(-ESRCH);
1961 1961
1962 tp->sighand = lock_task_sighand(tp->task, &tp->flags); 1962 tp->sighand = lock_task_sighand(tp->task, &tp->flags);
1963 if (!tp->sighand) 1963 if (!tp->sighand)
1964 return ERR_PTR(-ESRCH); 1964 return ERR_PTR(-ESRCH);
1965 1965
1966 return seq_list_start(&tp->task->signal->posix_timers, *pos); 1966 return seq_list_start(&tp->task->signal->posix_timers, *pos);
1967 } 1967 }
1968 1968
1969 static void *timers_next(struct seq_file *m, void *v, loff_t *pos) 1969 static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
1970 { 1970 {
1971 struct timers_private *tp = m->private; 1971 struct timers_private *tp = m->private;
1972 return seq_list_next(v, &tp->task->signal->posix_timers, pos); 1972 return seq_list_next(v, &tp->task->signal->posix_timers, pos);
1973 } 1973 }
1974 1974
1975 static void timers_stop(struct seq_file *m, void *v) 1975 static void timers_stop(struct seq_file *m, void *v)
1976 { 1976 {
1977 struct timers_private *tp = m->private; 1977 struct timers_private *tp = m->private;
1978 1978
1979 if (tp->sighand) { 1979 if (tp->sighand) {
1980 unlock_task_sighand(tp->task, &tp->flags); 1980 unlock_task_sighand(tp->task, &tp->flags);
1981 tp->sighand = NULL; 1981 tp->sighand = NULL;
1982 } 1982 }
1983 1983
1984 if (tp->task) { 1984 if (tp->task) {
1985 put_task_struct(tp->task); 1985 put_task_struct(tp->task);
1986 tp->task = NULL; 1986 tp->task = NULL;
1987 } 1987 }
1988 } 1988 }
1989 1989
1990 static int show_timer(struct seq_file *m, void *v) 1990 static int show_timer(struct seq_file *m, void *v)
1991 { 1991 {
1992 struct k_itimer *timer; 1992 struct k_itimer *timer;
1993 struct timers_private *tp = m->private; 1993 struct timers_private *tp = m->private;
1994 int notify; 1994 int notify;
1995 static const char * const nstr[] = { 1995 static const char * const nstr[] = {
1996 [SIGEV_SIGNAL] = "signal", 1996 [SIGEV_SIGNAL] = "signal",
1997 [SIGEV_NONE] = "none", 1997 [SIGEV_NONE] = "none",
1998 [SIGEV_THREAD] = "thread", 1998 [SIGEV_THREAD] = "thread",
1999 }; 1999 };
2000 2000
2001 timer = list_entry((struct list_head *)v, struct k_itimer, list); 2001 timer = list_entry((struct list_head *)v, struct k_itimer, list);
2002 notify = timer->it_sigev_notify; 2002 notify = timer->it_sigev_notify;
2003 2003
2004 seq_printf(m, "ID: %d\n", timer->it_id); 2004 seq_printf(m, "ID: %d\n", timer->it_id);
2005 seq_printf(m, "signal: %d/%p\n", timer->sigq->info.si_signo, 2005 seq_printf(m, "signal: %d/%p\n", timer->sigq->info.si_signo,
2006 timer->sigq->info.si_value.sival_ptr); 2006 timer->sigq->info.si_value.sival_ptr);
2007 seq_printf(m, "notify: %s/%s.%d\n", 2007 seq_printf(m, "notify: %s/%s.%d\n",
2008 nstr[notify & ~SIGEV_THREAD_ID], 2008 nstr[notify & ~SIGEV_THREAD_ID],
2009 (notify & SIGEV_THREAD_ID) ? "tid" : "pid", 2009 (notify & SIGEV_THREAD_ID) ? "tid" : "pid",
2010 pid_nr_ns(timer->it_pid, tp->ns)); 2010 pid_nr_ns(timer->it_pid, tp->ns));
2011 seq_printf(m, "ClockID: %d\n", timer->it_clock); 2011 seq_printf(m, "ClockID: %d\n", timer->it_clock);
2012 2012
2013 return 0; 2013 return 0;
2014 } 2014 }
2015 2015
2016 static const struct seq_operations proc_timers_seq_ops = { 2016 static const struct seq_operations proc_timers_seq_ops = {
2017 .start = timers_start, 2017 .start = timers_start,
2018 .next = timers_next, 2018 .next = timers_next,
2019 .stop = timers_stop, 2019 .stop = timers_stop,
2020 .show = show_timer, 2020 .show = show_timer,
2021 }; 2021 };
2022 2022
2023 static int proc_timers_open(struct inode *inode, struct file *file) 2023 static int proc_timers_open(struct inode *inode, struct file *file)
2024 { 2024 {
2025 struct timers_private *tp; 2025 struct timers_private *tp;
2026 2026
2027 tp = __seq_open_private(file, &proc_timers_seq_ops, 2027 tp = __seq_open_private(file, &proc_timers_seq_ops,
2028 sizeof(struct timers_private)); 2028 sizeof(struct timers_private));
2029 if (!tp) 2029 if (!tp)
2030 return -ENOMEM; 2030 return -ENOMEM;
2031 2031
2032 tp->pid = proc_pid(inode); 2032 tp->pid = proc_pid(inode);
2033 tp->ns = inode->i_sb->s_fs_info; 2033 tp->ns = inode->i_sb->s_fs_info;
2034 return 0; 2034 return 0;
2035 } 2035 }
2036 2036
2037 static const struct file_operations proc_timers_operations = { 2037 static const struct file_operations proc_timers_operations = {
2038 .open = proc_timers_open, 2038 .open = proc_timers_open,
2039 .read = seq_read, 2039 .read = seq_read,
2040 .llseek = seq_lseek, 2040 .llseek = seq_lseek,
2041 .release = seq_release_private, 2041 .release = seq_release_private,
2042 }; 2042 };
2043 #endif /* CONFIG_CHECKPOINT_RESTORE */ 2043 #endif /* CONFIG_CHECKPOINT_RESTORE */
2044 2044
2045 static int proc_pident_instantiate(struct inode *dir, 2045 static int proc_pident_instantiate(struct inode *dir,
2046 struct dentry *dentry, struct task_struct *task, const void *ptr) 2046 struct dentry *dentry, struct task_struct *task, const void *ptr)
2047 { 2047 {
2048 const struct pid_entry *p = ptr; 2048 const struct pid_entry *p = ptr;
2049 struct inode *inode; 2049 struct inode *inode;
2050 struct proc_inode *ei; 2050 struct proc_inode *ei;
2051 2051
2052 inode = proc_pid_make_inode(dir->i_sb, task); 2052 inode = proc_pid_make_inode(dir->i_sb, task);
2053 if (!inode) 2053 if (!inode)
2054 goto out; 2054 goto out;
2055 2055
2056 ei = PROC_I(inode); 2056 ei = PROC_I(inode);
2057 inode->i_mode = p->mode; 2057 inode->i_mode = p->mode;
2058 if (S_ISDIR(inode->i_mode)) 2058 if (S_ISDIR(inode->i_mode))
2059 set_nlink(inode, 2); /* Use getattr to fix if necessary */ 2059 set_nlink(inode, 2); /* Use getattr to fix if necessary */
2060 if (p->iop) 2060 if (p->iop)
2061 inode->i_op = p->iop; 2061 inode->i_op = p->iop;
2062 if (p->fop) 2062 if (p->fop)
2063 inode->i_fop = p->fop; 2063 inode->i_fop = p->fop;
2064 ei->op = p->op; 2064 ei->op = p->op;
2065 d_set_d_op(dentry, &pid_dentry_operations); 2065 d_set_d_op(dentry, &pid_dentry_operations);
2066 d_add(dentry, inode); 2066 d_add(dentry, inode);
2067 /* Close the race of the process dying before we return the dentry */ 2067 /* Close the race of the process dying before we return the dentry */
2068 if (pid_revalidate(dentry, 0)) 2068 if (pid_revalidate(dentry, 0))
2069 return 0; 2069 return 0;
2070 out: 2070 out:
2071 return -ENOENT; 2071 return -ENOENT;
2072 } 2072 }
2073 2073
2074 static struct dentry *proc_pident_lookup(struct inode *dir, 2074 static struct dentry *proc_pident_lookup(struct inode *dir,
2075 struct dentry *dentry, 2075 struct dentry *dentry,
2076 const struct pid_entry *ents, 2076 const struct pid_entry *ents,
2077 unsigned int nents) 2077 unsigned int nents)
2078 { 2078 {
2079 int error; 2079 int error;
2080 struct task_struct *task = get_proc_task(dir); 2080 struct task_struct *task = get_proc_task(dir);
2081 const struct pid_entry *p, *last; 2081 const struct pid_entry *p, *last;
2082 2082
2083 error = -ENOENT; 2083 error = -ENOENT;
2084 2084
2085 if (!task) 2085 if (!task)
2086 goto out_no_task; 2086 goto out_no_task;
2087 2087
2088 /* 2088 /*
2089 * Yes, it does not scale. And it should not. Don't add 2089 * Yes, it does not scale. And it should not. Don't add
2090 * new entries into /proc/<tgid>/ without very good reasons. 2090 * new entries into /proc/<tgid>/ without very good reasons.
2091 */ 2091 */
2092 last = &ents[nents - 1]; 2092 last = &ents[nents - 1];
2093 for (p = ents; p <= last; p++) { 2093 for (p = ents; p <= last; p++) {
2094 if (p->len != dentry->d_name.len) 2094 if (p->len != dentry->d_name.len)
2095 continue; 2095 continue;
2096 if (!memcmp(dentry->d_name.name, p->name, p->len)) 2096 if (!memcmp(dentry->d_name.name, p->name, p->len))
2097 break; 2097 break;
2098 } 2098 }
2099 if (p > last) 2099 if (p > last)
2100 goto out; 2100 goto out;
2101 2101
2102 error = proc_pident_instantiate(dir, dentry, task, p); 2102 error = proc_pident_instantiate(dir, dentry, task, p);
2103 out: 2103 out:
2104 put_task_struct(task); 2104 put_task_struct(task);
2105 out_no_task: 2105 out_no_task:
2106 return ERR_PTR(error); 2106 return ERR_PTR(error);
2107 } 2107 }
2108 2108
2109 static int proc_pident_readdir(struct file *file, struct dir_context *ctx, 2109 static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
2110 const struct pid_entry *ents, unsigned int nents) 2110 const struct pid_entry *ents, unsigned int nents)
2111 { 2111 {
2112 struct task_struct *task = get_proc_task(file_inode(file)); 2112 struct task_struct *task = get_proc_task(file_inode(file));
2113 const struct pid_entry *p; 2113 const struct pid_entry *p;
2114 2114
2115 if (!task) 2115 if (!task)
2116 return -ENOENT; 2116 return -ENOENT;
2117 2117
2118 if (!dir_emit_dots(file, ctx)) 2118 if (!dir_emit_dots(file, ctx))
2119 goto out; 2119 goto out;
2120 2120
2121 if (ctx->pos >= nents + 2) 2121 if (ctx->pos >= nents + 2)
2122 goto out; 2122 goto out;
2123 2123
2124 for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) { 2124 for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) {
2125 if (!proc_fill_cache(file, ctx, p->name, p->len, 2125 if (!proc_fill_cache(file, ctx, p->name, p->len,
2126 proc_pident_instantiate, task, p)) 2126 proc_pident_instantiate, task, p))
2127 break; 2127 break;
2128 ctx->pos++; 2128 ctx->pos++;
2129 } 2129 }
2130 out: 2130 out:
2131 put_task_struct(task); 2131 put_task_struct(task);
2132 return 0; 2132 return 0;
2133 } 2133 }
2134 2134
2135 #ifdef CONFIG_SECURITY 2135 #ifdef CONFIG_SECURITY
2136 static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, 2136 static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
2137 size_t count, loff_t *ppos) 2137 size_t count, loff_t *ppos)
2138 { 2138 {
2139 struct inode * inode = file_inode(file); 2139 struct inode * inode = file_inode(file);
2140 char *p = NULL; 2140 char *p = NULL;
2141 ssize_t length; 2141 ssize_t length;
2142 struct task_struct *task = get_proc_task(inode); 2142 struct task_struct *task = get_proc_task(inode);
2143 2143
2144 if (!task) 2144 if (!task)
2145 return -ESRCH; 2145 return -ESRCH;
2146 2146
2147 length = security_getprocattr(task, 2147 length = security_getprocattr(task,
2148 (char*)file->f_path.dentry->d_name.name, 2148 (char*)file->f_path.dentry->d_name.name,
2149 &p); 2149 &p);
2150 put_task_struct(task); 2150 put_task_struct(task);
2151 if (length > 0) 2151 if (length > 0)
2152 length = simple_read_from_buffer(buf, count, ppos, p, length); 2152 length = simple_read_from_buffer(buf, count, ppos, p, length);
2153 kfree(p); 2153 kfree(p);
2154 return length; 2154 return length;
2155 } 2155 }
2156 2156
2157 static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, 2157 static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
2158 size_t count, loff_t *ppos) 2158 size_t count, loff_t *ppos)
2159 { 2159 {
2160 struct inode * inode = file_inode(file); 2160 struct inode * inode = file_inode(file);
2161 char *page; 2161 char *page;
2162 ssize_t length; 2162 ssize_t length;
2163 struct task_struct *task = get_proc_task(inode); 2163 struct task_struct *task = get_proc_task(inode);
2164 2164
2165 length = -ESRCH; 2165 length = -ESRCH;
2166 if (!task) 2166 if (!task)
2167 goto out_no_task; 2167 goto out_no_task;
2168 if (count > PAGE_SIZE) 2168 if (count > PAGE_SIZE)
2169 count = PAGE_SIZE; 2169 count = PAGE_SIZE;
2170 2170
2171 /* No partial writes. */ 2171 /* No partial writes. */
2172 length = -EINVAL; 2172 length = -EINVAL;
2173 if (*ppos != 0) 2173 if (*ppos != 0)
2174 goto out; 2174 goto out;
2175 2175
2176 length = -ENOMEM; 2176 length = -ENOMEM;
2177 page = (char*)__get_free_page(GFP_TEMPORARY); 2177 page = (char*)__get_free_page(GFP_TEMPORARY);
2178 if (!page) 2178 if (!page)
2179 goto out; 2179 goto out;
2180 2180
2181 length = -EFAULT; 2181 length = -EFAULT;
2182 if (copy_from_user(page, buf, count)) 2182 if (copy_from_user(page, buf, count))
2183 goto out_free; 2183 goto out_free;
2184 2184
2185 /* Guard against adverse ptrace interaction */ 2185 /* Guard against adverse ptrace interaction */
2186 length = mutex_lock_interruptible(&task->signal->cred_guard_mutex); 2186 length = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
2187 if (length < 0) 2187 if (length < 0)
2188 goto out_free; 2188 goto out_free;
2189 2189
2190 length = security_setprocattr(task, 2190 length = security_setprocattr(task,
2191 (char*)file->f_path.dentry->d_name.name, 2191 (char*)file->f_path.dentry->d_name.name,
2192 (void*)page, count); 2192 (void*)page, count);
2193 mutex_unlock(&task->signal->cred_guard_mutex); 2193 mutex_unlock(&task->signal->cred_guard_mutex);
2194 out_free: 2194 out_free:
2195 free_page((unsigned long) page); 2195 free_page((unsigned long) page);
2196 out: 2196 out:
2197 put_task_struct(task); 2197 put_task_struct(task);
2198 out_no_task: 2198 out_no_task:
2199 return length; 2199 return length;
2200 } 2200 }
2201 2201
2202 static const struct file_operations proc_pid_attr_operations = { 2202 static const struct file_operations proc_pid_attr_operations = {
2203 .read = proc_pid_attr_read, 2203 .read = proc_pid_attr_read,
2204 .write = proc_pid_attr_write, 2204 .write = proc_pid_attr_write,
2205 .llseek = generic_file_llseek, 2205 .llseek = generic_file_llseek,
2206 }; 2206 };
2207 2207
2208 static const struct pid_entry attr_dir_stuff[] = { 2208 static const struct pid_entry attr_dir_stuff[] = {
2209 REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations), 2209 REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2210 REG("prev", S_IRUGO, proc_pid_attr_operations), 2210 REG("prev", S_IRUGO, proc_pid_attr_operations),
2211 REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations), 2211 REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2212 REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), 2212 REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2213 REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), 2213 REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2214 REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), 2214 REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2215 }; 2215 };
2216 2216
2217 static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx) 2217 static int proc_attr_dir_readdir(struct file *file, struct dir_context *ctx)
2218 { 2218 {
2219 return proc_pident_readdir(file, ctx, 2219 return proc_pident_readdir(file, ctx,
2220 attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); 2220 attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
2221 } 2221 }
2222 2222
2223 static const struct file_operations proc_attr_dir_operations = { 2223 static const struct file_operations proc_attr_dir_operations = {
2224 .read = generic_read_dir, 2224 .read = generic_read_dir,
2225 .iterate = proc_attr_dir_readdir, 2225 .iterate = proc_attr_dir_readdir,
2226 .llseek = default_llseek, 2226 .llseek = default_llseek,
2227 }; 2227 };
2228 2228
2229 static struct dentry *proc_attr_dir_lookup(struct inode *dir, 2229 static struct dentry *proc_attr_dir_lookup(struct inode *dir,
2230 struct dentry *dentry, unsigned int flags) 2230 struct dentry *dentry, unsigned int flags)
2231 { 2231 {
2232 return proc_pident_lookup(dir, dentry, 2232 return proc_pident_lookup(dir, dentry,
2233 attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); 2233 attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
2234 } 2234 }
2235 2235
2236 static const struct inode_operations proc_attr_dir_inode_operations = { 2236 static const struct inode_operations proc_attr_dir_inode_operations = {
2237 .lookup = proc_attr_dir_lookup, 2237 .lookup = proc_attr_dir_lookup,
2238 .getattr = pid_getattr, 2238 .getattr = pid_getattr,
2239 .setattr = proc_setattr, 2239 .setattr = proc_setattr,
2240 }; 2240 };
2241 2241
2242 #endif 2242 #endif
2243 2243
2244 #ifdef CONFIG_ELF_CORE 2244 #ifdef CONFIG_ELF_CORE
2245 static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf, 2245 static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
2246 size_t count, loff_t *ppos) 2246 size_t count, loff_t *ppos)
2247 { 2247 {
2248 struct task_struct *task = get_proc_task(file_inode(file)); 2248 struct task_struct *task = get_proc_task(file_inode(file));
2249 struct mm_struct *mm; 2249 struct mm_struct *mm;
2250 char buffer[PROC_NUMBUF]; 2250 char buffer[PROC_NUMBUF];
2251 size_t len; 2251 size_t len;
2252 int ret; 2252 int ret;
2253 2253
2254 if (!task) 2254 if (!task)
2255 return -ESRCH; 2255 return -ESRCH;
2256 2256
2257 ret = 0; 2257 ret = 0;
2258 mm = get_task_mm(task); 2258 mm = get_task_mm(task);
2259 if (mm) { 2259 if (mm) {
2260 len = snprintf(buffer, sizeof(buffer), "%08lx\n", 2260 len = snprintf(buffer, sizeof(buffer), "%08lx\n",
2261 ((mm->flags & MMF_DUMP_FILTER_MASK) >> 2261 ((mm->flags & MMF_DUMP_FILTER_MASK) >>
2262 MMF_DUMP_FILTER_SHIFT)); 2262 MMF_DUMP_FILTER_SHIFT));
2263 mmput(mm); 2263 mmput(mm);
2264 ret = simple_read_from_buffer(buf, count, ppos, buffer, len); 2264 ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
2265 } 2265 }
2266 2266
2267 put_task_struct(task); 2267 put_task_struct(task);
2268 2268
2269 return ret; 2269 return ret;
2270 } 2270 }
2271 2271
2272 static ssize_t proc_coredump_filter_write(struct file *file, 2272 static ssize_t proc_coredump_filter_write(struct file *file,
2273 const char __user *buf, 2273 const char __user *buf,
2274 size_t count, 2274 size_t count,
2275 loff_t *ppos) 2275 loff_t *ppos)
2276 { 2276 {
2277 struct task_struct *task; 2277 struct task_struct *task;
2278 struct mm_struct *mm; 2278 struct mm_struct *mm;
2279 char buffer[PROC_NUMBUF], *end; 2279 char buffer[PROC_NUMBUF], *end;
2280 unsigned int val; 2280 unsigned int val;
2281 int ret; 2281 int ret;
2282 int i; 2282 int i;
2283 unsigned long mask; 2283 unsigned long mask;
2284 2284
2285 ret = -EFAULT; 2285 ret = -EFAULT;
2286 memset(buffer, 0, sizeof(buffer)); 2286 memset(buffer, 0, sizeof(buffer));
2287 if (count > sizeof(buffer) - 1) 2287 if (count > sizeof(buffer) - 1)
2288 count = sizeof(buffer) - 1; 2288 count = sizeof(buffer) - 1;
2289 if (copy_from_user(buffer, buf, count)) 2289 if (copy_from_user(buffer, buf, count))
2290 goto out_no_task; 2290 goto out_no_task;
2291 2291
2292 ret = -EINVAL; 2292 ret = -EINVAL;
2293 val = (unsigned int)simple_strtoul(buffer, &end, 0); 2293 val = (unsigned int)simple_strtoul(buffer, &end, 0);
2294 if (*end == '\n') 2294 if (*end == '\n')
2295 end++; 2295 end++;
2296 if (end - buffer == 0) 2296 if (end - buffer == 0)
2297 goto out_no_task; 2297 goto out_no_task;
2298 2298
2299 ret = -ESRCH; 2299 ret = -ESRCH;
2300 task = get_proc_task(file_inode(file)); 2300 task = get_proc_task(file_inode(file));
2301 if (!task) 2301 if (!task)
2302 goto out_no_task; 2302 goto out_no_task;
2303 2303
2304 ret = end - buffer; 2304 ret = end - buffer;
2305 mm = get_task_mm(task); 2305 mm = get_task_mm(task);
2306 if (!mm) 2306 if (!mm)
2307 goto out_no_mm; 2307 goto out_no_mm;
2308 2308
2309 for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) { 2309 for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
2310 if (val & mask) 2310 if (val & mask)
2311 set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); 2311 set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
2312 else 2312 else
2313 clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags); 2313 clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
2314 } 2314 }
2315 2315
2316 mmput(mm); 2316 mmput(mm);
2317 out_no_mm: 2317 out_no_mm:
2318 put_task_struct(task); 2318 put_task_struct(task);
2319 out_no_task: 2319 out_no_task:
2320 return ret; 2320 return ret;
2321 } 2321 }
2322 2322
2323 static const struct file_operations proc_coredump_filter_operations = { 2323 static const struct file_operations proc_coredump_filter_operations = {
2324 .read = proc_coredump_filter_read, 2324 .read = proc_coredump_filter_read,
2325 .write = proc_coredump_filter_write, 2325 .write = proc_coredump_filter_write,
2326 .llseek = generic_file_llseek, 2326 .llseek = generic_file_llseek,
2327 }; 2327 };
2328 #endif 2328 #endif
2329 2329
2330 #ifdef CONFIG_TASK_IO_ACCOUNTING 2330 #ifdef CONFIG_TASK_IO_ACCOUNTING
2331 static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole) 2331 static int do_io_accounting(struct task_struct *task, struct seq_file *m, int whole)
2332 { 2332 {
2333 struct task_io_accounting acct = task->ioac; 2333 struct task_io_accounting acct = task->ioac;
2334 unsigned long flags; 2334 unsigned long flags;
2335 int result; 2335 int result;
2336 2336
2337 result = mutex_lock_killable(&task->signal->cred_guard_mutex); 2337 result = mutex_lock_killable(&task->signal->cred_guard_mutex);
2338 if (result) 2338 if (result)
2339 return result; 2339 return result;
2340 2340
2341 if (!ptrace_may_access(task, PTRACE_MODE_READ)) { 2341 if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
2342 result = -EACCES; 2342 result = -EACCES;
2343 goto out_unlock; 2343 goto out_unlock;
2344 } 2344 }
2345 2345
2346 if (whole && lock_task_sighand(task, &flags)) { 2346 if (whole && lock_task_sighand(task, &flags)) {
2347 struct task_struct *t = task; 2347 struct task_struct *t = task;
2348 2348
2349 task_io_accounting_add(&acct, &task->signal->ioac); 2349 task_io_accounting_add(&acct, &task->signal->ioac);
2350 while_each_thread(task, t) 2350 while_each_thread(task, t)
2351 task_io_accounting_add(&acct, &t->ioac); 2351 task_io_accounting_add(&acct, &t->ioac);
2352 2352
2353 unlock_task_sighand(task, &flags); 2353 unlock_task_sighand(task, &flags);
2354 } 2354 }
2355 result = seq_printf(m, 2355 result = seq_printf(m,
2356 "rchar: %llu\n" 2356 "rchar: %llu\n"
2357 "wchar: %llu\n" 2357 "wchar: %llu\n"
2358 "syscr: %llu\n" 2358 "syscr: %llu\n"
2359 "syscw: %llu\n" 2359 "syscw: %llu\n"
2360 "read_bytes: %llu\n" 2360 "read_bytes: %llu\n"
2361 "write_bytes: %llu\n" 2361 "write_bytes: %llu\n"
2362 "cancelled_write_bytes: %llu\n", 2362 "cancelled_write_bytes: %llu\n",
2363 (unsigned long long)acct.rchar, 2363 (unsigned long long)acct.rchar,
2364 (unsigned long long)acct.wchar, 2364 (unsigned long long)acct.wchar,
2365 (unsigned long long)acct.syscr, 2365 (unsigned long long)acct.syscr,
2366 (unsigned long long)acct.syscw, 2366 (unsigned long long)acct.syscw,
2367 (unsigned long long)acct.read_bytes, 2367 (unsigned long long)acct.read_bytes,
2368 (unsigned long long)acct.write_bytes, 2368 (unsigned long long)acct.write_bytes,
2369 (unsigned long long)acct.cancelled_write_bytes); 2369 (unsigned long long)acct.cancelled_write_bytes);
2370 out_unlock: 2370 out_unlock:
2371 mutex_unlock(&task->signal->cred_guard_mutex); 2371 mutex_unlock(&task->signal->cred_guard_mutex);
2372 return result; 2372 return result;
2373 } 2373 }
2374 2374
2375 static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns, 2375 static int proc_tid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
2376 struct pid *pid, struct task_struct *task) 2376 struct pid *pid, struct task_struct *task)
2377 { 2377 {
2378 return do_io_accounting(task, m, 0); 2378 return do_io_accounting(task, m, 0);
2379 } 2379 }
2380 2380
2381 static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns, 2381 static int proc_tgid_io_accounting(struct seq_file *m, struct pid_namespace *ns,
2382 struct pid *pid, struct task_struct *task) 2382 struct pid *pid, struct task_struct *task)
2383 { 2383 {
2384 return do_io_accounting(task, m, 1); 2384 return do_io_accounting(task, m, 1);
2385 } 2385 }
2386 #endif /* CONFIG_TASK_IO_ACCOUNTING */ 2386 #endif /* CONFIG_TASK_IO_ACCOUNTING */
2387 2387
2388 #ifdef CONFIG_USER_NS 2388 #ifdef CONFIG_USER_NS
2389 static int proc_id_map_open(struct inode *inode, struct file *file, 2389 static int proc_id_map_open(struct inode *inode, struct file *file,
2390 const struct seq_operations *seq_ops) 2390 const struct seq_operations *seq_ops)
2391 { 2391 {
2392 struct user_namespace *ns = NULL; 2392 struct user_namespace *ns = NULL;
2393 struct task_struct *task; 2393 struct task_struct *task;
2394 struct seq_file *seq; 2394 struct seq_file *seq;
2395 int ret = -EINVAL; 2395 int ret = -EINVAL;
2396 2396
2397 task = get_proc_task(inode); 2397 task = get_proc_task(inode);
2398 if (task) { 2398 if (task) {
2399 rcu_read_lock(); 2399 rcu_read_lock();
2400 ns = get_user_ns(task_cred_xxx(task, user_ns)); 2400 ns = get_user_ns(task_cred_xxx(task, user_ns));
2401 rcu_read_unlock(); 2401 rcu_read_unlock();
2402 put_task_struct(task); 2402 put_task_struct(task);
2403 } 2403 }
2404 if (!ns) 2404 if (!ns)
2405 goto err; 2405 goto err;
2406 2406
2407 ret = seq_open(file, seq_ops); 2407 ret = seq_open(file, seq_ops);
2408 if (ret) 2408 if (ret)
2409 goto err_put_ns; 2409 goto err_put_ns;
2410 2410
2411 seq = file->private_data; 2411 seq = file->private_data;
2412 seq->private = ns; 2412 seq->private = ns;
2413 2413
2414 return 0; 2414 return 0;
2415 err_put_ns: 2415 err_put_ns:
2416 put_user_ns(ns); 2416 put_user_ns(ns);
2417 err: 2417 err:
2418 return ret; 2418 return ret;
2419 } 2419 }
2420 2420
2421 static int proc_id_map_release(struct inode *inode, struct file *file) 2421 static int proc_id_map_release(struct inode *inode, struct file *file)
2422 { 2422 {
2423 struct seq_file *seq = file->private_data; 2423 struct seq_file *seq = file->private_data;
2424 struct user_namespace *ns = seq->private; 2424 struct user_namespace *ns = seq->private;
2425 put_user_ns(ns); 2425 put_user_ns(ns);
2426 return seq_release(inode, file); 2426 return seq_release(inode, file);
2427 } 2427 }
2428 2428
2429 static int proc_uid_map_open(struct inode *inode, struct file *file) 2429 static int proc_uid_map_open(struct inode *inode, struct file *file)
2430 { 2430 {
2431 return proc_id_map_open(inode, file, &proc_uid_seq_operations); 2431 return proc_id_map_open(inode, file, &proc_uid_seq_operations);
2432 } 2432 }
2433 2433
2434 static int proc_gid_map_open(struct inode *inode, struct file *file) 2434 static int proc_gid_map_open(struct inode *inode, struct file *file)
2435 { 2435 {
2436 return proc_id_map_open(inode, file, &proc_gid_seq_operations); 2436 return proc_id_map_open(inode, file, &proc_gid_seq_operations);
2437 } 2437 }
2438 2438
2439 static int proc_projid_map_open(struct inode *inode, struct file *file) 2439 static int proc_projid_map_open(struct inode *inode, struct file *file)
2440 { 2440 {
2441 return proc_id_map_open(inode, file, &proc_projid_seq_operations); 2441 return proc_id_map_open(inode, file, &proc_projid_seq_operations);
2442 } 2442 }
2443 2443
2444 static const struct file_operations proc_uid_map_operations = { 2444 static const struct file_operations proc_uid_map_operations = {
2445 .open = proc_uid_map_open, 2445 .open = proc_uid_map_open,
2446 .write = proc_uid_map_write, 2446 .write = proc_uid_map_write,
2447 .read = seq_read, 2447 .read = seq_read,
2448 .llseek = seq_lseek, 2448 .llseek = seq_lseek,
2449 .release = proc_id_map_release, 2449 .release = proc_id_map_release,
2450 }; 2450 };
2451 2451
2452 static const struct file_operations proc_gid_map_operations = { 2452 static const struct file_operations proc_gid_map_operations = {
2453 .open = proc_gid_map_open, 2453 .open = proc_gid_map_open,
2454 .write = proc_gid_map_write, 2454 .write = proc_gid_map_write,
2455 .read = seq_read, 2455 .read = seq_read,
2456 .llseek = seq_lseek, 2456 .llseek = seq_lseek,
2457 .release = proc_id_map_release, 2457 .release = proc_id_map_release,
2458 }; 2458 };
2459 2459
2460 static const struct file_operations proc_projid_map_operations = { 2460 static const struct file_operations proc_projid_map_operations = {
2461 .open = proc_projid_map_open, 2461 .open = proc_projid_map_open,
2462 .write = proc_projid_map_write, 2462 .write = proc_projid_map_write,
2463 .read = seq_read, 2463 .read = seq_read,
2464 .llseek = seq_lseek, 2464 .llseek = seq_lseek,
2465 .release = proc_id_map_release, 2465 .release = proc_id_map_release,
2466 }; 2466 };
2467
2468 static int proc_setgroups_open(struct inode *inode, struct file *file)
2469 {
2470 struct user_namespace *ns = NULL;
2471 struct task_struct *task;
2472 int ret;
2473
2474 ret = -ESRCH;
2475 task = get_proc_task(inode);
2476 if (task) {
2477 rcu_read_lock();
2478 ns = get_user_ns(task_cred_xxx(task, user_ns));
2479 rcu_read_unlock();
2480 put_task_struct(task);
2481 }
2482 if (!ns)
2483 goto err;
2484
2485 if (file->f_mode & FMODE_WRITE) {
2486 ret = -EACCES;
2487 if (!ns_capable(ns, CAP_SYS_ADMIN))
2488 goto err_put_ns;
2489 }
2490
2491 ret = single_open(file, &proc_setgroups_show, ns);
2492 if (ret)
2493 goto err_put_ns;
2494
2495 return 0;
2496 err_put_ns:
2497 put_user_ns(ns);
2498 err:
2499 return ret;
2500 }
2501
2502 static int proc_setgroups_release(struct inode *inode, struct file *file)
2503 {
2504 struct seq_file *seq = file->private_data;
2505 struct user_namespace *ns = seq->private;
2506 int ret = single_release(inode, file);
2507 put_user_ns(ns);
2508 return ret;
2509 }
2510
2511 static const struct file_operations proc_setgroups_operations = {
2512 .open = proc_setgroups_open,
2513 .write = proc_setgroups_write,
2514 .read = seq_read,
2515 .llseek = seq_lseek,
2516 .release = proc_setgroups_release,
2517 };
2467 #endif /* CONFIG_USER_NS */ 2518 #endif /* CONFIG_USER_NS */
2468 2519
2469 static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, 2520 static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
2470 struct pid *pid, struct task_struct *task) 2521 struct pid *pid, struct task_struct *task)
2471 { 2522 {
2472 int err = lock_trace(task); 2523 int err = lock_trace(task);
2473 if (!err) { 2524 if (!err) {
2474 seq_printf(m, "%08x\n", task->personality); 2525 seq_printf(m, "%08x\n", task->personality);
2475 unlock_trace(task); 2526 unlock_trace(task);
2476 } 2527 }
2477 return err; 2528 return err;
2478 } 2529 }
2479 2530
2480 /* 2531 /*
2481 * Thread groups 2532 * Thread groups
2482 */ 2533 */
2483 static const struct file_operations proc_task_operations; 2534 static const struct file_operations proc_task_operations;
2484 static const struct inode_operations proc_task_inode_operations; 2535 static const struct inode_operations proc_task_inode_operations;
2485 2536
2486 static const struct pid_entry tgid_base_stuff[] = { 2537 static const struct pid_entry tgid_base_stuff[] = {
2487 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), 2538 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
2488 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), 2539 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2489 #ifdef CONFIG_CHECKPOINT_RESTORE 2540 #ifdef CONFIG_CHECKPOINT_RESTORE
2490 DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), 2541 DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
2491 #endif 2542 #endif
2492 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), 2543 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
2493 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), 2544 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
2494 #ifdef CONFIG_NET 2545 #ifdef CONFIG_NET
2495 DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), 2546 DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
2496 #endif 2547 #endif
2497 REG("environ", S_IRUSR, proc_environ_operations), 2548 REG("environ", S_IRUSR, proc_environ_operations),
2498 ONE("auxv", S_IRUSR, proc_pid_auxv), 2549 ONE("auxv", S_IRUSR, proc_pid_auxv),
2499 ONE("status", S_IRUGO, proc_pid_status), 2550 ONE("status", S_IRUGO, proc_pid_status),
2500 ONE("personality", S_IRUSR, proc_pid_personality), 2551 ONE("personality", S_IRUSR, proc_pid_personality),
2501 ONE("limits", S_IRUGO, proc_pid_limits), 2552 ONE("limits", S_IRUGO, proc_pid_limits),
2502 #ifdef CONFIG_SCHED_DEBUG 2553 #ifdef CONFIG_SCHED_DEBUG
2503 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2554 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2504 #endif 2555 #endif
2505 #ifdef CONFIG_SCHED_AUTOGROUP 2556 #ifdef CONFIG_SCHED_AUTOGROUP
2506 REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), 2557 REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations),
2507 #endif 2558 #endif
2508 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), 2559 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
2509 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2560 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2510 ONE("syscall", S_IRUSR, proc_pid_syscall), 2561 ONE("syscall", S_IRUSR, proc_pid_syscall),
2511 #endif 2562 #endif
2512 ONE("cmdline", S_IRUGO, proc_pid_cmdline), 2563 ONE("cmdline", S_IRUGO, proc_pid_cmdline),
2513 ONE("stat", S_IRUGO, proc_tgid_stat), 2564 ONE("stat", S_IRUGO, proc_tgid_stat),
2514 ONE("statm", S_IRUGO, proc_pid_statm), 2565 ONE("statm", S_IRUGO, proc_pid_statm),
2515 REG("maps", S_IRUGO, proc_pid_maps_operations), 2566 REG("maps", S_IRUGO, proc_pid_maps_operations),
2516 #ifdef CONFIG_NUMA 2567 #ifdef CONFIG_NUMA
2517 REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), 2568 REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations),
2518 #endif 2569 #endif
2519 REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), 2570 REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
2520 LNK("cwd", proc_cwd_link), 2571 LNK("cwd", proc_cwd_link),
2521 LNK("root", proc_root_link), 2572 LNK("root", proc_root_link),
2522 LNK("exe", proc_exe_link), 2573 LNK("exe", proc_exe_link),
2523 REG("mounts", S_IRUGO, proc_mounts_operations), 2574 REG("mounts", S_IRUGO, proc_mounts_operations),
2524 REG("mountinfo", S_IRUGO, proc_mountinfo_operations), 2575 REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
2525 REG("mountstats", S_IRUSR, proc_mountstats_operations), 2576 REG("mountstats", S_IRUSR, proc_mountstats_operations),
2526 #ifdef CONFIG_PROC_PAGE_MONITOR 2577 #ifdef CONFIG_PROC_PAGE_MONITOR
2527 REG("clear_refs", S_IWUSR, proc_clear_refs_operations), 2578 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
2528 REG("smaps", S_IRUGO, proc_pid_smaps_operations), 2579 REG("smaps", S_IRUGO, proc_pid_smaps_operations),
2529 REG("pagemap", S_IRUSR, proc_pagemap_operations), 2580 REG("pagemap", S_IRUSR, proc_pagemap_operations),
2530 #endif 2581 #endif
2531 #ifdef CONFIG_SECURITY 2582 #ifdef CONFIG_SECURITY
2532 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), 2583 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
2533 #endif 2584 #endif
2534 #ifdef CONFIG_KALLSYMS 2585 #ifdef CONFIG_KALLSYMS
2535 ONE("wchan", S_IRUGO, proc_pid_wchan), 2586 ONE("wchan", S_IRUGO, proc_pid_wchan),
2536 #endif 2587 #endif
2537 #ifdef CONFIG_STACKTRACE 2588 #ifdef CONFIG_STACKTRACE
2538 ONE("stack", S_IRUSR, proc_pid_stack), 2589 ONE("stack", S_IRUSR, proc_pid_stack),
2539 #endif 2590 #endif
2540 #ifdef CONFIG_SCHEDSTATS 2591 #ifdef CONFIG_SCHEDSTATS
2541 ONE("schedstat", S_IRUGO, proc_pid_schedstat), 2592 ONE("schedstat", S_IRUGO, proc_pid_schedstat),
2542 #endif 2593 #endif
2543 #ifdef CONFIG_LATENCYTOP 2594 #ifdef CONFIG_LATENCYTOP
2544 REG("latency", S_IRUGO, proc_lstats_operations), 2595 REG("latency", S_IRUGO, proc_lstats_operations),
2545 #endif 2596 #endif
2546 #ifdef CONFIG_PROC_PID_CPUSET 2597 #ifdef CONFIG_PROC_PID_CPUSET
2547 ONE("cpuset", S_IRUGO, proc_cpuset_show), 2598 ONE("cpuset", S_IRUGO, proc_cpuset_show),
2548 #endif 2599 #endif
2549 #ifdef CONFIG_CGROUPS 2600 #ifdef CONFIG_CGROUPS
2550 ONE("cgroup", S_IRUGO, proc_cgroup_show), 2601 ONE("cgroup", S_IRUGO, proc_cgroup_show),
2551 #endif 2602 #endif
2552 ONE("oom_score", S_IRUGO, proc_oom_score), 2603 ONE("oom_score", S_IRUGO, proc_oom_score),
2553 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), 2604 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
2554 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), 2605 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
2555 #ifdef CONFIG_AUDITSYSCALL 2606 #ifdef CONFIG_AUDITSYSCALL
2556 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 2607 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
2557 REG("sessionid", S_IRUGO, proc_sessionid_operations), 2608 REG("sessionid", S_IRUGO, proc_sessionid_operations),
2558 #endif 2609 #endif
2559 #ifdef CONFIG_FAULT_INJECTION 2610 #ifdef CONFIG_FAULT_INJECTION
2560 REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), 2611 REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
2561 #endif 2612 #endif
2562 #ifdef CONFIG_ELF_CORE 2613 #ifdef CONFIG_ELF_CORE
2563 REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), 2614 REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
2564 #endif 2615 #endif
2565 #ifdef CONFIG_TASK_IO_ACCOUNTING 2616 #ifdef CONFIG_TASK_IO_ACCOUNTING
2566 ONE("io", S_IRUSR, proc_tgid_io_accounting), 2617 ONE("io", S_IRUSR, proc_tgid_io_accounting),
2567 #endif 2618 #endif
2568 #ifdef CONFIG_HARDWALL 2619 #ifdef CONFIG_HARDWALL
2569 ONE("hardwall", S_IRUGO, proc_pid_hardwall), 2620 ONE("hardwall", S_IRUGO, proc_pid_hardwall),
2570 #endif 2621 #endif
2571 #ifdef CONFIG_USER_NS 2622 #ifdef CONFIG_USER_NS
2572 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), 2623 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
2573 REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), 2624 REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
2574 REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), 2625 REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
2626 REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
2575 #endif 2627 #endif
2576 #ifdef CONFIG_CHECKPOINT_RESTORE 2628 #ifdef CONFIG_CHECKPOINT_RESTORE
2577 REG("timers", S_IRUGO, proc_timers_operations), 2629 REG("timers", S_IRUGO, proc_timers_operations),
2578 #endif 2630 #endif
2579 }; 2631 };
2580 2632
2581 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) 2633 static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
2582 { 2634 {
2583 return proc_pident_readdir(file, ctx, 2635 return proc_pident_readdir(file, ctx,
2584 tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); 2636 tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
2585 } 2637 }
2586 2638
2587 static const struct file_operations proc_tgid_base_operations = { 2639 static const struct file_operations proc_tgid_base_operations = {
2588 .read = generic_read_dir, 2640 .read = generic_read_dir,
2589 .iterate = proc_tgid_base_readdir, 2641 .iterate = proc_tgid_base_readdir,
2590 .llseek = default_llseek, 2642 .llseek = default_llseek,
2591 }; 2643 };
2592 2644
2593 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 2645 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
2594 { 2646 {
2595 return proc_pident_lookup(dir, dentry, 2647 return proc_pident_lookup(dir, dentry,
2596 tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); 2648 tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
2597 } 2649 }
2598 2650
2599 static const struct inode_operations proc_tgid_base_inode_operations = { 2651 static const struct inode_operations proc_tgid_base_inode_operations = {
2600 .lookup = proc_tgid_base_lookup, 2652 .lookup = proc_tgid_base_lookup,
2601 .getattr = pid_getattr, 2653 .getattr = pid_getattr,
2602 .setattr = proc_setattr, 2654 .setattr = proc_setattr,
2603 .permission = proc_pid_permission, 2655 .permission = proc_pid_permission,
2604 }; 2656 };
2605 2657
2606 static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) 2658 static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
2607 { 2659 {
2608 struct dentry *dentry, *leader, *dir; 2660 struct dentry *dentry, *leader, *dir;
2609 char buf[PROC_NUMBUF]; 2661 char buf[PROC_NUMBUF];
2610 struct qstr name; 2662 struct qstr name;
2611 2663
2612 name.name = buf; 2664 name.name = buf;
2613 name.len = snprintf(buf, sizeof(buf), "%d", pid); 2665 name.len = snprintf(buf, sizeof(buf), "%d", pid);
2614 /* no ->d_hash() rejects on procfs */ 2666 /* no ->d_hash() rejects on procfs */
2615 dentry = d_hash_and_lookup(mnt->mnt_root, &name); 2667 dentry = d_hash_and_lookup(mnt->mnt_root, &name);
2616 if (dentry) { 2668 if (dentry) {
2617 d_invalidate(dentry); 2669 d_invalidate(dentry);
2618 dput(dentry); 2670 dput(dentry);
2619 } 2671 }
2620 2672
2621 name.name = buf; 2673 name.name = buf;
2622 name.len = snprintf(buf, sizeof(buf), "%d", tgid); 2674 name.len = snprintf(buf, sizeof(buf), "%d", tgid);
2623 leader = d_hash_and_lookup(mnt->mnt_root, &name); 2675 leader = d_hash_and_lookup(mnt->mnt_root, &name);
2624 if (!leader) 2676 if (!leader)
2625 goto out; 2677 goto out;
2626 2678
2627 name.name = "task"; 2679 name.name = "task";
2628 name.len = strlen(name.name); 2680 name.len = strlen(name.name);
2629 dir = d_hash_and_lookup(leader, &name); 2681 dir = d_hash_and_lookup(leader, &name);
2630 if (!dir) 2682 if (!dir)
2631 goto out_put_leader; 2683 goto out_put_leader;
2632 2684
2633 name.name = buf; 2685 name.name = buf;
2634 name.len = snprintf(buf, sizeof(buf), "%d", pid); 2686 name.len = snprintf(buf, sizeof(buf), "%d", pid);
2635 dentry = d_hash_and_lookup(dir, &name); 2687 dentry = d_hash_and_lookup(dir, &name);
2636 if (dentry) { 2688 if (dentry) {
2637 d_invalidate(dentry); 2689 d_invalidate(dentry);
2638 dput(dentry); 2690 dput(dentry);
2639 } 2691 }
2640 2692
2641 dput(dir); 2693 dput(dir);
2642 out_put_leader: 2694 out_put_leader:
2643 dput(leader); 2695 dput(leader);
2644 out: 2696 out:
2645 return; 2697 return;
2646 } 2698 }
2647 2699
2648 /** 2700 /**
2649 * proc_flush_task - Remove dcache entries for @task from the /proc dcache. 2701 * proc_flush_task - Remove dcache entries for @task from the /proc dcache.
2650 * @task: task that should be flushed. 2702 * @task: task that should be flushed.
2651 * 2703 *
2652 * When flushing dentries from proc, one needs to flush them from global 2704 * When flushing dentries from proc, one needs to flush them from global
2653 * proc (proc_mnt) and from all the namespaces' procs this task was seen 2705 * proc (proc_mnt) and from all the namespaces' procs this task was seen
2654 * in. This call is supposed to do all of this job. 2706 * in. This call is supposed to do all of this job.
2655 * 2707 *
2656 * Looks in the dcache for 2708 * Looks in the dcache for
2657 * /proc/@pid 2709 * /proc/@pid
2658 * /proc/@tgid/task/@pid 2710 * /proc/@tgid/task/@pid
2659 * if either directory is present flushes it and all of it'ts children 2711 * if either directory is present flushes it and all of it'ts children
2660 * from the dcache. 2712 * from the dcache.
2661 * 2713 *
2662 * It is safe and reasonable to cache /proc entries for a task until 2714 * It is safe and reasonable to cache /proc entries for a task until
2663 * that task exits. After that they just clog up the dcache with 2715 * that task exits. After that they just clog up the dcache with
2664 * useless entries, possibly causing useful dcache entries to be 2716 * useless entries, possibly causing useful dcache entries to be
2665 * flushed instead. This routine is proved to flush those useless 2717 * flushed instead. This routine is proved to flush those useless
2666 * dcache entries at process exit time. 2718 * dcache entries at process exit time.
2667 * 2719 *
2668 * NOTE: This routine is just an optimization so it does not guarantee 2720 * NOTE: This routine is just an optimization so it does not guarantee
2669 * that no dcache entries will exist at process exit time it 2721 * that no dcache entries will exist at process exit time it
2670 * just makes it very unlikely that any will persist. 2722 * just makes it very unlikely that any will persist.
2671 */ 2723 */
2672 2724
2673 void proc_flush_task(struct task_struct *task) 2725 void proc_flush_task(struct task_struct *task)
2674 { 2726 {
2675 int i; 2727 int i;
2676 struct pid *pid, *tgid; 2728 struct pid *pid, *tgid;
2677 struct upid *upid; 2729 struct upid *upid;
2678 2730
2679 pid = task_pid(task); 2731 pid = task_pid(task);
2680 tgid = task_tgid(task); 2732 tgid = task_tgid(task);
2681 2733
2682 for (i = 0; i <= pid->level; i++) { 2734 for (i = 0; i <= pid->level; i++) {
2683 upid = &pid->numbers[i]; 2735 upid = &pid->numbers[i];
2684 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, 2736 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
2685 tgid->numbers[i].nr); 2737 tgid->numbers[i].nr);
2686 } 2738 }
2687 } 2739 }
2688 2740
2689 static int proc_pid_instantiate(struct inode *dir, 2741 static int proc_pid_instantiate(struct inode *dir,
2690 struct dentry * dentry, 2742 struct dentry * dentry,
2691 struct task_struct *task, const void *ptr) 2743 struct task_struct *task, const void *ptr)
2692 { 2744 {
2693 struct inode *inode; 2745 struct inode *inode;
2694 2746
2695 inode = proc_pid_make_inode(dir->i_sb, task); 2747 inode = proc_pid_make_inode(dir->i_sb, task);
2696 if (!inode) 2748 if (!inode)
2697 goto out; 2749 goto out;
2698 2750
2699 inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; 2751 inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
2700 inode->i_op = &proc_tgid_base_inode_operations; 2752 inode->i_op = &proc_tgid_base_inode_operations;
2701 inode->i_fop = &proc_tgid_base_operations; 2753 inode->i_fop = &proc_tgid_base_operations;
2702 inode->i_flags|=S_IMMUTABLE; 2754 inode->i_flags|=S_IMMUTABLE;
2703 2755
2704 set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff, 2756 set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff,
2705 ARRAY_SIZE(tgid_base_stuff))); 2757 ARRAY_SIZE(tgid_base_stuff)));
2706 2758
2707 d_set_d_op(dentry, &pid_dentry_operations); 2759 d_set_d_op(dentry, &pid_dentry_operations);
2708 2760
2709 d_add(dentry, inode); 2761 d_add(dentry, inode);
2710 /* Close the race of the process dying before we return the dentry */ 2762 /* Close the race of the process dying before we return the dentry */
2711 if (pid_revalidate(dentry, 0)) 2763 if (pid_revalidate(dentry, 0))
2712 return 0; 2764 return 0;
2713 out: 2765 out:
2714 return -ENOENT; 2766 return -ENOENT;
2715 } 2767 }
2716 2768
2717 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 2769 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
2718 { 2770 {
2719 int result = -ENOENT; 2771 int result = -ENOENT;
2720 struct task_struct *task; 2772 struct task_struct *task;
2721 unsigned tgid; 2773 unsigned tgid;
2722 struct pid_namespace *ns; 2774 struct pid_namespace *ns;
2723 2775
2724 tgid = name_to_int(&dentry->d_name); 2776 tgid = name_to_int(&dentry->d_name);
2725 if (tgid == ~0U) 2777 if (tgid == ~0U)
2726 goto out; 2778 goto out;
2727 2779
2728 ns = dentry->d_sb->s_fs_info; 2780 ns = dentry->d_sb->s_fs_info;
2729 rcu_read_lock(); 2781 rcu_read_lock();
2730 task = find_task_by_pid_ns(tgid, ns); 2782 task = find_task_by_pid_ns(tgid, ns);
2731 if (task) 2783 if (task)
2732 get_task_struct(task); 2784 get_task_struct(task);
2733 rcu_read_unlock(); 2785 rcu_read_unlock();
2734 if (!task) 2786 if (!task)
2735 goto out; 2787 goto out;
2736 2788
2737 result = proc_pid_instantiate(dir, dentry, task, NULL); 2789 result = proc_pid_instantiate(dir, dentry, task, NULL);
2738 put_task_struct(task); 2790 put_task_struct(task);
2739 out: 2791 out:
2740 return ERR_PTR(result); 2792 return ERR_PTR(result);
2741 } 2793 }
2742 2794
2743 /* 2795 /*
2744 * Find the first task with tgid >= tgid 2796 * Find the first task with tgid >= tgid
2745 * 2797 *
2746 */ 2798 */
2747 struct tgid_iter { 2799 struct tgid_iter {
2748 unsigned int tgid; 2800 unsigned int tgid;
2749 struct task_struct *task; 2801 struct task_struct *task;
2750 }; 2802 };
2751 static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter) 2803 static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
2752 { 2804 {
2753 struct pid *pid; 2805 struct pid *pid;
2754 2806
2755 if (iter.task) 2807 if (iter.task)
2756 put_task_struct(iter.task); 2808 put_task_struct(iter.task);
2757 rcu_read_lock(); 2809 rcu_read_lock();
2758 retry: 2810 retry:
2759 iter.task = NULL; 2811 iter.task = NULL;
2760 pid = find_ge_pid(iter.tgid, ns); 2812 pid = find_ge_pid(iter.tgid, ns);
2761 if (pid) { 2813 if (pid) {
2762 iter.tgid = pid_nr_ns(pid, ns); 2814 iter.tgid = pid_nr_ns(pid, ns);
2763 iter.task = pid_task(pid, PIDTYPE_PID); 2815 iter.task = pid_task(pid, PIDTYPE_PID);
2764 /* What we to know is if the pid we have find is the 2816 /* What we to know is if the pid we have find is the
2765 * pid of a thread_group_leader. Testing for task 2817 * pid of a thread_group_leader. Testing for task
2766 * being a thread_group_leader is the obvious thing 2818 * being a thread_group_leader is the obvious thing
2767 * todo but there is a window when it fails, due to 2819 * todo but there is a window when it fails, due to
2768 * the pid transfer logic in de_thread. 2820 * the pid transfer logic in de_thread.
2769 * 2821 *
2770 * So we perform the straight forward test of seeing 2822 * So we perform the straight forward test of seeing
2771 * if the pid we have found is the pid of a thread 2823 * if the pid we have found is the pid of a thread
2772 * group leader, and don't worry if the task we have 2824 * group leader, and don't worry if the task we have
2773 * found doesn't happen to be a thread group leader. 2825 * found doesn't happen to be a thread group leader.
2774 * As we don't care in the case of readdir. 2826 * As we don't care in the case of readdir.
2775 */ 2827 */
2776 if (!iter.task || !has_group_leader_pid(iter.task)) { 2828 if (!iter.task || !has_group_leader_pid(iter.task)) {
2777 iter.tgid += 1; 2829 iter.tgid += 1;
2778 goto retry; 2830 goto retry;
2779 } 2831 }
2780 get_task_struct(iter.task); 2832 get_task_struct(iter.task);
2781 } 2833 }
2782 rcu_read_unlock(); 2834 rcu_read_unlock();
2783 return iter; 2835 return iter;
2784 } 2836 }
2785 2837
2786 #define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2) 2838 #define TGID_OFFSET (FIRST_PROCESS_ENTRY + 2)
2787 2839
2788 /* for the /proc/ directory itself, after non-process stuff has been done */ 2840 /* for the /proc/ directory itself, after non-process stuff has been done */
2789 int proc_pid_readdir(struct file *file, struct dir_context *ctx) 2841 int proc_pid_readdir(struct file *file, struct dir_context *ctx)
2790 { 2842 {
2791 struct tgid_iter iter; 2843 struct tgid_iter iter;
2792 struct pid_namespace *ns = file->f_dentry->d_sb->s_fs_info; 2844 struct pid_namespace *ns = file->f_dentry->d_sb->s_fs_info;
2793 loff_t pos = ctx->pos; 2845 loff_t pos = ctx->pos;
2794 2846
2795 if (pos >= PID_MAX_LIMIT + TGID_OFFSET) 2847 if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
2796 return 0; 2848 return 0;
2797 2849
2798 if (pos == TGID_OFFSET - 2) { 2850 if (pos == TGID_OFFSET - 2) {
2799 struct inode *inode = ns->proc_self->d_inode; 2851 struct inode *inode = ns->proc_self->d_inode;
2800 if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) 2852 if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK))
2801 return 0; 2853 return 0;
2802 ctx->pos = pos = pos + 1; 2854 ctx->pos = pos = pos + 1;
2803 } 2855 }
2804 if (pos == TGID_OFFSET - 1) { 2856 if (pos == TGID_OFFSET - 1) {
2805 struct inode *inode = ns->proc_thread_self->d_inode; 2857 struct inode *inode = ns->proc_thread_self->d_inode;
2806 if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK)) 2858 if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK))
2807 return 0; 2859 return 0;
2808 ctx->pos = pos = pos + 1; 2860 ctx->pos = pos = pos + 1;
2809 } 2861 }
2810 iter.tgid = pos - TGID_OFFSET; 2862 iter.tgid = pos - TGID_OFFSET;
2811 iter.task = NULL; 2863 iter.task = NULL;
2812 for (iter = next_tgid(ns, iter); 2864 for (iter = next_tgid(ns, iter);
2813 iter.task; 2865 iter.task;
2814 iter.tgid += 1, iter = next_tgid(ns, iter)) { 2866 iter.tgid += 1, iter = next_tgid(ns, iter)) {
2815 char name[PROC_NUMBUF]; 2867 char name[PROC_NUMBUF];
2816 int len; 2868 int len;
2817 if (!has_pid_permissions(ns, iter.task, 2)) 2869 if (!has_pid_permissions(ns, iter.task, 2))
2818 continue; 2870 continue;
2819 2871
2820 len = snprintf(name, sizeof(name), "%d", iter.tgid); 2872 len = snprintf(name, sizeof(name), "%d", iter.tgid);
2821 ctx->pos = iter.tgid + TGID_OFFSET; 2873 ctx->pos = iter.tgid + TGID_OFFSET;
2822 if (!proc_fill_cache(file, ctx, name, len, 2874 if (!proc_fill_cache(file, ctx, name, len,
2823 proc_pid_instantiate, iter.task, NULL)) { 2875 proc_pid_instantiate, iter.task, NULL)) {
2824 put_task_struct(iter.task); 2876 put_task_struct(iter.task);
2825 return 0; 2877 return 0;
2826 } 2878 }
2827 } 2879 }
2828 ctx->pos = PID_MAX_LIMIT + TGID_OFFSET; 2880 ctx->pos = PID_MAX_LIMIT + TGID_OFFSET;
2829 return 0; 2881 return 0;
2830 } 2882 }
2831 2883
2832 /* 2884 /*
2833 * Tasks 2885 * Tasks
2834 */ 2886 */
2835 static const struct pid_entry tid_base_stuff[] = { 2887 static const struct pid_entry tid_base_stuff[] = {
2836 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), 2888 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2837 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), 2889 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
2838 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), 2890 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
2839 #ifdef CONFIG_NET 2891 #ifdef CONFIG_NET
2840 DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), 2892 DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
2841 #endif 2893 #endif
2842 REG("environ", S_IRUSR, proc_environ_operations), 2894 REG("environ", S_IRUSR, proc_environ_operations),
2843 ONE("auxv", S_IRUSR, proc_pid_auxv), 2895 ONE("auxv", S_IRUSR, proc_pid_auxv),
2844 ONE("status", S_IRUGO, proc_pid_status), 2896 ONE("status", S_IRUGO, proc_pid_status),
2845 ONE("personality", S_IRUSR, proc_pid_personality), 2897 ONE("personality", S_IRUSR, proc_pid_personality),
2846 ONE("limits", S_IRUGO, proc_pid_limits), 2898 ONE("limits", S_IRUGO, proc_pid_limits),
2847 #ifdef CONFIG_SCHED_DEBUG 2899 #ifdef CONFIG_SCHED_DEBUG
2848 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), 2900 REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2849 #endif 2901 #endif
2850 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations), 2902 REG("comm", S_IRUGO|S_IWUSR, proc_pid_set_comm_operations),
2851 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK 2903 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2852 ONE("syscall", S_IRUSR, proc_pid_syscall), 2904 ONE("syscall", S_IRUSR, proc_pid_syscall),
2853 #endif 2905 #endif
2854 ONE("cmdline", S_IRUGO, proc_pid_cmdline), 2906 ONE("cmdline", S_IRUGO, proc_pid_cmdline),
2855 ONE("stat", S_IRUGO, proc_tid_stat), 2907 ONE("stat", S_IRUGO, proc_tid_stat),
2856 ONE("statm", S_IRUGO, proc_pid_statm), 2908 ONE("statm", S_IRUGO, proc_pid_statm),
2857 REG("maps", S_IRUGO, proc_tid_maps_operations), 2909 REG("maps", S_IRUGO, proc_tid_maps_operations),
2858 #ifdef CONFIG_CHECKPOINT_RESTORE 2910 #ifdef CONFIG_CHECKPOINT_RESTORE
2859 REG("children", S_IRUGO, proc_tid_children_operations), 2911 REG("children", S_IRUGO, proc_tid_children_operations),
2860 #endif 2912 #endif
2861 #ifdef CONFIG_NUMA 2913 #ifdef CONFIG_NUMA
2862 REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations), 2914 REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations),
2863 #endif 2915 #endif
2864 REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), 2916 REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
2865 LNK("cwd", proc_cwd_link), 2917 LNK("cwd", proc_cwd_link),
2866 LNK("root", proc_root_link), 2918 LNK("root", proc_root_link),
2867 LNK("exe", proc_exe_link), 2919 LNK("exe", proc_exe_link),
2868 REG("mounts", S_IRUGO, proc_mounts_operations), 2920 REG("mounts", S_IRUGO, proc_mounts_operations),
2869 REG("mountinfo", S_IRUGO, proc_mountinfo_operations), 2921 REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
2870 #ifdef CONFIG_PROC_PAGE_MONITOR 2922 #ifdef CONFIG_PROC_PAGE_MONITOR
2871 REG("clear_refs", S_IWUSR, proc_clear_refs_operations), 2923 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
2872 REG("smaps", S_IRUGO, proc_tid_smaps_operations), 2924 REG("smaps", S_IRUGO, proc_tid_smaps_operations),
2873 REG("pagemap", S_IRUSR, proc_pagemap_operations), 2925 REG("pagemap", S_IRUSR, proc_pagemap_operations),
2874 #endif 2926 #endif
2875 #ifdef CONFIG_SECURITY 2927 #ifdef CONFIG_SECURITY
2876 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), 2928 DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
2877 #endif 2929 #endif
2878 #ifdef CONFIG_KALLSYMS 2930 #ifdef CONFIG_KALLSYMS
2879 ONE("wchan", S_IRUGO, proc_pid_wchan), 2931 ONE("wchan", S_IRUGO, proc_pid_wchan),
2880 #endif 2932 #endif
2881 #ifdef CONFIG_STACKTRACE 2933 #ifdef CONFIG_STACKTRACE
2882 ONE("stack", S_IRUSR, proc_pid_stack), 2934 ONE("stack", S_IRUSR, proc_pid_stack),
2883 #endif 2935 #endif
2884 #ifdef CONFIG_SCHEDSTATS 2936 #ifdef CONFIG_SCHEDSTATS
2885 ONE("schedstat", S_IRUGO, proc_pid_schedstat), 2937 ONE("schedstat", S_IRUGO, proc_pid_schedstat),
2886 #endif 2938 #endif
2887 #ifdef CONFIG_LATENCYTOP 2939 #ifdef CONFIG_LATENCYTOP
2888 REG("latency", S_IRUGO, proc_lstats_operations), 2940 REG("latency", S_IRUGO, proc_lstats_operations),
2889 #endif 2941 #endif
2890 #ifdef CONFIG_PROC_PID_CPUSET 2942 #ifdef CONFIG_PROC_PID_CPUSET
2891 ONE("cpuset", S_IRUGO, proc_cpuset_show), 2943 ONE("cpuset", S_IRUGO, proc_cpuset_show),
2892 #endif 2944 #endif
2893 #ifdef CONFIG_CGROUPS 2945 #ifdef CONFIG_CGROUPS
2894 ONE("cgroup", S_IRUGO, proc_cgroup_show), 2946 ONE("cgroup", S_IRUGO, proc_cgroup_show),
2895 #endif 2947 #endif
2896 ONE("oom_score", S_IRUGO, proc_oom_score), 2948 ONE("oom_score", S_IRUGO, proc_oom_score),
2897 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), 2949 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
2898 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), 2950 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
2899 #ifdef CONFIG_AUDITSYSCALL 2951 #ifdef CONFIG_AUDITSYSCALL
2900 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 2952 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
2901 REG("sessionid", S_IRUGO, proc_sessionid_operations), 2953 REG("sessionid", S_IRUGO, proc_sessionid_operations),
2902 #endif 2954 #endif
2903 #ifdef CONFIG_FAULT_INJECTION 2955 #ifdef CONFIG_FAULT_INJECTION
2904 REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), 2956 REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
2905 #endif 2957 #endif
2906 #ifdef CONFIG_TASK_IO_ACCOUNTING 2958 #ifdef CONFIG_TASK_IO_ACCOUNTING
2907 ONE("io", S_IRUSR, proc_tid_io_accounting), 2959 ONE("io", S_IRUSR, proc_tid_io_accounting),
2908 #endif 2960 #endif
2909 #ifdef CONFIG_HARDWALL 2961 #ifdef CONFIG_HARDWALL
2910 ONE("hardwall", S_IRUGO, proc_pid_hardwall), 2962 ONE("hardwall", S_IRUGO, proc_pid_hardwall),
2911 #endif 2963 #endif
2912 #ifdef CONFIG_USER_NS 2964 #ifdef CONFIG_USER_NS
2913 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), 2965 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
2914 REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), 2966 REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
2915 REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), 2967 REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
2968 REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
2916 #endif 2969 #endif
2917 }; 2970 };
2918 2971
2919 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) 2972 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
2920 { 2973 {
2921 return proc_pident_readdir(file, ctx, 2974 return proc_pident_readdir(file, ctx,
2922 tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); 2975 tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
2923 } 2976 }
2924 2977
2925 static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 2978 static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
2926 { 2979 {
2927 return proc_pident_lookup(dir, dentry, 2980 return proc_pident_lookup(dir, dentry,
2928 tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); 2981 tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
2929 } 2982 }
2930 2983
2931 static const struct file_operations proc_tid_base_operations = { 2984 static const struct file_operations proc_tid_base_operations = {
2932 .read = generic_read_dir, 2985 .read = generic_read_dir,
2933 .iterate = proc_tid_base_readdir, 2986 .iterate = proc_tid_base_readdir,
2934 .llseek = default_llseek, 2987 .llseek = default_llseek,
2935 }; 2988 };
2936 2989
2937 static const struct inode_operations proc_tid_base_inode_operations = { 2990 static const struct inode_operations proc_tid_base_inode_operations = {
2938 .lookup = proc_tid_base_lookup, 2991 .lookup = proc_tid_base_lookup,
2939 .getattr = pid_getattr, 2992 .getattr = pid_getattr,
2940 .setattr = proc_setattr, 2993 .setattr = proc_setattr,
2941 }; 2994 };
2942 2995
2943 static int proc_task_instantiate(struct inode *dir, 2996 static int proc_task_instantiate(struct inode *dir,
2944 struct dentry *dentry, struct task_struct *task, const void *ptr) 2997 struct dentry *dentry, struct task_struct *task, const void *ptr)
2945 { 2998 {
2946 struct inode *inode; 2999 struct inode *inode;
2947 inode = proc_pid_make_inode(dir->i_sb, task); 3000 inode = proc_pid_make_inode(dir->i_sb, task);
2948 3001
2949 if (!inode) 3002 if (!inode)
2950 goto out; 3003 goto out;
2951 inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; 3004 inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
2952 inode->i_op = &proc_tid_base_inode_operations; 3005 inode->i_op = &proc_tid_base_inode_operations;
2953 inode->i_fop = &proc_tid_base_operations; 3006 inode->i_fop = &proc_tid_base_operations;
2954 inode->i_flags|=S_IMMUTABLE; 3007 inode->i_flags|=S_IMMUTABLE;
2955 3008
2956 set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff, 3009 set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff,
2957 ARRAY_SIZE(tid_base_stuff))); 3010 ARRAY_SIZE(tid_base_stuff)));
2958 3011
2959 d_set_d_op(dentry, &pid_dentry_operations); 3012 d_set_d_op(dentry, &pid_dentry_operations);
2960 3013
2961 d_add(dentry, inode); 3014 d_add(dentry, inode);
2962 /* Close the race of the process dying before we return the dentry */ 3015 /* Close the race of the process dying before we return the dentry */
2963 if (pid_revalidate(dentry, 0)) 3016 if (pid_revalidate(dentry, 0))
2964 return 0; 3017 return 0;
2965 out: 3018 out:
2966 return -ENOENT; 3019 return -ENOENT;
2967 } 3020 }
2968 3021
2969 static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 3022 static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags)
2970 { 3023 {
2971 int result = -ENOENT; 3024 int result = -ENOENT;
2972 struct task_struct *task; 3025 struct task_struct *task;
2973 struct task_struct *leader = get_proc_task(dir); 3026 struct task_struct *leader = get_proc_task(dir);
2974 unsigned tid; 3027 unsigned tid;
2975 struct pid_namespace *ns; 3028 struct pid_namespace *ns;
2976 3029
2977 if (!leader) 3030 if (!leader)
2978 goto out_no_task; 3031 goto out_no_task;
2979 3032
2980 tid = name_to_int(&dentry->d_name); 3033 tid = name_to_int(&dentry->d_name);
2981 if (tid == ~0U) 3034 if (tid == ~0U)
2982 goto out; 3035 goto out;
2983 3036
2984 ns = dentry->d_sb->s_fs_info; 3037 ns = dentry->d_sb->s_fs_info;
2985 rcu_read_lock(); 3038 rcu_read_lock();
2986 task = find_task_by_pid_ns(tid, ns); 3039 task = find_task_by_pid_ns(tid, ns);
2987 if (task) 3040 if (task)
2988 get_task_struct(task); 3041 get_task_struct(task);
2989 rcu_read_unlock(); 3042 rcu_read_unlock();
2990 if (!task) 3043 if (!task)
2991 goto out; 3044 goto out;
2992 if (!same_thread_group(leader, task)) 3045 if (!same_thread_group(leader, task))
2993 goto out_drop_task; 3046 goto out_drop_task;
2994 3047
2995 result = proc_task_instantiate(dir, dentry, task, NULL); 3048 result = proc_task_instantiate(dir, dentry, task, NULL);
2996 out_drop_task: 3049 out_drop_task:
2997 put_task_struct(task); 3050 put_task_struct(task);
2998 out: 3051 out:
2999 put_task_struct(leader); 3052 put_task_struct(leader);
3000 out_no_task: 3053 out_no_task:
3001 return ERR_PTR(result); 3054 return ERR_PTR(result);
3002 } 3055 }
3003 3056
3004 /* 3057 /*
3005 * Find the first tid of a thread group to return to user space. 3058 * Find the first tid of a thread group to return to user space.
3006 * 3059 *
3007 * Usually this is just the thread group leader, but if the users 3060 * Usually this is just the thread group leader, but if the users
3008 * buffer was too small or there was a seek into the middle of the 3061 * buffer was too small or there was a seek into the middle of the
3009 * directory we have more work todo. 3062 * directory we have more work todo.
3010 * 3063 *
3011 * In the case of a short read we start with find_task_by_pid. 3064 * In the case of a short read we start with find_task_by_pid.
3012 * 3065 *
3013 * In the case of a seek we start with the leader and walk nr 3066 * In the case of a seek we start with the leader and walk nr
3014 * threads past it. 3067 * threads past it.
3015 */ 3068 */
3016 static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos, 3069 static struct task_struct *first_tid(struct pid *pid, int tid, loff_t f_pos,
3017 struct pid_namespace *ns) 3070 struct pid_namespace *ns)
3018 { 3071 {
3019 struct task_struct *pos, *task; 3072 struct task_struct *pos, *task;
3020 unsigned long nr = f_pos; 3073 unsigned long nr = f_pos;
3021 3074
3022 if (nr != f_pos) /* 32bit overflow? */ 3075 if (nr != f_pos) /* 32bit overflow? */
3023 return NULL; 3076 return NULL;
3024 3077
3025 rcu_read_lock(); 3078 rcu_read_lock();
3026 task = pid_task(pid, PIDTYPE_PID); 3079 task = pid_task(pid, PIDTYPE_PID);
3027 if (!task) 3080 if (!task)
3028 goto fail; 3081 goto fail;
3029 3082
3030 /* Attempt to start with the tid of a thread */ 3083 /* Attempt to start with the tid of a thread */
3031 if (tid && nr) { 3084 if (tid && nr) {
3032 pos = find_task_by_pid_ns(tid, ns); 3085 pos = find_task_by_pid_ns(tid, ns);
3033 if (pos && same_thread_group(pos, task)) 3086 if (pos && same_thread_group(pos, task))
3034 goto found; 3087 goto found;
3035 } 3088 }
3036 3089
3037 /* If nr exceeds the number of threads there is nothing todo */ 3090 /* If nr exceeds the number of threads there is nothing todo */
3038 if (nr >= get_nr_threads(task)) 3091 if (nr >= get_nr_threads(task))
3039 goto fail; 3092 goto fail;
3040 3093
3041 /* If we haven't found our starting place yet start 3094 /* If we haven't found our starting place yet start
3042 * with the leader and walk nr threads forward. 3095 * with the leader and walk nr threads forward.
3043 */ 3096 */
3044 pos = task = task->group_leader; 3097 pos = task = task->group_leader;
3045 do { 3098 do {
3046 if (!nr--) 3099 if (!nr--)
3047 goto found; 3100 goto found;
3048 } while_each_thread(task, pos); 3101 } while_each_thread(task, pos);
3049 fail: 3102 fail:
3050 pos = NULL; 3103 pos = NULL;
3051 goto out; 3104 goto out;
3052 found: 3105 found:
3053 get_task_struct(pos); 3106 get_task_struct(pos);
3054 out: 3107 out:
3055 rcu_read_unlock(); 3108 rcu_read_unlock();
3056 return pos; 3109 return pos;
3057 } 3110 }
3058 3111
3059 /* 3112 /*
3060 * Find the next thread in the thread list. 3113 * Find the next thread in the thread list.
3061 * Return NULL if there is an error or no next thread. 3114 * Return NULL if there is an error or no next thread.
3062 * 3115 *
3063 * The reference to the input task_struct is released. 3116 * The reference to the input task_struct is released.
3064 */ 3117 */
3065 static struct task_struct *next_tid(struct task_struct *start) 3118 static struct task_struct *next_tid(struct task_struct *start)
3066 { 3119 {
3067 struct task_struct *pos = NULL; 3120 struct task_struct *pos = NULL;
3068 rcu_read_lock(); 3121 rcu_read_lock();
3069 if (pid_alive(start)) { 3122 if (pid_alive(start)) {
3070 pos = next_thread(start); 3123 pos = next_thread(start);
3071 if (thread_group_leader(pos)) 3124 if (thread_group_leader(pos))
3072 pos = NULL; 3125 pos = NULL;
3073 else 3126 else
3074 get_task_struct(pos); 3127 get_task_struct(pos);
3075 } 3128 }
3076 rcu_read_unlock(); 3129 rcu_read_unlock();
3077 put_task_struct(start); 3130 put_task_struct(start);
3078 return pos; 3131 return pos;
3079 } 3132 }
3080 3133
3081 /* for the /proc/TGID/task/ directories */ 3134 /* for the /proc/TGID/task/ directories */
3082 static int proc_task_readdir(struct file *file, struct dir_context *ctx) 3135 static int proc_task_readdir(struct file *file, struct dir_context *ctx)
3083 { 3136 {
3084 struct inode *inode = file_inode(file); 3137 struct inode *inode = file_inode(file);
3085 struct task_struct *task; 3138 struct task_struct *task;
3086 struct pid_namespace *ns; 3139 struct pid_namespace *ns;
3087 int tid; 3140 int tid;
3088 3141
3089 if (proc_inode_is_dead(inode)) 3142 if (proc_inode_is_dead(inode))
3090 return -ENOENT; 3143 return -ENOENT;
3091 3144
3092 if (!dir_emit_dots(file, ctx)) 3145 if (!dir_emit_dots(file, ctx))
3093 return 0; 3146 return 0;
3094 3147
3095 /* f_version caches the tgid value that the last readdir call couldn't 3148 /* f_version caches the tgid value that the last readdir call couldn't
3096 * return. lseek aka telldir automagically resets f_version to 0. 3149 * return. lseek aka telldir automagically resets f_version to 0.
3097 */ 3150 */
3098 ns = file->f_dentry->d_sb->s_fs_info; 3151 ns = file->f_dentry->d_sb->s_fs_info;
3099 tid = (int)file->f_version; 3152 tid = (int)file->f_version;
3100 file->f_version = 0; 3153 file->f_version = 0;
3101 for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); 3154 for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
3102 task; 3155 task;
3103 task = next_tid(task), ctx->pos++) { 3156 task = next_tid(task), ctx->pos++) {
3104 char name[PROC_NUMBUF]; 3157 char name[PROC_NUMBUF];
3105 int len; 3158 int len;
3106 tid = task_pid_nr_ns(task, ns); 3159 tid = task_pid_nr_ns(task, ns);
3107 len = snprintf(name, sizeof(name), "%d", tid); 3160 len = snprintf(name, sizeof(name), "%d", tid);
3108 if (!proc_fill_cache(file, ctx, name, len, 3161 if (!proc_fill_cache(file, ctx, name, len,
3109 proc_task_instantiate, task, NULL)) { 3162 proc_task_instantiate, task, NULL)) {
3110 /* returning this tgid failed, save it as the first 3163 /* returning this tgid failed, save it as the first
3111 * pid for the next readir call */ 3164 * pid for the next readir call */
3112 file->f_version = (u64)tid; 3165 file->f_version = (u64)tid;
3113 put_task_struct(task); 3166 put_task_struct(task);
3114 break; 3167 break;
3115 } 3168 }
3116 } 3169 }
3117 3170
3118 return 0; 3171 return 0;
3119 } 3172 }
3120 3173
3121 static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) 3174 static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
3122 { 3175 {
3123 struct inode *inode = dentry->d_inode; 3176 struct inode *inode = dentry->d_inode;
3124 struct task_struct *p = get_proc_task(inode); 3177 struct task_struct *p = get_proc_task(inode);
3125 generic_fillattr(inode, stat); 3178 generic_fillattr(inode, stat);
3126 3179
3127 if (p) { 3180 if (p) {
3128 stat->nlink += get_nr_threads(p); 3181 stat->nlink += get_nr_threads(p);
3129 put_task_struct(p); 3182 put_task_struct(p);
3130 } 3183 }
3131 3184
3132 return 0; 3185 return 0;
3133 } 3186 }
3134 3187
3135 static const struct inode_operations proc_task_inode_operations = { 3188 static const struct inode_operations proc_task_inode_operations = {
3136 .lookup = proc_task_lookup, 3189 .lookup = proc_task_lookup,
3137 .getattr = proc_task_getattr, 3190 .getattr = proc_task_getattr,
3138 .setattr = proc_setattr, 3191 .setattr = proc_setattr,
3139 .permission = proc_pid_permission, 3192 .permission = proc_pid_permission,
3140 }; 3193 };
3141 3194
3142 static const struct file_operations proc_task_operations = { 3195 static const struct file_operations proc_task_operations = {
3143 .read = generic_read_dir, 3196 .read = generic_read_dir,
3144 .iterate = proc_task_readdir, 3197 .iterate = proc_task_readdir,
3145 .llseek = default_llseek, 3198 .llseek = default_llseek,
3146 }; 3199 };
3147 3200
include/linux/user_namespace.h
1 #ifndef _LINUX_USER_NAMESPACE_H 1 #ifndef _LINUX_USER_NAMESPACE_H
2 #define _LINUX_USER_NAMESPACE_H 2 #define _LINUX_USER_NAMESPACE_H
3 3
4 #include <linux/kref.h> 4 #include <linux/kref.h>
5 #include <linux/nsproxy.h> 5 #include <linux/nsproxy.h>
6 #include <linux/sched.h> 6 #include <linux/sched.h>
7 #include <linux/err.h> 7 #include <linux/err.h>
8 8
9 #define UID_GID_MAP_MAX_EXTENTS 5 9 #define UID_GID_MAP_MAX_EXTENTS 5
10 10
11 struct uid_gid_map { /* 64 bytes -- 1 cache line */ 11 struct uid_gid_map { /* 64 bytes -- 1 cache line */
12 u32 nr_extents; 12 u32 nr_extents;
13 struct uid_gid_extent { 13 struct uid_gid_extent {
14 u32 first; 14 u32 first;
15 u32 lower_first; 15 u32 lower_first;
16 u32 count; 16 u32 count;
17 } extent[UID_GID_MAP_MAX_EXTENTS]; 17 } extent[UID_GID_MAP_MAX_EXTENTS];
18 }; 18 };
19 19
20 #define USERNS_SETGROUPS_ALLOWED 1UL
21
22 #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
23
20 struct user_namespace { 24 struct user_namespace {
21 struct uid_gid_map uid_map; 25 struct uid_gid_map uid_map;
22 struct uid_gid_map gid_map; 26 struct uid_gid_map gid_map;
23 struct uid_gid_map projid_map; 27 struct uid_gid_map projid_map;
24 atomic_t count; 28 atomic_t count;
25 struct user_namespace *parent; 29 struct user_namespace *parent;
26 int level; 30 int level;
27 kuid_t owner; 31 kuid_t owner;
28 kgid_t group; 32 kgid_t group;
29 unsigned int proc_inum; 33 unsigned int proc_inum;
34 unsigned long flags;
30 35
31 /* Register of per-UID persistent keyrings for this namespace */ 36 /* Register of per-UID persistent keyrings for this namespace */
32 #ifdef CONFIG_PERSISTENT_KEYRINGS 37 #ifdef CONFIG_PERSISTENT_KEYRINGS
33 struct key *persistent_keyring_register; 38 struct key *persistent_keyring_register;
34 struct rw_semaphore persistent_keyring_register_sem; 39 struct rw_semaphore persistent_keyring_register_sem;
35 #endif 40 #endif
36 }; 41 };
37 42
38 extern struct user_namespace init_user_ns; 43 extern struct user_namespace init_user_ns;
39 44
40 #ifdef CONFIG_USER_NS 45 #ifdef CONFIG_USER_NS
41 46
42 static inline struct user_namespace *get_user_ns(struct user_namespace *ns) 47 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
43 { 48 {
44 if (ns) 49 if (ns)
45 atomic_inc(&ns->count); 50 atomic_inc(&ns->count);
46 return ns; 51 return ns;
47 } 52 }
48 53
49 extern int create_user_ns(struct cred *new); 54 extern int create_user_ns(struct cred *new);
50 extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred); 55 extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred);
51 extern void free_user_ns(struct user_namespace *ns); 56 extern void free_user_ns(struct user_namespace *ns);
52 57
53 static inline void put_user_ns(struct user_namespace *ns) 58 static inline void put_user_ns(struct user_namespace *ns)
54 { 59 {
55 if (ns && atomic_dec_and_test(&ns->count)) 60 if (ns && atomic_dec_and_test(&ns->count))
56 free_user_ns(ns); 61 free_user_ns(ns);
57 } 62 }
58 63
59 struct seq_operations; 64 struct seq_operations;
60 extern const struct seq_operations proc_uid_seq_operations; 65 extern const struct seq_operations proc_uid_seq_operations;
61 extern const struct seq_operations proc_gid_seq_operations; 66 extern const struct seq_operations proc_gid_seq_operations;
62 extern const struct seq_operations proc_projid_seq_operations; 67 extern const struct seq_operations proc_projid_seq_operations;
63 extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *); 68 extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *);
64 extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *); 69 extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *);
65 extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *); 70 extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *);
71 extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *);
72 extern int proc_setgroups_show(struct seq_file *m, void *v);
66 extern bool userns_may_setgroups(const struct user_namespace *ns); 73 extern bool userns_may_setgroups(const struct user_namespace *ns);
67 #else 74 #else
68 75
69 static inline struct user_namespace *get_user_ns(struct user_namespace *ns) 76 static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
70 { 77 {
71 return &init_user_ns; 78 return &init_user_ns;
72 } 79 }
73 80
74 static inline int create_user_ns(struct cred *new) 81 static inline int create_user_ns(struct cred *new)
75 { 82 {
76 return -EINVAL; 83 return -EINVAL;
77 } 84 }
78 85
79 static inline int unshare_userns(unsigned long unshare_flags, 86 static inline int unshare_userns(unsigned long unshare_flags,
80 struct cred **new_cred) 87 struct cred **new_cred)
81 { 88 {
82 if (unshare_flags & CLONE_NEWUSER) 89 if (unshare_flags & CLONE_NEWUSER)
83 return -EINVAL; 90 return -EINVAL;
84 return 0; 91 return 0;
85 } 92 }
86 93
87 static inline void put_user_ns(struct user_namespace *ns) 94 static inline void put_user_ns(struct user_namespace *ns)
88 { 95 {
89 } 96 }
90 97
91 static inline bool userns_may_setgroups(const struct user_namespace *ns) 98 static inline bool userns_may_setgroups(const struct user_namespace *ns)
92 { 99 {
93 return true; 100 return true;
94 } 101 }
95 #endif 102 #endif
96 103
97 #endif /* _LINUX_USER_H */ 104 #endif /* _LINUX_USER_H */
98 105
1 /* 1 /*
2 * The "user cache". 2 * The "user cache".
3 * 3 *
4 * (C) Copyright 1991-2000 Linus Torvalds 4 * (C) Copyright 1991-2000 Linus Torvalds
5 * 5 *
6 * We have a per-user structure to keep track of how many 6 * We have a per-user structure to keep track of how many
7 * processes, files etc the user has claimed, in order to be 7 * processes, files etc the user has claimed, in order to be
8 * able to have per-user limits for system resources. 8 * able to have per-user limits for system resources.
9 */ 9 */
10 10
11 #include <linux/init.h> 11 #include <linux/init.h>
12 #include <linux/sched.h> 12 #include <linux/sched.h>
13 #include <linux/slab.h> 13 #include <linux/slab.h>
14 #include <linux/bitops.h> 14 #include <linux/bitops.h>
15 #include <linux/key.h> 15 #include <linux/key.h>
16 #include <linux/interrupt.h> 16 #include <linux/interrupt.h>
17 #include <linux/export.h> 17 #include <linux/export.h>
18 #include <linux/user_namespace.h> 18 #include <linux/user_namespace.h>
19 #include <linux/proc_ns.h> 19 #include <linux/proc_ns.h>
20 20
21 /* 21 /*
22 * userns count is 1 for root user, 1 for init_uts_ns, 22 * userns count is 1 for root user, 1 for init_uts_ns,
23 * and 1 for... ? 23 * and 1 for... ?
24 */ 24 */
25 struct user_namespace init_user_ns = { 25 struct user_namespace init_user_ns = {
26 .uid_map = { 26 .uid_map = {
27 .nr_extents = 1, 27 .nr_extents = 1,
28 .extent[0] = { 28 .extent[0] = {
29 .first = 0, 29 .first = 0,
30 .lower_first = 0, 30 .lower_first = 0,
31 .count = 4294967295U, 31 .count = 4294967295U,
32 }, 32 },
33 }, 33 },
34 .gid_map = { 34 .gid_map = {
35 .nr_extents = 1, 35 .nr_extents = 1,
36 .extent[0] = { 36 .extent[0] = {
37 .first = 0, 37 .first = 0,
38 .lower_first = 0, 38 .lower_first = 0,
39 .count = 4294967295U, 39 .count = 4294967295U,
40 }, 40 },
41 }, 41 },
42 .projid_map = { 42 .projid_map = {
43 .nr_extents = 1, 43 .nr_extents = 1,
44 .extent[0] = { 44 .extent[0] = {
45 .first = 0, 45 .first = 0,
46 .lower_first = 0, 46 .lower_first = 0,
47 .count = 4294967295U, 47 .count = 4294967295U,
48 }, 48 },
49 }, 49 },
50 .count = ATOMIC_INIT(3), 50 .count = ATOMIC_INIT(3),
51 .owner = GLOBAL_ROOT_UID, 51 .owner = GLOBAL_ROOT_UID,
52 .group = GLOBAL_ROOT_GID, 52 .group = GLOBAL_ROOT_GID,
53 .proc_inum = PROC_USER_INIT_INO, 53 .proc_inum = PROC_USER_INIT_INO,
54 .flags = USERNS_INIT_FLAGS,
54 #ifdef CONFIG_PERSISTENT_KEYRINGS 55 #ifdef CONFIG_PERSISTENT_KEYRINGS
55 .persistent_keyring_register_sem = 56 .persistent_keyring_register_sem =
56 __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), 57 __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
57 #endif 58 #endif
58 }; 59 };
59 EXPORT_SYMBOL_GPL(init_user_ns); 60 EXPORT_SYMBOL_GPL(init_user_ns);
60 61
61 /* 62 /*
62 * UID task count cache, to get fast user lookup in "alloc_uid" 63 * UID task count cache, to get fast user lookup in "alloc_uid"
63 * when changing user ID's (ie setuid() and friends). 64 * when changing user ID's (ie setuid() and friends).
64 */ 65 */
65 66
66 #define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7) 67 #define UIDHASH_BITS (CONFIG_BASE_SMALL ? 3 : 7)
67 #define UIDHASH_SZ (1 << UIDHASH_BITS) 68 #define UIDHASH_SZ (1 << UIDHASH_BITS)
68 #define UIDHASH_MASK (UIDHASH_SZ - 1) 69 #define UIDHASH_MASK (UIDHASH_SZ - 1)
69 #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) 70 #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
70 #define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid)))) 71 #define uidhashentry(uid) (uidhash_table + __uidhashfn((__kuid_val(uid))))
71 72
72 static struct kmem_cache *uid_cachep; 73 static struct kmem_cache *uid_cachep;
73 struct hlist_head uidhash_table[UIDHASH_SZ]; 74 struct hlist_head uidhash_table[UIDHASH_SZ];
74 75
75 /* 76 /*
76 * The uidhash_lock is mostly taken from process context, but it is 77 * The uidhash_lock is mostly taken from process context, but it is
77 * occasionally also taken from softirq/tasklet context, when 78 * occasionally also taken from softirq/tasklet context, when
78 * task-structs get RCU-freed. Hence all locking must be softirq-safe. 79 * task-structs get RCU-freed. Hence all locking must be softirq-safe.
79 * But free_uid() is also called with local interrupts disabled, and running 80 * But free_uid() is also called with local interrupts disabled, and running
80 * local_bh_enable() with local interrupts disabled is an error - we'll run 81 * local_bh_enable() with local interrupts disabled is an error - we'll run
81 * softirq callbacks, and they can unconditionally enable interrupts, and 82 * softirq callbacks, and they can unconditionally enable interrupts, and
82 * the caller of free_uid() didn't expect that.. 83 * the caller of free_uid() didn't expect that..
83 */ 84 */
84 static DEFINE_SPINLOCK(uidhash_lock); 85 static DEFINE_SPINLOCK(uidhash_lock);
85 86
86 /* root_user.__count is 1, for init task cred */ 87 /* root_user.__count is 1, for init task cred */
87 struct user_struct root_user = { 88 struct user_struct root_user = {
88 .__count = ATOMIC_INIT(1), 89 .__count = ATOMIC_INIT(1),
89 .processes = ATOMIC_INIT(1), 90 .processes = ATOMIC_INIT(1),
90 .sigpending = ATOMIC_INIT(0), 91 .sigpending = ATOMIC_INIT(0),
91 .locked_shm = 0, 92 .locked_shm = 0,
92 .uid = GLOBAL_ROOT_UID, 93 .uid = GLOBAL_ROOT_UID,
93 }; 94 };
94 95
95 /* 96 /*
96 * These routines must be called with the uidhash spinlock held! 97 * These routines must be called with the uidhash spinlock held!
97 */ 98 */
98 static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) 99 static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
99 { 100 {
100 hlist_add_head(&up->uidhash_node, hashent); 101 hlist_add_head(&up->uidhash_node, hashent);
101 } 102 }
102 103
103 static void uid_hash_remove(struct user_struct *up) 104 static void uid_hash_remove(struct user_struct *up)
104 { 105 {
105 hlist_del_init(&up->uidhash_node); 106 hlist_del_init(&up->uidhash_node);
106 } 107 }
107 108
108 static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent) 109 static struct user_struct *uid_hash_find(kuid_t uid, struct hlist_head *hashent)
109 { 110 {
110 struct user_struct *user; 111 struct user_struct *user;
111 112
112 hlist_for_each_entry(user, hashent, uidhash_node) { 113 hlist_for_each_entry(user, hashent, uidhash_node) {
113 if (uid_eq(user->uid, uid)) { 114 if (uid_eq(user->uid, uid)) {
114 atomic_inc(&user->__count); 115 atomic_inc(&user->__count);
115 return user; 116 return user;
116 } 117 }
117 } 118 }
118 119
119 return NULL; 120 return NULL;
120 } 121 }
121 122
122 /* IRQs are disabled and uidhash_lock is held upon function entry. 123 /* IRQs are disabled and uidhash_lock is held upon function entry.
123 * IRQ state (as stored in flags) is restored and uidhash_lock released 124 * IRQ state (as stored in flags) is restored and uidhash_lock released
124 * upon function exit. 125 * upon function exit.
125 */ 126 */
126 static void free_user(struct user_struct *up, unsigned long flags) 127 static void free_user(struct user_struct *up, unsigned long flags)
127 __releases(&uidhash_lock) 128 __releases(&uidhash_lock)
128 { 129 {
129 uid_hash_remove(up); 130 uid_hash_remove(up);
130 spin_unlock_irqrestore(&uidhash_lock, flags); 131 spin_unlock_irqrestore(&uidhash_lock, flags);
131 key_put(up->uid_keyring); 132 key_put(up->uid_keyring);
132 key_put(up->session_keyring); 133 key_put(up->session_keyring);
133 kmem_cache_free(uid_cachep, up); 134 kmem_cache_free(uid_cachep, up);
134 } 135 }
135 136
136 /* 137 /*
137 * Locate the user_struct for the passed UID. If found, take a ref on it. The 138 * Locate the user_struct for the passed UID. If found, take a ref on it. The
138 * caller must undo that ref with free_uid(). 139 * caller must undo that ref with free_uid().
139 * 140 *
140 * If the user_struct could not be found, return NULL. 141 * If the user_struct could not be found, return NULL.
141 */ 142 */
142 struct user_struct *find_user(kuid_t uid) 143 struct user_struct *find_user(kuid_t uid)
143 { 144 {
144 struct user_struct *ret; 145 struct user_struct *ret;
145 unsigned long flags; 146 unsigned long flags;
146 147
147 spin_lock_irqsave(&uidhash_lock, flags); 148 spin_lock_irqsave(&uidhash_lock, flags);
148 ret = uid_hash_find(uid, uidhashentry(uid)); 149 ret = uid_hash_find(uid, uidhashentry(uid));
149 spin_unlock_irqrestore(&uidhash_lock, flags); 150 spin_unlock_irqrestore(&uidhash_lock, flags);
150 return ret; 151 return ret;
151 } 152 }
152 153
153 void free_uid(struct user_struct *up) 154 void free_uid(struct user_struct *up)
154 { 155 {
155 unsigned long flags; 156 unsigned long flags;
156 157
157 if (!up) 158 if (!up)
158 return; 159 return;
159 160
160 local_irq_save(flags); 161 local_irq_save(flags);
161 if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) 162 if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
162 free_user(up, flags); 163 free_user(up, flags);
163 else 164 else
164 local_irq_restore(flags); 165 local_irq_restore(flags);
165 } 166 }
166 167
167 struct user_struct *alloc_uid(kuid_t uid) 168 struct user_struct *alloc_uid(kuid_t uid)
168 { 169 {
169 struct hlist_head *hashent = uidhashentry(uid); 170 struct hlist_head *hashent = uidhashentry(uid);
170 struct user_struct *up, *new; 171 struct user_struct *up, *new;
171 172
172 spin_lock_irq(&uidhash_lock); 173 spin_lock_irq(&uidhash_lock);
173 up = uid_hash_find(uid, hashent); 174 up = uid_hash_find(uid, hashent);
174 spin_unlock_irq(&uidhash_lock); 175 spin_unlock_irq(&uidhash_lock);
175 176
176 if (!up) { 177 if (!up) {
177 new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); 178 new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL);
178 if (!new) 179 if (!new)
179 goto out_unlock; 180 goto out_unlock;
180 181
181 new->uid = uid; 182 new->uid = uid;
182 atomic_set(&new->__count, 1); 183 atomic_set(&new->__count, 1);
183 184
184 /* 185 /*
185 * Before adding this, check whether we raced 186 * Before adding this, check whether we raced
186 * on adding the same user already.. 187 * on adding the same user already..
187 */ 188 */
188 spin_lock_irq(&uidhash_lock); 189 spin_lock_irq(&uidhash_lock);
189 up = uid_hash_find(uid, hashent); 190 up = uid_hash_find(uid, hashent);
190 if (up) { 191 if (up) {
191 key_put(new->uid_keyring); 192 key_put(new->uid_keyring);
192 key_put(new->session_keyring); 193 key_put(new->session_keyring);
193 kmem_cache_free(uid_cachep, new); 194 kmem_cache_free(uid_cachep, new);
194 } else { 195 } else {
195 uid_hash_insert(new, hashent); 196 uid_hash_insert(new, hashent);
196 up = new; 197 up = new;
197 } 198 }
198 spin_unlock_irq(&uidhash_lock); 199 spin_unlock_irq(&uidhash_lock);
199 } 200 }
200 201
201 return up; 202 return up;
202 203
203 out_unlock: 204 out_unlock:
204 return NULL; 205 return NULL;
205 } 206 }
206 207
207 static int __init uid_cache_init(void) 208 static int __init uid_cache_init(void)
208 { 209 {
209 int n; 210 int n;
210 211
211 uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), 212 uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
212 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 213 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
213 214
214 for(n = 0; n < UIDHASH_SZ; ++n) 215 for(n = 0; n < UIDHASH_SZ; ++n)
215 INIT_HLIST_HEAD(uidhash_table + n); 216 INIT_HLIST_HEAD(uidhash_table + n);
216 217
217 /* Insert the root user immediately (init already runs as root) */ 218 /* Insert the root user immediately (init already runs as root) */
218 spin_lock_irq(&uidhash_lock); 219 spin_lock_irq(&uidhash_lock);
219 uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID)); 220 uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
220 spin_unlock_irq(&uidhash_lock); 221 spin_unlock_irq(&uidhash_lock);
221 222
222 return 0; 223 return 0;
223 } 224 }
224 subsys_initcall(uid_cache_init); 225 subsys_initcall(uid_cache_init);
225 226
kernel/user_namespace.c
1 /* 1 /*
2 * This program is free software; you can redistribute it and/or 2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License as 3 * modify it under the terms of the GNU General Public License as
4 * published by the Free Software Foundation, version 2 of the 4 * published by the Free Software Foundation, version 2 of the
5 * License. 5 * License.
6 */ 6 */
7 7
8 #include <linux/export.h> 8 #include <linux/export.h>
9 #include <linux/nsproxy.h> 9 #include <linux/nsproxy.h>
10 #include <linux/slab.h> 10 #include <linux/slab.h>
11 #include <linux/user_namespace.h> 11 #include <linux/user_namespace.h>
12 #include <linux/proc_ns.h> 12 #include <linux/proc_ns.h>
13 #include <linux/highuid.h> 13 #include <linux/highuid.h>
14 #include <linux/cred.h> 14 #include <linux/cred.h>
15 #include <linux/securebits.h> 15 #include <linux/securebits.h>
16 #include <linux/keyctl.h> 16 #include <linux/keyctl.h>
17 #include <linux/key-type.h> 17 #include <linux/key-type.h>
18 #include <keys/user-type.h> 18 #include <keys/user-type.h>
19 #include <linux/seq_file.h> 19 #include <linux/seq_file.h>
20 #include <linux/fs.h> 20 #include <linux/fs.h>
21 #include <linux/uaccess.h> 21 #include <linux/uaccess.h>
22 #include <linux/ctype.h> 22 #include <linux/ctype.h>
23 #include <linux/projid.h> 23 #include <linux/projid.h>
24 #include <linux/fs_struct.h> 24 #include <linux/fs_struct.h>
25 25
26 static struct kmem_cache *user_ns_cachep __read_mostly; 26 static struct kmem_cache *user_ns_cachep __read_mostly;
27 static DEFINE_MUTEX(userns_state_mutex); 27 static DEFINE_MUTEX(userns_state_mutex);
28 28
29 static bool new_idmap_permitted(const struct file *file, 29 static bool new_idmap_permitted(const struct file *file,
30 struct user_namespace *ns, int cap_setid, 30 struct user_namespace *ns, int cap_setid,
31 struct uid_gid_map *map); 31 struct uid_gid_map *map);
32 32
33 static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) 33 static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
34 { 34 {
35 /* Start with the same capabilities as init but useless for doing 35 /* Start with the same capabilities as init but useless for doing
36 * anything as the capabilities are bound to the new user namespace. 36 * anything as the capabilities are bound to the new user namespace.
37 */ 37 */
38 cred->securebits = SECUREBITS_DEFAULT; 38 cred->securebits = SECUREBITS_DEFAULT;
39 cred->cap_inheritable = CAP_EMPTY_SET; 39 cred->cap_inheritable = CAP_EMPTY_SET;
40 cred->cap_permitted = CAP_FULL_SET; 40 cred->cap_permitted = CAP_FULL_SET;
41 cred->cap_effective = CAP_FULL_SET; 41 cred->cap_effective = CAP_FULL_SET;
42 cred->cap_bset = CAP_FULL_SET; 42 cred->cap_bset = CAP_FULL_SET;
43 #ifdef CONFIG_KEYS 43 #ifdef CONFIG_KEYS
44 key_put(cred->request_key_auth); 44 key_put(cred->request_key_auth);
45 cred->request_key_auth = NULL; 45 cred->request_key_auth = NULL;
46 #endif 46 #endif
47 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ 47 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
48 cred->user_ns = user_ns; 48 cred->user_ns = user_ns;
49 } 49 }
50 50
51 /* 51 /*
52 * Create a new user namespace, deriving the creator from the user in the 52 * Create a new user namespace, deriving the creator from the user in the
53 * passed credentials, and replacing that user with the new root user for the 53 * passed credentials, and replacing that user with the new root user for the
54 * new namespace. 54 * new namespace.
55 * 55 *
56 * This is called by copy_creds(), which will finish setting the target task's 56 * This is called by copy_creds(), which will finish setting the target task's
57 * credentials. 57 * credentials.
58 */ 58 */
59 int create_user_ns(struct cred *new) 59 int create_user_ns(struct cred *new)
60 { 60 {
61 struct user_namespace *ns, *parent_ns = new->user_ns; 61 struct user_namespace *ns, *parent_ns = new->user_ns;
62 kuid_t owner = new->euid; 62 kuid_t owner = new->euid;
63 kgid_t group = new->egid; 63 kgid_t group = new->egid;
64 int ret; 64 int ret;
65 65
66 if (parent_ns->level > 32) 66 if (parent_ns->level > 32)
67 return -EUSERS; 67 return -EUSERS;
68 68
69 /* 69 /*
70 * Verify that we can not violate the policy of which files 70 * Verify that we can not violate the policy of which files
71 * may be accessed that is specified by the root directory, 71 * may be accessed that is specified by the root directory,
72 * by verifing that the root directory is at the root of the 72 * by verifing that the root directory is at the root of the
73 * mount namespace which allows all files to be accessed. 73 * mount namespace which allows all files to be accessed.
74 */ 74 */
75 if (current_chrooted()) 75 if (current_chrooted())
76 return -EPERM; 76 return -EPERM;
77 77
78 /* The creator needs a mapping in the parent user namespace 78 /* The creator needs a mapping in the parent user namespace
79 * or else we won't be able to reasonably tell userspace who 79 * or else we won't be able to reasonably tell userspace who
80 * created a user_namespace. 80 * created a user_namespace.
81 */ 81 */
82 if (!kuid_has_mapping(parent_ns, owner) || 82 if (!kuid_has_mapping(parent_ns, owner) ||
83 !kgid_has_mapping(parent_ns, group)) 83 !kgid_has_mapping(parent_ns, group))
84 return -EPERM; 84 return -EPERM;
85 85
86 ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); 86 ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
87 if (!ns) 87 if (!ns)
88 return -ENOMEM; 88 return -ENOMEM;
89 89
90 ret = proc_alloc_inum(&ns->proc_inum); 90 ret = proc_alloc_inum(&ns->proc_inum);
91 if (ret) { 91 if (ret) {
92 kmem_cache_free(user_ns_cachep, ns); 92 kmem_cache_free(user_ns_cachep, ns);
93 return ret; 93 return ret;
94 } 94 }
95 95
96 atomic_set(&ns->count, 1); 96 atomic_set(&ns->count, 1);
97 /* Leave the new->user_ns reference with the new user namespace. */ 97 /* Leave the new->user_ns reference with the new user namespace. */
98 ns->parent = parent_ns; 98 ns->parent = parent_ns;
99 ns->level = parent_ns->level + 1; 99 ns->level = parent_ns->level + 1;
100 ns->owner = owner; 100 ns->owner = owner;
101 ns->group = group; 101 ns->group = group;
102 102
103 /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
104 mutex_lock(&userns_state_mutex);
105 ns->flags = parent_ns->flags;
106 mutex_unlock(&userns_state_mutex);
107
103 set_cred_user_ns(new, ns); 108 set_cred_user_ns(new, ns);
104 109
105 #ifdef CONFIG_PERSISTENT_KEYRINGS 110 #ifdef CONFIG_PERSISTENT_KEYRINGS
106 init_rwsem(&ns->persistent_keyring_register_sem); 111 init_rwsem(&ns->persistent_keyring_register_sem);
107 #endif 112 #endif
108 return 0; 113 return 0;
109 } 114 }
110 115
111 int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) 116 int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
112 { 117 {
113 struct cred *cred; 118 struct cred *cred;
114 int err = -ENOMEM; 119 int err = -ENOMEM;
115 120
116 if (!(unshare_flags & CLONE_NEWUSER)) 121 if (!(unshare_flags & CLONE_NEWUSER))
117 return 0; 122 return 0;
118 123
119 cred = prepare_creds(); 124 cred = prepare_creds();
120 if (cred) { 125 if (cred) {
121 err = create_user_ns(cred); 126 err = create_user_ns(cred);
122 if (err) 127 if (err)
123 put_cred(cred); 128 put_cred(cred);
124 else 129 else
125 *new_cred = cred; 130 *new_cred = cred;
126 } 131 }
127 132
128 return err; 133 return err;
129 } 134 }
130 135
131 void free_user_ns(struct user_namespace *ns) 136 void free_user_ns(struct user_namespace *ns)
132 { 137 {
133 struct user_namespace *parent; 138 struct user_namespace *parent;
134 139
135 do { 140 do {
136 parent = ns->parent; 141 parent = ns->parent;
137 #ifdef CONFIG_PERSISTENT_KEYRINGS 142 #ifdef CONFIG_PERSISTENT_KEYRINGS
138 key_put(ns->persistent_keyring_register); 143 key_put(ns->persistent_keyring_register);
139 #endif 144 #endif
140 proc_free_inum(ns->proc_inum); 145 proc_free_inum(ns->proc_inum);
141 kmem_cache_free(user_ns_cachep, ns); 146 kmem_cache_free(user_ns_cachep, ns);
142 ns = parent; 147 ns = parent;
143 } while (atomic_dec_and_test(&parent->count)); 148 } while (atomic_dec_and_test(&parent->count));
144 } 149 }
145 EXPORT_SYMBOL(free_user_ns); 150 EXPORT_SYMBOL(free_user_ns);
146 151
147 static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) 152 static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
148 { 153 {
149 unsigned idx, extents; 154 unsigned idx, extents;
150 u32 first, last, id2; 155 u32 first, last, id2;
151 156
152 id2 = id + count - 1; 157 id2 = id + count - 1;
153 158
154 /* Find the matching extent */ 159 /* Find the matching extent */
155 extents = map->nr_extents; 160 extents = map->nr_extents;
156 smp_rmb(); 161 smp_rmb();
157 for (idx = 0; idx < extents; idx++) { 162 for (idx = 0; idx < extents; idx++) {
158 first = map->extent[idx].first; 163 first = map->extent[idx].first;
159 last = first + map->extent[idx].count - 1; 164 last = first + map->extent[idx].count - 1;
160 if (id >= first && id <= last && 165 if (id >= first && id <= last &&
161 (id2 >= first && id2 <= last)) 166 (id2 >= first && id2 <= last))
162 break; 167 break;
163 } 168 }
164 /* Map the id or note failure */ 169 /* Map the id or note failure */
165 if (idx < extents) 170 if (idx < extents)
166 id = (id - first) + map->extent[idx].lower_first; 171 id = (id - first) + map->extent[idx].lower_first;
167 else 172 else
168 id = (u32) -1; 173 id = (u32) -1;
169 174
170 return id; 175 return id;
171 } 176 }
172 177
173 static u32 map_id_down(struct uid_gid_map *map, u32 id) 178 static u32 map_id_down(struct uid_gid_map *map, u32 id)
174 { 179 {
175 unsigned idx, extents; 180 unsigned idx, extents;
176 u32 first, last; 181 u32 first, last;
177 182
178 /* Find the matching extent */ 183 /* Find the matching extent */
179 extents = map->nr_extents; 184 extents = map->nr_extents;
180 smp_rmb(); 185 smp_rmb();
181 for (idx = 0; idx < extents; idx++) { 186 for (idx = 0; idx < extents; idx++) {
182 first = map->extent[idx].first; 187 first = map->extent[idx].first;
183 last = first + map->extent[idx].count - 1; 188 last = first + map->extent[idx].count - 1;
184 if (id >= first && id <= last) 189 if (id >= first && id <= last)
185 break; 190 break;
186 } 191 }
187 /* Map the id or note failure */ 192 /* Map the id or note failure */
188 if (idx < extents) 193 if (idx < extents)
189 id = (id - first) + map->extent[idx].lower_first; 194 id = (id - first) + map->extent[idx].lower_first;
190 else 195 else
191 id = (u32) -1; 196 id = (u32) -1;
192 197
193 return id; 198 return id;
194 } 199 }
195 200
196 static u32 map_id_up(struct uid_gid_map *map, u32 id) 201 static u32 map_id_up(struct uid_gid_map *map, u32 id)
197 { 202 {
198 unsigned idx, extents; 203 unsigned idx, extents;
199 u32 first, last; 204 u32 first, last;
200 205
201 /* Find the matching extent */ 206 /* Find the matching extent */
202 extents = map->nr_extents; 207 extents = map->nr_extents;
203 smp_rmb(); 208 smp_rmb();
204 for (idx = 0; idx < extents; idx++) { 209 for (idx = 0; idx < extents; idx++) {
205 first = map->extent[idx].lower_first; 210 first = map->extent[idx].lower_first;
206 last = first + map->extent[idx].count - 1; 211 last = first + map->extent[idx].count - 1;
207 if (id >= first && id <= last) 212 if (id >= first && id <= last)
208 break; 213 break;
209 } 214 }
210 /* Map the id or note failure */ 215 /* Map the id or note failure */
211 if (idx < extents) 216 if (idx < extents)
212 id = (id - first) + map->extent[idx].first; 217 id = (id - first) + map->extent[idx].first;
213 else 218 else
214 id = (u32) -1; 219 id = (u32) -1;
215 220
216 return id; 221 return id;
217 } 222 }
218 223
219 /** 224 /**
220 * make_kuid - Map a user-namespace uid pair into a kuid. 225 * make_kuid - Map a user-namespace uid pair into a kuid.
221 * @ns: User namespace that the uid is in 226 * @ns: User namespace that the uid is in
222 * @uid: User identifier 227 * @uid: User identifier
223 * 228 *
224 * Maps a user-namespace uid pair into a kernel internal kuid, 229 * Maps a user-namespace uid pair into a kernel internal kuid,
225 * and returns that kuid. 230 * and returns that kuid.
226 * 231 *
227 * When there is no mapping defined for the user-namespace uid 232 * When there is no mapping defined for the user-namespace uid
228 * pair INVALID_UID is returned. Callers are expected to test 233 * pair INVALID_UID is returned. Callers are expected to test
229 * for and handle INVALID_UID being returned. INVALID_UID 234 * for and handle INVALID_UID being returned. INVALID_UID
230 * may be tested for using uid_valid(). 235 * may be tested for using uid_valid().
231 */ 236 */
232 kuid_t make_kuid(struct user_namespace *ns, uid_t uid) 237 kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
233 { 238 {
234 /* Map the uid to a global kernel uid */ 239 /* Map the uid to a global kernel uid */
235 return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); 240 return KUIDT_INIT(map_id_down(&ns->uid_map, uid));
236 } 241 }
237 EXPORT_SYMBOL(make_kuid); 242 EXPORT_SYMBOL(make_kuid);
238 243
239 /** 244 /**
240 * from_kuid - Create a uid from a kuid user-namespace pair. 245 * from_kuid - Create a uid from a kuid user-namespace pair.
241 * @targ: The user namespace we want a uid in. 246 * @targ: The user namespace we want a uid in.
242 * @kuid: The kernel internal uid to start with. 247 * @kuid: The kernel internal uid to start with.
243 * 248 *
244 * Map @kuid into the user-namespace specified by @targ and 249 * Map @kuid into the user-namespace specified by @targ and
245 * return the resulting uid. 250 * return the resulting uid.
246 * 251 *
247 * There is always a mapping into the initial user_namespace. 252 * There is always a mapping into the initial user_namespace.
248 * 253 *
249 * If @kuid has no mapping in @targ (uid_t)-1 is returned. 254 * If @kuid has no mapping in @targ (uid_t)-1 is returned.
250 */ 255 */
251 uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) 256 uid_t from_kuid(struct user_namespace *targ, kuid_t kuid)
252 { 257 {
253 /* Map the uid from a global kernel uid */ 258 /* Map the uid from a global kernel uid */
254 return map_id_up(&targ->uid_map, __kuid_val(kuid)); 259 return map_id_up(&targ->uid_map, __kuid_val(kuid));
255 } 260 }
256 EXPORT_SYMBOL(from_kuid); 261 EXPORT_SYMBOL(from_kuid);
257 262
258 /** 263 /**
259 * from_kuid_munged - Create a uid from a kuid user-namespace pair. 264 * from_kuid_munged - Create a uid from a kuid user-namespace pair.
260 * @targ: The user namespace we want a uid in. 265 * @targ: The user namespace we want a uid in.
261 * @kuid: The kernel internal uid to start with. 266 * @kuid: The kernel internal uid to start with.
262 * 267 *
263 * Map @kuid into the user-namespace specified by @targ and 268 * Map @kuid into the user-namespace specified by @targ and
264 * return the resulting uid. 269 * return the resulting uid.
265 * 270 *
266 * There is always a mapping into the initial user_namespace. 271 * There is always a mapping into the initial user_namespace.
267 * 272 *
268 * Unlike from_kuid from_kuid_munged never fails and always 273 * Unlike from_kuid from_kuid_munged never fails and always
269 * returns a valid uid. This makes from_kuid_munged appropriate 274 * returns a valid uid. This makes from_kuid_munged appropriate
270 * for use in syscalls like stat and getuid where failing the 275 * for use in syscalls like stat and getuid where failing the
271 * system call and failing to provide a valid uid are not an 276 * system call and failing to provide a valid uid are not an
272 * options. 277 * options.
273 * 278 *
274 * If @kuid has no mapping in @targ overflowuid is returned. 279 * If @kuid has no mapping in @targ overflowuid is returned.
275 */ 280 */
276 uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) 281 uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
277 { 282 {
278 uid_t uid; 283 uid_t uid;
279 uid = from_kuid(targ, kuid); 284 uid = from_kuid(targ, kuid);
280 285
281 if (uid == (uid_t) -1) 286 if (uid == (uid_t) -1)
282 uid = overflowuid; 287 uid = overflowuid;
283 return uid; 288 return uid;
284 } 289 }
285 EXPORT_SYMBOL(from_kuid_munged); 290 EXPORT_SYMBOL(from_kuid_munged);
286 291
287 /** 292 /**
288 * make_kgid - Map a user-namespace gid pair into a kgid. 293 * make_kgid - Map a user-namespace gid pair into a kgid.
289 * @ns: User namespace that the gid is in 294 * @ns: User namespace that the gid is in
290 * @gid: group identifier 295 * @gid: group identifier
291 * 296 *
292 * Maps a user-namespace gid pair into a kernel internal kgid, 297 * Maps a user-namespace gid pair into a kernel internal kgid,
293 * and returns that kgid. 298 * and returns that kgid.
294 * 299 *
295 * When there is no mapping defined for the user-namespace gid 300 * When there is no mapping defined for the user-namespace gid
296 * pair INVALID_GID is returned. Callers are expected to test 301 * pair INVALID_GID is returned. Callers are expected to test
297 * for and handle INVALID_GID being returned. INVALID_GID may be 302 * for and handle INVALID_GID being returned. INVALID_GID may be
298 * tested for using gid_valid(). 303 * tested for using gid_valid().
299 */ 304 */
300 kgid_t make_kgid(struct user_namespace *ns, gid_t gid) 305 kgid_t make_kgid(struct user_namespace *ns, gid_t gid)
301 { 306 {
302 /* Map the gid to a global kernel gid */ 307 /* Map the gid to a global kernel gid */
303 return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); 308 return KGIDT_INIT(map_id_down(&ns->gid_map, gid));
304 } 309 }
305 EXPORT_SYMBOL(make_kgid); 310 EXPORT_SYMBOL(make_kgid);
306 311
307 /** 312 /**
308 * from_kgid - Create a gid from a kgid user-namespace pair. 313 * from_kgid - Create a gid from a kgid user-namespace pair.
309 * @targ: The user namespace we want a gid in. 314 * @targ: The user namespace we want a gid in.
310 * @kgid: The kernel internal gid to start with. 315 * @kgid: The kernel internal gid to start with.
311 * 316 *
312 * Map @kgid into the user-namespace specified by @targ and 317 * Map @kgid into the user-namespace specified by @targ and
313 * return the resulting gid. 318 * return the resulting gid.
314 * 319 *
315 * There is always a mapping into the initial user_namespace. 320 * There is always a mapping into the initial user_namespace.
316 * 321 *
317 * If @kgid has no mapping in @targ (gid_t)-1 is returned. 322 * If @kgid has no mapping in @targ (gid_t)-1 is returned.
318 */ 323 */
319 gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) 324 gid_t from_kgid(struct user_namespace *targ, kgid_t kgid)
320 { 325 {
321 /* Map the gid from a global kernel gid */ 326 /* Map the gid from a global kernel gid */
322 return map_id_up(&targ->gid_map, __kgid_val(kgid)); 327 return map_id_up(&targ->gid_map, __kgid_val(kgid));
323 } 328 }
324 EXPORT_SYMBOL(from_kgid); 329 EXPORT_SYMBOL(from_kgid);
325 330
326 /** 331 /**
327 * from_kgid_munged - Create a gid from a kgid user-namespace pair. 332 * from_kgid_munged - Create a gid from a kgid user-namespace pair.
328 * @targ: The user namespace we want a gid in. 333 * @targ: The user namespace we want a gid in.
329 * @kgid: The kernel internal gid to start with. 334 * @kgid: The kernel internal gid to start with.
330 * 335 *
331 * Map @kgid into the user-namespace specified by @targ and 336 * Map @kgid into the user-namespace specified by @targ and
332 * return the resulting gid. 337 * return the resulting gid.
333 * 338 *
334 * There is always a mapping into the initial user_namespace. 339 * There is always a mapping into the initial user_namespace.
335 * 340 *
336 * Unlike from_kgid from_kgid_munged never fails and always 341 * Unlike from_kgid from_kgid_munged never fails and always
337 * returns a valid gid. This makes from_kgid_munged appropriate 342 * returns a valid gid. This makes from_kgid_munged appropriate
338 * for use in syscalls like stat and getgid where failing the 343 * for use in syscalls like stat and getgid where failing the
339 * system call and failing to provide a valid gid are not options. 344 * system call and failing to provide a valid gid are not options.
340 * 345 *
341 * If @kgid has no mapping in @targ overflowgid is returned. 346 * If @kgid has no mapping in @targ overflowgid is returned.
342 */ 347 */
343 gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) 348 gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
344 { 349 {
345 gid_t gid; 350 gid_t gid;
346 gid = from_kgid(targ, kgid); 351 gid = from_kgid(targ, kgid);
347 352
348 if (gid == (gid_t) -1) 353 if (gid == (gid_t) -1)
349 gid = overflowgid; 354 gid = overflowgid;
350 return gid; 355 return gid;
351 } 356 }
352 EXPORT_SYMBOL(from_kgid_munged); 357 EXPORT_SYMBOL(from_kgid_munged);
353 358
354 /** 359 /**
355 * make_kprojid - Map a user-namespace projid pair into a kprojid. 360 * make_kprojid - Map a user-namespace projid pair into a kprojid.
356 * @ns: User namespace that the projid is in 361 * @ns: User namespace that the projid is in
357 * @projid: Project identifier 362 * @projid: Project identifier
358 * 363 *
359 * Maps a user-namespace uid pair into a kernel internal kuid, 364 * Maps a user-namespace uid pair into a kernel internal kuid,
360 * and returns that kuid. 365 * and returns that kuid.
361 * 366 *
362 * When there is no mapping defined for the user-namespace projid 367 * When there is no mapping defined for the user-namespace projid
363 * pair INVALID_PROJID is returned. Callers are expected to test 368 * pair INVALID_PROJID is returned. Callers are expected to test
364 * for and handle handle INVALID_PROJID being returned. INVALID_PROJID 369 * for and handle handle INVALID_PROJID being returned. INVALID_PROJID
365 * may be tested for using projid_valid(). 370 * may be tested for using projid_valid().
366 */ 371 */
367 kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) 372 kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid)
368 { 373 {
369 /* Map the uid to a global kernel uid */ 374 /* Map the uid to a global kernel uid */
370 return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); 375 return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid));
371 } 376 }
372 EXPORT_SYMBOL(make_kprojid); 377 EXPORT_SYMBOL(make_kprojid);
373 378
374 /** 379 /**
375 * from_kprojid - Create a projid from a kprojid user-namespace pair. 380 * from_kprojid - Create a projid from a kprojid user-namespace pair.
376 * @targ: The user namespace we want a projid in. 381 * @targ: The user namespace we want a projid in.
377 * @kprojid: The kernel internal project identifier to start with. 382 * @kprojid: The kernel internal project identifier to start with.
378 * 383 *
379 * Map @kprojid into the user-namespace specified by @targ and 384 * Map @kprojid into the user-namespace specified by @targ and
380 * return the resulting projid. 385 * return the resulting projid.
381 * 386 *
382 * There is always a mapping into the initial user_namespace. 387 * There is always a mapping into the initial user_namespace.
383 * 388 *
384 * If @kprojid has no mapping in @targ (projid_t)-1 is returned. 389 * If @kprojid has no mapping in @targ (projid_t)-1 is returned.
385 */ 390 */
386 projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) 391 projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid)
387 { 392 {
388 /* Map the uid from a global kernel uid */ 393 /* Map the uid from a global kernel uid */
389 return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); 394 return map_id_up(&targ->projid_map, __kprojid_val(kprojid));
390 } 395 }
391 EXPORT_SYMBOL(from_kprojid); 396 EXPORT_SYMBOL(from_kprojid);
392 397
393 /** 398 /**
394 * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. 399 * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair.
395 * @targ: The user namespace we want a projid in. 400 * @targ: The user namespace we want a projid in.
396 * @kprojid: The kernel internal projid to start with. 401 * @kprojid: The kernel internal projid to start with.
397 * 402 *
398 * Map @kprojid into the user-namespace specified by @targ and 403 * Map @kprojid into the user-namespace specified by @targ and
399 * return the resulting projid. 404 * return the resulting projid.
400 * 405 *
401 * There is always a mapping into the initial user_namespace. 406 * There is always a mapping into the initial user_namespace.
402 * 407 *
403 * Unlike from_kprojid from_kprojid_munged never fails and always 408 * Unlike from_kprojid from_kprojid_munged never fails and always
404 * returns a valid projid. This makes from_kprojid_munged 409 * returns a valid projid. This makes from_kprojid_munged
405 * appropriate for use in syscalls like stat and where 410 * appropriate for use in syscalls like stat and where
406 * failing the system call and failing to provide a valid projid are 411 * failing the system call and failing to provide a valid projid are
407 * not an options. 412 * not an options.
408 * 413 *
409 * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. 414 * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned.
410 */ 415 */
411 projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) 416 projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid)
412 { 417 {
413 projid_t projid; 418 projid_t projid;
414 projid = from_kprojid(targ, kprojid); 419 projid = from_kprojid(targ, kprojid);
415 420
416 if (projid == (projid_t) -1) 421 if (projid == (projid_t) -1)
417 projid = OVERFLOW_PROJID; 422 projid = OVERFLOW_PROJID;
418 return projid; 423 return projid;
419 } 424 }
420 EXPORT_SYMBOL(from_kprojid_munged); 425 EXPORT_SYMBOL(from_kprojid_munged);
421 426
422 427
423 static int uid_m_show(struct seq_file *seq, void *v) 428 static int uid_m_show(struct seq_file *seq, void *v)
424 { 429 {
425 struct user_namespace *ns = seq->private; 430 struct user_namespace *ns = seq->private;
426 struct uid_gid_extent *extent = v; 431 struct uid_gid_extent *extent = v;
427 struct user_namespace *lower_ns; 432 struct user_namespace *lower_ns;
428 uid_t lower; 433 uid_t lower;
429 434
430 lower_ns = seq_user_ns(seq); 435 lower_ns = seq_user_ns(seq);
431 if ((lower_ns == ns) && lower_ns->parent) 436 if ((lower_ns == ns) && lower_ns->parent)
432 lower_ns = lower_ns->parent; 437 lower_ns = lower_ns->parent;
433 438
434 lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); 439 lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first));
435 440
436 seq_printf(seq, "%10u %10u %10u\n", 441 seq_printf(seq, "%10u %10u %10u\n",
437 extent->first, 442 extent->first,
438 lower, 443 lower,
439 extent->count); 444 extent->count);
440 445
441 return 0; 446 return 0;
442 } 447 }
443 448
444 static int gid_m_show(struct seq_file *seq, void *v) 449 static int gid_m_show(struct seq_file *seq, void *v)
445 { 450 {
446 struct user_namespace *ns = seq->private; 451 struct user_namespace *ns = seq->private;
447 struct uid_gid_extent *extent = v; 452 struct uid_gid_extent *extent = v;
448 struct user_namespace *lower_ns; 453 struct user_namespace *lower_ns;
449 gid_t lower; 454 gid_t lower;
450 455
451 lower_ns = seq_user_ns(seq); 456 lower_ns = seq_user_ns(seq);
452 if ((lower_ns == ns) && lower_ns->parent) 457 if ((lower_ns == ns) && lower_ns->parent)
453 lower_ns = lower_ns->parent; 458 lower_ns = lower_ns->parent;
454 459
455 lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); 460 lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first));
456 461
457 seq_printf(seq, "%10u %10u %10u\n", 462 seq_printf(seq, "%10u %10u %10u\n",
458 extent->first, 463 extent->first,
459 lower, 464 lower,
460 extent->count); 465 extent->count);
461 466
462 return 0; 467 return 0;
463 } 468 }
464 469
465 static int projid_m_show(struct seq_file *seq, void *v) 470 static int projid_m_show(struct seq_file *seq, void *v)
466 { 471 {
467 struct user_namespace *ns = seq->private; 472 struct user_namespace *ns = seq->private;
468 struct uid_gid_extent *extent = v; 473 struct uid_gid_extent *extent = v;
469 struct user_namespace *lower_ns; 474 struct user_namespace *lower_ns;
470 projid_t lower; 475 projid_t lower;
471 476
472 lower_ns = seq_user_ns(seq); 477 lower_ns = seq_user_ns(seq);
473 if ((lower_ns == ns) && lower_ns->parent) 478 if ((lower_ns == ns) && lower_ns->parent)
474 lower_ns = lower_ns->parent; 479 lower_ns = lower_ns->parent;
475 480
476 lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); 481 lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first));
477 482
478 seq_printf(seq, "%10u %10u %10u\n", 483 seq_printf(seq, "%10u %10u %10u\n",
479 extent->first, 484 extent->first,
480 lower, 485 lower,
481 extent->count); 486 extent->count);
482 487
483 return 0; 488 return 0;
484 } 489 }
485 490
486 static void *m_start(struct seq_file *seq, loff_t *ppos, 491 static void *m_start(struct seq_file *seq, loff_t *ppos,
487 struct uid_gid_map *map) 492 struct uid_gid_map *map)
488 { 493 {
489 struct uid_gid_extent *extent = NULL; 494 struct uid_gid_extent *extent = NULL;
490 loff_t pos = *ppos; 495 loff_t pos = *ppos;
491 496
492 if (pos < map->nr_extents) 497 if (pos < map->nr_extents)
493 extent = &map->extent[pos]; 498 extent = &map->extent[pos];
494 499
495 return extent; 500 return extent;
496 } 501 }
497 502
498 static void *uid_m_start(struct seq_file *seq, loff_t *ppos) 503 static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
499 { 504 {
500 struct user_namespace *ns = seq->private; 505 struct user_namespace *ns = seq->private;
501 506
502 return m_start(seq, ppos, &ns->uid_map); 507 return m_start(seq, ppos, &ns->uid_map);
503 } 508 }
504 509
505 static void *gid_m_start(struct seq_file *seq, loff_t *ppos) 510 static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
506 { 511 {
507 struct user_namespace *ns = seq->private; 512 struct user_namespace *ns = seq->private;
508 513
509 return m_start(seq, ppos, &ns->gid_map); 514 return m_start(seq, ppos, &ns->gid_map);
510 } 515 }
511 516
512 static void *projid_m_start(struct seq_file *seq, loff_t *ppos) 517 static void *projid_m_start(struct seq_file *seq, loff_t *ppos)
513 { 518 {
514 struct user_namespace *ns = seq->private; 519 struct user_namespace *ns = seq->private;
515 520
516 return m_start(seq, ppos, &ns->projid_map); 521 return m_start(seq, ppos, &ns->projid_map);
517 } 522 }
518 523
519 static void *m_next(struct seq_file *seq, void *v, loff_t *pos) 524 static void *m_next(struct seq_file *seq, void *v, loff_t *pos)
520 { 525 {
521 (*pos)++; 526 (*pos)++;
522 return seq->op->start(seq, pos); 527 return seq->op->start(seq, pos);
523 } 528 }
524 529
525 static void m_stop(struct seq_file *seq, void *v) 530 static void m_stop(struct seq_file *seq, void *v)
526 { 531 {
527 return; 532 return;
528 } 533 }
529 534
530 const struct seq_operations proc_uid_seq_operations = { 535 const struct seq_operations proc_uid_seq_operations = {
531 .start = uid_m_start, 536 .start = uid_m_start,
532 .stop = m_stop, 537 .stop = m_stop,
533 .next = m_next, 538 .next = m_next,
534 .show = uid_m_show, 539 .show = uid_m_show,
535 }; 540 };
536 541
537 const struct seq_operations proc_gid_seq_operations = { 542 const struct seq_operations proc_gid_seq_operations = {
538 .start = gid_m_start, 543 .start = gid_m_start,
539 .stop = m_stop, 544 .stop = m_stop,
540 .next = m_next, 545 .next = m_next,
541 .show = gid_m_show, 546 .show = gid_m_show,
542 }; 547 };
543 548
544 const struct seq_operations proc_projid_seq_operations = { 549 const struct seq_operations proc_projid_seq_operations = {
545 .start = projid_m_start, 550 .start = projid_m_start,
546 .stop = m_stop, 551 .stop = m_stop,
547 .next = m_next, 552 .next = m_next,
548 .show = projid_m_show, 553 .show = projid_m_show,
549 }; 554 };
550 555
551 static bool mappings_overlap(struct uid_gid_map *new_map, 556 static bool mappings_overlap(struct uid_gid_map *new_map,
552 struct uid_gid_extent *extent) 557 struct uid_gid_extent *extent)
553 { 558 {
554 u32 upper_first, lower_first, upper_last, lower_last; 559 u32 upper_first, lower_first, upper_last, lower_last;
555 unsigned idx; 560 unsigned idx;
556 561
557 upper_first = extent->first; 562 upper_first = extent->first;
558 lower_first = extent->lower_first; 563 lower_first = extent->lower_first;
559 upper_last = upper_first + extent->count - 1; 564 upper_last = upper_first + extent->count - 1;
560 lower_last = lower_first + extent->count - 1; 565 lower_last = lower_first + extent->count - 1;
561 566
562 for (idx = 0; idx < new_map->nr_extents; idx++) { 567 for (idx = 0; idx < new_map->nr_extents; idx++) {
563 u32 prev_upper_first, prev_lower_first; 568 u32 prev_upper_first, prev_lower_first;
564 u32 prev_upper_last, prev_lower_last; 569 u32 prev_upper_last, prev_lower_last;
565 struct uid_gid_extent *prev; 570 struct uid_gid_extent *prev;
566 571
567 prev = &new_map->extent[idx]; 572 prev = &new_map->extent[idx];
568 573
569 prev_upper_first = prev->first; 574 prev_upper_first = prev->first;
570 prev_lower_first = prev->lower_first; 575 prev_lower_first = prev->lower_first;
571 prev_upper_last = prev_upper_first + prev->count - 1; 576 prev_upper_last = prev_upper_first + prev->count - 1;
572 prev_lower_last = prev_lower_first + prev->count - 1; 577 prev_lower_last = prev_lower_first + prev->count - 1;
573 578
574 /* Does the upper range intersect a previous extent? */ 579 /* Does the upper range intersect a previous extent? */
575 if ((prev_upper_first <= upper_last) && 580 if ((prev_upper_first <= upper_last) &&
576 (prev_upper_last >= upper_first)) 581 (prev_upper_last >= upper_first))
577 return true; 582 return true;
578 583
579 /* Does the lower range intersect a previous extent? */ 584 /* Does the lower range intersect a previous extent? */
580 if ((prev_lower_first <= lower_last) && 585 if ((prev_lower_first <= lower_last) &&
581 (prev_lower_last >= lower_first)) 586 (prev_lower_last >= lower_first))
582 return true; 587 return true;
583 } 588 }
584 return false; 589 return false;
585 } 590 }
586 591
587 static ssize_t map_write(struct file *file, const char __user *buf, 592 static ssize_t map_write(struct file *file, const char __user *buf,
588 size_t count, loff_t *ppos, 593 size_t count, loff_t *ppos,
589 int cap_setid, 594 int cap_setid,
590 struct uid_gid_map *map, 595 struct uid_gid_map *map,
591 struct uid_gid_map *parent_map) 596 struct uid_gid_map *parent_map)
592 { 597 {
593 struct seq_file *seq = file->private_data; 598 struct seq_file *seq = file->private_data;
594 struct user_namespace *ns = seq->private; 599 struct user_namespace *ns = seq->private;
595 struct uid_gid_map new_map; 600 struct uid_gid_map new_map;
596 unsigned idx; 601 unsigned idx;
597 struct uid_gid_extent *extent = NULL; 602 struct uid_gid_extent *extent = NULL;
598 unsigned long page = 0; 603 unsigned long page = 0;
599 char *kbuf, *pos, *next_line; 604 char *kbuf, *pos, *next_line;
600 ssize_t ret = -EINVAL; 605 ssize_t ret = -EINVAL;
601 606
602 /* 607 /*
603 * The userns_state_mutex serializes all writes to any given map. 608 * The userns_state_mutex serializes all writes to any given map.
604 * 609 *
605 * Any map is only ever written once. 610 * Any map is only ever written once.
606 * 611 *
607 * An id map fits within 1 cache line on most architectures. 612 * An id map fits within 1 cache line on most architectures.
608 * 613 *
609 * On read nothing needs to be done unless you are on an 614 * On read nothing needs to be done unless you are on an
610 * architecture with a crazy cache coherency model like alpha. 615 * architecture with a crazy cache coherency model like alpha.
611 * 616 *
612 * There is a one time data dependency between reading the 617 * There is a one time data dependency between reading the
613 * count of the extents and the values of the extents. The 618 * count of the extents and the values of the extents. The
614 * desired behavior is to see the values of the extents that 619 * desired behavior is to see the values of the extents that
615 * were written before the count of the extents. 620 * were written before the count of the extents.
616 * 621 *
617 * To achieve this smp_wmb() is used on guarantee the write 622 * To achieve this smp_wmb() is used on guarantee the write
618 * order and smp_rmb() is guaranteed that we don't have crazy 623 * order and smp_rmb() is guaranteed that we don't have crazy
619 * architectures returning stale data. 624 * architectures returning stale data.
620 */ 625 */
621 mutex_lock(&userns_state_mutex); 626 mutex_lock(&userns_state_mutex);
622 627
623 ret = -EPERM; 628 ret = -EPERM;
624 /* Only allow one successful write to the map */ 629 /* Only allow one successful write to the map */
625 if (map->nr_extents != 0) 630 if (map->nr_extents != 0)
626 goto out; 631 goto out;
627 632
628 /* 633 /*
629 * Adjusting namespace settings requires capabilities on the target. 634 * Adjusting namespace settings requires capabilities on the target.
630 */ 635 */
631 if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN)) 636 if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
632 goto out; 637 goto out;
633 638
634 /* Get a buffer */ 639 /* Get a buffer */
635 ret = -ENOMEM; 640 ret = -ENOMEM;
636 page = __get_free_page(GFP_TEMPORARY); 641 page = __get_free_page(GFP_TEMPORARY);
637 kbuf = (char *) page; 642 kbuf = (char *) page;
638 if (!page) 643 if (!page)
639 goto out; 644 goto out;
640 645
641 /* Only allow <= page size writes at the beginning of the file */ 646 /* Only allow <= page size writes at the beginning of the file */
642 ret = -EINVAL; 647 ret = -EINVAL;
643 if ((*ppos != 0) || (count >= PAGE_SIZE)) 648 if ((*ppos != 0) || (count >= PAGE_SIZE))
644 goto out; 649 goto out;
645 650
646 /* Slurp in the user data */ 651 /* Slurp in the user data */
647 ret = -EFAULT; 652 ret = -EFAULT;
648 if (copy_from_user(kbuf, buf, count)) 653 if (copy_from_user(kbuf, buf, count))
649 goto out; 654 goto out;
650 kbuf[count] = '\0'; 655 kbuf[count] = '\0';
651 656
652 /* Parse the user data */ 657 /* Parse the user data */
653 ret = -EINVAL; 658 ret = -EINVAL;
654 pos = kbuf; 659 pos = kbuf;
655 new_map.nr_extents = 0; 660 new_map.nr_extents = 0;
656 for (; pos; pos = next_line) { 661 for (; pos; pos = next_line) {
657 extent = &new_map.extent[new_map.nr_extents]; 662 extent = &new_map.extent[new_map.nr_extents];
658 663
659 /* Find the end of line and ensure I don't look past it */ 664 /* Find the end of line and ensure I don't look past it */
660 next_line = strchr(pos, '\n'); 665 next_line = strchr(pos, '\n');
661 if (next_line) { 666 if (next_line) {
662 *next_line = '\0'; 667 *next_line = '\0';
663 next_line++; 668 next_line++;
664 if (*next_line == '\0') 669 if (*next_line == '\0')
665 next_line = NULL; 670 next_line = NULL;
666 } 671 }
667 672
668 pos = skip_spaces(pos); 673 pos = skip_spaces(pos);
669 extent->first = simple_strtoul(pos, &pos, 10); 674 extent->first = simple_strtoul(pos, &pos, 10);
670 if (!isspace(*pos)) 675 if (!isspace(*pos))
671 goto out; 676 goto out;
672 677
673 pos = skip_spaces(pos); 678 pos = skip_spaces(pos);
674 extent->lower_first = simple_strtoul(pos, &pos, 10); 679 extent->lower_first = simple_strtoul(pos, &pos, 10);
675 if (!isspace(*pos)) 680 if (!isspace(*pos))
676 goto out; 681 goto out;
677 682
678 pos = skip_spaces(pos); 683 pos = skip_spaces(pos);
679 extent->count = simple_strtoul(pos, &pos, 10); 684 extent->count = simple_strtoul(pos, &pos, 10);
680 if (*pos && !isspace(*pos)) 685 if (*pos && !isspace(*pos))
681 goto out; 686 goto out;
682 687
683 /* Verify there is not trailing junk on the line */ 688 /* Verify there is not trailing junk on the line */
684 pos = skip_spaces(pos); 689 pos = skip_spaces(pos);
685 if (*pos != '\0') 690 if (*pos != '\0')
686 goto out; 691 goto out;
687 692
688 /* Verify we have been given valid starting values */ 693 /* Verify we have been given valid starting values */
689 if ((extent->first == (u32) -1) || 694 if ((extent->first == (u32) -1) ||
690 (extent->lower_first == (u32) -1)) 695 (extent->lower_first == (u32) -1))
691 goto out; 696 goto out;
692 697
693 /* Verify count is not zero and does not cause the 698 /* Verify count is not zero and does not cause the
694 * extent to wrap 699 * extent to wrap
695 */ 700 */
696 if ((extent->first + extent->count) <= extent->first) 701 if ((extent->first + extent->count) <= extent->first)
697 goto out; 702 goto out;
698 if ((extent->lower_first + extent->count) <= 703 if ((extent->lower_first + extent->count) <=
699 extent->lower_first) 704 extent->lower_first)
700 goto out; 705 goto out;
701 706
702 /* Do the ranges in extent overlap any previous extents? */ 707 /* Do the ranges in extent overlap any previous extents? */
703 if (mappings_overlap(&new_map, extent)) 708 if (mappings_overlap(&new_map, extent))
704 goto out; 709 goto out;
705 710
706 new_map.nr_extents++; 711 new_map.nr_extents++;
707 712
708 /* Fail if the file contains too many extents */ 713 /* Fail if the file contains too many extents */
709 if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) && 714 if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
710 (next_line != NULL)) 715 (next_line != NULL))
711 goto out; 716 goto out;
712 } 717 }
713 /* Be very certaint the new map actually exists */ 718 /* Be very certaint the new map actually exists */
714 if (new_map.nr_extents == 0) 719 if (new_map.nr_extents == 0)
715 goto out; 720 goto out;
716 721
717 ret = -EPERM; 722 ret = -EPERM;
718 /* Validate the user is allowed to use user id's mapped to. */ 723 /* Validate the user is allowed to use user id's mapped to. */
719 if (!new_idmap_permitted(file, ns, cap_setid, &new_map)) 724 if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
720 goto out; 725 goto out;
721 726
722 /* Map the lower ids from the parent user namespace to the 727 /* Map the lower ids from the parent user namespace to the
723 * kernel global id space. 728 * kernel global id space.
724 */ 729 */
725 for (idx = 0; idx < new_map.nr_extents; idx++) { 730 for (idx = 0; idx < new_map.nr_extents; idx++) {
726 u32 lower_first; 731 u32 lower_first;
727 extent = &new_map.extent[idx]; 732 extent = &new_map.extent[idx];
728 733
729 lower_first = map_id_range_down(parent_map, 734 lower_first = map_id_range_down(parent_map,
730 extent->lower_first, 735 extent->lower_first,
731 extent->count); 736 extent->count);
732 737
733 /* Fail if we can not map the specified extent to 738 /* Fail if we can not map the specified extent to
734 * the kernel global id space. 739 * the kernel global id space.
735 */ 740 */
736 if (lower_first == (u32) -1) 741 if (lower_first == (u32) -1)
737 goto out; 742 goto out;
738 743
739 extent->lower_first = lower_first; 744 extent->lower_first = lower_first;
740 } 745 }
741 746
742 /* Install the map */ 747 /* Install the map */
743 memcpy(map->extent, new_map.extent, 748 memcpy(map->extent, new_map.extent,
744 new_map.nr_extents*sizeof(new_map.extent[0])); 749 new_map.nr_extents*sizeof(new_map.extent[0]));
745 smp_wmb(); 750 smp_wmb();
746 map->nr_extents = new_map.nr_extents; 751 map->nr_extents = new_map.nr_extents;
747 752
748 *ppos = count; 753 *ppos = count;
749 ret = count; 754 ret = count;
750 out: 755 out:
751 mutex_unlock(&userns_state_mutex); 756 mutex_unlock(&userns_state_mutex);
752 if (page) 757 if (page)
753 free_page(page); 758 free_page(page);
754 return ret; 759 return ret;
755 } 760 }
756 761
757 ssize_t proc_uid_map_write(struct file *file, const char __user *buf, 762 ssize_t proc_uid_map_write(struct file *file, const char __user *buf,
758 size_t size, loff_t *ppos) 763 size_t size, loff_t *ppos)
759 { 764 {
760 struct seq_file *seq = file->private_data; 765 struct seq_file *seq = file->private_data;
761 struct user_namespace *ns = seq->private; 766 struct user_namespace *ns = seq->private;
762 struct user_namespace *seq_ns = seq_user_ns(seq); 767 struct user_namespace *seq_ns = seq_user_ns(seq);
763 768
764 if (!ns->parent) 769 if (!ns->parent)
765 return -EPERM; 770 return -EPERM;
766 771
767 if ((seq_ns != ns) && (seq_ns != ns->parent)) 772 if ((seq_ns != ns) && (seq_ns != ns->parent))
768 return -EPERM; 773 return -EPERM;
769 774
770 return map_write(file, buf, size, ppos, CAP_SETUID, 775 return map_write(file, buf, size, ppos, CAP_SETUID,
771 &ns->uid_map, &ns->parent->uid_map); 776 &ns->uid_map, &ns->parent->uid_map);
772 } 777 }
773 778
774 ssize_t proc_gid_map_write(struct file *file, const char __user *buf, 779 ssize_t proc_gid_map_write(struct file *file, const char __user *buf,
775 size_t size, loff_t *ppos) 780 size_t size, loff_t *ppos)
776 { 781 {
777 struct seq_file *seq = file->private_data; 782 struct seq_file *seq = file->private_data;
778 struct user_namespace *ns = seq->private; 783 struct user_namespace *ns = seq->private;
779 struct user_namespace *seq_ns = seq_user_ns(seq); 784 struct user_namespace *seq_ns = seq_user_ns(seq);
780 785
781 if (!ns->parent) 786 if (!ns->parent)
782 return -EPERM; 787 return -EPERM;
783 788
784 if ((seq_ns != ns) && (seq_ns != ns->parent)) 789 if ((seq_ns != ns) && (seq_ns != ns->parent))
785 return -EPERM; 790 return -EPERM;
786 791
787 return map_write(file, buf, size, ppos, CAP_SETGID, 792 return map_write(file, buf, size, ppos, CAP_SETGID,
788 &ns->gid_map, &ns->parent->gid_map); 793 &ns->gid_map, &ns->parent->gid_map);
789 } 794 }
790 795
791 ssize_t proc_projid_map_write(struct file *file, const char __user *buf, 796 ssize_t proc_projid_map_write(struct file *file, const char __user *buf,
792 size_t size, loff_t *ppos) 797 size_t size, loff_t *ppos)
793 { 798 {
794 struct seq_file *seq = file->private_data; 799 struct seq_file *seq = file->private_data;
795 struct user_namespace *ns = seq->private; 800 struct user_namespace *ns = seq->private;
796 struct user_namespace *seq_ns = seq_user_ns(seq); 801 struct user_namespace *seq_ns = seq_user_ns(seq);
797 802
798 if (!ns->parent) 803 if (!ns->parent)
799 return -EPERM; 804 return -EPERM;
800 805
801 if ((seq_ns != ns) && (seq_ns != ns->parent)) 806 if ((seq_ns != ns) && (seq_ns != ns->parent))
802 return -EPERM; 807 return -EPERM;
803 808
804 /* Anyone can set any valid project id no capability needed */ 809 /* Anyone can set any valid project id no capability needed */
805 return map_write(file, buf, size, ppos, -1, 810 return map_write(file, buf, size, ppos, -1,
806 &ns->projid_map, &ns->parent->projid_map); 811 &ns->projid_map, &ns->parent->projid_map);
807 } 812 }
808 813
809 static bool new_idmap_permitted(const struct file *file, 814 static bool new_idmap_permitted(const struct file *file,
810 struct user_namespace *ns, int cap_setid, 815 struct user_namespace *ns, int cap_setid,
811 struct uid_gid_map *new_map) 816 struct uid_gid_map *new_map)
812 { 817 {
813 const struct cred *cred = file->f_cred; 818 const struct cred *cred = file->f_cred;
814 /* Don't allow mappings that would allow anything that wouldn't 819 /* Don't allow mappings that would allow anything that wouldn't
815 * be allowed without the establishment of unprivileged mappings. 820 * be allowed without the establishment of unprivileged mappings.
816 */ 821 */
817 if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && 822 if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) &&
818 uid_eq(ns->owner, cred->euid)) { 823 uid_eq(ns->owner, cred->euid)) {
819 u32 id = new_map->extent[0].lower_first; 824 u32 id = new_map->extent[0].lower_first;
820 if (cap_setid == CAP_SETUID) { 825 if (cap_setid == CAP_SETUID) {
821 kuid_t uid = make_kuid(ns->parent, id); 826 kuid_t uid = make_kuid(ns->parent, id);
822 if (uid_eq(uid, cred->euid)) 827 if (uid_eq(uid, cred->euid))
823 return true; 828 return true;
824 } 829 }
825 } 830 }
826 831
827 /* Allow anyone to set a mapping that doesn't require privilege */ 832 /* Allow anyone to set a mapping that doesn't require privilege */
828 if (!cap_valid(cap_setid)) 833 if (!cap_valid(cap_setid))
829 return true; 834 return true;
830 835
831 /* Allow the specified ids if we have the appropriate capability 836 /* Allow the specified ids if we have the appropriate capability
832 * (CAP_SETUID or CAP_SETGID) over the parent user namespace. 837 * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
833 * And the opener of the id file also had the approprpiate capability. 838 * And the opener of the id file also had the approprpiate capability.
834 */ 839 */
835 if (ns_capable(ns->parent, cap_setid) && 840 if (ns_capable(ns->parent, cap_setid) &&
836 file_ns_capable(file, ns->parent, cap_setid)) 841 file_ns_capable(file, ns->parent, cap_setid))
837 return true; 842 return true;
838 843
839 return false; 844 return false;
840 } 845 }
841 846
847 int proc_setgroups_show(struct seq_file *seq, void *v)
848 {
849 struct user_namespace *ns = seq->private;
850 unsigned long userns_flags = ACCESS_ONCE(ns->flags);
851
852 seq_printf(seq, "%s\n",
853 (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
854 "allow" : "deny");
855 return 0;
856 }
857
858 ssize_t proc_setgroups_write(struct file *file, const char __user *buf,
859 size_t count, loff_t *ppos)
860 {
861 struct seq_file *seq = file->private_data;
862 struct user_namespace *ns = seq->private;
863 char kbuf[8], *pos;
864 bool setgroups_allowed;
865 ssize_t ret;
866
867 /* Only allow a very narrow range of strings to be written */
868 ret = -EINVAL;
869 if ((*ppos != 0) || (count >= sizeof(kbuf)))
870 goto out;
871
872 /* What was written? */
873 ret = -EFAULT;
874 if (copy_from_user(kbuf, buf, count))
875 goto out;
876 kbuf[count] = '\0';
877 pos = kbuf;
878
879 /* What is being requested? */
880 ret = -EINVAL;
881 if (strncmp(pos, "allow", 5) == 0) {
882 pos += 5;
883 setgroups_allowed = true;
884 }
885 else if (strncmp(pos, "deny", 4) == 0) {
886 pos += 4;
887 setgroups_allowed = false;
888 }
889 else
890 goto out;
891
892 /* Verify there is not trailing junk on the line */
893 pos = skip_spaces(pos);
894 if (*pos != '\0')
895 goto out;
896
897 ret = -EPERM;
898 mutex_lock(&userns_state_mutex);
899 if (setgroups_allowed) {
900 /* Enabling setgroups after setgroups has been disabled
901 * is not allowed.
902 */
903 if (!(ns->flags & USERNS_SETGROUPS_ALLOWED))
904 goto out_unlock;
905 } else {
906 /* Permanently disabling setgroups after setgroups has
907 * been enabled by writing the gid_map is not allowed.
908 */
909 if (ns->gid_map.nr_extents != 0)
910 goto out_unlock;
911 ns->flags &= ~USERNS_SETGROUPS_ALLOWED;
912 }
913 mutex_unlock(&userns_state_mutex);
914
915 /* Report a successful write */
916 *ppos = count;
917 ret = count;
918 out:
919 return ret;
920 out_unlock:
921 mutex_unlock(&userns_state_mutex);
922 goto out;
923 }
924
842 bool userns_may_setgroups(const struct user_namespace *ns) 925 bool userns_may_setgroups(const struct user_namespace *ns)
843 { 926 {
844 bool allowed; 927 bool allowed;
845 928
846 mutex_lock(&userns_state_mutex); 929 mutex_lock(&userns_state_mutex);
847 /* It is not safe to use setgroups until a gid mapping in 930 /* It is not safe to use setgroups until a gid mapping in
848 * the user namespace has been established. 931 * the user namespace has been established.
849 */ 932 */
850 allowed = ns->gid_map.nr_extents != 0; 933 allowed = ns->gid_map.nr_extents != 0;
934 /* Is setgroups allowed? */
935 allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED);
851 mutex_unlock(&userns_state_mutex); 936 mutex_unlock(&userns_state_mutex);
852 937
853 return allowed; 938 return allowed;
854 } 939 }
855 940
856 static void *userns_get(struct task_struct *task) 941 static void *userns_get(struct task_struct *task)
857 { 942 {
858 struct user_namespace *user_ns; 943 struct user_namespace *user_ns;
859 944
860 rcu_read_lock(); 945 rcu_read_lock();
861 user_ns = get_user_ns(__task_cred(task)->user_ns); 946 user_ns = get_user_ns(__task_cred(task)->user_ns);
862 rcu_read_unlock(); 947 rcu_read_unlock();
863 948
864 return user_ns; 949 return user_ns;
865 } 950 }
866 951
867 static void userns_put(void *ns) 952 static void userns_put(void *ns)
868 { 953 {
869 put_user_ns(ns); 954 put_user_ns(ns);
870 } 955 }
871 956
872 static int userns_install(struct nsproxy *nsproxy, void *ns) 957 static int userns_install(struct nsproxy *nsproxy, void *ns)
873 { 958 {
874 struct user_namespace *user_ns = ns; 959 struct user_namespace *user_ns = ns;
875 struct cred *cred; 960 struct cred *cred;
876 961
877 /* Don't allow gaining capabilities by reentering 962 /* Don't allow gaining capabilities by reentering
878 * the same user namespace. 963 * the same user namespace.
879 */ 964 */
880 if (user_ns == current_user_ns()) 965 if (user_ns == current_user_ns())
881 return -EINVAL; 966 return -EINVAL;
882 967
883 /* Threaded processes may not enter a different user namespace */ 968 /* Threaded processes may not enter a different user namespace */
884 if (atomic_read(&current->mm->mm_users) > 1) 969 if (atomic_read(&current->mm->mm_users) > 1)
885 return -EINVAL; 970 return -EINVAL;
886 971
887 if (current->fs->users != 1) 972 if (current->fs->users != 1)
888 return -EINVAL; 973 return -EINVAL;
889 974
890 if (!ns_capable(user_ns, CAP_SYS_ADMIN)) 975 if (!ns_capable(user_ns, CAP_SYS_ADMIN))
891 return -EPERM; 976 return -EPERM;
892 977
893 cred = prepare_creds(); 978 cred = prepare_creds();
894 if (!cred) 979 if (!cred)
895 return -ENOMEM; 980 return -ENOMEM;
896 981
897 put_user_ns(cred->user_ns); 982 put_user_ns(cred->user_ns);
898 set_cred_user_ns(cred, get_user_ns(user_ns)); 983 set_cred_user_ns(cred, get_user_ns(user_ns));
899 984
900 return commit_creds(cred); 985 return commit_creds(cred);
901 } 986 }
902 987
903 static unsigned int userns_inum(void *ns) 988 static unsigned int userns_inum(void *ns)
904 { 989 {
905 struct user_namespace *user_ns = ns; 990 struct user_namespace *user_ns = ns;
906 return user_ns->proc_inum; 991 return user_ns->proc_inum;
907 } 992 }
908 993
909 const struct proc_ns_operations userns_operations = { 994 const struct proc_ns_operations userns_operations = {
910 .name = "user", 995 .name = "user",
911 .type = CLONE_NEWUSER, 996 .type = CLONE_NEWUSER,
912 .get = userns_get, 997 .get = userns_get,
913 .put = userns_put, 998 .put = userns_put,
914 .install = userns_install, 999 .install = userns_install,
915 .inum = userns_inum, 1000 .inum = userns_inum,
916 }; 1001 };
917 1002
918 static __init int user_namespaces_init(void) 1003 static __init int user_namespaces_init(void)
919 { 1004 {
920 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); 1005 user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC);
921 return 0; 1006 return 0;
922 } 1007 }
923 subsys_initcall(user_namespaces_init); 1008 subsys_initcall(user_namespaces_init);
924 1009