Blame view

kernel/cpuset.c 73.8 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
  /*
   *  kernel/cpuset.c
   *
   *  Processor and Memory placement constraints for sets of tasks.
   *
   *  Copyright (C) 2003 BULL SA.
029190c51   Paul Jackson   cpuset sched_load...
7
   *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
8793d854e   Paul Menage   Task Control Grou...
8
   *  Copyright (C) 2006 Google, Inc
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
9
10
11
   *
   *  Portions derived from Patrick Mochel's sysfs code.
   *  sysfs is Copyright (c) 2001-3 Patrick Mochel
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
12
   *
825a46af5   Paul Jackson   [PATCH] cpuset me...
13
   *  2003-10-10 Written by Simon Derr.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
14
   *  2003-10-22 Updates by Stephen Hemminger.
825a46af5   Paul Jackson   [PATCH] cpuset me...
15
   *  2004 May-July Rework by Paul Jackson.
8793d854e   Paul Menage   Task Control Grou...
16
   *  2006 Rework by Paul Menage to use generic cgroups
cf417141c   Max Krasnyansky   sched, cpuset: re...
17
18
   *  2008 Rework of the scheduler domains and CPU hotplug handling
   *       by Max Krasnyansky
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
19
20
21
22
23
   *
   *  This file is subject to the terms and conditions of the GNU General Public
   *  License.  See the file COPYING in the main directory of the Linux
   *  distribution for more details.
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
24
25
26
27
28
29
30
31
32
33
34
35
  #include <linux/cpu.h>
  #include <linux/cpumask.h>
  #include <linux/cpuset.h>
  #include <linux/err.h>
  #include <linux/errno.h>
  #include <linux/file.h>
  #include <linux/fs.h>
  #include <linux/init.h>
  #include <linux/interrupt.h>
  #include <linux/kernel.h>
  #include <linux/kmod.h>
  #include <linux/list.h>
68860ec10   Paul Jackson   [PATCH] cpusets: ...
36
  #include <linux/mempolicy.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
37
  #include <linux/mm.h>
f481891fd   Miao Xie   cpuset: update to...
38
  #include <linux/memory.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
39
40
41
42
43
  #include <linux/module.h>
  #include <linux/mount.h>
  #include <linux/namei.h>
  #include <linux/pagemap.h>
  #include <linux/proc_fs.h>
6b9c2603c   Paul Jackson   [PATCH] cpuset: u...
44
  #include <linux/rcupdate.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
45
46
  #include <linux/sched.h>
  #include <linux/seq_file.h>
22fb52dd7   David Quigley   [PATCH] SELinux: ...
47
  #include <linux/security.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
48
  #include <linux/slab.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
49
50
51
52
53
54
55
56
57
  #include <linux/spinlock.h>
  #include <linux/stat.h>
  #include <linux/string.h>
  #include <linux/time.h>
  #include <linux/backing-dev.h>
  #include <linux/sort.h>
  
  #include <asm/uaccess.h>
  #include <asm/atomic.h>
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
58
  #include <linux/mutex.h>
956db3ca0   Cliff Wickman   hotplug cpu: move...
59
60
  #include <linux/workqueue.h>
  #include <linux/cgroup.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
61

202f72d5d   Paul Jackson   [PATCH] cpuset: n...
62
  /*
f90d4118b   Miao Xie   cpuset: fix possi...
63
64
65
66
67
68
69
70
   * Workqueue for cpuset related tasks.
   *
   * Using kevent workqueue may cause deadlock when memory_migrate
   * is set. So we create a separate workqueue thread for cpuset.
   */
  static struct workqueue_struct *cpuset_wq;
  
  /*
202f72d5d   Paul Jackson   [PATCH] cpuset: n...
71
72
73
74
   * Tracks how many cpusets are currently defined in system.
   * When there is only one cpuset (the root cpuset) we can
   * short circuit some hooks.
   */
7edc59628   Paul Jackson   [PATCH] cpuset: m...
75
  int number_of_cpusets __read_mostly;
202f72d5d   Paul Jackson   [PATCH] cpuset: n...
76

2df167a30   Paul Menage   cgroups: update c...
77
  /* Forward declare cgroup structures */
8793d854e   Paul Menage   Task Control Grou...
78
79
  struct cgroup_subsys cpuset_subsys;
  struct cpuset;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
80
81
82
83
84
85
86
87
  /* See "Frequency meter" comments, below. */
  
  struct fmeter {
  	int cnt;		/* unprocessed events count */
  	int val;		/* most recent output value */
  	time_t time;		/* clock (secs) when val computed */
  	spinlock_t lock;	/* guards read or write of above */
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
88
  struct cpuset {
8793d854e   Paul Menage   Task Control Grou...
89
  	struct cgroup_subsys_state css;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
90
  	unsigned long flags;		/* "unsigned long" so bitops work */
300ed6cbb   Li Zefan   cpuset: convert c...
91
  	cpumask_var_t cpus_allowed;	/* CPUs allowed to tasks in cpuset */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
92
  	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
93
  	struct cpuset *parent;		/* my parent */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
94

3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
95
  	struct fmeter fmeter;		/* memory_pressure filter */
029190c51   Paul Jackson   cpuset sched_load...
96
97
98
  
  	/* partition number for rebuild_sched_domains() */
  	int pn;
956db3ca0   Cliff Wickman   hotplug cpu: move...
99

1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
100
101
  	/* for custom sched domain */
  	int relax_domain_level;
732bee7af   Uwe Kleine-König   fix typos concern...
102
  	/* used for walking a cpuset hierarchy */
956db3ca0   Cliff Wickman   hotplug cpu: move...
103
  	struct list_head stack_list;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
104
  };
8793d854e   Paul Menage   Task Control Grou...
105
106
107
108
109
110
111
112
113
114
115
116
117
  /* Retrieve the cpuset for a cgroup */
  static inline struct cpuset *cgroup_cs(struct cgroup *cont)
  {
  	return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
  			    struct cpuset, css);
  }
  
  /* Retrieve the cpuset for a task */
  static inline struct cpuset *task_cs(struct task_struct *task)
  {
  	return container_of(task_subsys_state(task, cpuset_subsys_id),
  			    struct cpuset, css);
  }
8793d854e   Paul Menage   Task Control Grou...
118

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
119
120
121
122
  /* bits in struct cpuset flags field */
  typedef enum {
  	CS_CPU_EXCLUSIVE,
  	CS_MEM_EXCLUSIVE,
786083667   Paul Menage   Cpuset hardwall f...
123
  	CS_MEM_HARDWALL,
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
124
  	CS_MEMORY_MIGRATE,
029190c51   Paul Jackson   cpuset sched_load...
125
  	CS_SCHED_LOAD_BALANCE,
825a46af5   Paul Jackson   [PATCH] cpuset me...
126
127
  	CS_SPREAD_PAGE,
  	CS_SPREAD_SLAB,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
128
129
130
131
132
  } cpuset_flagbits_t;
  
  /* convenient tests for these bits */
  static inline int is_cpu_exclusive(const struct cpuset *cs)
  {
7b5b9ef0e   Paul Jackson   [PATCH] cpuset cl...
133
  	return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
134
135
136
137
  }
  
  static inline int is_mem_exclusive(const struct cpuset *cs)
  {
7b5b9ef0e   Paul Jackson   [PATCH] cpuset cl...
138
  	return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
139
  }
786083667   Paul Menage   Cpuset hardwall f...
140
141
142
143
  static inline int is_mem_hardwall(const struct cpuset *cs)
  {
  	return test_bit(CS_MEM_HARDWALL, &cs->flags);
  }
029190c51   Paul Jackson   cpuset sched_load...
144
145
146
147
  static inline int is_sched_load_balance(const struct cpuset *cs)
  {
  	return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
  }
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
148
149
  static inline int is_memory_migrate(const struct cpuset *cs)
  {
7b5b9ef0e   Paul Jackson   [PATCH] cpuset cl...
150
  	return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
151
  }
825a46af5   Paul Jackson   [PATCH] cpuset me...
152
153
154
155
156
157
158
159
160
  static inline int is_spread_page(const struct cpuset *cs)
  {
  	return test_bit(CS_SPREAD_PAGE, &cs->flags);
  }
  
  static inline int is_spread_slab(const struct cpuset *cs)
  {
  	return test_bit(CS_SPREAD_SLAB, &cs->flags);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
161
162
  static struct cpuset top_cpuset = {
  	.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
163
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
164
  /*
2df167a30   Paul Menage   cgroups: update c...
165
166
167
168
169
170
171
   * There are two global mutexes guarding cpuset structures.  The first
   * is the main control groups cgroup_mutex, accessed via
   * cgroup_lock()/cgroup_unlock().  The second is the cpuset-specific
   * callback_mutex, below. They can nest.  It is ok to first take
   * cgroup_mutex, then nest callback_mutex.  We also require taking
   * task_lock() when dereferencing a task's cpuset pointer.  See "The
   * task_lock() exception", at the end of this comment.
053199edf   Paul Jackson   [PATCH] cpusets: ...
172
   *
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
173
   * A task must hold both mutexes to modify cpusets.  If a task
2df167a30   Paul Menage   cgroups: update c...
174
   * holds cgroup_mutex, then it blocks others wanting that mutex,
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
175
   * ensuring that it is the only task able to also acquire callback_mutex
053199edf   Paul Jackson   [PATCH] cpusets: ...
176
177
   * and be able to modify cpusets.  It can perform various checks on
   * the cpuset structure first, knowing nothing will change.  It can
2df167a30   Paul Menage   cgroups: update c...
178
   * also allocate memory while just holding cgroup_mutex.  While it is
053199edf   Paul Jackson   [PATCH] cpusets: ...
179
   * performing these checks, various callback routines can briefly
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
180
181
   * acquire callback_mutex to query cpusets.  Once it is ready to make
   * the changes, it takes callback_mutex, blocking everyone else.
053199edf   Paul Jackson   [PATCH] cpusets: ...
182
183
   *
   * Calls to the kernel memory allocator can not be made while holding
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
184
   * callback_mutex, as that would risk double tripping on callback_mutex
053199edf   Paul Jackson   [PATCH] cpusets: ...
185
186
187
   * from one of the callbacks into the cpuset code from within
   * __alloc_pages().
   *
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
188
   * If a task is only holding callback_mutex, then it has read-only
053199edf   Paul Jackson   [PATCH] cpusets: ...
189
190
   * access to cpusets.
   *
58568d2a8   Miao Xie   cpuset,mm: update...
191
192
193
   * Now, the task_struct fields mems_allowed and mempolicy may be changed
   * by other task, we use alloc_lock in the task_struct fields to protect
   * them.
053199edf   Paul Jackson   [PATCH] cpusets: ...
194
   *
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
195
   * The cpuset_common_file_read() handlers only hold callback_mutex across
053199edf   Paul Jackson   [PATCH] cpusets: ...
196
197
198
   * small pieces of code, such as when reading out possibly multi-word
   * cpumasks and nodemasks.
   *
2df167a30   Paul Menage   cgroups: update c...
199
200
   * Accessing a task's cpuset should be done in accordance with the
   * guidelines for accessing subsystem state in kernel/cgroup.c
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
201
   */
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
202
  static DEFINE_MUTEX(callback_mutex);
4247bdc60   Paul Jackson   [PATCH] cpuset se...
203

cf417141c   Max Krasnyansky   sched, cpuset: re...
204
  /*
75aa19941   David Rientjes   oom: print trigge...
205
206
207
208
209
210
211
212
213
214
215
   * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
   * buffers.  They are statically allocated to prevent using excess stack
   * when calling cpuset_print_task_mems_allowed().
   */
  #define CPUSET_NAME_LEN		(128)
  #define	CPUSET_NODELIST_LEN	(256)
  static char cpuset_name[CPUSET_NAME_LEN];
  static char cpuset_nodelist[CPUSET_NODELIST_LEN];
  static DEFINE_SPINLOCK(cpuset_buffer_lock);
  
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
216
   * This is ugly, but preserves the userspace API for existing cpuset
8793d854e   Paul Menage   Task Control Grou...
217
   * users. If someone tries to mount the "cpuset" filesystem, we
cf417141c   Max Krasnyansky   sched, cpuset: re...
218
219
   * silently switch it to mount "cgroup" instead
   */
f7e835710   Al Viro   convert cgroup an...
220
221
  static struct dentry *cpuset_mount(struct file_system_type *fs_type,
  			 int flags, const char *unused_dev_name, void *data)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
222
  {
8793d854e   Paul Menage   Task Control Grou...
223
  	struct file_system_type *cgroup_fs = get_fs_type("cgroup");
f7e835710   Al Viro   convert cgroup an...
224
  	struct dentry *ret = ERR_PTR(-ENODEV);
8793d854e   Paul Menage   Task Control Grou...
225
226
227
228
  	if (cgroup_fs) {
  		char mountopts[] =
  			"cpuset,noprefix,"
  			"release_agent=/sbin/cpuset_release_agent";
f7e835710   Al Viro   convert cgroup an...
229
230
  		ret = cgroup_fs->mount(cgroup_fs, flags,
  					   unused_dev_name, mountopts);
8793d854e   Paul Menage   Task Control Grou...
231
232
233
  		put_filesystem(cgroup_fs);
  	}
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
234
235
236
237
  }
  
  static struct file_system_type cpuset_fs_type = {
  	.name = "cpuset",
f7e835710   Al Viro   convert cgroup an...
238
  	.mount = cpuset_mount,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
239
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
240
  /*
300ed6cbb   Li Zefan   cpuset: convert c...
241
   * Return in pmask the portion of a cpusets's cpus_allowed that
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
242
243
244
245
246
247
248
249
250
   * are online.  If none are online, walk up the cpuset hierarchy
   * until we find one that does have some online cpus.  If we get
   * all the way to the top and still haven't found any online cpus,
   * return cpu_online_map.  Or if passed a NULL cs from an exit'ing
   * task, return cpu_online_map.
   *
   * One way or another, we guarantee to return some non-empty subset
   * of cpu_online_map.
   *
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
251
   * Call with callback_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
252
   */
6af866af3   Li Zefan   cpuset: remove re...
253
254
  static void guarantee_online_cpus(const struct cpuset *cs,
  				  struct cpumask *pmask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
255
  {
300ed6cbb   Li Zefan   cpuset: convert c...
256
  	while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
257
258
  		cs = cs->parent;
  	if (cs)
300ed6cbb   Li Zefan   cpuset: convert c...
259
  		cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
260
  	else
300ed6cbb   Li Zefan   cpuset: convert c...
261
262
  		cpumask_copy(pmask, cpu_online_mask);
  	BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
263
264
265
266
  }
  
  /*
   * Return in *pmask the portion of a cpusets's mems_allowed that
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
267
268
269
270
   * are online, with memory.  If none are online with memory, walk
   * up the cpuset hierarchy until we find one that does have some
   * online mems.  If we get all the way to the top and still haven't
   * found any online mems, return node_states[N_HIGH_MEMORY].
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
271
272
   *
   * One way or another, we guarantee to return some non-empty subset
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
273
   * of node_states[N_HIGH_MEMORY].
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
274
   *
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
275
   * Call with callback_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
276
277
278
279
   */
  
  static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  {
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
280
281
  	while (cs && !nodes_intersects(cs->mems_allowed,
  					node_states[N_HIGH_MEMORY]))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
282
283
  		cs = cs->parent;
  	if (cs)
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
284
285
  		nodes_and(*pmask, cs->mems_allowed,
  					node_states[N_HIGH_MEMORY]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
286
  	else
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
287
288
  		*pmask = node_states[N_HIGH_MEMORY];
  	BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
289
  }
f3b39d47e   Miao Xie   cpusets: restruct...
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
  /*
   * update task's spread flag if cpuset's page/slab spread flag is set
   *
   * Called with callback_mutex/cgroup_mutex held
   */
  static void cpuset_update_task_spread_flag(struct cpuset *cs,
  					struct task_struct *tsk)
  {
  	if (is_spread_page(cs))
  		tsk->flags |= PF_SPREAD_PAGE;
  	else
  		tsk->flags &= ~PF_SPREAD_PAGE;
  	if (is_spread_slab(cs))
  		tsk->flags |= PF_SPREAD_SLAB;
  	else
  		tsk->flags &= ~PF_SPREAD_SLAB;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
307
308
309
310
311
  /*
   * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
   *
   * One cpuset is a subset of another if all its allowed CPUs and
   * Memory Nodes are a subset of the other, and its exclusive flags
2df167a30   Paul Menage   cgroups: update c...
312
   * are only set if the other's are set.  Call holding cgroup_mutex.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
313
314
315
316
   */
  
  static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
  {
300ed6cbb   Li Zefan   cpuset: convert c...
317
  	return	cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
318
319
320
321
  		nodes_subset(p->mems_allowed, q->mems_allowed) &&
  		is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
  		is_mem_exclusive(p) <= is_mem_exclusive(q);
  }
645fcc9d2   Li Zefan   cpuset: don't all...
322
323
324
325
326
327
  /**
   * alloc_trial_cpuset - allocate a trial cpuset
   * @cs: the cpuset that the trial cpuset duplicates
   */
  static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
  {
300ed6cbb   Li Zefan   cpuset: convert c...
328
329
330
331
332
333
334
335
336
337
338
339
340
  	struct cpuset *trial;
  
  	trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
  	if (!trial)
  		return NULL;
  
  	if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
  		kfree(trial);
  		return NULL;
  	}
  	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
  
  	return trial;
645fcc9d2   Li Zefan   cpuset: don't all...
341
342
343
344
345
346
347
348
  }
  
  /**
   * free_trial_cpuset - free the trial cpuset
   * @trial: the trial cpuset to be freed
   */
  static void free_trial_cpuset(struct cpuset *trial)
  {
300ed6cbb   Li Zefan   cpuset: convert c...
349
  	free_cpumask_var(trial->cpus_allowed);
645fcc9d2   Li Zefan   cpuset: don't all...
350
351
  	kfree(trial);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
352
353
354
355
356
357
358
  /*
   * validate_change() - Used to validate that any proposed cpuset change
   *		       follows the structural rules for cpusets.
   *
   * If we replaced the flag and mask values of the current cpuset
   * (cur) with those values in the trial cpuset (trial), would
   * our various subset and exclusive rules still be valid?  Presumes
2df167a30   Paul Menage   cgroups: update c...
359
   * cgroup_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
360
361
362
363
364
365
366
367
368
369
370
371
372
373
   *
   * 'cur' is the address of an actual, in-use cpuset.  Operations
   * such as list traversal that depend on the actual address of the
   * cpuset in the list must use cur below, not trial.
   *
   * 'trial' is the address of bulk structure copy of cur, with
   * perhaps one or more of the fields cpus_allowed, mems_allowed,
   * or flags changed to new, trial values.
   *
   * Return 0 if valid, -errno if not.
   */
  
  static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
  {
8793d854e   Paul Menage   Task Control Grou...
374
  	struct cgroup *cont;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
375
376
377
  	struct cpuset *c, *par;
  
  	/* Each of our child cpusets must be a subset of us */
8793d854e   Paul Menage   Task Control Grou...
378
379
  	list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
  		if (!is_cpuset_subset(cgroup_cs(cont), trial))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
380
381
382
383
  			return -EBUSY;
  	}
  
  	/* Remaining checks don't apply to root cpuset */
696040670   Paul Jackson   [PATCH] cpuset: m...
384
  	if (cur == &top_cpuset)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
385
  		return 0;
696040670   Paul Jackson   [PATCH] cpuset: m...
386
  	par = cur->parent;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
387
388
389
  	/* We must be a subset of our parent cpuset */
  	if (!is_cpuset_subset(trial, par))
  		return -EACCES;
2df167a30   Paul Menage   cgroups: update c...
390
391
392
393
  	/*
  	 * If either I or some sibling (!= me) is exclusive, we can't
  	 * overlap
  	 */
8793d854e   Paul Menage   Task Control Grou...
394
395
  	list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
  		c = cgroup_cs(cont);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
396
397
  		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
  		    c != cur &&
300ed6cbb   Li Zefan   cpuset: convert c...
398
  		    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
399
400
401
402
403
404
  			return -EINVAL;
  		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
  		    c != cur &&
  		    nodes_intersects(trial->mems_allowed, c->mems_allowed))
  			return -EINVAL;
  	}
020958b62   Paul Jackson   cpusets: decrusti...
405
406
  	/* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
  	if (cgroup_task_count(cur->css.cgroup)) {
300ed6cbb   Li Zefan   cpuset: convert c...
407
  		if (cpumask_empty(trial->cpus_allowed) ||
020958b62   Paul Jackson   cpusets: decrusti...
408
409
410
411
  		    nodes_empty(trial->mems_allowed)) {
  			return -ENOSPC;
  		}
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
412
413
  	return 0;
  }
db7f47cf4   Paul Menage   cpusets: allow cp...
414
  #ifdef CONFIG_SMP
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
415
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
416
   * Helper routine for generate_sched_domains().
029190c51   Paul Jackson   cpuset sched_load...
417
418
   * Do cpusets a, b have overlapping cpus_allowed masks?
   */
029190c51   Paul Jackson   cpuset sched_load...
419
420
  static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
  {
300ed6cbb   Li Zefan   cpuset: convert c...
421
  	return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
029190c51   Paul Jackson   cpuset sched_load...
422
  }
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
423
424
425
  static void
  update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
  {
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
426
427
428
429
  	if (dattr->relax_domain_level < c->relax_domain_level)
  		dattr->relax_domain_level = c->relax_domain_level;
  	return;
  }
f5393693e   Lai Jiangshan   cpuset: speed up ...
430
431
432
433
434
435
436
437
438
439
440
441
442
  static void
  update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
  {
  	LIST_HEAD(q);
  
  	list_add(&c->stack_list, &q);
  	while (!list_empty(&q)) {
  		struct cpuset *cp;
  		struct cgroup *cont;
  		struct cpuset *child;
  
  		cp = list_first_entry(&q, struct cpuset, stack_list);
  		list_del(q.next);
300ed6cbb   Li Zefan   cpuset: convert c...
443
  		if (cpumask_empty(cp->cpus_allowed))
f5393693e   Lai Jiangshan   cpuset: speed up ...
444
445
446
447
448
449
450
451
452
453
454
  			continue;
  
  		if (is_sched_load_balance(cp))
  			update_domain_attr(dattr, cp);
  
  		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
  			child = cgroup_cs(cont);
  			list_add_tail(&child->stack_list, &q);
  		}
  	}
  }
029190c51   Paul Jackson   cpuset sched_load...
455
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
456
457
458
459
460
461
462
463
464
   * generate_sched_domains()
   *
   * This function builds a partial partition of the systems CPUs
   * A 'partial partition' is a set of non-overlapping subsets whose
   * union is a subset of that set.
   * The output of this function needs to be passed to kernel/sched.c
   * partition_sched_domains() routine, which will rebuild the scheduler's
   * load balancing domains (sched domains) as specified by that partial
   * partition.
029190c51   Paul Jackson   cpuset sched_load...
465
   *
45ce80fb6   Li Zefan   cgroups: consolid...
466
   * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt
029190c51   Paul Jackson   cpuset sched_load...
467
468
469
470
471
472
473
   * for a background explanation of this.
   *
   * Does not return errors, on the theory that the callers of this
   * routine would rather not worry about failures to rebuild sched
   * domains when operating in the severe memory shortage situations
   * that could cause allocation failures below.
   *
cf417141c   Max Krasnyansky   sched, cpuset: re...
474
   * Must be called with cgroup_lock held.
029190c51   Paul Jackson   cpuset sched_load...
475
476
   *
   * The three key local variables below are:
aeed68242   Li Zefan   cpuset: clean up ...
477
   *    q  - a linked-list queue of cpuset pointers, used to implement a
029190c51   Paul Jackson   cpuset sched_load...
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
   *	   top-down scan of all cpusets.  This scan loads a pointer
   *	   to each cpuset marked is_sched_load_balance into the
   *	   array 'csa'.  For our purposes, rebuilding the schedulers
   *	   sched domains, we can ignore !is_sched_load_balance cpusets.
   *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
   *	   that need to be load balanced, for convenient iterative
   *	   access by the subsequent code that finds the best partition,
   *	   i.e the set of domains (subsets) of CPUs such that the
   *	   cpus_allowed of every cpuset marked is_sched_load_balance
   *	   is a subset of one of these domains, while there are as
   *	   many such domains as possible, each as small as possible.
   * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
   *	   the kernel/sched.c routine partition_sched_domains() in a
   *	   convenient format, that can be easily compared to the prior
   *	   value to determine what partition elements (sched domains)
   *	   were changed (added or removed.)
   *
   * Finding the best partition (set of domains):
   *	The triple nested loops below over i, j, k scan over the
   *	load balanced cpusets (using the array of cpuset pointers in
   *	csa[]) looking for pairs of cpusets that have overlapping
   *	cpus_allowed, but which don't have the same 'pn' partition
   *	number and gives them in the same partition number.  It keeps
   *	looping on the 'restart' label until it can no longer find
   *	any such pairs.
   *
   *	The union of the cpus_allowed masks from the set of
   *	all cpusets having the same 'pn' value then form the one
   *	element of the partition (one sched domain) to be passed to
   *	partition_sched_domains().
   */
acc3f5d7c   Rusty Russell   cpumask: Partitio...
509
  static int generate_sched_domains(cpumask_var_t **domains,
cf417141c   Max Krasnyansky   sched, cpuset: re...
510
  			struct sched_domain_attr **attributes)
029190c51   Paul Jackson   cpuset sched_load...
511
  {
cf417141c   Max Krasnyansky   sched, cpuset: re...
512
  	LIST_HEAD(q);		/* queue of cpusets to be scanned */
029190c51   Paul Jackson   cpuset sched_load...
513
514
515
516
  	struct cpuset *cp;	/* scans q */
  	struct cpuset **csa;	/* array of all cpuset ptrs */
  	int csn;		/* how many cpuset ptrs in csa so far */
  	int i, j, k;		/* indices for partition finding loops */
acc3f5d7c   Rusty Russell   cpumask: Partitio...
517
  	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
518
  	struct sched_domain_attr *dattr;  /* attributes for custom domains */
1583715dd   Ingo Molnar   sched, cpusets: f...
519
  	int ndoms = 0;		/* number of sched domains in result */
6af866af3   Li Zefan   cpuset: remove re...
520
  	int nslot;		/* next empty doms[] struct cpumask slot */
029190c51   Paul Jackson   cpuset sched_load...
521

029190c51   Paul Jackson   cpuset sched_load...
522
  	doms = NULL;
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
523
  	dattr = NULL;
cf417141c   Max Krasnyansky   sched, cpuset: re...
524
  	csa = NULL;
029190c51   Paul Jackson   cpuset sched_load...
525
526
527
  
  	/* Special case for the 99% of systems with one, full, sched domain */
  	if (is_sched_load_balance(&top_cpuset)) {
acc3f5d7c   Rusty Russell   cpumask: Partitio...
528
529
  		ndoms = 1;
  		doms = alloc_sched_domains(ndoms);
029190c51   Paul Jackson   cpuset sched_load...
530
  		if (!doms)
cf417141c   Max Krasnyansky   sched, cpuset: re...
531
  			goto done;
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
532
533
534
  		dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
  		if (dattr) {
  			*dattr = SD_ATTR_INIT;
93a655755   Li Zefan   cpuset: fix wrong...
535
  			update_domain_attr_tree(dattr, &top_cpuset);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
536
  		}
acc3f5d7c   Rusty Russell   cpumask: Partitio...
537
  		cpumask_copy(doms[0], top_cpuset.cpus_allowed);
cf417141c   Max Krasnyansky   sched, cpuset: re...
538

cf417141c   Max Krasnyansky   sched, cpuset: re...
539
  		goto done;
029190c51   Paul Jackson   cpuset sched_load...
540
  	}
029190c51   Paul Jackson   cpuset sched_load...
541
542
543
544
  	csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
  	if (!csa)
  		goto done;
  	csn = 0;
aeed68242   Li Zefan   cpuset: clean up ...
545
546
  	list_add(&top_cpuset.stack_list, &q);
  	while (!list_empty(&q)) {
029190c51   Paul Jackson   cpuset sched_load...
547
548
  		struct cgroup *cont;
  		struct cpuset *child;   /* scans child cpusets of cp */
489a5393a   Lai Jiangshan   cpuset: don't pas...
549

aeed68242   Li Zefan   cpuset: clean up ...
550
551
  		cp = list_first_entry(&q, struct cpuset, stack_list);
  		list_del(q.next);
300ed6cbb   Li Zefan   cpuset: convert c...
552
  		if (cpumask_empty(cp->cpus_allowed))
489a5393a   Lai Jiangshan   cpuset: don't pas...
553
  			continue;
f5393693e   Lai Jiangshan   cpuset: speed up ...
554
555
556
557
558
559
560
  		/*
  		 * All child cpusets contain a subset of the parent's cpus, so
  		 * just skip them, and then we call update_domain_attr_tree()
  		 * to calc relax_domain_level of the corresponding sched
  		 * domain.
  		 */
  		if (is_sched_load_balance(cp)) {
029190c51   Paul Jackson   cpuset sched_load...
561
  			csa[csn++] = cp;
f5393693e   Lai Jiangshan   cpuset: speed up ...
562
563
  			continue;
  		}
489a5393a   Lai Jiangshan   cpuset: don't pas...
564

029190c51   Paul Jackson   cpuset sched_load...
565
566
  		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
  			child = cgroup_cs(cont);
aeed68242   Li Zefan   cpuset: clean up ...
567
  			list_add_tail(&child->stack_list, &q);
029190c51   Paul Jackson   cpuset sched_load...
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
  		}
    	}
  
  	for (i = 0; i < csn; i++)
  		csa[i]->pn = i;
  	ndoms = csn;
  
  restart:
  	/* Find the best partition (set of sched domains) */
  	for (i = 0; i < csn; i++) {
  		struct cpuset *a = csa[i];
  		int apn = a->pn;
  
  		for (j = 0; j < csn; j++) {
  			struct cpuset *b = csa[j];
  			int bpn = b->pn;
  
  			if (apn != bpn && cpusets_overlap(a, b)) {
  				for (k = 0; k < csn; k++) {
  					struct cpuset *c = csa[k];
  
  					if (c->pn == bpn)
  						c->pn = apn;
  				}
  				ndoms--;	/* one less element */
  				goto restart;
  			}
  		}
  	}
cf417141c   Max Krasnyansky   sched, cpuset: re...
597
598
599
600
  	/*
  	 * Now we know how many domains to create.
  	 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
  	 */
acc3f5d7c   Rusty Russell   cpumask: Partitio...
601
  	doms = alloc_sched_domains(ndoms);
700018e0a   Li Zefan   cpuset: fix regre...
602
  	if (!doms)
cf417141c   Max Krasnyansky   sched, cpuset: re...
603
  		goto done;
cf417141c   Max Krasnyansky   sched, cpuset: re...
604
605
606
607
608
  
  	/*
  	 * The rest of the code, including the scheduler, can deal with
  	 * dattr==NULL case. No need to abort if alloc fails.
  	 */
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
609
  	dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
029190c51   Paul Jackson   cpuset sched_load...
610
611
612
  
  	for (nslot = 0, i = 0; i < csn; i++) {
  		struct cpuset *a = csa[i];
6af866af3   Li Zefan   cpuset: remove re...
613
  		struct cpumask *dp;
029190c51   Paul Jackson   cpuset sched_load...
614
  		int apn = a->pn;
cf417141c   Max Krasnyansky   sched, cpuset: re...
615
616
617
618
  		if (apn < 0) {
  			/* Skip completed partitions */
  			continue;
  		}
acc3f5d7c   Rusty Russell   cpumask: Partitio...
619
  		dp = doms[nslot];
cf417141c   Max Krasnyansky   sched, cpuset: re...
620
621
622
623
624
625
626
627
628
629
630
  
  		if (nslot == ndoms) {
  			static int warnings = 10;
  			if (warnings) {
  				printk(KERN_WARNING
  				 "rebuild_sched_domains confused:"
  				  " nslot %d, ndoms %d, csn %d, i %d,"
  				  " apn %d
  ",
  				  nslot, ndoms, csn, i, apn);
  				warnings--;
029190c51   Paul Jackson   cpuset sched_load...
631
  			}
cf417141c   Max Krasnyansky   sched, cpuset: re...
632
633
  			continue;
  		}
029190c51   Paul Jackson   cpuset sched_load...
634

6af866af3   Li Zefan   cpuset: remove re...
635
  		cpumask_clear(dp);
cf417141c   Max Krasnyansky   sched, cpuset: re...
636
637
638
639
640
641
  		if (dattr)
  			*(dattr + nslot) = SD_ATTR_INIT;
  		for (j = i; j < csn; j++) {
  			struct cpuset *b = csa[j];
  
  			if (apn == b->pn) {
300ed6cbb   Li Zefan   cpuset: convert c...
642
  				cpumask_or(dp, dp, b->cpus_allowed);
cf417141c   Max Krasnyansky   sched, cpuset: re...
643
644
645
646
647
  				if (dattr)
  					update_domain_attr_tree(dattr + nslot, b);
  
  				/* Done with this partition */
  				b->pn = -1;
029190c51   Paul Jackson   cpuset sched_load...
648
  			}
029190c51   Paul Jackson   cpuset sched_load...
649
  		}
cf417141c   Max Krasnyansky   sched, cpuset: re...
650
  		nslot++;
029190c51   Paul Jackson   cpuset sched_load...
651
652
  	}
  	BUG_ON(nslot != ndoms);
cf417141c   Max Krasnyansky   sched, cpuset: re...
653
654
  done:
  	kfree(csa);
700018e0a   Li Zefan   cpuset: fix regre...
655
656
657
658
659
660
  	/*
  	 * Fallback to the default domain if kmalloc() failed.
  	 * See comments in partition_sched_domains().
  	 */
  	if (doms == NULL)
  		ndoms = 1;
cf417141c   Max Krasnyansky   sched, cpuset: re...
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
  	*domains    = doms;
  	*attributes = dattr;
  	return ndoms;
  }
  
  /*
   * Rebuild scheduler domains.
   *
   * Call with neither cgroup_mutex held nor within get_online_cpus().
   * Takes both cgroup_mutex and get_online_cpus().
   *
   * Cannot be directly called from cpuset code handling changes
   * to the cpuset pseudo-filesystem, because it cannot be called
   * from code that already holds cgroup_mutex.
   */
  static void do_rebuild_sched_domains(struct work_struct *unused)
  {
  	struct sched_domain_attr *attr;
acc3f5d7c   Rusty Russell   cpumask: Partitio...
679
  	cpumask_var_t *doms;
cf417141c   Max Krasnyansky   sched, cpuset: re...
680
  	int ndoms;
86ef5c9a8   Gautham R Shenoy   cpu-hotplug: repl...
681
  	get_online_cpus();
cf417141c   Max Krasnyansky   sched, cpuset: re...
682
683
684
685
686
687
688
689
  
  	/* Generate domain masks and attrs */
  	cgroup_lock();
  	ndoms = generate_sched_domains(&doms, &attr);
  	cgroup_unlock();
  
  	/* Have scheduler rebuild the domains */
  	partition_sched_domains(ndoms, doms, attr);
86ef5c9a8   Gautham R Shenoy   cpu-hotplug: repl...
690
  	put_online_cpus();
cf417141c   Max Krasnyansky   sched, cpuset: re...
691
  }
db7f47cf4   Paul Menage   cpusets: allow cp...
692
693
694
695
  #else /* !CONFIG_SMP */
  static void do_rebuild_sched_domains(struct work_struct *unused)
  {
  }
e1b8090bd   Geert Uytterhoeven   cpumask: Fix gene...
696
  static int generate_sched_domains(cpumask_var_t **domains,
db7f47cf4   Paul Menage   cpusets: allow cp...
697
698
699
700
701
702
  			struct sched_domain_attr **attributes)
  {
  	*domains = NULL;
  	return 1;
  }
  #endif /* CONFIG_SMP */
029190c51   Paul Jackson   cpuset sched_load...
703

cf417141c   Max Krasnyansky   sched, cpuset: re...
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
  static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
  
  /*
   * Rebuild scheduler domains, asynchronously via workqueue.
   *
   * If the flag 'sched_load_balance' of any cpuset with non-empty
   * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
   * which has that flag enabled, or if any cpuset with a non-empty
   * 'cpus' is removed, then call this routine to rebuild the
   * scheduler's dynamic sched domains.
   *
   * The rebuild_sched_domains() and partition_sched_domains()
   * routines must nest cgroup_lock() inside get_online_cpus(),
   * but such cpuset changes as these must nest that locking the
   * other way, holding cgroup_lock() for much of the code.
   *
   * So in order to avoid an ABBA deadlock, the cpuset code handling
   * these user changes delegates the actual sched domain rebuilding
   * to a separate workqueue thread, which ends up processing the
   * above do_rebuild_sched_domains() function.
   */
  static void async_rebuild_sched_domains(void)
  {
f90d4118b   Miao Xie   cpuset: fix possi...
727
  	queue_work(cpuset_wq, &rebuild_sched_domains_work);
cf417141c   Max Krasnyansky   sched, cpuset: re...
728
729
730
731
732
733
734
735
736
737
738
739
740
741
  }
  
  /*
   * Accomplishes the same scheduler domain rebuild as the above
   * async_rebuild_sched_domains(), however it directly calls the
   * rebuild routine synchronously rather than calling it via an
   * asynchronous work thread.
   *
   * This can only be called from code that is not holding
   * cgroup_mutex (not nested in a cgroup_lock() call.)
   */
  void rebuild_sched_domains(void)
  {
  	do_rebuild_sched_domains(NULL);
029190c51   Paul Jackson   cpuset sched_load...
742
  }
58f4790b7   Cliff Wickman   cpusets: update_c...
743
744
745
746
747
  /**
   * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
   * @tsk: task to test
   * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
   *
2df167a30   Paul Menage   cgroups: update c...
748
   * Call with cgroup_mutex held.  May take callback_mutex during call.
58f4790b7   Cliff Wickman   cpusets: update_c...
749
750
751
   * Called for each task in a cgroup by cgroup_scan_tasks().
   * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
   * words, if its mask is not equal to its cpuset's mask).
053199edf   Paul Jackson   [PATCH] cpusets: ...
752
   */
9e0c914ca   Adrian Bunk   kernel/cpuset.c: ...
753
754
  static int cpuset_test_cpumask(struct task_struct *tsk,
  			       struct cgroup_scanner *scan)
58f4790b7   Cliff Wickman   cpusets: update_c...
755
  {
300ed6cbb   Li Zefan   cpuset: convert c...
756
  	return !cpumask_equal(&tsk->cpus_allowed,
58f4790b7   Cliff Wickman   cpusets: update_c...
757
758
  			(cgroup_cs(scan->cg))->cpus_allowed);
  }
053199edf   Paul Jackson   [PATCH] cpusets: ...
759

58f4790b7   Cliff Wickman   cpusets: update_c...
760
761
762
763
764
765
766
767
768
769
770
  /**
   * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
   * @tsk: task to test
   * @scan: struct cgroup_scanner containing the cgroup of the task
   *
   * Called by cgroup_scan_tasks() for each task in a cgroup whose
   * cpus_allowed mask needs to be changed.
   *
   * We don't need to re-check for the cgroup/cpuset membership, since we're
   * holding cgroup_lock() at this point.
   */
9e0c914ca   Adrian Bunk   kernel/cpuset.c: ...
771
772
  static void cpuset_change_cpumask(struct task_struct *tsk,
  				  struct cgroup_scanner *scan)
58f4790b7   Cliff Wickman   cpusets: update_c...
773
  {
300ed6cbb   Li Zefan   cpuset: convert c...
774
  	set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
58f4790b7   Cliff Wickman   cpusets: update_c...
775
776
777
  }
  
  /**
0b2f630a2   Miao Xie   cpusets: restruct...
778
779
   * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
   * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
4e74339af   Li Zefan   cpuset: avoid cha...
780
   * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
0b2f630a2   Miao Xie   cpusets: restruct...
781
782
783
784
785
786
   *
   * Called with cgroup_mutex held
   *
   * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
   * calling callback functions for each.
   *
4e74339af   Li Zefan   cpuset: avoid cha...
787
788
   * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
   * if @heap != NULL.
0b2f630a2   Miao Xie   cpusets: restruct...
789
   */
4e74339af   Li Zefan   cpuset: avoid cha...
790
  static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
0b2f630a2   Miao Xie   cpusets: restruct...
791
792
  {
  	struct cgroup_scanner scan;
0b2f630a2   Miao Xie   cpusets: restruct...
793
794
795
796
  
  	scan.cg = cs->css.cgroup;
  	scan.test_task = cpuset_test_cpumask;
  	scan.process_task = cpuset_change_cpumask;
4e74339af   Li Zefan   cpuset: avoid cha...
797
798
  	scan.heap = heap;
  	cgroup_scan_tasks(&scan);
0b2f630a2   Miao Xie   cpusets: restruct...
799
800
801
  }
  
  /**
58f4790b7   Cliff Wickman   cpusets: update_c...
802
803
804
805
   * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
   * @cs: the cpuset to consider
   * @buf: buffer of cpu numbers written to this cpuset
   */
645fcc9d2   Li Zefan   cpuset: don't all...
806
807
  static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  			  const char *buf)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
808
  {
4e74339af   Li Zefan   cpuset: avoid cha...
809
  	struct ptr_heap heap;
58f4790b7   Cliff Wickman   cpusets: update_c...
810
811
  	int retval;
  	int is_load_balanced;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
812

4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
813
814
815
  	/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
  	if (cs == &top_cpuset)
  		return -EACCES;
6f7f02e78   David Rientjes   cpusets: allow em...
816
  	/*
c8d9c90c7   Paul Jackson   hotplug cpu: move...
817
  	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
020958b62   Paul Jackson   cpusets: decrusti...
818
819
820
  	 * Since cpulist_parse() fails on an empty mask, we special case
  	 * that parsing.  The validate_change() call ensures that cpusets
  	 * with tasks have cpus.
6f7f02e78   David Rientjes   cpusets: allow em...
821
  	 */
020958b62   Paul Jackson   cpusets: decrusti...
822
  	if (!*buf) {
300ed6cbb   Li Zefan   cpuset: convert c...
823
  		cpumask_clear(trialcs->cpus_allowed);
6f7f02e78   David Rientjes   cpusets: allow em...
824
  	} else {
300ed6cbb   Li Zefan   cpuset: convert c...
825
  		retval = cpulist_parse(buf, trialcs->cpus_allowed);
6f7f02e78   David Rientjes   cpusets: allow em...
826
827
  		if (retval < 0)
  			return retval;
37340746a   Lai Jiangshan   cpusets: fix bug ...
828

6ad4c1888   Peter Zijlstra   sched: Fix balanc...
829
  		if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
37340746a   Lai Jiangshan   cpusets: fix bug ...
830
  			return -EINVAL;
6f7f02e78   David Rientjes   cpusets: allow em...
831
  	}
645fcc9d2   Li Zefan   cpuset: don't all...
832
  	retval = validate_change(cs, trialcs);
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
833
834
  	if (retval < 0)
  		return retval;
029190c51   Paul Jackson   cpuset sched_load...
835

8707d8b8c   Paul Menage   Fix cpusets updat...
836
  	/* Nothing to do if the cpus didn't change */
300ed6cbb   Li Zefan   cpuset: convert c...
837
  	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
8707d8b8c   Paul Menage   Fix cpusets updat...
838
  		return 0;
58f4790b7   Cliff Wickman   cpusets: update_c...
839

4e74339af   Li Zefan   cpuset: avoid cha...
840
841
842
  	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
  	if (retval)
  		return retval;
645fcc9d2   Li Zefan   cpuset: don't all...
843
  	is_load_balanced = is_sched_load_balance(trialcs);
029190c51   Paul Jackson   cpuset sched_load...
844

3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
845
  	mutex_lock(&callback_mutex);
300ed6cbb   Li Zefan   cpuset: convert c...
846
  	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
847
  	mutex_unlock(&callback_mutex);
029190c51   Paul Jackson   cpuset sched_load...
848

8707d8b8c   Paul Menage   Fix cpusets updat...
849
850
  	/*
  	 * Scan tasks in the cpuset, and update the cpumasks of any
58f4790b7   Cliff Wickman   cpusets: update_c...
851
  	 * that need an update.
8707d8b8c   Paul Menage   Fix cpusets updat...
852
  	 */
4e74339af   Li Zefan   cpuset: avoid cha...
853
854
855
  	update_tasks_cpumask(cs, &heap);
  
  	heap_free(&heap);
58f4790b7   Cliff Wickman   cpusets: update_c...
856

8707d8b8c   Paul Menage   Fix cpusets updat...
857
  	if (is_load_balanced)
cf417141c   Max Krasnyansky   sched, cpuset: re...
858
  		async_rebuild_sched_domains();
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
859
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
860
  }
053199edf   Paul Jackson   [PATCH] cpusets: ...
861
  /*
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
862
863
864
865
866
867
868
   * cpuset_migrate_mm
   *
   *    Migrate memory region from one set of nodes to another.
   *
   *    Temporarilly set tasks mems_allowed to target nodes of migration,
   *    so that the migration code can allocate pages on these nodes.
   *
2df167a30   Paul Menage   cgroups: update c...
869
   *    Call holding cgroup_mutex, so current's cpuset won't change
c8d9c90c7   Paul Jackson   hotplug cpu: move...
870
   *    during this call, as manage_mutex holds off any cpuset_attach()
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
871
872
   *    calls.  Therefore we don't need to take task_lock around the
   *    call to guarantee_online_mems(), as we know no one is changing
2df167a30   Paul Menage   cgroups: update c...
873
   *    our task's cpuset.
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
874
   *
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
875
876
877
878
   *    While the mm_struct we are migrating is typically from some
   *    other task, the task_struct mems_allowed that we are hacking
   *    is for our current task, which must allocate new pages for that
   *    migrating memory region.
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
879
880
881
882
883
884
   */
  
  static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
  							const nodemask_t *to)
  {
  	struct task_struct *tsk = current;
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
885
  	tsk->mems_allowed = *to;
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
886
887
  
  	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
8793d854e   Paul Menage   Task Control Grou...
888
  	guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
889
  }
3b6766fe6   Li Zefan   cpuset: rewrite u...
890
  /*
58568d2a8   Miao Xie   cpuset,mm: update...
891
892
893
894
895
896
897
   * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
   * @tsk: the task to change
   * @newmems: new nodes that the task will be set
   *
   * In order to avoid seeing no nodes if the old and new nodes are disjoint,
   * we structure updates as setting all new allowed nodes, then clearing newly
   * disallowed ones.
58568d2a8   Miao Xie   cpuset,mm: update...
898
899
900
901
   */
  static void cpuset_change_task_nodemask(struct task_struct *tsk,
  					nodemask_t *newmems)
  {
c0ff7453b   Miao Xie   cpuset,mm: fix no...
902
903
904
905
906
907
908
909
910
911
912
  repeat:
  	/*
  	 * Allow tasks that have access to memory reserves because they have
  	 * been OOM killed to get memory anywhere.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE)))
  		return;
  	if (current->flags & PF_EXITING) /* Let dying task have memory */
  		return;
  
  	task_lock(tsk);
58568d2a8   Miao Xie   cpuset,mm: update...
913
  	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
c0ff7453b   Miao Xie   cpuset,mm: fix no...
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
  	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
  
  
  	/*
  	 * ensure checking ->mems_allowed_change_disable after setting all new
  	 * allowed nodes.
  	 *
  	 * the read-side task can see an nodemask with new allowed nodes and
  	 * old allowed nodes. and if it allocates page when cpuset clears newly
  	 * disallowed ones continuous, it can see the new allowed bits.
  	 *
  	 * And if setting all new allowed nodes is after the checking, setting
  	 * all new allowed nodes and clearing newly disallowed ones will be done
  	 * continuous, and the read-side task may find no node to alloc page.
  	 */
  	smp_mb();
  
  	/*
  	 * Allocation of memory is very fast, we needn't sleep when waiting
  	 * for the read-side.
  	 */
  	while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
  		task_unlock(tsk);
  		if (!task_curr(tsk))
  			yield();
  		goto repeat;
  	}
  
  	/*
  	 * ensure checking ->mems_allowed_change_disable before clearing all new
  	 * disallowed nodes.
  	 *
  	 * if clearing newly disallowed bits before the checking, the read-side
  	 * task may find no node to alloc page.
  	 */
  	smp_mb();
  
  	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
58568d2a8   Miao Xie   cpuset,mm: update...
952
  	tsk->mems_allowed = *newmems;
c0ff7453b   Miao Xie   cpuset,mm: fix no...
953
  	task_unlock(tsk);
58568d2a8   Miao Xie   cpuset,mm: update...
954
955
956
957
958
959
  }
  
  /*
   * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
   * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
   * memory_migrate flag is set. Called with cgroup_mutex held.
3b6766fe6   Li Zefan   cpuset: rewrite u...
960
961
962
963
964
965
966
967
   */
  static void cpuset_change_nodemask(struct task_struct *p,
  				   struct cgroup_scanner *scan)
  {
  	struct mm_struct *mm;
  	struct cpuset *cs;
  	int migrate;
  	const nodemask_t *oldmem = scan->data;
ee24d3797   Li Zefan   cpuset: fix unche...
968
  	static nodemask_t newmems;	/* protected by cgroup_mutex */
58568d2a8   Miao Xie   cpuset,mm: update...
969
970
  
  	cs = cgroup_cs(scan->cg);
ee24d3797   Li Zefan   cpuset: fix unche...
971
  	guarantee_online_mems(cs, &newmems);
58568d2a8   Miao Xie   cpuset,mm: update...
972

ee24d3797   Li Zefan   cpuset: fix unche...
973
  	cpuset_change_task_nodemask(p, &newmems);
53feb2976   Miao Xie   cpuset: alloc nod...
974

3b6766fe6   Li Zefan   cpuset: rewrite u...
975
976
977
  	mm = get_task_mm(p);
  	if (!mm)
  		return;
3b6766fe6   Li Zefan   cpuset: rewrite u...
978
979
980
981
982
983
984
  	migrate = is_memory_migrate(cs);
  
  	mpol_rebind_mm(mm, &cs->mems_allowed);
  	if (migrate)
  		cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
  	mmput(mm);
  }
8793d854e   Paul Menage   Task Control Grou...
985
  static void *cpuset_being_rebound;
0b2f630a2   Miao Xie   cpusets: restruct...
986
987
988
989
  /**
   * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
   * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
   * @oldmem: old mems_allowed of cpuset cs
010cfac4c   Li Zefan   cpuset: avoid cha...
990
   * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
0b2f630a2   Miao Xie   cpusets: restruct...
991
992
   *
   * Called with cgroup_mutex held
010cfac4c   Li Zefan   cpuset: avoid cha...
993
994
   * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
   * if @heap != NULL.
0b2f630a2   Miao Xie   cpusets: restruct...
995
   */
010cfac4c   Li Zefan   cpuset: avoid cha...
996
997
  static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
  				 struct ptr_heap *heap)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
998
  {
3b6766fe6   Li Zefan   cpuset: rewrite u...
999
  	struct cgroup_scanner scan;
59dac16fb   Paul Jackson   [PATCH] cpuset: u...
1000

846a16bf0   Lee Schermerhorn   mempolicy: rename...
1001
  	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1002

3b6766fe6   Li Zefan   cpuset: rewrite u...
1003
1004
1005
  	scan.cg = cs->css.cgroup;
  	scan.test_task = NULL;
  	scan.process_task = cpuset_change_nodemask;
010cfac4c   Li Zefan   cpuset: avoid cha...
1006
  	scan.heap = heap;
3b6766fe6   Li Zefan   cpuset: rewrite u...
1007
  	scan.data = (nodemask_t *)oldmem;
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1008
1009
  
  	/*
3b6766fe6   Li Zefan   cpuset: rewrite u...
1010
1011
1012
1013
1014
1015
  	 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
  	 * take while holding tasklist_lock.  Forks can happen - the
  	 * mpol_dup() cpuset_being_rebound check will catch such forks,
  	 * and rebind their vma mempolicies too.  Because we still hold
  	 * the global cgroup_mutex, we know that no other rebind effort
  	 * will be contending for the global variable cpuset_being_rebound.
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1016
  	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
04c19fa6f   Paul Jackson   [PATCH] cpuset: m...
1017
  	 * is idempotent.  Also migrate pages in each mm to new nodes.
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1018
  	 */
010cfac4c   Li Zefan   cpuset: avoid cha...
1019
  	cgroup_scan_tasks(&scan);
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1020

2df167a30   Paul Menage   cgroups: update c...
1021
  	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
8793d854e   Paul Menage   Task Control Grou...
1022
  	cpuset_being_rebound = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1023
  }
0b2f630a2   Miao Xie   cpusets: restruct...
1024
1025
1026
  /*
   * Handle user request to change the 'mems' memory placement
   * of a cpuset.  Needs to validate the request, update the
58568d2a8   Miao Xie   cpuset,mm: update...
1027
1028
1029
1030
   * cpusets mems_allowed, and for each task in the cpuset,
   * update mems_allowed and rebind task's mempolicy and any vma
   * mempolicies and if the cpuset is marked 'memory_migrate',
   * migrate the tasks pages to the new memory.
0b2f630a2   Miao Xie   cpusets: restruct...
1031
1032
1033
1034
1035
1036
   *
   * Call with cgroup_mutex held.  May take callback_mutex during call.
   * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
   * lock each such tasks mm->mmap_sem, scan its vma's and rebind
   * their mempolicies to the cpusets new mems_allowed.
   */
645fcc9d2   Li Zefan   cpuset: don't all...
1037
1038
  static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
  			   const char *buf)
0b2f630a2   Miao Xie   cpusets: restruct...
1039
  {
53feb2976   Miao Xie   cpuset: alloc nod...
1040
  	NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
0b2f630a2   Miao Xie   cpusets: restruct...
1041
  	int retval;
010cfac4c   Li Zefan   cpuset: avoid cha...
1042
  	struct ptr_heap heap;
0b2f630a2   Miao Xie   cpusets: restruct...
1043

53feb2976   Miao Xie   cpuset: alloc nod...
1044
1045
  	if (!oldmem)
  		return -ENOMEM;
0b2f630a2   Miao Xie   cpusets: restruct...
1046
1047
1048
1049
  	/*
  	 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
  	 * it's read-only
  	 */
53feb2976   Miao Xie   cpuset: alloc nod...
1050
1051
1052
1053
  	if (cs == &top_cpuset) {
  		retval = -EACCES;
  		goto done;
  	}
0b2f630a2   Miao Xie   cpusets: restruct...
1054

0b2f630a2   Miao Xie   cpusets: restruct...
1055
1056
1057
1058
1059
1060
1061
  	/*
  	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
  	 * Since nodelist_parse() fails on an empty mask, we special case
  	 * that parsing.  The validate_change() call ensures that cpusets
  	 * with tasks have memory.
  	 */
  	if (!*buf) {
645fcc9d2   Li Zefan   cpuset: don't all...
1062
  		nodes_clear(trialcs->mems_allowed);
0b2f630a2   Miao Xie   cpusets: restruct...
1063
  	} else {
645fcc9d2   Li Zefan   cpuset: don't all...
1064
  		retval = nodelist_parse(buf, trialcs->mems_allowed);
0b2f630a2   Miao Xie   cpusets: restruct...
1065
1066
  		if (retval < 0)
  			goto done;
645fcc9d2   Li Zefan   cpuset: don't all...
1067
  		if (!nodes_subset(trialcs->mems_allowed,
53feb2976   Miao Xie   cpuset: alloc nod...
1068
1069
1070
1071
  				node_states[N_HIGH_MEMORY])) {
  			retval =  -EINVAL;
  			goto done;
  		}
0b2f630a2   Miao Xie   cpusets: restruct...
1072
  	}
53feb2976   Miao Xie   cpuset: alloc nod...
1073
1074
  	*oldmem = cs->mems_allowed;
  	if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
0b2f630a2   Miao Xie   cpusets: restruct...
1075
1076
1077
  		retval = 0;		/* Too easy - nothing to do */
  		goto done;
  	}
645fcc9d2   Li Zefan   cpuset: don't all...
1078
  	retval = validate_change(cs, trialcs);
0b2f630a2   Miao Xie   cpusets: restruct...
1079
1080
  	if (retval < 0)
  		goto done;
010cfac4c   Li Zefan   cpuset: avoid cha...
1081
1082
1083
  	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
  	if (retval < 0)
  		goto done;
0b2f630a2   Miao Xie   cpusets: restruct...
1084
  	mutex_lock(&callback_mutex);
645fcc9d2   Li Zefan   cpuset: don't all...
1085
  	cs->mems_allowed = trialcs->mems_allowed;
0b2f630a2   Miao Xie   cpusets: restruct...
1086
  	mutex_unlock(&callback_mutex);
53feb2976   Miao Xie   cpuset: alloc nod...
1087
  	update_tasks_nodemask(cs, oldmem, &heap);
010cfac4c   Li Zefan   cpuset: avoid cha...
1088
1089
  
  	heap_free(&heap);
0b2f630a2   Miao Xie   cpusets: restruct...
1090
  done:
53feb2976   Miao Xie   cpuset: alloc nod...
1091
  	NODEMASK_FREE(oldmem);
0b2f630a2   Miao Xie   cpusets: restruct...
1092
1093
  	return retval;
  }
8793d854e   Paul Menage   Task Control Grou...
1094
1095
1096
1097
  int current_cpuset_is_being_rebound(void)
  {
  	return task_cs(current) == cpuset_being_rebound;
  }
5be7a4792   Paul Menage   Fix cpuset sched_...
1098
  static int update_relax_domain_level(struct cpuset *cs, s64 val)
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1099
  {
db7f47cf4   Paul Menage   cpusets: allow cp...
1100
  #ifdef CONFIG_SMP
60495e776   Peter Zijlstra   sched: Dynamic sc...
1101
  	if (val < -1 || val >= sched_domain_level_max)
30e0e1781   Li Zefan   cpuset: limit the...
1102
  		return -EINVAL;
db7f47cf4   Paul Menage   cpusets: allow cp...
1103
  #endif
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1104
1105
1106
  
  	if (val != cs->relax_domain_level) {
  		cs->relax_domain_level = val;
300ed6cbb   Li Zefan   cpuset: convert c...
1107
1108
  		if (!cpumask_empty(cs->cpus_allowed) &&
  		    is_sched_load_balance(cs))
cf417141c   Max Krasnyansky   sched, cpuset: re...
1109
  			async_rebuild_sched_domains();
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1110
1111
1112
1113
  	}
  
  	return 0;
  }
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1114
  /*
950592f7b   Miao Xie   cpusets: update t...
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
   * cpuset_change_flag - make a task's spread flags the same as its cpuset's
   * @tsk: task to be updated
   * @scan: struct cgroup_scanner containing the cgroup of the task
   *
   * Called by cgroup_scan_tasks() for each task in a cgroup.
   *
   * We don't need to re-check for the cgroup/cpuset membership, since we're
   * holding cgroup_lock() at this point.
   */
  static void cpuset_change_flag(struct task_struct *tsk,
  				struct cgroup_scanner *scan)
  {
  	cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
  }
  
  /*
   * update_tasks_flags - update the spread flags of tasks in the cpuset.
   * @cs: the cpuset in which each task's spread flags needs to be changed
   * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
   *
   * Called with cgroup_mutex held
   *
   * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
   * calling callback functions for each.
   *
   * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
   * if @heap != NULL.
   */
  static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
  {
  	struct cgroup_scanner scan;
  
  	scan.cg = cs->css.cgroup;
  	scan.test_task = NULL;
  	scan.process_task = cpuset_change_flag;
  	scan.heap = heap;
  	cgroup_scan_tasks(&scan);
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1155
   * update_flag - read a 0 or a 1 in a file and update associated flag
786083667   Paul Menage   Cpuset hardwall f...
1156
1157
1158
   * bit:		the bit to update (see cpuset_flagbits_t)
   * cs:		the cpuset to update
   * turning_on: 	whether the flag is being set or cleared
053199edf   Paul Jackson   [PATCH] cpusets: ...
1159
   *
2df167a30   Paul Menage   cgroups: update c...
1160
   * Call with cgroup_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1161
   */
700fe1ab9   Paul Menage   CGroup API files:...
1162
1163
  static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
  		       int turning_on)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1164
  {
645fcc9d2   Li Zefan   cpuset: don't all...
1165
  	struct cpuset *trialcs;
40b6a7623   Rakib Mullick   cpuset.c: remove ...
1166
  	int balance_flag_changed;
950592f7b   Miao Xie   cpusets: update t...
1167
1168
1169
  	int spread_flag_changed;
  	struct ptr_heap heap;
  	int err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1170

645fcc9d2   Li Zefan   cpuset: don't all...
1171
1172
1173
  	trialcs = alloc_trial_cpuset(cs);
  	if (!trialcs)
  		return -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1174
  	if (turning_on)
645fcc9d2   Li Zefan   cpuset: don't all...
1175
  		set_bit(bit, &trialcs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1176
  	else
645fcc9d2   Li Zefan   cpuset: don't all...
1177
  		clear_bit(bit, &trialcs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1178

645fcc9d2   Li Zefan   cpuset: don't all...
1179
  	err = validate_change(cs, trialcs);
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
1180
  	if (err < 0)
645fcc9d2   Li Zefan   cpuset: don't all...
1181
  		goto out;
029190c51   Paul Jackson   cpuset sched_load...
1182

950592f7b   Miao Xie   cpusets: update t...
1183
1184
1185
  	err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
  	if (err < 0)
  		goto out;
029190c51   Paul Jackson   cpuset sched_load...
1186
  	balance_flag_changed = (is_sched_load_balance(cs) !=
645fcc9d2   Li Zefan   cpuset: don't all...
1187
  				is_sched_load_balance(trialcs));
029190c51   Paul Jackson   cpuset sched_load...
1188

950592f7b   Miao Xie   cpusets: update t...
1189
1190
  	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
  			|| (is_spread_page(cs) != is_spread_page(trialcs)));
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
1191
  	mutex_lock(&callback_mutex);
645fcc9d2   Li Zefan   cpuset: don't all...
1192
  	cs->flags = trialcs->flags;
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
1193
  	mutex_unlock(&callback_mutex);
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
1194

300ed6cbb   Li Zefan   cpuset: convert c...
1195
  	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
cf417141c   Max Krasnyansky   sched, cpuset: re...
1196
  		async_rebuild_sched_domains();
029190c51   Paul Jackson   cpuset sched_load...
1197

950592f7b   Miao Xie   cpusets: update t...
1198
1199
1200
  	if (spread_flag_changed)
  		update_tasks_flags(cs, &heap);
  	heap_free(&heap);
645fcc9d2   Li Zefan   cpuset: don't all...
1201
1202
1203
  out:
  	free_trial_cpuset(trialcs);
  	return err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1204
  }
053199edf   Paul Jackson   [PATCH] cpusets: ...
1205
  /*
80f7228b5   Adrian Bunk   typo fixes: occur...
1206
   * Frequency meter - How fast is some event occurring?
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
   *
   * These routines manage a digitally filtered, constant time based,
   * event frequency meter.  There are four routines:
   *   fmeter_init() - initialize a frequency meter.
   *   fmeter_markevent() - called each time the event happens.
   *   fmeter_getrate() - returns the recent rate of such events.
   *   fmeter_update() - internal routine used to update fmeter.
   *
   * A common data structure is passed to each of these routines,
   * which is used to keep track of the state required to manage the
   * frequency meter and its digital filter.
   *
   * The filter works on the number of events marked per unit time.
   * The filter is single-pole low-pass recursive (IIR).  The time unit
   * is 1 second.  Arithmetic is done using 32-bit integers scaled to
   * simulate 3 decimal digits of precision (multiplied by 1000).
   *
   * With an FM_COEF of 933, and a time base of 1 second, the filter
   * has a half-life of 10 seconds, meaning that if the events quit
   * happening, then the rate returned from the fmeter_getrate()
   * will be cut in half each 10 seconds, until it converges to zero.
   *
   * It is not worth doing a real infinitely recursive filter.  If more
   * than FM_MAXTICKS ticks have elapsed since the last filter event,
   * just compute FM_MAXTICKS ticks worth, by which point the level
   * will be stable.
   *
   * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
   * arithmetic overflow in the fmeter_update() routine.
   *
   * Given the simple 32 bit integer arithmetic used, this meter works
   * best for reporting rates between one per millisecond (msec) and
   * one per 32 (approx) seconds.  At constant rates faster than one
   * per msec it maxes out at values just under 1,000,000.  At constant
   * rates between one per msec, and one per second it will stabilize
   * to a value N*1000, where N is the rate of events per second.
   * At constant rates between one per second and one per 32 seconds,
   * it will be choppy, moving up on the seconds that have an event,
   * and then decaying until the next event.  At rates slower than
   * about one in 32 seconds, it decays all the way back to zero between
   * each event.
   */
  
  #define FM_COEF 933		/* coefficient for half-life of 10 secs */
  #define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
  #define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */
  #define FM_SCALE 1000		/* faux fixed point scale */
  
  /* Initialize a frequency meter */
  static void fmeter_init(struct fmeter *fmp)
  {
  	fmp->cnt = 0;
  	fmp->val = 0;
  	fmp->time = 0;
  	spin_lock_init(&fmp->lock);
  }
  
  /* Internal meter update - process cnt events and update value */
  static void fmeter_update(struct fmeter *fmp)
  {
  	time_t now = get_seconds();
  	time_t ticks = now - fmp->time;
  
  	if (ticks == 0)
  		return;
  
  	ticks = min(FM_MAXTICKS, ticks);
  	while (ticks-- > 0)
  		fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
  	fmp->time = now;
  
  	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
  	fmp->cnt = 0;
  }
  
  /* Process any previous ticks, then bump cnt by one (times scale). */
  static void fmeter_markevent(struct fmeter *fmp)
  {
  	spin_lock(&fmp->lock);
  	fmeter_update(fmp);
  	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
  	spin_unlock(&fmp->lock);
  }
  
  /* Process any previous ticks, then return current value. */
  static int fmeter_getrate(struct fmeter *fmp)
  {
  	int val;
  
  	spin_lock(&fmp->lock);
  	fmeter_update(fmp);
  	val = fmp->val;
  	spin_unlock(&fmp->lock);
  	return val;
  }
2341d1b65   Li Zefan   cpuset: convert c...
1302
1303
  /* Protected by cgroup_lock */
  static cpumask_var_t cpus_attach;
2df167a30   Paul Menage   cgroups: update c...
1304
  /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
be367d099   Ben Blum   cgroups: let ss->...
1305
1306
  static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
  			     struct task_struct *tsk, bool threadgroup)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1307
  {
be367d099   Ben Blum   cgroups: let ss->...
1308
  	int ret;
8793d854e   Paul Menage   Task Control Grou...
1309
  	struct cpuset *cs = cgroup_cs(cont);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1310

300ed6cbb   Li Zefan   cpuset: convert c...
1311
  	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1312
  		return -ENOSPC;
9985b0bab   David Rientjes   sched: prevent bo...
1313

6d7b2f5f9   David Rientjes   cpusets: prevent ...
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
  	/*
  	 * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
  	 * cannot change their cpu affinity and isolating such threads by their
  	 * set of allowed nodes is unnecessary.  Thus, cpusets are not
  	 * applicable for such threads.  This prevents checking for success of
  	 * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
  	 * be changed.
  	 */
  	if (tsk->flags & PF_THREAD_BOUND)
  		return -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1324

b0ae19811   KOSAKI Motohiro   security: remove ...
1325
  	ret = security_task_setscheduler(tsk);
be367d099   Ben Blum   cgroups: let ss->...
1326
1327
1328
1329
1330
1331
1332
  	if (ret)
  		return ret;
  	if (threadgroup) {
  		struct task_struct *c;
  
  		rcu_read_lock();
  		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
b0ae19811   KOSAKI Motohiro   security: remove ...
1333
  			ret = security_task_setscheduler(c);
be367d099   Ben Blum   cgroups: let ss->...
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
  			if (ret) {
  				rcu_read_unlock();
  				return ret;
  			}
  		}
  		rcu_read_unlock();
  	}
  	return 0;
  }
  
  static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
  			       struct cpuset *cs)
  {
  	int err;
  	/*
  	 * can_attach beforehand should guarantee that this doesn't fail.
  	 * TODO: have a better way to handle failure here
  	 */
  	err = set_cpus_allowed_ptr(tsk, cpus_attach);
  	WARN_ON_ONCE(err);
be367d099   Ben Blum   cgroups: let ss->...
1354
  	cpuset_change_task_nodemask(tsk, to);
be367d099   Ben Blum   cgroups: let ss->...
1355
  	cpuset_update_task_spread_flag(cs, tsk);
8793d854e   Paul Menage   Task Control Grou...
1356
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1357

be367d099   Ben Blum   cgroups: let ss->...
1358
1359
1360
  static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
  			  struct cgroup *oldcont, struct task_struct *tsk,
  			  bool threadgroup)
8793d854e   Paul Menage   Task Control Grou...
1361
  {
8793d854e   Paul Menage   Task Control Grou...
1362
1363
1364
  	struct mm_struct *mm;
  	struct cpuset *cs = cgroup_cs(cont);
  	struct cpuset *oldcs = cgroup_cs(oldcont);
ee24d3797   Li Zefan   cpuset: fix unche...
1365
  	static nodemask_t to;		/* protected by cgroup_mutex */
22fb52dd7   David Quigley   [PATCH] SELinux: ...
1366

f5813d942   Miao Xie   cpusets: set task...
1367
  	if (cs == &top_cpuset) {
2341d1b65   Li Zefan   cpuset: convert c...
1368
  		cpumask_copy(cpus_attach, cpu_possible_mask);
f5813d942   Miao Xie   cpusets: set task...
1369
  	} else {
2341d1b65   Li Zefan   cpuset: convert c...
1370
  		guarantee_online_cpus(cs, cpus_attach);
f5813d942   Miao Xie   cpusets: set task...
1371
  	}
ee24d3797   Li Zefan   cpuset: fix unche...
1372
  	guarantee_online_mems(cs, &to);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1373

be367d099   Ben Blum   cgroups: let ss->...
1374
  	/* do per-task migration stuff possibly for each in the threadgroup */
ee24d3797   Li Zefan   cpuset: fix unche...
1375
  	cpuset_attach_task(tsk, &to, cs);
be367d099   Ben Blum   cgroups: let ss->...
1376
1377
1378
1379
  	if (threadgroup) {
  		struct task_struct *c;
  		rcu_read_lock();
  		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
ee24d3797   Li Zefan   cpuset: fix unche...
1380
  			cpuset_attach_task(c, &to, cs);
be367d099   Ben Blum   cgroups: let ss->...
1381
1382
1383
  		}
  		rcu_read_unlock();
  	}
950592f7b   Miao Xie   cpusets: update t...
1384

be367d099   Ben Blum   cgroups: let ss->...
1385
  	/* change mm; only needs to be done once even if threadgroup */
ee24d3797   Li Zefan   cpuset: fix unche...
1386
  	to = cs->mems_allowed;
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1387
1388
  	mm = get_task_mm(tsk);
  	if (mm) {
ee24d3797   Li Zefan   cpuset: fix unche...
1389
  		mpol_rebind_mm(mm, &to);
2741a559a   Paul Jackson   [PATCH] cpuset: u...
1390
  		if (is_memory_migrate(cs))
ee24d3797   Li Zefan   cpuset: fix unche...
1391
  			cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1392
1393
  		mmput(mm);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1394
1395
1396
1397
1398
  }
  
  /* The various types of files and directories in a cpuset file system */
  
  typedef enum {
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
1399
  	FILE_MEMORY_MIGRATE,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1400
1401
1402
1403
  	FILE_CPULIST,
  	FILE_MEMLIST,
  	FILE_CPU_EXCLUSIVE,
  	FILE_MEM_EXCLUSIVE,
786083667   Paul Menage   Cpuset hardwall f...
1404
  	FILE_MEM_HARDWALL,
029190c51   Paul Jackson   cpuset sched_load...
1405
  	FILE_SCHED_LOAD_BALANCE,
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1406
  	FILE_SCHED_RELAX_DOMAIN_LEVEL,
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1407
1408
  	FILE_MEMORY_PRESSURE_ENABLED,
  	FILE_MEMORY_PRESSURE,
825a46af5   Paul Jackson   [PATCH] cpuset me...
1409
1410
  	FILE_SPREAD_PAGE,
  	FILE_SPREAD_SLAB,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1411
  } cpuset_filetype_t;
700fe1ab9   Paul Menage   CGroup API files:...
1412
1413
1414
1415
1416
  static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
  {
  	int retval = 0;
  	struct cpuset *cs = cgroup_cs(cgrp);
  	cpuset_filetype_t type = cft->private;
e37123953   Paul Menage   cgroup files: rem...
1417
  	if (!cgroup_lock_live_group(cgrp))
700fe1ab9   Paul Menage   CGroup API files:...
1418
  		return -ENODEV;
700fe1ab9   Paul Menage   CGroup API files:...
1419
1420
  
  	switch (type) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1421
  	case FILE_CPU_EXCLUSIVE:
700fe1ab9   Paul Menage   CGroup API files:...
1422
  		retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1423
1424
  		break;
  	case FILE_MEM_EXCLUSIVE:
700fe1ab9   Paul Menage   CGroup API files:...
1425
  		retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1426
  		break;
786083667   Paul Menage   Cpuset hardwall f...
1427
1428
1429
  	case FILE_MEM_HARDWALL:
  		retval = update_flag(CS_MEM_HARDWALL, cs, val);
  		break;
029190c51   Paul Jackson   cpuset sched_load...
1430
  	case FILE_SCHED_LOAD_BALANCE:
700fe1ab9   Paul Menage   CGroup API files:...
1431
  		retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1432
  		break;
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
1433
  	case FILE_MEMORY_MIGRATE:
700fe1ab9   Paul Menage   CGroup API files:...
1434
  		retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
1435
  		break;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1436
  	case FILE_MEMORY_PRESSURE_ENABLED:
700fe1ab9   Paul Menage   CGroup API files:...
1437
  		cpuset_memory_pressure_enabled = !!val;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1438
1439
1440
1441
  		break;
  	case FILE_MEMORY_PRESSURE:
  		retval = -EACCES;
  		break;
825a46af5   Paul Jackson   [PATCH] cpuset me...
1442
  	case FILE_SPREAD_PAGE:
700fe1ab9   Paul Menage   CGroup API files:...
1443
  		retval = update_flag(CS_SPREAD_PAGE, cs, val);
825a46af5   Paul Jackson   [PATCH] cpuset me...
1444
1445
  		break;
  	case FILE_SPREAD_SLAB:
700fe1ab9   Paul Menage   CGroup API files:...
1446
  		retval = update_flag(CS_SPREAD_SLAB, cs, val);
825a46af5   Paul Jackson   [PATCH] cpuset me...
1447
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1448
1449
  	default:
  		retval = -EINVAL;
700fe1ab9   Paul Menage   CGroup API files:...
1450
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1451
  	}
8793d854e   Paul Menage   Task Control Grou...
1452
  	cgroup_unlock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1453
1454
  	return retval;
  }
5be7a4792   Paul Menage   Fix cpuset sched_...
1455
1456
1457
1458
1459
  static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
  {
  	int retval = 0;
  	struct cpuset *cs = cgroup_cs(cgrp);
  	cpuset_filetype_t type = cft->private;
e37123953   Paul Menage   cgroup files: rem...
1460
  	if (!cgroup_lock_live_group(cgrp))
5be7a4792   Paul Menage   Fix cpuset sched_...
1461
  		return -ENODEV;
e37123953   Paul Menage   cgroup files: rem...
1462

5be7a4792   Paul Menage   Fix cpuset sched_...
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
  	switch (type) {
  	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
  		retval = update_relax_domain_level(cs, val);
  		break;
  	default:
  		retval = -EINVAL;
  		break;
  	}
  	cgroup_unlock();
  	return retval;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1474
  /*
e37123953   Paul Menage   cgroup files: rem...
1475
1476
1477
1478
1479
1480
   * Common handling for a write to a "cpus" or "mems" file.
   */
  static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
  				const char *buf)
  {
  	int retval = 0;
645fcc9d2   Li Zefan   cpuset: don't all...
1481
1482
  	struct cpuset *cs = cgroup_cs(cgrp);
  	struct cpuset *trialcs;
e37123953   Paul Menage   cgroup files: rem...
1483
1484
1485
  
  	if (!cgroup_lock_live_group(cgrp))
  		return -ENODEV;
645fcc9d2   Li Zefan   cpuset: don't all...
1486
  	trialcs = alloc_trial_cpuset(cs);
b75f38d65   Li Zefan   cpuset: add a mis...
1487
1488
1489
1490
  	if (!trialcs) {
  		retval = -ENOMEM;
  		goto out;
  	}
645fcc9d2   Li Zefan   cpuset: don't all...
1491

e37123953   Paul Menage   cgroup files: rem...
1492
1493
  	switch (cft->private) {
  	case FILE_CPULIST:
645fcc9d2   Li Zefan   cpuset: don't all...
1494
  		retval = update_cpumask(cs, trialcs, buf);
e37123953   Paul Menage   cgroup files: rem...
1495
1496
  		break;
  	case FILE_MEMLIST:
645fcc9d2   Li Zefan   cpuset: don't all...
1497
  		retval = update_nodemask(cs, trialcs, buf);
e37123953   Paul Menage   cgroup files: rem...
1498
1499
1500
1501
1502
  		break;
  	default:
  		retval = -EINVAL;
  		break;
  	}
645fcc9d2   Li Zefan   cpuset: don't all...
1503
1504
  
  	free_trial_cpuset(trialcs);
b75f38d65   Li Zefan   cpuset: add a mis...
1505
  out:
e37123953   Paul Menage   cgroup files: rem...
1506
1507
1508
1509
1510
  	cgroup_unlock();
  	return retval;
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
   * These ascii lists should be read in a single call, by using a user
   * buffer large enough to hold the entire map.  If read in smaller
   * chunks, there is no guarantee of atomicity.  Since the display format
   * used, list of ranges of sequential numbers, is variable length,
   * and since these maps can change value dynamically, one could read
   * gibberish by doing partial reads while a list was changing.
   * A single large read to a buffer that crosses a page boundary is
   * ok, because the result being copied to user land is not recomputed
   * across a page fault.
   */
9303e0c48   Li Zefan   cpuset: remove un...
1521
  static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1522
  {
9303e0c48   Li Zefan   cpuset: remove un...
1523
  	size_t count;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1524

3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
1525
  	mutex_lock(&callback_mutex);
9303e0c48   Li Zefan   cpuset: remove un...
1526
  	count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
1527
  	mutex_unlock(&callback_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1528

9303e0c48   Li Zefan   cpuset: remove un...
1529
  	return count;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1530
  }
9303e0c48   Li Zefan   cpuset: remove un...
1531
  static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1532
  {
9303e0c48   Li Zefan   cpuset: remove un...
1533
  	size_t count;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1534

3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
1535
  	mutex_lock(&callback_mutex);
9303e0c48   Li Zefan   cpuset: remove un...
1536
  	count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
1537
  	mutex_unlock(&callback_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1538

9303e0c48   Li Zefan   cpuset: remove un...
1539
  	return count;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1540
  }
8793d854e   Paul Menage   Task Control Grou...
1541
1542
1543
1544
1545
  static ssize_t cpuset_common_file_read(struct cgroup *cont,
  				       struct cftype *cft,
  				       struct file *file,
  				       char __user *buf,
  				       size_t nbytes, loff_t *ppos)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1546
  {
8793d854e   Paul Menage   Task Control Grou...
1547
  	struct cpuset *cs = cgroup_cs(cont);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1548
1549
1550
1551
  	cpuset_filetype_t type = cft->private;
  	char *page;
  	ssize_t retval = 0;
  	char *s;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1552

e12ba74d8   Mel Gorman   Group short-lived...
1553
  	if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
  		return -ENOMEM;
  
  	s = page;
  
  	switch (type) {
  	case FILE_CPULIST:
  		s += cpuset_sprintf_cpulist(s, cs);
  		break;
  	case FILE_MEMLIST:
  		s += cpuset_sprintf_memlist(s, cs);
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1565
1566
1567
1568
1569
1570
  	default:
  		retval = -EINVAL;
  		goto out;
  	}
  	*s++ = '
  ';
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1571

eacaa1f5a   Al Viro   [PATCH] cpuset cr...
1572
  	retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1573
1574
1575
1576
  out:
  	free_page((unsigned long)page);
  	return retval;
  }
700fe1ab9   Paul Menage   CGroup API files:...
1577
1578
1579
1580
1581
1582
1583
1584
1585
  static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
  {
  	struct cpuset *cs = cgroup_cs(cont);
  	cpuset_filetype_t type = cft->private;
  	switch (type) {
  	case FILE_CPU_EXCLUSIVE:
  		return is_cpu_exclusive(cs);
  	case FILE_MEM_EXCLUSIVE:
  		return is_mem_exclusive(cs);
786083667   Paul Menage   Cpuset hardwall f...
1586
1587
  	case FILE_MEM_HARDWALL:
  		return is_mem_hardwall(cs);
700fe1ab9   Paul Menage   CGroup API files:...
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
  	case FILE_SCHED_LOAD_BALANCE:
  		return is_sched_load_balance(cs);
  	case FILE_MEMORY_MIGRATE:
  		return is_memory_migrate(cs);
  	case FILE_MEMORY_PRESSURE_ENABLED:
  		return cpuset_memory_pressure_enabled;
  	case FILE_MEMORY_PRESSURE:
  		return fmeter_getrate(&cs->fmeter);
  	case FILE_SPREAD_PAGE:
  		return is_spread_page(cs);
  	case FILE_SPREAD_SLAB:
  		return is_spread_slab(cs);
  	default:
  		BUG();
  	}
cf417141c   Max Krasnyansky   sched, cpuset: re...
1603
1604
1605
  
  	/* Unreachable but makes gcc happy */
  	return 0;
700fe1ab9   Paul Menage   CGroup API files:...
1606
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1607

5be7a4792   Paul Menage   Fix cpuset sched_...
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
  static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
  {
  	struct cpuset *cs = cgroup_cs(cont);
  	cpuset_filetype_t type = cft->private;
  	switch (type) {
  	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
  		return cs->relax_domain_level;
  	default:
  		BUG();
  	}
cf417141c   Max Krasnyansky   sched, cpuset: re...
1618
1619
1620
  
  	/* Unrechable but makes gcc happy */
  	return 0;
5be7a4792   Paul Menage   Fix cpuset sched_...
1621
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1622
1623
1624
1625
  
  /*
   * for the common functions, 'private' gives the type of file
   */
addf2c739   Paul Menage   Cpuset hardwall f...
1626
1627
1628
1629
  static struct cftype files[] = {
  	{
  		.name = "cpus",
  		.read = cpuset_common_file_read,
e37123953   Paul Menage   cgroup files: rem...
1630
1631
  		.write_string = cpuset_write_resmask,
  		.max_write_len = (100U + 6 * NR_CPUS),
addf2c739   Paul Menage   Cpuset hardwall f...
1632
1633
1634
1635
1636
1637
  		.private = FILE_CPULIST,
  	},
  
  	{
  		.name = "mems",
  		.read = cpuset_common_file_read,
e37123953   Paul Menage   cgroup files: rem...
1638
1639
  		.write_string = cpuset_write_resmask,
  		.max_write_len = (100U + 6 * MAX_NUMNODES),
addf2c739   Paul Menage   Cpuset hardwall f...
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
  		.private = FILE_MEMLIST,
  	},
  
  	{
  		.name = "cpu_exclusive",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_CPU_EXCLUSIVE,
  	},
  
  	{
  		.name = "mem_exclusive",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEM_EXCLUSIVE,
  	},
  
  	{
786083667   Paul Menage   Cpuset hardwall f...
1658
1659
1660
1661
1662
1663
1664
  		.name = "mem_hardwall",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEM_HARDWALL,
  	},
  
  	{
addf2c739   Paul Menage   Cpuset hardwall f...
1665
1666
1667
1668
1669
1670
1671
1672
  		.name = "sched_load_balance",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_SCHED_LOAD_BALANCE,
  	},
  
  	{
  		.name = "sched_relax_domain_level",
5be7a4792   Paul Menage   Fix cpuset sched_...
1673
1674
  		.read_s64 = cpuset_read_s64,
  		.write_s64 = cpuset_write_s64,
addf2c739   Paul Menage   Cpuset hardwall f...
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
  		.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
  	},
  
  	{
  		.name = "memory_migrate",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEMORY_MIGRATE,
  	},
  
  	{
  		.name = "memory_pressure",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEMORY_PRESSURE,
099fca322   Li Zefan   cgroups: show cor...
1690
  		.mode = S_IRUGO,
addf2c739   Paul Menage   Cpuset hardwall f...
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
  	},
  
  	{
  		.name = "memory_spread_page",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_SPREAD_PAGE,
  	},
  
  	{
  		.name = "memory_spread_slab",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_SPREAD_SLAB,
  	},
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
1706
  };
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1707
1708
  static struct cftype cft_memory_pressure_enabled = {
  	.name = "memory_pressure_enabled",
700fe1ab9   Paul Menage   CGroup API files:...
1709
1710
  	.read_u64 = cpuset_read_u64,
  	.write_u64 = cpuset_write_u64,
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1711
1712
  	.private = FILE_MEMORY_PRESSURE_ENABLED,
  };
8793d854e   Paul Menage   Task Control Grou...
1713
  static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1714
1715
  {
  	int err;
addf2c739   Paul Menage   Cpuset hardwall f...
1716
1717
  	err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
  	if (err)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1718
  		return err;
8793d854e   Paul Menage   Task Control Grou...
1719
  	/* memory_pressure_enabled is in root cpuset only */
addf2c739   Paul Menage   Cpuset hardwall f...
1720
  	if (!cont->parent)
8793d854e   Paul Menage   Task Control Grou...
1721
  		err = cgroup_add_file(cont, ss,
addf2c739   Paul Menage   Cpuset hardwall f...
1722
1723
  				      &cft_memory_pressure_enabled);
  	return err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1724
1725
1726
  }
  
  /*
8793d854e   Paul Menage   Task Control Grou...
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
   * post_clone() is called at the end of cgroup_clone().
   * 'cgroup' was just created automatically as a result of
   * a cgroup_clone(), and the current task is about to
   * be moved into 'cgroup'.
   *
   * Currently we refuse to set up the cgroup - thereby
   * refusing the task to be entered, and as a result refusing
   * the sys_unshare() or clone() which initiated it - if any
   * sibling cpusets have exclusive cpus or mem.
   *
   * If this becomes a problem for some users who wish to
   * allow that scenario, then cpuset_post_clone() could be
   * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
2df167a30   Paul Menage   cgroups: update c...
1740
1741
   * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
   * held.
8793d854e   Paul Menage   Task Control Grou...
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
   */
  static void cpuset_post_clone(struct cgroup_subsys *ss,
  			      struct cgroup *cgroup)
  {
  	struct cgroup *parent, *child;
  	struct cpuset *cs, *parent_cs;
  
  	parent = cgroup->parent;
  	list_for_each_entry(child, &parent->children, sibling) {
  		cs = cgroup_cs(child);
  		if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
  			return;
  	}
  	cs = cgroup_cs(cgroup);
  	parent_cs = cgroup_cs(parent);
523fb486b   Li Zefan   cpuset: hold call...
1757
  	mutex_lock(&callback_mutex);
8793d854e   Paul Menage   Task Control Grou...
1758
  	cs->mems_allowed = parent_cs->mems_allowed;
300ed6cbb   Li Zefan   cpuset: convert c...
1759
  	cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
523fb486b   Li Zefan   cpuset: hold call...
1760
  	mutex_unlock(&callback_mutex);
8793d854e   Paul Menage   Task Control Grou...
1761
1762
1763
1764
  	return;
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1765
   *	cpuset_create - create a cpuset
2df167a30   Paul Menage   cgroups: update c...
1766
1767
   *	ss:	cpuset cgroup subsystem
   *	cont:	control group that the new cpuset will be part of
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1768
   */
8793d854e   Paul Menage   Task Control Grou...
1769
1770
1771
  static struct cgroup_subsys_state *cpuset_create(
  	struct cgroup_subsys *ss,
  	struct cgroup *cont)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1772
1773
  {
  	struct cpuset *cs;
8793d854e   Paul Menage   Task Control Grou...
1774
  	struct cpuset *parent;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1775

8793d854e   Paul Menage   Task Control Grou...
1776
  	if (!cont->parent) {
8793d854e   Paul Menage   Task Control Grou...
1777
1778
1779
  		return &top_cpuset.css;
  	}
  	parent = cgroup_cs(cont->parent);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1780
1781
  	cs = kmalloc(sizeof(*cs), GFP_KERNEL);
  	if (!cs)
8793d854e   Paul Menage   Task Control Grou...
1782
  		return ERR_PTR(-ENOMEM);
300ed6cbb   Li Zefan   cpuset: convert c...
1783
1784
1785
1786
  	if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
  		kfree(cs);
  		return ERR_PTR(-ENOMEM);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1787

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1788
  	cs->flags = 0;
825a46af5   Paul Jackson   [PATCH] cpuset me...
1789
1790
1791
1792
  	if (is_spread_page(parent))
  		set_bit(CS_SPREAD_PAGE, &cs->flags);
  	if (is_spread_slab(parent))
  		set_bit(CS_SPREAD_SLAB, &cs->flags);
029190c51   Paul Jackson   cpuset sched_load...
1793
  	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
300ed6cbb   Li Zefan   cpuset: convert c...
1794
  	cpumask_clear(cs->cpus_allowed);
f9a86fcbb   Mike Travis   cpuset: modify cp...
1795
  	nodes_clear(cs->mems_allowed);
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1796
  	fmeter_init(&cs->fmeter);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1797
  	cs->relax_domain_level = -1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1798
1799
  
  	cs->parent = parent;
202f72d5d   Paul Jackson   [PATCH] cpuset: n...
1800
  	number_of_cpusets++;
8793d854e   Paul Menage   Task Control Grou...
1801
  	return &cs->css ;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1802
  }
029190c51   Paul Jackson   cpuset sched_load...
1803
  /*
029190c51   Paul Jackson   cpuset sched_load...
1804
1805
   * If the cpuset being removed has its flag 'sched_load_balance'
   * enabled, then simulate turning sched_load_balance off, which
cf417141c   Max Krasnyansky   sched, cpuset: re...
1806
   * will call async_rebuild_sched_domains().
029190c51   Paul Jackson   cpuset sched_load...
1807
   */
8793d854e   Paul Menage   Task Control Grou...
1808
  static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1809
  {
8793d854e   Paul Menage   Task Control Grou...
1810
  	struct cpuset *cs = cgroup_cs(cont);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1811

029190c51   Paul Jackson   cpuset sched_load...
1812
  	if (is_sched_load_balance(cs))
700fe1ab9   Paul Menage   CGroup API files:...
1813
  		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
029190c51   Paul Jackson   cpuset sched_load...
1814

202f72d5d   Paul Jackson   [PATCH] cpuset: n...
1815
  	number_of_cpusets--;
300ed6cbb   Li Zefan   cpuset: convert c...
1816
  	free_cpumask_var(cs->cpus_allowed);
8793d854e   Paul Menage   Task Control Grou...
1817
  	kfree(cs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1818
  }
8793d854e   Paul Menage   Task Control Grou...
1819
1820
1821
  struct cgroup_subsys cpuset_subsys = {
  	.name = "cpuset",
  	.create = cpuset_create,
cf417141c   Max Krasnyansky   sched, cpuset: re...
1822
  	.destroy = cpuset_destroy,
8793d854e   Paul Menage   Task Control Grou...
1823
1824
1825
1826
1827
1828
1829
  	.can_attach = cpuset_can_attach,
  	.attach = cpuset_attach,
  	.populate = cpuset_populate,
  	.post_clone = cpuset_post_clone,
  	.subsys_id = cpuset_subsys_id,
  	.early_init = 1,
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1830
1831
1832
1833
1834
1835
1836
1837
  /**
   * cpuset_init - initialize cpusets at system boot
   *
   * Description: Initialize top_cpuset and the cpuset internal file system,
   **/
  
  int __init cpuset_init(void)
  {
8793d854e   Paul Menage   Task Control Grou...
1838
  	int err = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1839

58568d2a8   Miao Xie   cpuset,mm: update...
1840
1841
  	if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
  		BUG();
300ed6cbb   Li Zefan   cpuset: convert c...
1842
  	cpumask_setall(top_cpuset.cpus_allowed);
f9a86fcbb   Mike Travis   cpuset: modify cp...
1843
  	nodes_setall(top_cpuset.mems_allowed);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1844

3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1845
  	fmeter_init(&top_cpuset.fmeter);
029190c51   Paul Jackson   cpuset sched_load...
1846
  	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1847
  	top_cpuset.relax_domain_level = -1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1848

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1849
1850
  	err = register_filesystem(&cpuset_fs_type);
  	if (err < 0)
8793d854e   Paul Menage   Task Control Grou...
1851
  		return err;
2341d1b65   Li Zefan   cpuset: convert c...
1852
1853
  	if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
  		BUG();
202f72d5d   Paul Jackson   [PATCH] cpuset: n...
1854
  	number_of_cpusets = 1;
8793d854e   Paul Menage   Task Control Grou...
1855
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1856
  }
956db3ca0   Cliff Wickman   hotplug cpu: move...
1857
1858
1859
1860
1861
1862
1863
1864
  /**
   * cpuset_do_move_task - move a given task to another cpuset
   * @tsk: pointer to task_struct the task to move
   * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
   *
   * Called by cgroup_scan_tasks() for each task in a cgroup.
   * Return nonzero to stop the walk through the tasks.
   */
9e0c914ca   Adrian Bunk   kernel/cpuset.c: ...
1865
1866
  static void cpuset_do_move_task(struct task_struct *tsk,
  				struct cgroup_scanner *scan)
956db3ca0   Cliff Wickman   hotplug cpu: move...
1867
  {
7f81b1ae1   Li Zefan   cpuset: remove st...
1868
  	struct cgroup *new_cgroup = scan->data;
956db3ca0   Cliff Wickman   hotplug cpu: move...
1869

7f81b1ae1   Li Zefan   cpuset: remove st...
1870
  	cgroup_attach_task(new_cgroup, tsk);
956db3ca0   Cliff Wickman   hotplug cpu: move...
1871
1872
1873
1874
1875
1876
1877
  }
  
  /**
   * move_member_tasks_to_cpuset - move tasks from one cpuset to another
   * @from: cpuset in which the tasks currently reside
   * @to: cpuset to which the tasks will be moved
   *
c8d9c90c7   Paul Jackson   hotplug cpu: move...
1878
1879
   * Called with cgroup_mutex held
   * callback_mutex must not be held, as cpuset_attach() will take it.
956db3ca0   Cliff Wickman   hotplug cpu: move...
1880
1881
1882
1883
1884
1885
   *
   * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
   * calling callback functions for each.
   */
  static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
  {
7f81b1ae1   Li Zefan   cpuset: remove st...
1886
  	struct cgroup_scanner scan;
956db3ca0   Cliff Wickman   hotplug cpu: move...
1887

7f81b1ae1   Li Zefan   cpuset: remove st...
1888
1889
1890
1891
1892
  	scan.cg = from->css.cgroup;
  	scan.test_task = NULL; /* select all tasks in cgroup */
  	scan.process_task = cpuset_do_move_task;
  	scan.heap = NULL;
  	scan.data = to->css.cgroup;
956db3ca0   Cliff Wickman   hotplug cpu: move...
1893

7f81b1ae1   Li Zefan   cpuset: remove st...
1894
  	if (cgroup_scan_tasks(&scan))
956db3ca0   Cliff Wickman   hotplug cpu: move...
1895
1896
1897
1898
  		printk(KERN_ERR "move_member_tasks_to_cpuset: "
  				"cgroup_scan_tasks failed
  ");
  }
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1899
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
1900
   * If CPU and/or memory hotplug handlers, below, unplug any CPUs
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1901
1902
   * or memory nodes, we need to walk over the cpuset hierarchy,
   * removing that CPU or node from all cpusets.  If this removes the
956db3ca0   Cliff Wickman   hotplug cpu: move...
1903
1904
   * last CPU or node from a cpuset, then move the tasks in the empty
   * cpuset to its next-highest non-empty parent.
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1905
   *
c8d9c90c7   Paul Jackson   hotplug cpu: move...
1906
1907
   * Called with cgroup_mutex held
   * callback_mutex must not be held, as cpuset_attach() will take it.
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1908
   */
956db3ca0   Cliff Wickman   hotplug cpu: move...
1909
1910
1911
  static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
  {
  	struct cpuset *parent;
c8d9c90c7   Paul Jackson   hotplug cpu: move...
1912
1913
1914
1915
1916
  	/*
  	 * The cgroup's css_sets list is in use if there are tasks
  	 * in the cpuset; the list is empty if there are none;
  	 * the cs->css.refcnt seems always 0.
  	 */
956db3ca0   Cliff Wickman   hotplug cpu: move...
1917
1918
  	if (list_empty(&cs->css.cgroup->css_sets))
  		return;
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1919

956db3ca0   Cliff Wickman   hotplug cpu: move...
1920
1921
1922
1923
1924
  	/*
  	 * Find its next-highest non-empty parent, (top cpuset
  	 * has online cpus, so can't be empty).
  	 */
  	parent = cs->parent;
300ed6cbb   Li Zefan   cpuset: convert c...
1925
  	while (cpumask_empty(parent->cpus_allowed) ||
b45012955   Paul Jackson   hotplug cpu move ...
1926
  			nodes_empty(parent->mems_allowed))
956db3ca0   Cliff Wickman   hotplug cpu: move...
1927
  		parent = parent->parent;
956db3ca0   Cliff Wickman   hotplug cpu: move...
1928
1929
1930
1931
1932
1933
1934
1935
  
  	move_member_tasks_to_cpuset(cs, parent);
  }
  
  /*
   * Walk the specified cpuset subtree and look for empty cpusets.
   * The tasks of such cpuset must be moved to a parent cpuset.
   *
2df167a30   Paul Menage   cgroups: update c...
1936
   * Called with cgroup_mutex held.  We take callback_mutex to modify
956db3ca0   Cliff Wickman   hotplug cpu: move...
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
   * cpus_allowed and mems_allowed.
   *
   * This walk processes the tree from top to bottom, completing one layer
   * before dropping down to the next.  It always processes a node before
   * any of its children.
   *
   * For now, since we lack memory hot unplug, we'll never see a cpuset
   * that has tasks along with an empty 'mems'.  But if we did see such
   * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
   */
d294eb83d   Frederic Weisbecker   cpusets: scan_for...
1947
  static void scan_for_empty_cpusets(struct cpuset *root)
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1948
  {
8d1e6266f   Li Zefan   cpuset: a bit cle...
1949
  	LIST_HEAD(queue);
956db3ca0   Cliff Wickman   hotplug cpu: move...
1950
1951
  	struct cpuset *cp;	/* scans cpusets being updated */
  	struct cpuset *child;	/* scans child cpusets of cp */
8793d854e   Paul Menage   Task Control Grou...
1952
  	struct cgroup *cont;
ee24d3797   Li Zefan   cpuset: fix unche...
1953
  	static nodemask_t oldmems;	/* protected by cgroup_mutex */
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1954

956db3ca0   Cliff Wickman   hotplug cpu: move...
1955
  	list_add_tail((struct list_head *)&root->stack_list, &queue);
956db3ca0   Cliff Wickman   hotplug cpu: move...
1956
  	while (!list_empty(&queue)) {
8d1e6266f   Li Zefan   cpuset: a bit cle...
1957
  		cp = list_first_entry(&queue, struct cpuset, stack_list);
956db3ca0   Cliff Wickman   hotplug cpu: move...
1958
1959
1960
1961
1962
  		list_del(queue.next);
  		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
  			child = cgroup_cs(cont);
  			list_add_tail(&child->stack_list, &queue);
  		}
b45012955   Paul Jackson   hotplug cpu move ...
1963
1964
  
  		/* Continue past cpusets with all cpus, mems online */
6ad4c1888   Peter Zijlstra   sched: Fix balanc...
1965
  		if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
b45012955   Paul Jackson   hotplug cpu move ...
1966
1967
  		    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
  			continue;
ee24d3797   Li Zefan   cpuset: fix unche...
1968
  		oldmems = cp->mems_allowed;
f9b4fb8da   Miao Xie   cpusets: update t...
1969

956db3ca0   Cliff Wickman   hotplug cpu: move...
1970
  		/* Remove offline cpus and mems from this cpuset. */
b45012955   Paul Jackson   hotplug cpu move ...
1971
  		mutex_lock(&callback_mutex);
300ed6cbb   Li Zefan   cpuset: convert c...
1972
  		cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
6ad4c1888   Peter Zijlstra   sched: Fix balanc...
1973
  			    cpu_active_mask);
956db3ca0   Cliff Wickman   hotplug cpu: move...
1974
1975
  		nodes_and(cp->mems_allowed, cp->mems_allowed,
  						node_states[N_HIGH_MEMORY]);
b45012955   Paul Jackson   hotplug cpu move ...
1976
1977
1978
  		mutex_unlock(&callback_mutex);
  
  		/* Move tasks from the empty cpuset to a parent */
300ed6cbb   Li Zefan   cpuset: convert c...
1979
  		if (cpumask_empty(cp->cpus_allowed) ||
b45012955   Paul Jackson   hotplug cpu move ...
1980
  		     nodes_empty(cp->mems_allowed))
956db3ca0   Cliff Wickman   hotplug cpu: move...
1981
  			remove_tasks_in_empty_cpuset(cp);
f9b4fb8da   Miao Xie   cpusets: update t...
1982
  		else {
4e74339af   Li Zefan   cpuset: avoid cha...
1983
  			update_tasks_cpumask(cp, NULL);
ee24d3797   Li Zefan   cpuset: fix unche...
1984
  			update_tasks_nodemask(cp, &oldmems, NULL);
f9b4fb8da   Miao Xie   cpusets: update t...
1985
  		}
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1986
1987
1988
1989
  	}
  }
  
  /*
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
1990
1991
1992
1993
1994
   * The top_cpuset tracks what CPUs and Memory Nodes are online,
   * period.  This is necessary in order to make cpusets transparent
   * (of no affect) on systems that are actively using CPU hotplug
   * but making no active use of cpusets.
   *
38837fc75   Paul Jackson   [PATCH] cpuset: t...
1995
   * This routine ensures that top_cpuset.cpus_allowed tracks
3a101d054   Tejun Heo   sched: adjust whe...
1996
   * cpu_active_mask on each CPU hotplug (cpuhp) event.
cf417141c   Max Krasnyansky   sched, cpuset: re...
1997
1998
1999
   *
   * Called within get_online_cpus().  Needs to call cgroup_lock()
   * before calling generate_sched_domains().
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2000
   */
0b2e918aa   Tejun Heo   sched, cpuset: Dr...
2001
  void cpuset_update_active_cpus(void)
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2002
  {
cf417141c   Max Krasnyansky   sched, cpuset: re...
2003
  	struct sched_domain_attr *attr;
acc3f5d7c   Rusty Russell   cpumask: Partitio...
2004
  	cpumask_var_t *doms;
cf417141c   Max Krasnyansky   sched, cpuset: re...
2005
  	int ndoms;
cf417141c   Max Krasnyansky   sched, cpuset: re...
2006
  	cgroup_lock();
0b4217b3f   Li Zefan   cpuset: fix possi...
2007
  	mutex_lock(&callback_mutex);
6ad4c1888   Peter Zijlstra   sched: Fix balanc...
2008
  	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
0b4217b3f   Li Zefan   cpuset: fix possi...
2009
  	mutex_unlock(&callback_mutex);
cf417141c   Max Krasnyansky   sched, cpuset: re...
2010
2011
2012
2013
2014
2015
  	scan_for_empty_cpusets(&top_cpuset);
  	ndoms = generate_sched_domains(&doms, &attr);
  	cgroup_unlock();
  
  	/* Have scheduler rebuild the domains */
  	partition_sched_domains(ndoms, doms, attr);
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2016
  }
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2017

b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2018
  #ifdef CONFIG_MEMORY_HOTPLUG
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2019
  /*
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
2020
   * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
cf417141c   Max Krasnyansky   sched, cpuset: re...
2021
2022
   * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
   * See also the previous routine cpuset_track_online_cpus().
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2023
   */
f481891fd   Miao Xie   cpuset: update to...
2024
2025
  static int cpuset_track_online_nodes(struct notifier_block *self,
  				unsigned long action, void *arg)
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2026
  {
ee24d3797   Li Zefan   cpuset: fix unche...
2027
  	static nodemask_t oldmems;	/* protected by cgroup_mutex */
5ab116c93   Miao Xie   cpuset: fix the p...
2028

cf417141c   Max Krasnyansky   sched, cpuset: re...
2029
  	cgroup_lock();
f481891fd   Miao Xie   cpuset: update to...
2030
2031
  	switch (action) {
  	case MEM_ONLINE:
ee24d3797   Li Zefan   cpuset: fix unche...
2032
  		oldmems = top_cpuset.mems_allowed;
0b4217b3f   Li Zefan   cpuset: fix possi...
2033
  		mutex_lock(&callback_mutex);
f481891fd   Miao Xie   cpuset: update to...
2034
  		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
0b4217b3f   Li Zefan   cpuset: fix possi...
2035
  		mutex_unlock(&callback_mutex);
ee24d3797   Li Zefan   cpuset: fix unche...
2036
  		update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
5ab116c93   Miao Xie   cpuset: fix the p...
2037
2038
2039
2040
2041
2042
2043
  		break;
  	case MEM_OFFLINE:
  		/*
  		 * needn't update top_cpuset.mems_allowed explicitly because
  		 * scan_for_empty_cpusets() will update it.
  		 */
  		scan_for_empty_cpusets(&top_cpuset);
f481891fd   Miao Xie   cpuset: update to...
2044
2045
2046
2047
  		break;
  	default:
  		break;
  	}
cf417141c   Max Krasnyansky   sched, cpuset: re...
2048
  	cgroup_unlock();
53feb2976   Miao Xie   cpuset: alloc nod...
2049

f481891fd   Miao Xie   cpuset: update to...
2050
  	return NOTIFY_OK;
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2051
2052
  }
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2053
2054
2055
2056
2057
2058
2059
2060
  /**
   * cpuset_init_smp - initialize cpus_allowed
   *
   * Description: Finish top cpuset after cpu, node maps are initialized
   **/
  
  void __init cpuset_init_smp(void)
  {
6ad4c1888   Peter Zijlstra   sched: Fix balanc...
2061
  	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
2062
  	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2063

f481891fd   Miao Xie   cpuset: update to...
2064
  	hotplug_memory_notifier(cpuset_track_online_nodes, 10);
f90d4118b   Miao Xie   cpuset: fix possi...
2065
2066
2067
  
  	cpuset_wq = create_singlethread_workqueue("cpuset");
  	BUG_ON(!cpuset_wq);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2068
2069
2070
  }
  
  /**
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2071
2072
   * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
   * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
6af866af3   Li Zefan   cpuset: remove re...
2073
   * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2074
   *
300ed6cbb   Li Zefan   cpuset: convert c...
2075
   * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2076
2077
2078
2079
   * attached to the specified @tsk.  Guaranteed to return some non-empty
   * subset of cpu_online_map, even if this means going outside the
   * tasks cpuset.
   **/
6af866af3   Li Zefan   cpuset: remove re...
2080
  void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2081
  {
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2082
  	mutex_lock(&callback_mutex);
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2083
  	task_lock(tsk);
f9a86fcbb   Mike Travis   cpuset: modify cp...
2084
  	guarantee_online_cpus(task_cs(tsk), pmask);
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2085
  	task_unlock(tsk);
897f0b3c3   Oleg Nesterov   sched: Kill the b...
2086
  	mutex_unlock(&callback_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2087
  }
9084bb824   Oleg Nesterov   sched: Make selec...
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
  int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
  {
  	const struct cpuset *cs;
  	int cpu;
  
  	rcu_read_lock();
  	cs = task_cs(tsk);
  	if (cs)
  		cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
  	rcu_read_unlock();
  
  	/*
  	 * We own tsk->cpus_allowed, nobody can change it under us.
  	 *
  	 * But we used cs && cs->cpus_allowed lockless and thus can
  	 * race with cgroup_attach_task() or update_cpumask() and get
  	 * the wrong tsk->cpus_allowed. However, both cases imply the
  	 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
  	 * which takes task_rq_lock().
  	 *
  	 * If we are called after it dropped the lock we must see all
  	 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
  	 * set any mask even if it is not right from task_cs() pov,
  	 * the pending set_cpus_allowed_ptr() will fix things.
  	 */
  
  	cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
  	if (cpu >= nr_cpu_ids) {
  		/*
  		 * Either tsk->cpus_allowed is wrong (see above) or it
  		 * is actually empty. The latter case is only possible
  		 * if we are racing with remove_tasks_in_empty_cpuset().
  		 * Like above we can temporary set any mask and rely on
  		 * set_cpus_allowed_ptr() as synchronization point.
  		 */
  		cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
  		cpu = cpumask_any(cpu_active_mask);
  	}
  
  	return cpu;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2129
2130
  void cpuset_init_current_mems_allowed(void)
  {
f9a86fcbb   Mike Travis   cpuset: modify cp...
2131
  	nodes_setall(current->mems_allowed);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2132
  }
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2133
  /**
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2134
2135
2136
2137
2138
   * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
   * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
   *
   * Description: Returns the nodemask_t mems_allowed of the cpuset
   * attached to the specified @tsk.  Guaranteed to return some non-empty
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
2139
   * subset of node_states[N_HIGH_MEMORY], even if this means going outside the
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2140
2141
2142
2143
2144
2145
   * tasks cpuset.
   **/
  
  nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
  {
  	nodemask_t mask;
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2146
  	mutex_lock(&callback_mutex);
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2147
  	task_lock(tsk);
8793d854e   Paul Menage   Task Control Grou...
2148
  	guarantee_online_mems(task_cs(tsk), &mask);
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2149
  	task_unlock(tsk);
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2150
  	mutex_unlock(&callback_mutex);
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2151
2152
2153
2154
2155
  
  	return mask;
  }
  
  /**
19770b326   Mel Gorman   mm: filter based ...
2156
2157
   * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
   * @nodemask: the nodemask to be checked
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2158
   *
19770b326   Mel Gorman   mm: filter based ...
2159
   * Are any of the nodes in the nodemask allowed in current->mems_allowed?
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2160
   */
19770b326   Mel Gorman   mm: filter based ...
2161
  int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2162
  {
19770b326   Mel Gorman   mm: filter based ...
2163
  	return nodes_intersects(*nodemask, current->mems_allowed);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2164
  }
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2165
  /*
786083667   Paul Menage   Cpuset hardwall f...
2166
2167
2168
2169
   * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
   * mem_hardwall ancestor to the specified cpuset.  Call holding
   * callback_mutex.  If no ancestor is mem_exclusive or mem_hardwall
   * (an unusual configuration), then returns the root cpuset.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2170
   */
786083667   Paul Menage   Cpuset hardwall f...
2171
  static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2172
  {
786083667   Paul Menage   Cpuset hardwall f...
2173
  	while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2174
2175
2176
  		cs = cs->parent;
  	return cs;
  }
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2177
  /**
a1bc5a4ee   David Rientjes   cpusets: replace ...
2178
2179
   * cpuset_node_allowed_softwall - Can we allocate on a memory node?
   * @node: is this an allowed node?
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2180
   * @gfp_mask: memory allocation flags
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2181
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2182
2183
2184
2185
2186
2187
   * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
   * set, yes, we can always allocate.  If node is in our task's mems_allowed,
   * yes.  If it's not a __GFP_HARDWALL request and this node is in the nearest
   * hardwalled cpuset ancestor to this task's cpuset, yes.  If the task has been
   * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
   * flag, yes.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2188
2189
   * Otherwise, no.
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2190
2191
2192
   * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
   * cpuset_node_allowed_hardwall().  Otherwise, cpuset_node_allowed_softwall()
   * might sleep, and might allow a node from an enclosing cpuset.
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2193
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2194
2195
   * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
   * cpusets, and never sleeps.
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2196
2197
2198
2199
2200
2201
2202
   *
   * The __GFP_THISNODE placement logic is really handled elsewhere,
   * by forcibly using a zonelist starting at a specified node, and by
   * (in get_page_from_freelist()) refusing to consider the zones for
   * any node on the zonelist except the first.  By the time any such
   * calls get to this routine, we should just shut up and say 'yes'.
   *
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2203
   * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
c596d9f32   David Rientjes   cpusets: allow TI...
2204
2205
   * and do not allow allocations outside the current tasks cpuset
   * unless the task has been OOM killed as is marked TIF_MEMDIE.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2206
   * GFP_KERNEL allocations are not so marked, so can escape to the
786083667   Paul Menage   Cpuset hardwall f...
2207
   * nearest enclosing hardwalled ancestor cpuset.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2208
   *
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2209
2210
2211
2212
2213
2214
2215
   * Scanning up parent cpusets requires callback_mutex.  The
   * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
   * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
   * current tasks mems_allowed came up empty on the first pass over
   * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
   * cpuset are short of memory, might require taking the callback_mutex
   * mutex.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2216
   *
36be57ffe   Paul Jackson   [PATCH] cpuset: u...
2217
   * The first call here from mm/page_alloc:get_page_from_freelist()
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2218
2219
2220
   * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
   * so no allocation on a node outside the cpuset is allowed (unless
   * in interrupt, of course).
36be57ffe   Paul Jackson   [PATCH] cpuset: u...
2221
2222
2223
2224
2225
2226
   *
   * The second pass through get_page_from_freelist() doesn't even call
   * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
   * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
   * in alloc_flags.  That logic and the checks below have the combined
   * affect that:
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2227
2228
   *	in_interrupt - any node ok (current task context irrelevant)
   *	GFP_ATOMIC   - any node ok
c596d9f32   David Rientjes   cpusets: allow TI...
2229
   *	TIF_MEMDIE   - any node ok
786083667   Paul Menage   Cpuset hardwall f...
2230
   *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2231
   *	GFP_USER     - only nodes in current tasks mems allowed ok.
36be57ffe   Paul Jackson   [PATCH] cpuset: u...
2232
2233
   *
   * Rule:
a1bc5a4ee   David Rientjes   cpusets: replace ...
2234
   *    Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
36be57ffe   Paul Jackson   [PATCH] cpuset: u...
2235
2236
   *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
   *    the code that might scan up ancestor cpusets and sleep.
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2237
   */
a1bc5a4ee   David Rientjes   cpusets: replace ...
2238
  int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2239
  {
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2240
  	const struct cpuset *cs;	/* current cpuset ancestors */
29afd49b7   Paul Jackson   [PATCH] cpuset: r...
2241
  	int allowed;			/* is allocation in zone z allowed? */
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2242

9b819d204   Christoph Lameter   [PATCH] Add __GFP...
2243
  	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2244
  		return 1;
92d1dbd27   Paul Jackson   [PATCH] cpuset: m...
2245
  	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2246
2247
  	if (node_isset(node, current->mems_allowed))
  		return 1;
c596d9f32   David Rientjes   cpusets: allow TI...
2248
2249
2250
2251
2252
2253
  	/*
  	 * Allow tasks that have access to memory reserves because they have
  	 * been OOM killed to get memory anywhere.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE)))
  		return 1;
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2254
2255
  	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
  		return 0;
5563e7707   Bob Picco   [PATCH] cpuset: f...
2256
2257
  	if (current->flags & PF_EXITING) /* Let dying task have memory */
  		return 1;
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2258
  	/* Not hardwall and node outside mems_allowed: scan up cpusets */
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2259
  	mutex_lock(&callback_mutex);
053199edf   Paul Jackson   [PATCH] cpusets: ...
2260

053199edf   Paul Jackson   [PATCH] cpusets: ...
2261
  	task_lock(current);
786083667   Paul Menage   Cpuset hardwall f...
2262
  	cs = nearest_hardwall_ancestor(task_cs(current));
053199edf   Paul Jackson   [PATCH] cpusets: ...
2263
  	task_unlock(current);
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2264
  	allowed = node_isset(node, cs->mems_allowed);
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2265
  	mutex_unlock(&callback_mutex);
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2266
  	return allowed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2267
  }
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2268
  /*
a1bc5a4ee   David Rientjes   cpusets: replace ...
2269
2270
   * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
   * @node: is this an allowed node?
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2271
2272
   * @gfp_mask: memory allocation flags
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2273
2274
2275
2276
2277
   * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
   * set, yes, we can always allocate.  If node is in our task's mems_allowed,
   * yes.  If the task has been OOM killed and has access to memory reserves as
   * specified by the TIF_MEMDIE flag, yes.
   * Otherwise, no.
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2278
2279
2280
2281
2282
2283
2284
   *
   * The __GFP_THISNODE placement logic is really handled elsewhere,
   * by forcibly using a zonelist starting at a specified node, and by
   * (in get_page_from_freelist()) refusing to consider the zones for
   * any node on the zonelist except the first.  By the time any such
   * calls get to this routine, we should just shut up and say 'yes'.
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2285
2286
   * Unlike the cpuset_node_allowed_softwall() variant, above,
   * this variant requires that the node be in the current task's
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2287
2288
2289
2290
   * mems_allowed or that we're in interrupt.  It does not scan up the
   * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
   * It never sleeps.
   */
a1bc5a4ee   David Rientjes   cpusets: replace ...
2291
  int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2292
  {
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2293
2294
  	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
  		return 1;
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2295
2296
  	if (node_isset(node, current->mems_allowed))
  		return 1;
dedf8b79e   Daniel Walker   whitespace fixes:...
2297
2298
2299
2300
2301
2302
  	/*
  	 * Allow tasks that have access to memory reserves because they have
  	 * been OOM killed to get memory anywhere.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE)))
  		return 1;
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2303
2304
  	return 0;
  }
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2305
  /**
505970b96   Paul Jackson   [PATCH] cpuset oo...
2306
2307
2308
2309
2310
2311
2312
   * cpuset_unlock - release lock on cpuset changes
   *
   * Undo the lock taken in a previous cpuset_lock() call.
   */
  
  void cpuset_unlock(void)
  {
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2313
  	mutex_unlock(&callback_mutex);
505970b96   Paul Jackson   [PATCH] cpuset oo...
2314
2315
2316
  }
  
  /**
6adef3ebe   Jack Steiner   cpusets: new roun...
2317
2318
   * cpuset_mem_spread_node() - On which node to begin search for a file page
   * cpuset_slab_spread_node() - On which node to begin search for a slab page
825a46af5   Paul Jackson   [PATCH] cpuset me...
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
   *
   * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
   * tasks in a cpuset with is_spread_page or is_spread_slab set),
   * and if the memory allocation used cpuset_mem_spread_node()
   * to determine on which node to start looking, as it will for
   * certain page cache or slab cache pages such as used for file
   * system buffers and inode caches, then instead of starting on the
   * local node to look for a free page, rather spread the starting
   * node around the tasks mems_allowed nodes.
   *
   * We don't have to worry about the returned node being offline
   * because "it can't happen", and even if it did, it would be ok.
   *
   * The routines calling guarantee_online_mems() are careful to
   * only set nodes in task->mems_allowed that are online.  So it
   * should not be possible for the following code to return an
   * offline node.  But if it did, that would be ok, as this routine
   * is not returning the node where the allocation must be, only
   * the node where the search should start.  The zonelist passed to
   * __alloc_pages() will include all nodes.  If the slab allocator
   * is passed an offline node, it will fall back to the local node.
   * See kmem_cache_alloc_node().
   */
6adef3ebe   Jack Steiner   cpusets: new roun...
2342
  static int cpuset_spread_node(int *rotor)
825a46af5   Paul Jackson   [PATCH] cpuset me...
2343
2344
  {
  	int node;
6adef3ebe   Jack Steiner   cpusets: new roun...
2345
  	node = next_node(*rotor, current->mems_allowed);
825a46af5   Paul Jackson   [PATCH] cpuset me...
2346
2347
  	if (node == MAX_NUMNODES)
  		node = first_node(current->mems_allowed);
6adef3ebe   Jack Steiner   cpusets: new roun...
2348
  	*rotor = node;
825a46af5   Paul Jackson   [PATCH] cpuset me...
2349
2350
  	return node;
  }
6adef3ebe   Jack Steiner   cpusets: new roun...
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
  
  int cpuset_mem_spread_node(void)
  {
  	return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
  }
  
  int cpuset_slab_spread_node(void)
  {
  	return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
  }
825a46af5   Paul Jackson   [PATCH] cpuset me...
2361
2362
2363
  EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
  
  /**
bbe373f2c   David Rientjes   oom: compare cpus...
2364
2365
2366
2367
2368
2369
2370
2371
   * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
   * @tsk1: pointer to task_struct of some task.
   * @tsk2: pointer to task_struct of some other task.
   *
   * Description: Return true if @tsk1's mems_allowed intersects the
   * mems_allowed of @tsk2.  Used by the OOM killer to determine if
   * one of the task's memory usage might impact the memory available
   * to the other.
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2372
   **/
bbe373f2c   David Rientjes   oom: compare cpus...
2373
2374
  int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
  				   const struct task_struct *tsk2)
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2375
  {
bbe373f2c   David Rientjes   oom: compare cpus...
2376
  	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2377
  }
75aa19941   David Rientjes   oom: print trigge...
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
  /**
   * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
   * @task: pointer to task_struct of some task.
   *
   * Description: Prints @task's name, cpuset name, and cached copy of its
   * mems_allowed to the kernel log.  Must hold task_lock(task) to allow
   * dereferencing task_cs(task).
   */
  void cpuset_print_task_mems_allowed(struct task_struct *tsk)
  {
  	struct dentry *dentry;
  
  	dentry = task_cs(tsk)->css.cgroup->dentry;
  	spin_lock(&cpuset_buffer_lock);
  	snprintf(cpuset_name, CPUSET_NAME_LEN,
  		 dentry ? (const char *)dentry->d_name.name : "/");
  	nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
  			   tsk->mems_allowed);
  	printk(KERN_INFO "%s cpuset=%s mems_allowed=%s
  ",
  	       tsk->comm, cpuset_name, cpuset_nodelist);
  	spin_unlock(&cpuset_buffer_lock);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2401
  /*
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2402
2403
2404
2405
   * Collection of memory_pressure is suppressed unless
   * this flag is enabled by writing "1" to the special
   * cpuset file 'memory_pressure_enabled' in the root cpuset.
   */
c5b2aff89   Paul Jackson   [PATCH] cpuset: m...
2406
  int cpuset_memory_pressure_enabled __read_mostly;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
  
  /**
   * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
   *
   * Keep a running average of the rate of synchronous (direct)
   * page reclaim efforts initiated by tasks in each cpuset.
   *
   * This represents the rate at which some task in the cpuset
   * ran low on memory on all nodes it was allowed to use, and
   * had to enter the kernels page reclaim code in an effort to
   * create more free memory by tossing clean pages or swapping
   * or writing dirty pages.
   *
   * Display to user space in the per-cpuset read-only file
   * "memory_pressure".  Value displayed is an integer
   * representing the recent rate of entry into the synchronous
   * (direct) page reclaim by any task attached to the cpuset.
   **/
  
  void __cpuset_memory_pressure_bump(void)
  {
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2428
  	task_lock(current);
8793d854e   Paul Menage   Task Control Grou...
2429
  	fmeter_markevent(&task_cs(current)->fmeter);
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2430
2431
  	task_unlock(current);
  }
8793d854e   Paul Menage   Task Control Grou...
2432
  #ifdef CONFIG_PROC_PID_CPUSET
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2433
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2434
2435
2436
   * proc_cpuset_show()
   *  - Print tasks cpuset path into seq_file.
   *  - Used for /proc/<pid>/cpuset.
053199edf   Paul Jackson   [PATCH] cpusets: ...
2437
2438
   *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
   *    doesn't really matter if tsk->cpuset changes after we read it,
c8d9c90c7   Paul Jackson   hotplug cpu: move...
2439
   *    and we take cgroup_mutex, keeping cpuset_attach() from changing it
2df167a30   Paul Menage   cgroups: update c...
2440
   *    anyway.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2441
   */
029190c51   Paul Jackson   cpuset sched_load...
2442
  static int proc_cpuset_show(struct seq_file *m, void *unused_v)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2443
  {
13b41b094   Eric W. Biederman   [PATCH] proc: Use...
2444
  	struct pid *pid;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2445
2446
  	struct task_struct *tsk;
  	char *buf;
8793d854e   Paul Menage   Task Control Grou...
2447
  	struct cgroup_subsys_state *css;
99f895518   Eric W. Biederman   [PATCH] proc: don...
2448
  	int retval;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2449

99f895518   Eric W. Biederman   [PATCH] proc: don...
2450
  	retval = -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2451
2452
  	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
  	if (!buf)
99f895518   Eric W. Biederman   [PATCH] proc: don...
2453
2454
2455
  		goto out;
  
  	retval = -ESRCH;
13b41b094   Eric W. Biederman   [PATCH] proc: Use...
2456
2457
  	pid = m->private;
  	tsk = get_pid_task(pid, PIDTYPE_PID);
99f895518   Eric W. Biederman   [PATCH] proc: don...
2458
2459
  	if (!tsk)
  		goto out_free;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2460

99f895518   Eric W. Biederman   [PATCH] proc: don...
2461
  	retval = -EINVAL;
8793d854e   Paul Menage   Task Control Grou...
2462
2463
2464
  	cgroup_lock();
  	css = task_subsys_state(tsk, cpuset_subsys_id);
  	retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2465
  	if (retval < 0)
99f895518   Eric W. Biederman   [PATCH] proc: don...
2466
  		goto out_unlock;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2467
2468
2469
  	seq_puts(m, buf);
  	seq_putc(m, '
  ');
99f895518   Eric W. Biederman   [PATCH] proc: don...
2470
  out_unlock:
8793d854e   Paul Menage   Task Control Grou...
2471
  	cgroup_unlock();
99f895518   Eric W. Biederman   [PATCH] proc: don...
2472
2473
  	put_task_struct(tsk);
  out_free:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2474
  	kfree(buf);
99f895518   Eric W. Biederman   [PATCH] proc: don...
2475
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2476
2477
2478
2479
2480
  	return retval;
  }
  
  static int cpuset_open(struct inode *inode, struct file *file)
  {
13b41b094   Eric W. Biederman   [PATCH] proc: Use...
2481
2482
  	struct pid *pid = PROC_I(inode)->pid;
  	return single_open(file, proc_cpuset_show, pid);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2483
  }
9a32144e9   Arjan van de Ven   [PATCH] mark stru...
2484
  const struct file_operations proc_cpuset_operations = {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2485
2486
2487
2488
2489
  	.open		= cpuset_open,
  	.read		= seq_read,
  	.llseek		= seq_lseek,
  	.release	= single_release,
  };
8793d854e   Paul Menage   Task Control Grou...
2490
  #endif /* CONFIG_PROC_PID_CPUSET */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2491

d01d48278   Heiko Carstens   sched: Always sho...
2492
  /* Display task mems_allowed in /proc/<pid>/status file. */
df5f8314c   Eric W. Biederman   proc: seqfile con...
2493
2494
  void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
  {
df5f8314c   Eric W. Biederman   proc: seqfile con...
2495
  	seq_printf(m, "Mems_allowed:\t");
30e8e1360   Lai Jiangshan   cpuset: use seq_*...
2496
  	seq_nodemask(m, &task->mems_allowed);
df5f8314c   Eric W. Biederman   proc: seqfile con...
2497
2498
  	seq_printf(m, "
  ");
39106dcf8   Mike Travis   cpumask: use new ...
2499
  	seq_printf(m, "Mems_allowed_list:\t");
30e8e1360   Lai Jiangshan   cpuset: use seq_*...
2500
  	seq_nodemask_list(m, &task->mems_allowed);
39106dcf8   Mike Travis   cpumask: use new ...
2501
2502
  	seq_printf(m, "
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2503
  }