Blame view

kernel/cpuset.c 74.5 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
  /*
   *  kernel/cpuset.c
   *
   *  Processor and Memory placement constraints for sets of tasks.
   *
   *  Copyright (C) 2003 BULL SA.
029190c51   Paul Jackson   cpuset sched_load...
7
   *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
8793d854e   Paul Menage   Task Control Grou...
8
   *  Copyright (C) 2006 Google, Inc
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
9
10
11
   *
   *  Portions derived from Patrick Mochel's sysfs code.
   *  sysfs is Copyright (c) 2001-3 Patrick Mochel
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
12
   *
825a46af5   Paul Jackson   [PATCH] cpuset me...
13
   *  2003-10-10 Written by Simon Derr.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
14
   *  2003-10-22 Updates by Stephen Hemminger.
825a46af5   Paul Jackson   [PATCH] cpuset me...
15
   *  2004 May-July Rework by Paul Jackson.
8793d854e   Paul Menage   Task Control Grou...
16
   *  2006 Rework by Paul Menage to use generic cgroups
cf417141c   Max Krasnyansky   sched, cpuset: re...
17
18
   *  2008 Rework of the scheduler domains and CPU hotplug handling
   *       by Max Krasnyansky
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
19
20
21
22
23
   *
   *  This file is subject to the terms and conditions of the GNU General Public
   *  License.  See the file COPYING in the main directory of the Linux
   *  distribution for more details.
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
24
25
26
27
28
29
30
31
32
33
34
35
  #include <linux/cpu.h>
  #include <linux/cpumask.h>
  #include <linux/cpuset.h>
  #include <linux/err.h>
  #include <linux/errno.h>
  #include <linux/file.h>
  #include <linux/fs.h>
  #include <linux/init.h>
  #include <linux/interrupt.h>
  #include <linux/kernel.h>
  #include <linux/kmod.h>
  #include <linux/list.h>
68860ec10   Paul Jackson   [PATCH] cpusets: ...
36
  #include <linux/mempolicy.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
37
  #include <linux/mm.h>
f481891fd   Miao Xie   cpuset: update to...
38
  #include <linux/memory.h>
9984de1a5   Paul Gortmaker   kernel: Map most ...
39
  #include <linux/export.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
40
41
42
43
  #include <linux/mount.h>
  #include <linux/namei.h>
  #include <linux/pagemap.h>
  #include <linux/proc_fs.h>
6b9c2603c   Paul Jackson   [PATCH] cpuset: u...
44
  #include <linux/rcupdate.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
45
46
  #include <linux/sched.h>
  #include <linux/seq_file.h>
22fb52dd7   David Quigley   [PATCH] SELinux: ...
47
  #include <linux/security.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
48
  #include <linux/slab.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
49
50
51
52
53
54
55
56
  #include <linux/spinlock.h>
  #include <linux/stat.h>
  #include <linux/string.h>
  #include <linux/time.h>
  #include <linux/backing-dev.h>
  #include <linux/sort.h>
  
  #include <asm/uaccess.h>
60063497a   Arun Sharma   atomic: use <linu...
57
  #include <linux/atomic.h>
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
58
  #include <linux/mutex.h>
956db3ca0   Cliff Wickman   hotplug cpu: move...
59
60
  #include <linux/workqueue.h>
  #include <linux/cgroup.h>
e44193d39   Li Zefan   cpuset: let hotpl...
61
  #include <linux/wait.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
62

202f72d5d   Paul Jackson   [PATCH] cpuset: n...
63
64
65
66
67
  /*
   * Tracks how many cpusets are currently defined in system.
   * When there is only one cpuset (the root cpuset) we can
   * short circuit some hooks.
   */
7edc59628   Paul Jackson   [PATCH] cpuset: m...
68
  int number_of_cpusets __read_mostly;
202f72d5d   Paul Jackson   [PATCH] cpuset: n...
69

3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
70
71
72
73
74
75
76
77
  /* See "Frequency meter" comments, below. */
  
  struct fmeter {
  	int cnt;		/* unprocessed events count */
  	int val;		/* most recent output value */
  	time_t time;		/* clock (secs) when val computed */
  	spinlock_t lock;	/* guards read or write of above */
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
78
  struct cpuset {
8793d854e   Paul Menage   Task Control Grou...
79
  	struct cgroup_subsys_state css;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
80
  	unsigned long flags;		/* "unsigned long" so bitops work */
300ed6cbb   Li Zefan   cpuset: convert c...
81
  	cpumask_var_t cpus_allowed;	/* CPUs allowed to tasks in cpuset */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
82
  	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */
33ad801df   Li Zefan   cpuset: record ol...
83
84
85
86
87
88
89
90
91
92
93
  	/*
  	 * This is old Memory Nodes tasks took on.
  	 *
  	 * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
  	 * - A new cpuset's old_mems_allowed is initialized when some
  	 *   task is moved into it.
  	 * - old_mems_allowed is used in cpuset_migrate_mm() when we change
  	 *   cpuset.mems_allowed and have tasks' nodemask updated, and
  	 *   then old_mems_allowed is updated to mems_allowed.
  	 */
  	nodemask_t old_mems_allowed;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
94
  	struct fmeter fmeter;		/* memory_pressure filter */
029190c51   Paul Jackson   cpuset sched_load...
95

452477fa6   Tejun Heo   cpuset: pin down ...
96
97
98
99
100
  	/*
  	 * Tasks are being attached to this cpuset.  Used to prevent
  	 * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
  	 */
  	int attach_in_progress;
029190c51   Paul Jackson   cpuset sched_load...
101
102
  	/* partition number for rebuild_sched_domains() */
  	int pn;
956db3ca0   Cliff Wickman   hotplug cpu: move...
103

1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
104
105
  	/* for custom sched domain */
  	int relax_domain_level;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
106
  };
a7c6d554a   Tejun Heo   cgroup: add/updat...
107
  static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
8793d854e   Paul Menage   Task Control Grou...
108
  {
a7c6d554a   Tejun Heo   cgroup: add/updat...
109
  	return css ? container_of(css, struct cpuset, css) : NULL;
8793d854e   Paul Menage   Task Control Grou...
110
111
112
113
114
  }
  
  /* Retrieve the cpuset for a task */
  static inline struct cpuset *task_cs(struct task_struct *task)
  {
073219e99   Tejun Heo   cgroup: clean up ...
115
  	return css_cs(task_css(task, cpuset_cgrp_id));
8793d854e   Paul Menage   Task Control Grou...
116
  }
8793d854e   Paul Menage   Task Control Grou...
117

c9710d801   Tejun Heo   cpuset: drop "con...
118
  static inline struct cpuset *parent_cs(struct cpuset *cs)
c431069fe   Tejun Heo   cpuset: remove cp...
119
  {
638769869   Tejun Heo   cgroup: add css_p...
120
  	return css_cs(css_parent(&cs->css));
c431069fe   Tejun Heo   cpuset: remove cp...
121
  }
b246272ec   David Rientjes   cpusets: stall wh...
122
123
124
125
126
127
128
129
130
131
132
  #ifdef CONFIG_NUMA
  static inline bool task_has_mempolicy(struct task_struct *task)
  {
  	return task->mempolicy;
  }
  #else
  static inline bool task_has_mempolicy(struct task_struct *task)
  {
  	return false;
  }
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
133
134
  /* bits in struct cpuset flags field */
  typedef enum {
efeb77b2f   Tejun Heo   cpuset: introduce...
135
  	CS_ONLINE,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
136
137
  	CS_CPU_EXCLUSIVE,
  	CS_MEM_EXCLUSIVE,
786083667   Paul Menage   Cpuset hardwall f...
138
  	CS_MEM_HARDWALL,
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
139
  	CS_MEMORY_MIGRATE,
029190c51   Paul Jackson   cpuset sched_load...
140
  	CS_SCHED_LOAD_BALANCE,
825a46af5   Paul Jackson   [PATCH] cpuset me...
141
142
  	CS_SPREAD_PAGE,
  	CS_SPREAD_SLAB,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
143
144
145
  } cpuset_flagbits_t;
  
  /* convenient tests for these bits */
efeb77b2f   Tejun Heo   cpuset: introduce...
146
147
148
149
  static inline bool is_cpuset_online(const struct cpuset *cs)
  {
  	return test_bit(CS_ONLINE, &cs->flags);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
150
151
  static inline int is_cpu_exclusive(const struct cpuset *cs)
  {
7b5b9ef0e   Paul Jackson   [PATCH] cpuset cl...
152
  	return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
153
154
155
156
  }
  
  static inline int is_mem_exclusive(const struct cpuset *cs)
  {
7b5b9ef0e   Paul Jackson   [PATCH] cpuset cl...
157
  	return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
158
  }
786083667   Paul Menage   Cpuset hardwall f...
159
160
161
162
  static inline int is_mem_hardwall(const struct cpuset *cs)
  {
  	return test_bit(CS_MEM_HARDWALL, &cs->flags);
  }
029190c51   Paul Jackson   cpuset sched_load...
163
164
165
166
  static inline int is_sched_load_balance(const struct cpuset *cs)
  {
  	return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
  }
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
167
168
  static inline int is_memory_migrate(const struct cpuset *cs)
  {
7b5b9ef0e   Paul Jackson   [PATCH] cpuset cl...
169
  	return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
170
  }
825a46af5   Paul Jackson   [PATCH] cpuset me...
171
172
173
174
175
176
177
178
179
  static inline int is_spread_page(const struct cpuset *cs)
  {
  	return test_bit(CS_SPREAD_PAGE, &cs->flags);
  }
  
  static inline int is_spread_slab(const struct cpuset *cs)
  {
  	return test_bit(CS_SPREAD_SLAB, &cs->flags);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
180
  static struct cpuset top_cpuset = {
efeb77b2f   Tejun Heo   cpuset: introduce...
181
182
  	.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
  		  (1 << CS_MEM_EXCLUSIVE)),
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
183
  };
ae8086ce1   Tejun Heo   cpuset: introduce...
184
185
186
  /**
   * cpuset_for_each_child - traverse online children of a cpuset
   * @child_cs: loop cursor pointing to the current child
492eb21b9   Tejun Heo   cgroup: make hier...
187
   * @pos_css: used for iteration
ae8086ce1   Tejun Heo   cpuset: introduce...
188
189
190
191
192
   * @parent_cs: target cpuset to walk children of
   *
   * Walk @child_cs through the online children of @parent_cs.  Must be used
   * with RCU read locked.
   */
492eb21b9   Tejun Heo   cgroup: make hier...
193
194
195
  #define cpuset_for_each_child(child_cs, pos_css, parent_cs)		\
  	css_for_each_child((pos_css), &(parent_cs)->css)		\
  		if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
ae8086ce1   Tejun Heo   cpuset: introduce...
196

fc560a26a   Tejun Heo   cpuset: replace c...
197
198
199
  /**
   * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
   * @des_cs: loop cursor pointing to the current descendant
492eb21b9   Tejun Heo   cgroup: make hier...
200
   * @pos_css: used for iteration
fc560a26a   Tejun Heo   cpuset: replace c...
201
202
203
   * @root_cs: target cpuset to walk ancestor of
   *
   * Walk @des_cs through the online descendants of @root_cs.  Must be used
492eb21b9   Tejun Heo   cgroup: make hier...
204
   * with RCU read locked.  The caller may modify @pos_css by calling
bd8815a6d   Tejun Heo   cgroup: make css_...
205
206
   * css_rightmost_descendant() to skip subtree.  @root_cs is included in the
   * iteration and the first node to be visited.
fc560a26a   Tejun Heo   cpuset: replace c...
207
   */
492eb21b9   Tejun Heo   cgroup: make hier...
208
209
210
  #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs)	\
  	css_for_each_descendant_pre((pos_css), &(root_cs)->css)		\
  		if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
fc560a26a   Tejun Heo   cpuset: replace c...
211

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
212
  /*
5d21cc2db   Tejun Heo   cpuset: replace c...
213
214
215
216
217
218
219
220
221
222
223
224
225
226
   * There are two global mutexes guarding cpuset structures - cpuset_mutex
   * and callback_mutex.  The latter may nest inside the former.  We also
   * require taking task_lock() when dereferencing a task's cpuset pointer.
   * See "The task_lock() exception", at the end of this comment.
   *
   * A task must hold both mutexes to modify cpusets.  If a task holds
   * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
   * is the only task able to also acquire callback_mutex and be able to
   * modify cpusets.  It can perform various checks on the cpuset structure
   * first, knowing nothing will change.  It can also allocate memory while
   * just holding cpuset_mutex.  While it is performing these checks, various
   * callback routines can briefly acquire callback_mutex to query cpusets.
   * Once it is ready to make the changes, it takes callback_mutex, blocking
   * everyone else.
053199edf   Paul Jackson   [PATCH] cpusets: ...
227
228
   *
   * Calls to the kernel memory allocator can not be made while holding
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
229
   * callback_mutex, as that would risk double tripping on callback_mutex
053199edf   Paul Jackson   [PATCH] cpusets: ...
230
231
232
   * from one of the callbacks into the cpuset code from within
   * __alloc_pages().
   *
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
233
   * If a task is only holding callback_mutex, then it has read-only
053199edf   Paul Jackson   [PATCH] cpusets: ...
234
235
   * access to cpusets.
   *
58568d2a8   Miao Xie   cpuset,mm: update...
236
237
238
   * Now, the task_struct fields mems_allowed and mempolicy may be changed
   * by other task, we use alloc_lock in the task_struct fields to protect
   * them.
053199edf   Paul Jackson   [PATCH] cpusets: ...
239
   *
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
240
   * The cpuset_common_file_read() handlers only hold callback_mutex across
053199edf   Paul Jackson   [PATCH] cpusets: ...
241
242
243
   * small pieces of code, such as when reading out possibly multi-word
   * cpumasks and nodemasks.
   *
2df167a30   Paul Menage   cgroups: update c...
244
245
   * Accessing a task's cpuset should be done in accordance with the
   * guidelines for accessing subsystem state in kernel/cgroup.c
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
246
   */
5d21cc2db   Tejun Heo   cpuset: replace c...
247
  static DEFINE_MUTEX(cpuset_mutex);
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
248
  static DEFINE_MUTEX(callback_mutex);
4247bdc60   Paul Jackson   [PATCH] cpuset se...
249

cf417141c   Max Krasnyansky   sched, cpuset: re...
250
  /*
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
251
252
253
   * CPU / memory hotplug is handled asynchronously.
   */
  static void cpuset_hotplug_workfn(struct work_struct *work);
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
254
  static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
e44193d39   Li Zefan   cpuset: let hotpl...
255
  static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
256
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
257
   * This is ugly, but preserves the userspace API for existing cpuset
8793d854e   Paul Menage   Task Control Grou...
258
   * users. If someone tries to mount the "cpuset" filesystem, we
cf417141c   Max Krasnyansky   sched, cpuset: re...
259
260
   * silently switch it to mount "cgroup" instead
   */
f7e835710   Al Viro   convert cgroup an...
261
262
  static struct dentry *cpuset_mount(struct file_system_type *fs_type,
  			 int flags, const char *unused_dev_name, void *data)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
263
  {
8793d854e   Paul Menage   Task Control Grou...
264
  	struct file_system_type *cgroup_fs = get_fs_type("cgroup");
f7e835710   Al Viro   convert cgroup an...
265
  	struct dentry *ret = ERR_PTR(-ENODEV);
8793d854e   Paul Menage   Task Control Grou...
266
267
268
269
  	if (cgroup_fs) {
  		char mountopts[] =
  			"cpuset,noprefix,"
  			"release_agent=/sbin/cpuset_release_agent";
f7e835710   Al Viro   convert cgroup an...
270
271
  		ret = cgroup_fs->mount(cgroup_fs, flags,
  					   unused_dev_name, mountopts);
8793d854e   Paul Menage   Task Control Grou...
272
273
274
  		put_filesystem(cgroup_fs);
  	}
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
275
276
277
278
  }
  
  static struct file_system_type cpuset_fs_type = {
  	.name = "cpuset",
f7e835710   Al Viro   convert cgroup an...
279
  	.mount = cpuset_mount,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
280
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
281
  /*
300ed6cbb   Li Zefan   cpuset: convert c...
282
   * Return in pmask the portion of a cpusets's cpus_allowed that
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
283
   * are online.  If none are online, walk up the cpuset hierarchy
40df2deb5   Li Zefan   cpuset: cleanup g...
284
285
   * until we find one that does have some online cpus.  The top
   * cpuset always has some cpus online.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
286
287
   *
   * One way or another, we guarantee to return some non-empty subset
5f054e31c   Rusty Russell   documentation: re...
288
   * of cpu_online_mask.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
289
   *
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
290
   * Call with callback_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
291
   */
c9710d801   Tejun Heo   cpuset: drop "con...
292
  static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
293
  {
40df2deb5   Li Zefan   cpuset: cleanup g...
294
  	while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
c431069fe   Tejun Heo   cpuset: remove cp...
295
  		cs = parent_cs(cs);
40df2deb5   Li Zefan   cpuset: cleanup g...
296
  	cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
297
298
299
300
  }
  
  /*
   * Return in *pmask the portion of a cpusets's mems_allowed that
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
301
302
   * are online, with memory.  If none are online with memory, walk
   * up the cpuset hierarchy until we find one that does have some
40df2deb5   Li Zefan   cpuset: cleanup g...
303
   * online mems.  The top cpuset always has some mems online.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
304
305
   *
   * One way or another, we guarantee to return some non-empty subset
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
306
   * of node_states[N_MEMORY].
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
307
   *
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
308
   * Call with callback_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
309
   */
c9710d801   Tejun Heo   cpuset: drop "con...
310
  static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
311
  {
40df2deb5   Li Zefan   cpuset: cleanup g...
312
  	while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
c431069fe   Tejun Heo   cpuset: remove cp...
313
  		cs = parent_cs(cs);
40df2deb5   Li Zefan   cpuset: cleanup g...
314
  	nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
315
  }
f3b39d47e   Miao Xie   cpusets: restruct...
316
317
318
  /*
   * update task's spread flag if cpuset's page/slab spread flag is set
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
319
   * Called with callback_mutex/cpuset_mutex held
f3b39d47e   Miao Xie   cpusets: restruct...
320
321
322
323
324
325
326
327
328
329
330
331
332
   */
  static void cpuset_update_task_spread_flag(struct cpuset *cs,
  					struct task_struct *tsk)
  {
  	if (is_spread_page(cs))
  		tsk->flags |= PF_SPREAD_PAGE;
  	else
  		tsk->flags &= ~PF_SPREAD_PAGE;
  	if (is_spread_slab(cs))
  		tsk->flags |= PF_SPREAD_SLAB;
  	else
  		tsk->flags &= ~PF_SPREAD_SLAB;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
333
334
335
336
337
  /*
   * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
   *
   * One cpuset is a subset of another if all its allowed CPUs and
   * Memory Nodes are a subset of the other, and its exclusive flags
5d21cc2db   Tejun Heo   cpuset: replace c...
338
   * are only set if the other's are set.  Call holding cpuset_mutex.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
339
340
341
342
   */
  
  static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
  {
300ed6cbb   Li Zefan   cpuset: convert c...
343
  	return	cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
344
345
346
347
  		nodes_subset(p->mems_allowed, q->mems_allowed) &&
  		is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
  		is_mem_exclusive(p) <= is_mem_exclusive(q);
  }
645fcc9d2   Li Zefan   cpuset: don't all...
348
349
350
351
  /**
   * alloc_trial_cpuset - allocate a trial cpuset
   * @cs: the cpuset that the trial cpuset duplicates
   */
c9710d801   Tejun Heo   cpuset: drop "con...
352
  static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
645fcc9d2   Li Zefan   cpuset: don't all...
353
  {
300ed6cbb   Li Zefan   cpuset: convert c...
354
355
356
357
358
359
360
361
362
363
364
365
366
  	struct cpuset *trial;
  
  	trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
  	if (!trial)
  		return NULL;
  
  	if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
  		kfree(trial);
  		return NULL;
  	}
  	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
  
  	return trial;
645fcc9d2   Li Zefan   cpuset: don't all...
367
368
369
370
371
372
373
374
  }
  
  /**
   * free_trial_cpuset - free the trial cpuset
   * @trial: the trial cpuset to be freed
   */
  static void free_trial_cpuset(struct cpuset *trial)
  {
300ed6cbb   Li Zefan   cpuset: convert c...
375
  	free_cpumask_var(trial->cpus_allowed);
645fcc9d2   Li Zefan   cpuset: don't all...
376
377
  	kfree(trial);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
378
379
380
381
382
383
384
  /*
   * validate_change() - Used to validate that any proposed cpuset change
   *		       follows the structural rules for cpusets.
   *
   * If we replaced the flag and mask values of the current cpuset
   * (cur) with those values in the trial cpuset (trial), would
   * our various subset and exclusive rules still be valid?  Presumes
5d21cc2db   Tejun Heo   cpuset: replace c...
385
   * cpuset_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
386
387
388
389
390
391
392
393
394
395
396
   *
   * 'cur' is the address of an actual, in-use cpuset.  Operations
   * such as list traversal that depend on the actual address of the
   * cpuset in the list must use cur below, not trial.
   *
   * 'trial' is the address of bulk structure copy of cur, with
   * perhaps one or more of the fields cpus_allowed, mems_allowed,
   * or flags changed to new, trial values.
   *
   * Return 0 if valid, -errno if not.
   */
c9710d801   Tejun Heo   cpuset: drop "con...
397
  static int validate_change(struct cpuset *cur, struct cpuset *trial)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
398
  {
492eb21b9   Tejun Heo   cgroup: make hier...
399
  	struct cgroup_subsys_state *css;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
400
  	struct cpuset *c, *par;
ae8086ce1   Tejun Heo   cpuset: introduce...
401
402
403
  	int ret;
  
  	rcu_read_lock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
404
405
  
  	/* Each of our child cpusets must be a subset of us */
ae8086ce1   Tejun Heo   cpuset: introduce...
406
  	ret = -EBUSY;
492eb21b9   Tejun Heo   cgroup: make hier...
407
  	cpuset_for_each_child(c, css, cur)
ae8086ce1   Tejun Heo   cpuset: introduce...
408
409
  		if (!is_cpuset_subset(c, trial))
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
410
411
  
  	/* Remaining checks don't apply to root cpuset */
ae8086ce1   Tejun Heo   cpuset: introduce...
412
  	ret = 0;
696040670   Paul Jackson   [PATCH] cpuset: m...
413
  	if (cur == &top_cpuset)
ae8086ce1   Tejun Heo   cpuset: introduce...
414
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
415

c431069fe   Tejun Heo   cpuset: remove cp...
416
  	par = parent_cs(cur);
696040670   Paul Jackson   [PATCH] cpuset: m...
417

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
418
  	/* We must be a subset of our parent cpuset */
ae8086ce1   Tejun Heo   cpuset: introduce...
419
  	ret = -EACCES;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
420
  	if (!is_cpuset_subset(trial, par))
ae8086ce1   Tejun Heo   cpuset: introduce...
421
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
422

2df167a30   Paul Menage   cgroups: update c...
423
424
425
426
  	/*
  	 * If either I or some sibling (!= me) is exclusive, we can't
  	 * overlap
  	 */
ae8086ce1   Tejun Heo   cpuset: introduce...
427
  	ret = -EINVAL;
492eb21b9   Tejun Heo   cgroup: make hier...
428
  	cpuset_for_each_child(c, css, par) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
429
430
  		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
  		    c != cur &&
300ed6cbb   Li Zefan   cpuset: convert c...
431
  		    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
ae8086ce1   Tejun Heo   cpuset: introduce...
432
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
433
434
435
  		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
  		    c != cur &&
  		    nodes_intersects(trial->mems_allowed, c->mems_allowed))
ae8086ce1   Tejun Heo   cpuset: introduce...
436
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
437
  	}
452477fa6   Tejun Heo   cpuset: pin down ...
438
439
  	/*
  	 * Cpusets with tasks - existing or newly being attached - can't
1c09b195d   Li Zefan   cpuset: fix a reg...
440
  	 * be changed to have empty cpus_allowed or mems_allowed.
452477fa6   Tejun Heo   cpuset: pin down ...
441
  	 */
ae8086ce1   Tejun Heo   cpuset: introduce...
442
  	ret = -ENOSPC;
07bc356ed   Tejun Heo   cgroup: implement...
443
  	if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
1c09b195d   Li Zefan   cpuset: fix a reg...
444
445
446
447
448
449
450
  		if (!cpumask_empty(cur->cpus_allowed) &&
  		    cpumask_empty(trial->cpus_allowed))
  			goto out;
  		if (!nodes_empty(cur->mems_allowed) &&
  		    nodes_empty(trial->mems_allowed))
  			goto out;
  	}
020958b62   Paul Jackson   cpusets: decrusti...
451

ae8086ce1   Tejun Heo   cpuset: introduce...
452
453
454
455
  	ret = 0;
  out:
  	rcu_read_unlock();
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
456
  }
db7f47cf4   Paul Menage   cpusets: allow cp...
457
  #ifdef CONFIG_SMP
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
458
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
459
   * Helper routine for generate_sched_domains().
029190c51   Paul Jackson   cpuset sched_load...
460
461
   * Do cpusets a, b have overlapping cpus_allowed masks?
   */
029190c51   Paul Jackson   cpuset sched_load...
462
463
  static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
  {
300ed6cbb   Li Zefan   cpuset: convert c...
464
  	return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
029190c51   Paul Jackson   cpuset sched_load...
465
  }
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
466
467
468
  static void
  update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
  {
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
469
470
471
472
  	if (dattr->relax_domain_level < c->relax_domain_level)
  		dattr->relax_domain_level = c->relax_domain_level;
  	return;
  }
fc560a26a   Tejun Heo   cpuset: replace c...
473
474
  static void update_domain_attr_tree(struct sched_domain_attr *dattr,
  				    struct cpuset *root_cs)
f5393693e   Lai Jiangshan   cpuset: speed up ...
475
  {
fc560a26a   Tejun Heo   cpuset: replace c...
476
  	struct cpuset *cp;
492eb21b9   Tejun Heo   cgroup: make hier...
477
  	struct cgroup_subsys_state *pos_css;
f5393693e   Lai Jiangshan   cpuset: speed up ...
478

fc560a26a   Tejun Heo   cpuset: replace c...
479
  	rcu_read_lock();
492eb21b9   Tejun Heo   cgroup: make hier...
480
  	cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
bd8815a6d   Tejun Heo   cgroup: make css_...
481
482
  		if (cp == root_cs)
  			continue;
fc560a26a   Tejun Heo   cpuset: replace c...
483
484
  		/* skip the whole subtree if @cp doesn't have any CPU */
  		if (cpumask_empty(cp->cpus_allowed)) {
492eb21b9   Tejun Heo   cgroup: make hier...
485
  			pos_css = css_rightmost_descendant(pos_css);
f5393693e   Lai Jiangshan   cpuset: speed up ...
486
  			continue;
fc560a26a   Tejun Heo   cpuset: replace c...
487
  		}
f5393693e   Lai Jiangshan   cpuset: speed up ...
488
489
490
  
  		if (is_sched_load_balance(cp))
  			update_domain_attr(dattr, cp);
f5393693e   Lai Jiangshan   cpuset: speed up ...
491
  	}
fc560a26a   Tejun Heo   cpuset: replace c...
492
  	rcu_read_unlock();
f5393693e   Lai Jiangshan   cpuset: speed up ...
493
  }
029190c51   Paul Jackson   cpuset sched_load...
494
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
495
496
497
498
499
   * generate_sched_domains()
   *
   * This function builds a partial partition of the systems CPUs
   * A 'partial partition' is a set of non-overlapping subsets whose
   * union is a subset of that set.
0a0fca9d8   Viresh Kumar   sched: Rename sch...
500
   * The output of this function needs to be passed to kernel/sched/core.c
cf417141c   Max Krasnyansky   sched, cpuset: re...
501
502
503
   * partition_sched_domains() routine, which will rebuild the scheduler's
   * load balancing domains (sched domains) as specified by that partial
   * partition.
029190c51   Paul Jackson   cpuset sched_load...
504
   *
45ce80fb6   Li Zefan   cgroups: consolid...
505
   * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt
029190c51   Paul Jackson   cpuset sched_load...
506
507
508
509
510
511
512
   * for a background explanation of this.
   *
   * Does not return errors, on the theory that the callers of this
   * routine would rather not worry about failures to rebuild sched
   * domains when operating in the severe memory shortage situations
   * that could cause allocation failures below.
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
513
   * Must be called with cpuset_mutex held.
029190c51   Paul Jackson   cpuset sched_load...
514
515
   *
   * The three key local variables below are:
aeed68242   Li Zefan   cpuset: clean up ...
516
   *    q  - a linked-list queue of cpuset pointers, used to implement a
029190c51   Paul Jackson   cpuset sched_load...
517
518
519
520
521
522
523
524
525
526
527
528
   *	   top-down scan of all cpusets.  This scan loads a pointer
   *	   to each cpuset marked is_sched_load_balance into the
   *	   array 'csa'.  For our purposes, rebuilding the schedulers
   *	   sched domains, we can ignore !is_sched_load_balance cpusets.
   *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
   *	   that need to be load balanced, for convenient iterative
   *	   access by the subsequent code that finds the best partition,
   *	   i.e the set of domains (subsets) of CPUs such that the
   *	   cpus_allowed of every cpuset marked is_sched_load_balance
   *	   is a subset of one of these domains, while there are as
   *	   many such domains as possible, each as small as possible.
   * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
0a0fca9d8   Viresh Kumar   sched: Rename sch...
529
   *	   the kernel/sched/core.c routine partition_sched_domains() in a
029190c51   Paul Jackson   cpuset sched_load...
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
   *	   convenient format, that can be easily compared to the prior
   *	   value to determine what partition elements (sched domains)
   *	   were changed (added or removed.)
   *
   * Finding the best partition (set of domains):
   *	The triple nested loops below over i, j, k scan over the
   *	load balanced cpusets (using the array of cpuset pointers in
   *	csa[]) looking for pairs of cpusets that have overlapping
   *	cpus_allowed, but which don't have the same 'pn' partition
   *	number and gives them in the same partition number.  It keeps
   *	looping on the 'restart' label until it can no longer find
   *	any such pairs.
   *
   *	The union of the cpus_allowed masks from the set of
   *	all cpusets having the same 'pn' value then form the one
   *	element of the partition (one sched domain) to be passed to
   *	partition_sched_domains().
   */
acc3f5d7c   Rusty Russell   cpumask: Partitio...
548
  static int generate_sched_domains(cpumask_var_t **domains,
cf417141c   Max Krasnyansky   sched, cpuset: re...
549
  			struct sched_domain_attr **attributes)
029190c51   Paul Jackson   cpuset sched_load...
550
  {
029190c51   Paul Jackson   cpuset sched_load...
551
552
553
554
  	struct cpuset *cp;	/* scans q */
  	struct cpuset **csa;	/* array of all cpuset ptrs */
  	int csn;		/* how many cpuset ptrs in csa so far */
  	int i, j, k;		/* indices for partition finding loops */
acc3f5d7c   Rusty Russell   cpumask: Partitio...
555
  	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
556
  	struct sched_domain_attr *dattr;  /* attributes for custom domains */
1583715dd   Ingo Molnar   sched, cpusets: f...
557
  	int ndoms = 0;		/* number of sched domains in result */
6af866af3   Li Zefan   cpuset: remove re...
558
  	int nslot;		/* next empty doms[] struct cpumask slot */
492eb21b9   Tejun Heo   cgroup: make hier...
559
  	struct cgroup_subsys_state *pos_css;
029190c51   Paul Jackson   cpuset sched_load...
560

029190c51   Paul Jackson   cpuset sched_load...
561
  	doms = NULL;
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
562
  	dattr = NULL;
cf417141c   Max Krasnyansky   sched, cpuset: re...
563
  	csa = NULL;
029190c51   Paul Jackson   cpuset sched_load...
564
565
566
  
  	/* Special case for the 99% of systems with one, full, sched domain */
  	if (is_sched_load_balance(&top_cpuset)) {
acc3f5d7c   Rusty Russell   cpumask: Partitio...
567
568
  		ndoms = 1;
  		doms = alloc_sched_domains(ndoms);
029190c51   Paul Jackson   cpuset sched_load...
569
  		if (!doms)
cf417141c   Max Krasnyansky   sched, cpuset: re...
570
  			goto done;
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
571
572
573
  		dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
  		if (dattr) {
  			*dattr = SD_ATTR_INIT;
93a655755   Li Zefan   cpuset: fix wrong...
574
  			update_domain_attr_tree(dattr, &top_cpuset);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
575
  		}
acc3f5d7c   Rusty Russell   cpumask: Partitio...
576
  		cpumask_copy(doms[0], top_cpuset.cpus_allowed);
cf417141c   Max Krasnyansky   sched, cpuset: re...
577

cf417141c   Max Krasnyansky   sched, cpuset: re...
578
  		goto done;
029190c51   Paul Jackson   cpuset sched_load...
579
  	}
029190c51   Paul Jackson   cpuset sched_load...
580
581
582
583
  	csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
  	if (!csa)
  		goto done;
  	csn = 0;
fc560a26a   Tejun Heo   cpuset: replace c...
584
  	rcu_read_lock();
492eb21b9   Tejun Heo   cgroup: make hier...
585
  	cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
bd8815a6d   Tejun Heo   cgroup: make css_...
586
587
  		if (cp == &top_cpuset)
  			continue;
f5393693e   Lai Jiangshan   cpuset: speed up ...
588
  		/*
fc560a26a   Tejun Heo   cpuset: replace c...
589
590
591
592
593
594
  		 * Continue traversing beyond @cp iff @cp has some CPUs and
  		 * isn't load balancing.  The former is obvious.  The
  		 * latter: All child cpusets contain a subset of the
  		 * parent's cpus, so just skip them, and then we call
  		 * update_domain_attr_tree() to calc relax_domain_level of
  		 * the corresponding sched domain.
f5393693e   Lai Jiangshan   cpuset: speed up ...
595
  		 */
fc560a26a   Tejun Heo   cpuset: replace c...
596
597
  		if (!cpumask_empty(cp->cpus_allowed) &&
  		    !is_sched_load_balance(cp))
f5393693e   Lai Jiangshan   cpuset: speed up ...
598
  			continue;
489a5393a   Lai Jiangshan   cpuset: don't pas...
599

fc560a26a   Tejun Heo   cpuset: replace c...
600
601
602
603
  		if (is_sched_load_balance(cp))
  			csa[csn++] = cp;
  
  		/* skip @cp's subtree */
492eb21b9   Tejun Heo   cgroup: make hier...
604
  		pos_css = css_rightmost_descendant(pos_css);
fc560a26a   Tejun Heo   cpuset: replace c...
605
606
  	}
  	rcu_read_unlock();
029190c51   Paul Jackson   cpuset sched_load...
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
  
  	for (i = 0; i < csn; i++)
  		csa[i]->pn = i;
  	ndoms = csn;
  
  restart:
  	/* Find the best partition (set of sched domains) */
  	for (i = 0; i < csn; i++) {
  		struct cpuset *a = csa[i];
  		int apn = a->pn;
  
  		for (j = 0; j < csn; j++) {
  			struct cpuset *b = csa[j];
  			int bpn = b->pn;
  
  			if (apn != bpn && cpusets_overlap(a, b)) {
  				for (k = 0; k < csn; k++) {
  					struct cpuset *c = csa[k];
  
  					if (c->pn == bpn)
  						c->pn = apn;
  				}
  				ndoms--;	/* one less element */
  				goto restart;
  			}
  		}
  	}
cf417141c   Max Krasnyansky   sched, cpuset: re...
634
635
636
637
  	/*
  	 * Now we know how many domains to create.
  	 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
  	 */
acc3f5d7c   Rusty Russell   cpumask: Partitio...
638
  	doms = alloc_sched_domains(ndoms);
700018e0a   Li Zefan   cpuset: fix regre...
639
  	if (!doms)
cf417141c   Max Krasnyansky   sched, cpuset: re...
640
  		goto done;
cf417141c   Max Krasnyansky   sched, cpuset: re...
641
642
643
644
645
  
  	/*
  	 * The rest of the code, including the scheduler, can deal with
  	 * dattr==NULL case. No need to abort if alloc fails.
  	 */
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
646
  	dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
029190c51   Paul Jackson   cpuset sched_load...
647
648
649
  
  	for (nslot = 0, i = 0; i < csn; i++) {
  		struct cpuset *a = csa[i];
6af866af3   Li Zefan   cpuset: remove re...
650
  		struct cpumask *dp;
029190c51   Paul Jackson   cpuset sched_load...
651
  		int apn = a->pn;
cf417141c   Max Krasnyansky   sched, cpuset: re...
652
653
654
655
  		if (apn < 0) {
  			/* Skip completed partitions */
  			continue;
  		}
acc3f5d7c   Rusty Russell   cpumask: Partitio...
656
  		dp = doms[nslot];
cf417141c   Max Krasnyansky   sched, cpuset: re...
657
658
659
660
661
662
663
664
665
666
667
  
  		if (nslot == ndoms) {
  			static int warnings = 10;
  			if (warnings) {
  				printk(KERN_WARNING
  				 "rebuild_sched_domains confused:"
  				  " nslot %d, ndoms %d, csn %d, i %d,"
  				  " apn %d
  ",
  				  nslot, ndoms, csn, i, apn);
  				warnings--;
029190c51   Paul Jackson   cpuset sched_load...
668
  			}
cf417141c   Max Krasnyansky   sched, cpuset: re...
669
670
  			continue;
  		}
029190c51   Paul Jackson   cpuset sched_load...
671

6af866af3   Li Zefan   cpuset: remove re...
672
  		cpumask_clear(dp);
cf417141c   Max Krasnyansky   sched, cpuset: re...
673
674
675
676
677
678
  		if (dattr)
  			*(dattr + nslot) = SD_ATTR_INIT;
  		for (j = i; j < csn; j++) {
  			struct cpuset *b = csa[j];
  
  			if (apn == b->pn) {
300ed6cbb   Li Zefan   cpuset: convert c...
679
  				cpumask_or(dp, dp, b->cpus_allowed);
cf417141c   Max Krasnyansky   sched, cpuset: re...
680
681
682
683
684
  				if (dattr)
  					update_domain_attr_tree(dattr + nslot, b);
  
  				/* Done with this partition */
  				b->pn = -1;
029190c51   Paul Jackson   cpuset sched_load...
685
  			}
029190c51   Paul Jackson   cpuset sched_load...
686
  		}
cf417141c   Max Krasnyansky   sched, cpuset: re...
687
  		nslot++;
029190c51   Paul Jackson   cpuset sched_load...
688
689
  	}
  	BUG_ON(nslot != ndoms);
cf417141c   Max Krasnyansky   sched, cpuset: re...
690
691
  done:
  	kfree(csa);
700018e0a   Li Zefan   cpuset: fix regre...
692
693
694
695
696
697
  	/*
  	 * Fallback to the default domain if kmalloc() failed.
  	 * See comments in partition_sched_domains().
  	 */
  	if (doms == NULL)
  		ndoms = 1;
cf417141c   Max Krasnyansky   sched, cpuset: re...
698
699
700
701
702
703
704
705
  	*domains    = doms;
  	*attributes = dattr;
  	return ndoms;
  }
  
  /*
   * Rebuild scheduler domains.
   *
699140ba8   Tejun Heo   cpuset: drop asyn...
706
707
708
709
710
   * If the flag 'sched_load_balance' of any cpuset with non-empty
   * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
   * which has that flag enabled, or if any cpuset with a non-empty
   * 'cpus' is removed, then call this routine to rebuild the
   * scheduler's dynamic sched domains.
cf417141c   Max Krasnyansky   sched, cpuset: re...
711
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
712
   * Call with cpuset_mutex held.  Takes get_online_cpus().
cf417141c   Max Krasnyansky   sched, cpuset: re...
713
   */
699140ba8   Tejun Heo   cpuset: drop asyn...
714
  static void rebuild_sched_domains_locked(void)
cf417141c   Max Krasnyansky   sched, cpuset: re...
715
716
  {
  	struct sched_domain_attr *attr;
acc3f5d7c   Rusty Russell   cpumask: Partitio...
717
  	cpumask_var_t *doms;
cf417141c   Max Krasnyansky   sched, cpuset: re...
718
  	int ndoms;
5d21cc2db   Tejun Heo   cpuset: replace c...
719
  	lockdep_assert_held(&cpuset_mutex);
86ef5c9a8   Gautham R Shenoy   cpu-hotplug: repl...
720
  	get_online_cpus();
cf417141c   Max Krasnyansky   sched, cpuset: re...
721

5b16c2a49   Li Zefan   cpuset: fix cpu h...
722
723
724
725
726
727
728
  	/*
  	 * We have raced with CPU hotplug. Don't do anything to avoid
  	 * passing doms with offlined cpu to partition_sched_domains().
  	 * Anyways, hotplug work item will rebuild sched domains.
  	 */
  	if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
  		goto out;
cf417141c   Max Krasnyansky   sched, cpuset: re...
729
  	/* Generate domain masks and attrs */
cf417141c   Max Krasnyansky   sched, cpuset: re...
730
  	ndoms = generate_sched_domains(&doms, &attr);
cf417141c   Max Krasnyansky   sched, cpuset: re...
731
732
733
  
  	/* Have scheduler rebuild the domains */
  	partition_sched_domains(ndoms, doms, attr);
5b16c2a49   Li Zefan   cpuset: fix cpu h...
734
  out:
86ef5c9a8   Gautham R Shenoy   cpu-hotplug: repl...
735
  	put_online_cpus();
cf417141c   Max Krasnyansky   sched, cpuset: re...
736
  }
db7f47cf4   Paul Menage   cpusets: allow cp...
737
  #else /* !CONFIG_SMP */
699140ba8   Tejun Heo   cpuset: drop asyn...
738
  static void rebuild_sched_domains_locked(void)
db7f47cf4   Paul Menage   cpusets: allow cp...
739
740
  {
  }
db7f47cf4   Paul Menage   cpusets: allow cp...
741
  #endif /* CONFIG_SMP */
029190c51   Paul Jackson   cpuset sched_load...
742

cf417141c   Max Krasnyansky   sched, cpuset: re...
743
744
  void rebuild_sched_domains(void)
  {
5d21cc2db   Tejun Heo   cpuset: replace c...
745
  	mutex_lock(&cpuset_mutex);
699140ba8   Tejun Heo   cpuset: drop asyn...
746
  	rebuild_sched_domains_locked();
5d21cc2db   Tejun Heo   cpuset: replace c...
747
  	mutex_unlock(&cpuset_mutex);
029190c51   Paul Jackson   cpuset sched_load...
748
  }
070b57fca   Li Zefan   cpuset: introduce...
749
750
751
  /*
   * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
   * @cs: the cpuset in interest
58f4790b7   Cliff Wickman   cpusets: update_c...
752
   *
070b57fca   Li Zefan   cpuset: introduce...
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
   * A cpuset's effective cpumask is the cpumask of the nearest ancestor
   * with non-empty cpus. We use effective cpumask whenever:
   * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
   *   if the cpuset they reside in has no cpus)
   * - we want to retrieve task_cs(tsk)'s cpus_allowed.
   *
   * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
   * exception. See comments there.
   */
  static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
  {
  	while (cpumask_empty(cs->cpus_allowed))
  		cs = parent_cs(cs);
  	return cs;
  }
  
  /*
   * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
   * @cs: the cpuset in interest
   *
   * A cpuset's effective nodemask is the nodemask of the nearest ancestor
   * with non-empty memss. We use effective nodemask whenever:
   * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
   *   if the cpuset they reside in has no mems)
   * - we want to retrieve task_cs(tsk)'s mems_allowed.
   *
   * Called with cpuset_mutex held.
053199edf   Paul Jackson   [PATCH] cpusets: ...
780
   */
070b57fca   Li Zefan   cpuset: introduce...
781
  static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
58f4790b7   Cliff Wickman   cpusets: update_c...
782
  {
070b57fca   Li Zefan   cpuset: introduce...
783
784
785
  	while (nodes_empty(cs->mems_allowed))
  		cs = parent_cs(cs);
  	return cs;
58f4790b7   Cliff Wickman   cpusets: update_c...
786
  }
053199edf   Paul Jackson   [PATCH] cpusets: ...
787

58f4790b7   Cliff Wickman   cpusets: update_c...
788
  /**
0b2f630a2   Miao Xie   cpusets: restruct...
789
790
   * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
   * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
0b2f630a2   Miao Xie   cpusets: restruct...
791
   *
d66393e54   Tejun Heo   cpuset: use css_t...
792
793
794
   * Iterate through each task of @cs updating its cpus_allowed to the
   * effective cpuset's.  As this function is called with cpuset_mutex held,
   * cpuset membership stays stable.
0b2f630a2   Miao Xie   cpusets: restruct...
795
   */
d66393e54   Tejun Heo   cpuset: use css_t...
796
  static void update_tasks_cpumask(struct cpuset *cs)
0b2f630a2   Miao Xie   cpusets: restruct...
797
  {
d66393e54   Tejun Heo   cpuset: use css_t...
798
799
800
801
802
803
804
805
  	struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
  	struct css_task_iter it;
  	struct task_struct *task;
  
  	css_task_iter_start(&cs->css, &it);
  	while ((task = css_task_iter_next(&it)))
  		set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
  	css_task_iter_end(&it);
0b2f630a2   Miao Xie   cpusets: restruct...
806
  }
5c5cc6232   Li Zefan   cpuset: allow to ...
807
808
809
810
  /*
   * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
   * @root_cs: the root cpuset of the hierarchy
   * @update_root: update root cpuset or not?
5c5cc6232   Li Zefan   cpuset: allow to ...
811
812
813
814
815
816
   *
   * This will update cpumasks of tasks in @root_cs and all other empty cpusets
   * which take on cpumask of @root_cs.
   *
   * Called with cpuset_mutex held
   */
d66393e54   Tejun Heo   cpuset: use css_t...
817
  static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
5c5cc6232   Li Zefan   cpuset: allow to ...
818
819
  {
  	struct cpuset *cp;
492eb21b9   Tejun Heo   cgroup: make hier...
820
  	struct cgroup_subsys_state *pos_css;
5c5cc6232   Li Zefan   cpuset: allow to ...
821
822
  
  	rcu_read_lock();
492eb21b9   Tejun Heo   cgroup: make hier...
823
  	cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
bd8815a6d   Tejun Heo   cgroup: make css_...
824
825
826
827
828
829
830
831
832
  		if (cp == root_cs) {
  			if (!update_root)
  				continue;
  		} else {
  			/* skip the whole subtree if @cp have some CPU */
  			if (!cpumask_empty(cp->cpus_allowed)) {
  				pos_css = css_rightmost_descendant(pos_css);
  				continue;
  			}
5c5cc6232   Li Zefan   cpuset: allow to ...
833
834
835
836
  		}
  		if (!css_tryget(&cp->css))
  			continue;
  		rcu_read_unlock();
d66393e54   Tejun Heo   cpuset: use css_t...
837
  		update_tasks_cpumask(cp);
5c5cc6232   Li Zefan   cpuset: allow to ...
838
839
840
841
842
843
  
  		rcu_read_lock();
  		css_put(&cp->css);
  	}
  	rcu_read_unlock();
  }
0b2f630a2   Miao Xie   cpusets: restruct...
844
  /**
58f4790b7   Cliff Wickman   cpusets: update_c...
845
846
847
848
   * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
   * @cs: the cpuset to consider
   * @buf: buffer of cpu numbers written to this cpuset
   */
645fcc9d2   Li Zefan   cpuset: don't all...
849
850
  static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  			  const char *buf)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
851
  {
58f4790b7   Cliff Wickman   cpusets: update_c...
852
853
  	int retval;
  	int is_load_balanced;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
854

5f054e31c   Rusty Russell   documentation: re...
855
  	/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
856
857
  	if (cs == &top_cpuset)
  		return -EACCES;
6f7f02e78   David Rientjes   cpusets: allow em...
858
  	/*
c8d9c90c7   Paul Jackson   hotplug cpu: move...
859
  	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
020958b62   Paul Jackson   cpusets: decrusti...
860
861
862
  	 * Since cpulist_parse() fails on an empty mask, we special case
  	 * that parsing.  The validate_change() call ensures that cpusets
  	 * with tasks have cpus.
6f7f02e78   David Rientjes   cpusets: allow em...
863
  	 */
020958b62   Paul Jackson   cpusets: decrusti...
864
  	if (!*buf) {
300ed6cbb   Li Zefan   cpuset: convert c...
865
  		cpumask_clear(trialcs->cpus_allowed);
6f7f02e78   David Rientjes   cpusets: allow em...
866
  	} else {
300ed6cbb   Li Zefan   cpuset: convert c...
867
  		retval = cpulist_parse(buf, trialcs->cpus_allowed);
6f7f02e78   David Rientjes   cpusets: allow em...
868
869
  		if (retval < 0)
  			return retval;
37340746a   Lai Jiangshan   cpusets: fix bug ...
870

6ad4c1888   Peter Zijlstra   sched: Fix balanc...
871
  		if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
37340746a   Lai Jiangshan   cpusets: fix bug ...
872
  			return -EINVAL;
6f7f02e78   David Rientjes   cpusets: allow em...
873
  	}
029190c51   Paul Jackson   cpuset sched_load...
874

8707d8b8c   Paul Menage   Fix cpusets updat...
875
  	/* Nothing to do if the cpus didn't change */
300ed6cbb   Li Zefan   cpuset: convert c...
876
  	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
8707d8b8c   Paul Menage   Fix cpusets updat...
877
  		return 0;
58f4790b7   Cliff Wickman   cpusets: update_c...
878

a73456f37   Li Zefan   cpuset: re-struct...
879
880
881
  	retval = validate_change(cs, trialcs);
  	if (retval < 0)
  		return retval;
645fcc9d2   Li Zefan   cpuset: don't all...
882
  	is_load_balanced = is_sched_load_balance(trialcs);
029190c51   Paul Jackson   cpuset sched_load...
883

3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
884
  	mutex_lock(&callback_mutex);
300ed6cbb   Li Zefan   cpuset: convert c...
885
  	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
886
  	mutex_unlock(&callback_mutex);
029190c51   Paul Jackson   cpuset sched_load...
887

d66393e54   Tejun Heo   cpuset: use css_t...
888
  	update_tasks_cpumask_hier(cs, true);
58f4790b7   Cliff Wickman   cpusets: update_c...
889

8707d8b8c   Paul Menage   Fix cpusets updat...
890
  	if (is_load_balanced)
699140ba8   Tejun Heo   cpuset: drop asyn...
891
  		rebuild_sched_domains_locked();
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
892
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
893
  }
053199edf   Paul Jackson   [PATCH] cpusets: ...
894
  /*
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
895
896
897
898
899
900
901
   * cpuset_migrate_mm
   *
   *    Migrate memory region from one set of nodes to another.
   *
   *    Temporarilly set tasks mems_allowed to target nodes of migration,
   *    so that the migration code can allocate pages on these nodes.
   *
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
902
903
904
905
   *    While the mm_struct we are migrating is typically from some
   *    other task, the task_struct mems_allowed that we are hacking
   *    is for our current task, which must allocate new pages for that
   *    migrating memory region.
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
906
907
908
909
910
911
   */
  
  static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
  							const nodemask_t *to)
  {
  	struct task_struct *tsk = current;
070b57fca   Li Zefan   cpuset: introduce...
912
  	struct cpuset *mems_cs;
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
913

e4e364e86   Paul Jackson   [PATCH] cpuset: m...
914
  	tsk->mems_allowed = *to;
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
915
916
  
  	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
472958300   Li Zefan   cpuset: fix a loc...
917
  	rcu_read_lock();
070b57fca   Li Zefan   cpuset: introduce...
918
919
  	mems_cs = effective_nodemask_cpuset(task_cs(tsk));
  	guarantee_online_mems(mems_cs, &tsk->mems_allowed);
472958300   Li Zefan   cpuset: fix a loc...
920
  	rcu_read_unlock();
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
921
  }
3b6766fe6   Li Zefan   cpuset: rewrite u...
922
  /*
58568d2a8   Miao Xie   cpuset,mm: update...
923
924
925
926
927
928
929
   * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
   * @tsk: the task to change
   * @newmems: new nodes that the task will be set
   *
   * In order to avoid seeing no nodes if the old and new nodes are disjoint,
   * we structure updates as setting all new allowed nodes, then clearing newly
   * disallowed ones.
58568d2a8   Miao Xie   cpuset,mm: update...
930
931
932
933
   */
  static void cpuset_change_task_nodemask(struct task_struct *tsk,
  					nodemask_t *newmems)
  {
b246272ec   David Rientjes   cpusets: stall wh...
934
  	bool need_loop;
89e8a244b   David Rientjes   cpusets: avoid lo...
935

c0ff7453b   Miao Xie   cpuset,mm: fix no...
936
937
938
939
940
941
942
943
944
945
  	/*
  	 * Allow tasks that have access to memory reserves because they have
  	 * been OOM killed to get memory anywhere.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE)))
  		return;
  	if (current->flags & PF_EXITING) /* Let dying task have memory */
  		return;
  
  	task_lock(tsk);
b246272ec   David Rientjes   cpusets: stall wh...
946
947
  	/*
  	 * Determine if a loop is necessary if another thread is doing
d26914d11   Mel Gorman   mm: optimize put_...
948
  	 * read_mems_allowed_begin().  If at least one node remains unchanged and
b246272ec   David Rientjes   cpusets: stall wh...
949
950
951
952
953
  	 * tsk does not have a mempolicy, then an empty nodemask will not be
  	 * possible when mems_allowed is larger than a word.
  	 */
  	need_loop = task_has_mempolicy(tsk) ||
  			!nodes_intersects(*newmems, tsk->mems_allowed);
c0ff7453b   Miao Xie   cpuset,mm: fix no...
954

0fc0287c9   Peter Zijlstra   cpuset: Fix memor...
955
956
  	if (need_loop) {
  		local_irq_disable();
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
957
  		write_seqcount_begin(&tsk->mems_allowed_seq);
0fc0287c9   Peter Zijlstra   cpuset: Fix memor...
958
  	}
c0ff7453b   Miao Xie   cpuset,mm: fix no...
959

cc9a6c877   Mel Gorman   cpuset: mm: reduc...
960
961
  	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
  	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
c0ff7453b   Miao Xie   cpuset,mm: fix no...
962
963
  
  	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
58568d2a8   Miao Xie   cpuset,mm: update...
964
  	tsk->mems_allowed = *newmems;
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
965

0fc0287c9   Peter Zijlstra   cpuset: Fix memor...
966
  	if (need_loop) {
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
967
  		write_seqcount_end(&tsk->mems_allowed_seq);
0fc0287c9   Peter Zijlstra   cpuset: Fix memor...
968
969
  		local_irq_enable();
  	}
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
970

c0ff7453b   Miao Xie   cpuset,mm: fix no...
971
  	task_unlock(tsk);
58568d2a8   Miao Xie   cpuset,mm: update...
972
  }
8793d854e   Paul Menage   Task Control Grou...
973
  static void *cpuset_being_rebound;
0b2f630a2   Miao Xie   cpusets: restruct...
974
975
976
  /**
   * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
   * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
0b2f630a2   Miao Xie   cpusets: restruct...
977
   *
d66393e54   Tejun Heo   cpuset: use css_t...
978
979
980
   * Iterate through each task of @cs updating its mems_allowed to the
   * effective cpuset's.  As this function is called with cpuset_mutex held,
   * cpuset membership stays stable.
0b2f630a2   Miao Xie   cpusets: restruct...
981
   */
d66393e54   Tejun Heo   cpuset: use css_t...
982
  static void update_tasks_nodemask(struct cpuset *cs)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
983
  {
33ad801df   Li Zefan   cpuset: record ol...
984
  	static nodemask_t newmems;	/* protected by cpuset_mutex */
070b57fca   Li Zefan   cpuset: introduce...
985
  	struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
d66393e54   Tejun Heo   cpuset: use css_t...
986
987
  	struct css_task_iter it;
  	struct task_struct *task;
59dac16fb   Paul Jackson   [PATCH] cpuset: u...
988

846a16bf0   Lee Schermerhorn   mempolicy: rename...
989
  	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
4225399a6   Paul Jackson   [PATCH] cpuset: r...
990

070b57fca   Li Zefan   cpuset: introduce...
991
  	guarantee_online_mems(mems_cs, &newmems);
33ad801df   Li Zefan   cpuset: record ol...
992

4225399a6   Paul Jackson   [PATCH] cpuset: r...
993
  	/*
3b6766fe6   Li Zefan   cpuset: rewrite u...
994
995
996
997
  	 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
  	 * take while holding tasklist_lock.  Forks can happen - the
  	 * mpol_dup() cpuset_being_rebound check will catch such forks,
  	 * and rebind their vma mempolicies too.  Because we still hold
5d21cc2db   Tejun Heo   cpuset: replace c...
998
  	 * the global cpuset_mutex, we know that no other rebind effort
3b6766fe6   Li Zefan   cpuset: rewrite u...
999
  	 * will be contending for the global variable cpuset_being_rebound.
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1000
  	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
04c19fa6f   Paul Jackson   [PATCH] cpuset: m...
1001
  	 * is idempotent.  Also migrate pages in each mm to new nodes.
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1002
  	 */
d66393e54   Tejun Heo   cpuset: use css_t...
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
  	css_task_iter_start(&cs->css, &it);
  	while ((task = css_task_iter_next(&it))) {
  		struct mm_struct *mm;
  		bool migrate;
  
  		cpuset_change_task_nodemask(task, &newmems);
  
  		mm = get_task_mm(task);
  		if (!mm)
  			continue;
  
  		migrate = is_memory_migrate(cs);
  
  		mpol_rebind_mm(mm, &cs->mems_allowed);
  		if (migrate)
  			cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
  		mmput(mm);
  	}
  	css_task_iter_end(&it);
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1022

33ad801df   Li Zefan   cpuset: record ol...
1023
1024
1025
1026
1027
  	/*
  	 * All the tasks' nodemasks have been updated, update
  	 * cs->old_mems_allowed.
  	 */
  	cs->old_mems_allowed = newmems;
2df167a30   Paul Menage   cgroups: update c...
1028
  	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
8793d854e   Paul Menage   Task Control Grou...
1029
  	cpuset_being_rebound = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1030
  }
0b2f630a2   Miao Xie   cpusets: restruct...
1031
  /*
5c5cc6232   Li Zefan   cpuset: allow to ...
1032
1033
1034
   * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
   * @cs: the root cpuset of the hierarchy
   * @update_root: update the root cpuset or not?
5c5cc6232   Li Zefan   cpuset: allow to ...
1035
1036
1037
1038
1039
1040
   *
   * This will update nodemasks of tasks in @root_cs and all other empty cpusets
   * which take on nodemask of @root_cs.
   *
   * Called with cpuset_mutex held
   */
d66393e54   Tejun Heo   cpuset: use css_t...
1041
  static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
5c5cc6232   Li Zefan   cpuset: allow to ...
1042
1043
  {
  	struct cpuset *cp;
492eb21b9   Tejun Heo   cgroup: make hier...
1044
  	struct cgroup_subsys_state *pos_css;
5c5cc6232   Li Zefan   cpuset: allow to ...
1045
1046
  
  	rcu_read_lock();
492eb21b9   Tejun Heo   cgroup: make hier...
1047
  	cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
bd8815a6d   Tejun Heo   cgroup: make css_...
1048
1049
1050
1051
1052
1053
1054
1055
1056
  		if (cp == root_cs) {
  			if (!update_root)
  				continue;
  		} else {
  			/* skip the whole subtree if @cp have some CPU */
  			if (!nodes_empty(cp->mems_allowed)) {
  				pos_css = css_rightmost_descendant(pos_css);
  				continue;
  			}
5c5cc6232   Li Zefan   cpuset: allow to ...
1057
1058
1059
1060
  		}
  		if (!css_tryget(&cp->css))
  			continue;
  		rcu_read_unlock();
d66393e54   Tejun Heo   cpuset: use css_t...
1061
  		update_tasks_nodemask(cp);
5c5cc6232   Li Zefan   cpuset: allow to ...
1062
1063
1064
1065
1066
1067
1068
1069
  
  		rcu_read_lock();
  		css_put(&cp->css);
  	}
  	rcu_read_unlock();
  }
  
  /*
0b2f630a2   Miao Xie   cpusets: restruct...
1070
1071
   * Handle user request to change the 'mems' memory placement
   * of a cpuset.  Needs to validate the request, update the
58568d2a8   Miao Xie   cpuset,mm: update...
1072
1073
1074
1075
   * cpusets mems_allowed, and for each task in the cpuset,
   * update mems_allowed and rebind task's mempolicy and any vma
   * mempolicies and if the cpuset is marked 'memory_migrate',
   * migrate the tasks pages to the new memory.
0b2f630a2   Miao Xie   cpusets: restruct...
1076
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
1077
   * Call with cpuset_mutex held.  May take callback_mutex during call.
0b2f630a2   Miao Xie   cpusets: restruct...
1078
1079
1080
1081
   * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
   * lock each such tasks mm->mmap_sem, scan its vma's and rebind
   * their mempolicies to the cpusets new mems_allowed.
   */
645fcc9d2   Li Zefan   cpuset: don't all...
1082
1083
  static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
  			   const char *buf)
0b2f630a2   Miao Xie   cpusets: restruct...
1084
  {
0b2f630a2   Miao Xie   cpusets: restruct...
1085
1086
1087
  	int retval;
  
  	/*
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
1088
  	 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
0b2f630a2   Miao Xie   cpusets: restruct...
1089
1090
  	 * it's read-only
  	 */
53feb2976   Miao Xie   cpuset: alloc nod...
1091
1092
1093
1094
  	if (cs == &top_cpuset) {
  		retval = -EACCES;
  		goto done;
  	}
0b2f630a2   Miao Xie   cpusets: restruct...
1095

0b2f630a2   Miao Xie   cpusets: restruct...
1096
1097
1098
1099
1100
1101
1102
  	/*
  	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
  	 * Since nodelist_parse() fails on an empty mask, we special case
  	 * that parsing.  The validate_change() call ensures that cpusets
  	 * with tasks have memory.
  	 */
  	if (!*buf) {
645fcc9d2   Li Zefan   cpuset: don't all...
1103
  		nodes_clear(trialcs->mems_allowed);
0b2f630a2   Miao Xie   cpusets: restruct...
1104
  	} else {
645fcc9d2   Li Zefan   cpuset: don't all...
1105
  		retval = nodelist_parse(buf, trialcs->mems_allowed);
0b2f630a2   Miao Xie   cpusets: restruct...
1106
1107
  		if (retval < 0)
  			goto done;
645fcc9d2   Li Zefan   cpuset: don't all...
1108
  		if (!nodes_subset(trialcs->mems_allowed,
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
1109
  				node_states[N_MEMORY])) {
53feb2976   Miao Xie   cpuset: alloc nod...
1110
1111
1112
  			retval =  -EINVAL;
  			goto done;
  		}
0b2f630a2   Miao Xie   cpusets: restruct...
1113
  	}
33ad801df   Li Zefan   cpuset: record ol...
1114
1115
  
  	if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
0b2f630a2   Miao Xie   cpusets: restruct...
1116
1117
1118
  		retval = 0;		/* Too easy - nothing to do */
  		goto done;
  	}
645fcc9d2   Li Zefan   cpuset: don't all...
1119
  	retval = validate_change(cs, trialcs);
0b2f630a2   Miao Xie   cpusets: restruct...
1120
1121
1122
1123
  	if (retval < 0)
  		goto done;
  
  	mutex_lock(&callback_mutex);
645fcc9d2   Li Zefan   cpuset: don't all...
1124
  	cs->mems_allowed = trialcs->mems_allowed;
0b2f630a2   Miao Xie   cpusets: restruct...
1125
  	mutex_unlock(&callback_mutex);
d66393e54   Tejun Heo   cpuset: use css_t...
1126
  	update_tasks_nodemask_hier(cs, true);
0b2f630a2   Miao Xie   cpusets: restruct...
1127
1128
1129
  done:
  	return retval;
  }
8793d854e   Paul Menage   Task Control Grou...
1130
1131
1132
1133
  int current_cpuset_is_being_rebound(void)
  {
  	return task_cs(current) == cpuset_being_rebound;
  }
5be7a4792   Paul Menage   Fix cpuset sched_...
1134
  static int update_relax_domain_level(struct cpuset *cs, s64 val)
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1135
  {
db7f47cf4   Paul Menage   cpusets: allow cp...
1136
  #ifdef CONFIG_SMP
60495e776   Peter Zijlstra   sched: Dynamic sc...
1137
  	if (val < -1 || val >= sched_domain_level_max)
30e0e1781   Li Zefan   cpuset: limit the...
1138
  		return -EINVAL;
db7f47cf4   Paul Menage   cpusets: allow cp...
1139
  #endif
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1140
1141
1142
  
  	if (val != cs->relax_domain_level) {
  		cs->relax_domain_level = val;
300ed6cbb   Li Zefan   cpuset: convert c...
1143
1144
  		if (!cpumask_empty(cs->cpus_allowed) &&
  		    is_sched_load_balance(cs))
699140ba8   Tejun Heo   cpuset: drop asyn...
1145
  			rebuild_sched_domains_locked();
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1146
1147
1148
1149
  	}
  
  	return 0;
  }
72ec70299   Tejun Heo   cgroup: make task...
1150
  /**
950592f7b   Miao Xie   cpusets: update t...
1151
1152
   * update_tasks_flags - update the spread flags of tasks in the cpuset.
   * @cs: the cpuset in which each task's spread flags needs to be changed
950592f7b   Miao Xie   cpusets: update t...
1153
   *
d66393e54   Tejun Heo   cpuset: use css_t...
1154
1155
1156
   * Iterate through each task of @cs updating its spread flags.  As this
   * function is called with cpuset_mutex held, cpuset membership stays
   * stable.
950592f7b   Miao Xie   cpusets: update t...
1157
   */
d66393e54   Tejun Heo   cpuset: use css_t...
1158
  static void update_tasks_flags(struct cpuset *cs)
950592f7b   Miao Xie   cpusets: update t...
1159
  {
d66393e54   Tejun Heo   cpuset: use css_t...
1160
1161
1162
1163
1164
1165
1166
  	struct css_task_iter it;
  	struct task_struct *task;
  
  	css_task_iter_start(&cs->css, &it);
  	while ((task = css_task_iter_next(&it)))
  		cpuset_update_task_spread_flag(cs, task);
  	css_task_iter_end(&it);
950592f7b   Miao Xie   cpusets: update t...
1167
1168
1169
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1170
   * update_flag - read a 0 or a 1 in a file and update associated flag
786083667   Paul Menage   Cpuset hardwall f...
1171
1172
1173
   * bit:		the bit to update (see cpuset_flagbits_t)
   * cs:		the cpuset to update
   * turning_on: 	whether the flag is being set or cleared
053199edf   Paul Jackson   [PATCH] cpusets: ...
1174
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
1175
   * Call with cpuset_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1176
   */
700fe1ab9   Paul Menage   CGroup API files:...
1177
1178
  static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
  		       int turning_on)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1179
  {
645fcc9d2   Li Zefan   cpuset: don't all...
1180
  	struct cpuset *trialcs;
40b6a7623   Rakib Mullick   cpuset.c: remove ...
1181
  	int balance_flag_changed;
950592f7b   Miao Xie   cpusets: update t...
1182
  	int spread_flag_changed;
950592f7b   Miao Xie   cpusets: update t...
1183
  	int err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1184

645fcc9d2   Li Zefan   cpuset: don't all...
1185
1186
1187
  	trialcs = alloc_trial_cpuset(cs);
  	if (!trialcs)
  		return -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1188
  	if (turning_on)
645fcc9d2   Li Zefan   cpuset: don't all...
1189
  		set_bit(bit, &trialcs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1190
  	else
645fcc9d2   Li Zefan   cpuset: don't all...
1191
  		clear_bit(bit, &trialcs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1192

645fcc9d2   Li Zefan   cpuset: don't all...
1193
  	err = validate_change(cs, trialcs);
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
1194
  	if (err < 0)
645fcc9d2   Li Zefan   cpuset: don't all...
1195
  		goto out;
029190c51   Paul Jackson   cpuset sched_load...
1196

029190c51   Paul Jackson   cpuset sched_load...
1197
  	balance_flag_changed = (is_sched_load_balance(cs) !=
645fcc9d2   Li Zefan   cpuset: don't all...
1198
  				is_sched_load_balance(trialcs));
029190c51   Paul Jackson   cpuset sched_load...
1199

950592f7b   Miao Xie   cpusets: update t...
1200
1201
  	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
  			|| (is_spread_page(cs) != is_spread_page(trialcs)));
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
1202
  	mutex_lock(&callback_mutex);
645fcc9d2   Li Zefan   cpuset: don't all...
1203
  	cs->flags = trialcs->flags;
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
1204
  	mutex_unlock(&callback_mutex);
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
1205

300ed6cbb   Li Zefan   cpuset: convert c...
1206
  	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
699140ba8   Tejun Heo   cpuset: drop asyn...
1207
  		rebuild_sched_domains_locked();
029190c51   Paul Jackson   cpuset sched_load...
1208

950592f7b   Miao Xie   cpusets: update t...
1209
  	if (spread_flag_changed)
d66393e54   Tejun Heo   cpuset: use css_t...
1210
  		update_tasks_flags(cs);
645fcc9d2   Li Zefan   cpuset: don't all...
1211
1212
1213
  out:
  	free_trial_cpuset(trialcs);
  	return err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1214
  }
053199edf   Paul Jackson   [PATCH] cpusets: ...
1215
  /*
80f7228b5   Adrian Bunk   typo fixes: occur...
1216
   * Frequency meter - How fast is some event occurring?
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
   *
   * These routines manage a digitally filtered, constant time based,
   * event frequency meter.  There are four routines:
   *   fmeter_init() - initialize a frequency meter.
   *   fmeter_markevent() - called each time the event happens.
   *   fmeter_getrate() - returns the recent rate of such events.
   *   fmeter_update() - internal routine used to update fmeter.
   *
   * A common data structure is passed to each of these routines,
   * which is used to keep track of the state required to manage the
   * frequency meter and its digital filter.
   *
   * The filter works on the number of events marked per unit time.
   * The filter is single-pole low-pass recursive (IIR).  The time unit
   * is 1 second.  Arithmetic is done using 32-bit integers scaled to
   * simulate 3 decimal digits of precision (multiplied by 1000).
   *
   * With an FM_COEF of 933, and a time base of 1 second, the filter
   * has a half-life of 10 seconds, meaning that if the events quit
   * happening, then the rate returned from the fmeter_getrate()
   * will be cut in half each 10 seconds, until it converges to zero.
   *
   * It is not worth doing a real infinitely recursive filter.  If more
   * than FM_MAXTICKS ticks have elapsed since the last filter event,
   * just compute FM_MAXTICKS ticks worth, by which point the level
   * will be stable.
   *
   * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
   * arithmetic overflow in the fmeter_update() routine.
   *
   * Given the simple 32 bit integer arithmetic used, this meter works
   * best for reporting rates between one per millisecond (msec) and
   * one per 32 (approx) seconds.  At constant rates faster than one
   * per msec it maxes out at values just under 1,000,000.  At constant
   * rates between one per msec, and one per second it will stabilize
   * to a value N*1000, where N is the rate of events per second.
   * At constant rates between one per second and one per 32 seconds,
   * it will be choppy, moving up on the seconds that have an event,
   * and then decaying until the next event.  At rates slower than
   * about one in 32 seconds, it decays all the way back to zero between
   * each event.
   */
  
  #define FM_COEF 933		/* coefficient for half-life of 10 secs */
  #define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
  #define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */
  #define FM_SCALE 1000		/* faux fixed point scale */
  
  /* Initialize a frequency meter */
  static void fmeter_init(struct fmeter *fmp)
  {
  	fmp->cnt = 0;
  	fmp->val = 0;
  	fmp->time = 0;
  	spin_lock_init(&fmp->lock);
  }
  
  /* Internal meter update - process cnt events and update value */
  static void fmeter_update(struct fmeter *fmp)
  {
  	time_t now = get_seconds();
  	time_t ticks = now - fmp->time;
  
  	if (ticks == 0)
  		return;
  
  	ticks = min(FM_MAXTICKS, ticks);
  	while (ticks-- > 0)
  		fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
  	fmp->time = now;
  
  	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
  	fmp->cnt = 0;
  }
  
  /* Process any previous ticks, then bump cnt by one (times scale). */
  static void fmeter_markevent(struct fmeter *fmp)
  {
  	spin_lock(&fmp->lock);
  	fmeter_update(fmp);
  	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
  	spin_unlock(&fmp->lock);
  }
  
  /* Process any previous ticks, then return current value. */
  static int fmeter_getrate(struct fmeter *fmp)
  {
  	int val;
  
  	spin_lock(&fmp->lock);
  	fmeter_update(fmp);
  	val = fmp->val;
  	spin_unlock(&fmp->lock);
  	return val;
  }
57fce0a68   Tejun Heo   cpuset: don't use...
1312
  static struct cpuset *cpuset_attach_old_cs;
5d21cc2db   Tejun Heo   cpuset: replace c...
1313
  /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
eb95419b0   Tejun Heo   cgroup: pass arou...
1314
1315
  static int cpuset_can_attach(struct cgroup_subsys_state *css,
  			     struct cgroup_taskset *tset)
f780bdb7c   Ben Blum   cgroups: add per-...
1316
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
1317
  	struct cpuset *cs = css_cs(css);
bb9d97b6d   Tejun Heo   cgroup: don't use...
1318
1319
  	struct task_struct *task;
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1320

57fce0a68   Tejun Heo   cpuset: don't use...
1321
1322
  	/* used later by cpuset_attach() */
  	cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
5d21cc2db   Tejun Heo   cpuset: replace c...
1323
  	mutex_lock(&cpuset_mutex);
88fa523bf   Li Zefan   cpuset: allow to ...
1324
1325
1326
1327
  	/*
  	 * We allow to move tasks into an empty cpuset if sane_behavior
  	 * flag is set.
  	 */
5d21cc2db   Tejun Heo   cpuset: replace c...
1328
  	ret = -ENOSPC;
eb95419b0   Tejun Heo   cgroup: pass arou...
1329
  	if (!cgroup_sane_behavior(css->cgroup) &&
88fa523bf   Li Zefan   cpuset: allow to ...
1330
  	    (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
5d21cc2db   Tejun Heo   cpuset: replace c...
1331
  		goto out_unlock;
9985b0bab   David Rientjes   sched: prevent bo...
1332

924f0d9a2   Tejun Heo   cgroup: drop @ski...
1333
  	cgroup_taskset_for_each(task, tset) {
bb9d97b6d   Tejun Heo   cgroup: don't use...
1334
  		/*
14a40ffcc   Tejun Heo   sched: replace PF...
1335
1336
1337
1338
1339
1340
1341
  		 * Kthreads which disallow setaffinity shouldn't be moved
  		 * to a new cpuset; we don't want to change their cpu
  		 * affinity and isolating such threads by their set of
  		 * allowed nodes is unnecessary.  Thus, cpusets are not
  		 * applicable for such threads.  This prevents checking for
  		 * success of set_cpus_allowed_ptr() on all attached tasks
  		 * before cpus_allowed may be changed.
bb9d97b6d   Tejun Heo   cgroup: don't use...
1342
  		 */
5d21cc2db   Tejun Heo   cpuset: replace c...
1343
  		ret = -EINVAL;
14a40ffcc   Tejun Heo   sched: replace PF...
1344
  		if (task->flags & PF_NO_SETAFFINITY)
5d21cc2db   Tejun Heo   cpuset: replace c...
1345
1346
1347
1348
  			goto out_unlock;
  		ret = security_task_setscheduler(task);
  		if (ret)
  			goto out_unlock;
bb9d97b6d   Tejun Heo   cgroup: don't use...
1349
  	}
f780bdb7c   Ben Blum   cgroups: add per-...
1350

452477fa6   Tejun Heo   cpuset: pin down ...
1351
1352
1353
1354
1355
  	/*
  	 * Mark attach is in progress.  This makes validate_change() fail
  	 * changes which zero cpus/mems_allowed.
  	 */
  	cs->attach_in_progress++;
5d21cc2db   Tejun Heo   cpuset: replace c...
1356
1357
1358
1359
  	ret = 0;
  out_unlock:
  	mutex_unlock(&cpuset_mutex);
  	return ret;
8793d854e   Paul Menage   Task Control Grou...
1360
  }
f780bdb7c   Ben Blum   cgroups: add per-...
1361

eb95419b0   Tejun Heo   cgroup: pass arou...
1362
  static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
452477fa6   Tejun Heo   cpuset: pin down ...
1363
1364
  				 struct cgroup_taskset *tset)
  {
5d21cc2db   Tejun Heo   cpuset: replace c...
1365
  	mutex_lock(&cpuset_mutex);
eb95419b0   Tejun Heo   cgroup: pass arou...
1366
  	css_cs(css)->attach_in_progress--;
5d21cc2db   Tejun Heo   cpuset: replace c...
1367
  	mutex_unlock(&cpuset_mutex);
8793d854e   Paul Menage   Task Control Grou...
1368
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1369

4e4c9a140   Tejun Heo   cpuset: cleanup c...
1370
  /*
5d21cc2db   Tejun Heo   cpuset: replace c...
1371
   * Protected by cpuset_mutex.  cpus_attach is used only by cpuset_attach()
4e4c9a140   Tejun Heo   cpuset: cleanup c...
1372
1373
1374
1375
   * but we can't allocate it dynamically there.  Define it global and
   * allocate from cpuset_init().
   */
  static cpumask_var_t cpus_attach;
eb95419b0   Tejun Heo   cgroup: pass arou...
1376
1377
  static void cpuset_attach(struct cgroup_subsys_state *css,
  			  struct cgroup_taskset *tset)
8793d854e   Paul Menage   Task Control Grou...
1378
  {
67bd2c598   Li Zefan   cpuset: remove un...
1379
  	/* static buf protected by cpuset_mutex */
4e4c9a140   Tejun Heo   cpuset: cleanup c...
1380
  	static nodemask_t cpuset_attach_nodemask_to;
8793d854e   Paul Menage   Task Control Grou...
1381
  	struct mm_struct *mm;
bb9d97b6d   Tejun Heo   cgroup: don't use...
1382
1383
  	struct task_struct *task;
  	struct task_struct *leader = cgroup_taskset_first(tset);
eb95419b0   Tejun Heo   cgroup: pass arou...
1384
  	struct cpuset *cs = css_cs(css);
57fce0a68   Tejun Heo   cpuset: don't use...
1385
  	struct cpuset *oldcs = cpuset_attach_old_cs;
070b57fca   Li Zefan   cpuset: introduce...
1386
1387
  	struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
  	struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
22fb52dd7   David Quigley   [PATCH] SELinux: ...
1388

5d21cc2db   Tejun Heo   cpuset: replace c...
1389
  	mutex_lock(&cpuset_mutex);
4e4c9a140   Tejun Heo   cpuset: cleanup c...
1390
1391
1392
1393
  	/* prepare for attach */
  	if (cs == &top_cpuset)
  		cpumask_copy(cpus_attach, cpu_possible_mask);
  	else
070b57fca   Li Zefan   cpuset: introduce...
1394
  		guarantee_online_cpus(cpus_cs, cpus_attach);
4e4c9a140   Tejun Heo   cpuset: cleanup c...
1395

070b57fca   Li Zefan   cpuset: introduce...
1396
  	guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
4e4c9a140   Tejun Heo   cpuset: cleanup c...
1397

924f0d9a2   Tejun Heo   cgroup: drop @ski...
1398
  	cgroup_taskset_for_each(task, tset) {
bb9d97b6d   Tejun Heo   cgroup: don't use...
1399
1400
1401
1402
1403
1404
1405
1406
1407
  		/*
  		 * can_attach beforehand should guarantee that this doesn't
  		 * fail.  TODO: have a better way to handle failure here
  		 */
  		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
  
  		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
  		cpuset_update_task_spread_flag(cs, task);
  	}
22fb52dd7   David Quigley   [PATCH] SELinux: ...
1408

f780bdb7c   Ben Blum   cgroups: add per-...
1409
1410
1411
1412
  	/*
  	 * Change mm, possibly for multiple threads in a threadgroup. This is
  	 * expensive and may sleep.
  	 */
f780bdb7c   Ben Blum   cgroups: add per-...
1413
  	cpuset_attach_nodemask_to = cs->mems_allowed;
bb9d97b6d   Tejun Heo   cgroup: don't use...
1414
  	mm = get_task_mm(leader);
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1415
  	if (mm) {
070b57fca   Li Zefan   cpuset: introduce...
1416
  		struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
f780bdb7c   Ben Blum   cgroups: add per-...
1417
  		mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
f047cecf2   Li Zefan   cpuset: fix to mi...
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
  
  		/*
  		 * old_mems_allowed is the same with mems_allowed here, except
  		 * if this task is being moved automatically due to hotplug.
  		 * In that case @mems_allowed has been updated and is empty,
  		 * so @old_mems_allowed is the right nodesets that we migrate
  		 * mm from.
  		 */
  		if (is_memory_migrate(cs)) {
  			cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
f780bdb7c   Ben Blum   cgroups: add per-...
1428
  					  &cpuset_attach_nodemask_to);
f047cecf2   Li Zefan   cpuset: fix to mi...
1429
  		}
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1430
1431
  		mmput(mm);
  	}
452477fa6   Tejun Heo   cpuset: pin down ...
1432

33ad801df   Li Zefan   cpuset: record ol...
1433
  	cs->old_mems_allowed = cpuset_attach_nodemask_to;
02bb58637   Tejun Heo   cpuset: schedule ...
1434

452477fa6   Tejun Heo   cpuset: pin down ...
1435
  	cs->attach_in_progress--;
e44193d39   Li Zefan   cpuset: let hotpl...
1436
1437
  	if (!cs->attach_in_progress)
  		wake_up(&cpuset_attach_wq);
5d21cc2db   Tejun Heo   cpuset: replace c...
1438
1439
  
  	mutex_unlock(&cpuset_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1440
1441
1442
1443
1444
  }
  
  /* The various types of files and directories in a cpuset file system */
  
  typedef enum {
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
1445
  	FILE_MEMORY_MIGRATE,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1446
1447
1448
1449
  	FILE_CPULIST,
  	FILE_MEMLIST,
  	FILE_CPU_EXCLUSIVE,
  	FILE_MEM_EXCLUSIVE,
786083667   Paul Menage   Cpuset hardwall f...
1450
  	FILE_MEM_HARDWALL,
029190c51   Paul Jackson   cpuset sched_load...
1451
  	FILE_SCHED_LOAD_BALANCE,
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1452
  	FILE_SCHED_RELAX_DOMAIN_LEVEL,
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1453
1454
  	FILE_MEMORY_PRESSURE_ENABLED,
  	FILE_MEMORY_PRESSURE,
825a46af5   Paul Jackson   [PATCH] cpuset me...
1455
1456
  	FILE_SPREAD_PAGE,
  	FILE_SPREAD_SLAB,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1457
  } cpuset_filetype_t;
182446d08   Tejun Heo   cgroup: pass arou...
1458
1459
  static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
  			    u64 val)
700fe1ab9   Paul Menage   CGroup API files:...
1460
  {
182446d08   Tejun Heo   cgroup: pass arou...
1461
  	struct cpuset *cs = css_cs(css);
700fe1ab9   Paul Menage   CGroup API files:...
1462
  	cpuset_filetype_t type = cft->private;
a903f0865   Li Zefan   cpuset: fix the r...
1463
  	int retval = 0;
700fe1ab9   Paul Menage   CGroup API files:...
1464

5d21cc2db   Tejun Heo   cpuset: replace c...
1465
  	mutex_lock(&cpuset_mutex);
a903f0865   Li Zefan   cpuset: fix the r...
1466
1467
  	if (!is_cpuset_online(cs)) {
  		retval = -ENODEV;
5d21cc2db   Tejun Heo   cpuset: replace c...
1468
  		goto out_unlock;
a903f0865   Li Zefan   cpuset: fix the r...
1469
  	}
700fe1ab9   Paul Menage   CGroup API files:...
1470
1471
  
  	switch (type) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1472
  	case FILE_CPU_EXCLUSIVE:
700fe1ab9   Paul Menage   CGroup API files:...
1473
  		retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1474
1475
  		break;
  	case FILE_MEM_EXCLUSIVE:
700fe1ab9   Paul Menage   CGroup API files:...
1476
  		retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1477
  		break;
786083667   Paul Menage   Cpuset hardwall f...
1478
1479
1480
  	case FILE_MEM_HARDWALL:
  		retval = update_flag(CS_MEM_HARDWALL, cs, val);
  		break;
029190c51   Paul Jackson   cpuset sched_load...
1481
  	case FILE_SCHED_LOAD_BALANCE:
700fe1ab9   Paul Menage   CGroup API files:...
1482
  		retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1483
  		break;
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
1484
  	case FILE_MEMORY_MIGRATE:
700fe1ab9   Paul Menage   CGroup API files:...
1485
  		retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
1486
  		break;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1487
  	case FILE_MEMORY_PRESSURE_ENABLED:
700fe1ab9   Paul Menage   CGroup API files:...
1488
  		cpuset_memory_pressure_enabled = !!val;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1489
1490
1491
1492
  		break;
  	case FILE_MEMORY_PRESSURE:
  		retval = -EACCES;
  		break;
825a46af5   Paul Jackson   [PATCH] cpuset me...
1493
  	case FILE_SPREAD_PAGE:
700fe1ab9   Paul Menage   CGroup API files:...
1494
  		retval = update_flag(CS_SPREAD_PAGE, cs, val);
825a46af5   Paul Jackson   [PATCH] cpuset me...
1495
1496
  		break;
  	case FILE_SPREAD_SLAB:
700fe1ab9   Paul Menage   CGroup API files:...
1497
  		retval = update_flag(CS_SPREAD_SLAB, cs, val);
825a46af5   Paul Jackson   [PATCH] cpuset me...
1498
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1499
1500
  	default:
  		retval = -EINVAL;
700fe1ab9   Paul Menage   CGroup API files:...
1501
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1502
  	}
5d21cc2db   Tejun Heo   cpuset: replace c...
1503
1504
  out_unlock:
  	mutex_unlock(&cpuset_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1505
1506
  	return retval;
  }
182446d08   Tejun Heo   cgroup: pass arou...
1507
1508
  static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
  			    s64 val)
5be7a4792   Paul Menage   Fix cpuset sched_...
1509
  {
182446d08   Tejun Heo   cgroup: pass arou...
1510
  	struct cpuset *cs = css_cs(css);
5be7a4792   Paul Menage   Fix cpuset sched_...
1511
  	cpuset_filetype_t type = cft->private;
5d21cc2db   Tejun Heo   cpuset: replace c...
1512
  	int retval = -ENODEV;
5be7a4792   Paul Menage   Fix cpuset sched_...
1513

5d21cc2db   Tejun Heo   cpuset: replace c...
1514
1515
1516
  	mutex_lock(&cpuset_mutex);
  	if (!is_cpuset_online(cs))
  		goto out_unlock;
e37123953   Paul Menage   cgroup files: rem...
1517

5be7a4792   Paul Menage   Fix cpuset sched_...
1518
1519
1520
1521
1522
1523
1524
1525
  	switch (type) {
  	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
  		retval = update_relax_domain_level(cs, val);
  		break;
  	default:
  		retval = -EINVAL;
  		break;
  	}
5d21cc2db   Tejun Heo   cpuset: replace c...
1526
1527
  out_unlock:
  	mutex_unlock(&cpuset_mutex);
5be7a4792   Paul Menage   Fix cpuset sched_...
1528
1529
  	return retval;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1530
  /*
e37123953   Paul Menage   cgroup files: rem...
1531
1532
   * Common handling for a write to a "cpus" or "mems" file.
   */
182446d08   Tejun Heo   cgroup: pass arou...
1533
  static int cpuset_write_resmask(struct cgroup_subsys_state *css,
4d3bb511b   Tejun Heo   cgroup: drop cons...
1534
  				struct cftype *cft, char *buf)
e37123953   Paul Menage   cgroup files: rem...
1535
  {
182446d08   Tejun Heo   cgroup: pass arou...
1536
  	struct cpuset *cs = css_cs(css);
645fcc9d2   Li Zefan   cpuset: don't all...
1537
  	struct cpuset *trialcs;
5d21cc2db   Tejun Heo   cpuset: replace c...
1538
  	int retval = -ENODEV;
e37123953   Paul Menage   cgroup files: rem...
1539

3a5a6d0c2   Tejun Heo   cpuset: don't nes...
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
  	/*
  	 * CPU or memory hotunplug may leave @cs w/o any execution
  	 * resources, in which case the hotplug code asynchronously updates
  	 * configuration and transfers all tasks to the nearest ancestor
  	 * which can execute.
  	 *
  	 * As writes to "cpus" or "mems" may restore @cs's execution
  	 * resources, wait for the previously scheduled operations before
  	 * proceeding, so that we don't end up keep removing tasks added
  	 * after execution capability is restored.
  	 */
  	flush_work(&cpuset_hotplug_work);
5d21cc2db   Tejun Heo   cpuset: replace c...
1552
1553
1554
  	mutex_lock(&cpuset_mutex);
  	if (!is_cpuset_online(cs))
  		goto out_unlock;
e37123953   Paul Menage   cgroup files: rem...
1555

645fcc9d2   Li Zefan   cpuset: don't all...
1556
  	trialcs = alloc_trial_cpuset(cs);
b75f38d65   Li Zefan   cpuset: add a mis...
1557
1558
  	if (!trialcs) {
  		retval = -ENOMEM;
5d21cc2db   Tejun Heo   cpuset: replace c...
1559
  		goto out_unlock;
b75f38d65   Li Zefan   cpuset: add a mis...
1560
  	}
645fcc9d2   Li Zefan   cpuset: don't all...
1561

e37123953   Paul Menage   cgroup files: rem...
1562
1563
  	switch (cft->private) {
  	case FILE_CPULIST:
645fcc9d2   Li Zefan   cpuset: don't all...
1564
  		retval = update_cpumask(cs, trialcs, buf);
e37123953   Paul Menage   cgroup files: rem...
1565
1566
  		break;
  	case FILE_MEMLIST:
645fcc9d2   Li Zefan   cpuset: don't all...
1567
  		retval = update_nodemask(cs, trialcs, buf);
e37123953   Paul Menage   cgroup files: rem...
1568
1569
1570
1571
1572
  		break;
  	default:
  		retval = -EINVAL;
  		break;
  	}
645fcc9d2   Li Zefan   cpuset: don't all...
1573
1574
  
  	free_trial_cpuset(trialcs);
5d21cc2db   Tejun Heo   cpuset: replace c...
1575
1576
  out_unlock:
  	mutex_unlock(&cpuset_mutex);
e37123953   Paul Menage   cgroup files: rem...
1577
1578
1579
1580
  	return retval;
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1581
1582
1583
1584
1585
1586
   * These ascii lists should be read in a single call, by using a user
   * buffer large enough to hold the entire map.  If read in smaller
   * chunks, there is no guarantee of atomicity.  Since the display format
   * used, list of ranges of sequential numbers, is variable length,
   * and since these maps can change value dynamically, one could read
   * gibberish by doing partial reads while a list was changing.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1587
   */
2da8ca822   Tejun Heo   cgroup: replace c...
1588
  static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1589
  {
2da8ca822   Tejun Heo   cgroup: replace c...
1590
1591
  	struct cpuset *cs = css_cs(seq_css(sf));
  	cpuset_filetype_t type = seq_cft(sf)->private;
51ffe4117   Tejun Heo   cpuset: convert a...
1592
1593
1594
  	ssize_t count;
  	char *buf, *s;
  	int ret = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1595

51ffe4117   Tejun Heo   cpuset: convert a...
1596
1597
  	count = seq_get_buf(sf, &buf);
  	s = buf;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1598

51ffe4117   Tejun Heo   cpuset: convert a...
1599
  	mutex_lock(&callback_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1600
1601
1602
  
  	switch (type) {
  	case FILE_CPULIST:
51ffe4117   Tejun Heo   cpuset: convert a...
1603
  		s += cpulist_scnprintf(s, count, cs->cpus_allowed);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1604
1605
  		break;
  	case FILE_MEMLIST:
51ffe4117   Tejun Heo   cpuset: convert a...
1606
  		s += nodelist_scnprintf(s, count, cs->mems_allowed);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1607
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1608
  	default:
51ffe4117   Tejun Heo   cpuset: convert a...
1609
1610
  		ret = -EINVAL;
  		goto out_unlock;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1611
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1612

51ffe4117   Tejun Heo   cpuset: convert a...
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
  	if (s < buf + count - 1) {
  		*s++ = '
  ';
  		seq_commit(sf, s - buf);
  	} else {
  		seq_commit(sf, -1);
  	}
  out_unlock:
  	mutex_unlock(&callback_mutex);
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1623
  }
182446d08   Tejun Heo   cgroup: pass arou...
1624
  static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
700fe1ab9   Paul Menage   CGroup API files:...
1625
  {
182446d08   Tejun Heo   cgroup: pass arou...
1626
  	struct cpuset *cs = css_cs(css);
700fe1ab9   Paul Menage   CGroup API files:...
1627
1628
1629
1630
1631
1632
  	cpuset_filetype_t type = cft->private;
  	switch (type) {
  	case FILE_CPU_EXCLUSIVE:
  		return is_cpu_exclusive(cs);
  	case FILE_MEM_EXCLUSIVE:
  		return is_mem_exclusive(cs);
786083667   Paul Menage   Cpuset hardwall f...
1633
1634
  	case FILE_MEM_HARDWALL:
  		return is_mem_hardwall(cs);
700fe1ab9   Paul Menage   CGroup API files:...
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
  	case FILE_SCHED_LOAD_BALANCE:
  		return is_sched_load_balance(cs);
  	case FILE_MEMORY_MIGRATE:
  		return is_memory_migrate(cs);
  	case FILE_MEMORY_PRESSURE_ENABLED:
  		return cpuset_memory_pressure_enabled;
  	case FILE_MEMORY_PRESSURE:
  		return fmeter_getrate(&cs->fmeter);
  	case FILE_SPREAD_PAGE:
  		return is_spread_page(cs);
  	case FILE_SPREAD_SLAB:
  		return is_spread_slab(cs);
  	default:
  		BUG();
  	}
cf417141c   Max Krasnyansky   sched, cpuset: re...
1650
1651
1652
  
  	/* Unreachable but makes gcc happy */
  	return 0;
700fe1ab9   Paul Menage   CGroup API files:...
1653
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1654

182446d08   Tejun Heo   cgroup: pass arou...
1655
  static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
5be7a4792   Paul Menage   Fix cpuset sched_...
1656
  {
182446d08   Tejun Heo   cgroup: pass arou...
1657
  	struct cpuset *cs = css_cs(css);
5be7a4792   Paul Menage   Fix cpuset sched_...
1658
1659
1660
1661
1662
1663
1664
  	cpuset_filetype_t type = cft->private;
  	switch (type) {
  	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
  		return cs->relax_domain_level;
  	default:
  		BUG();
  	}
cf417141c   Max Krasnyansky   sched, cpuset: re...
1665
1666
1667
  
  	/* Unrechable but makes gcc happy */
  	return 0;
5be7a4792   Paul Menage   Fix cpuset sched_...
1668
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1669
1670
1671
1672
  
  /*
   * for the common functions, 'private' gives the type of file
   */
addf2c739   Paul Menage   Cpuset hardwall f...
1673
1674
1675
  static struct cftype files[] = {
  	{
  		.name = "cpus",
2da8ca822   Tejun Heo   cgroup: replace c...
1676
  		.seq_show = cpuset_common_seq_show,
e37123953   Paul Menage   cgroup files: rem...
1677
1678
  		.write_string = cpuset_write_resmask,
  		.max_write_len = (100U + 6 * NR_CPUS),
addf2c739   Paul Menage   Cpuset hardwall f...
1679
1680
1681
1682
1683
  		.private = FILE_CPULIST,
  	},
  
  	{
  		.name = "mems",
2da8ca822   Tejun Heo   cgroup: replace c...
1684
  		.seq_show = cpuset_common_seq_show,
e37123953   Paul Menage   cgroup files: rem...
1685
1686
  		.write_string = cpuset_write_resmask,
  		.max_write_len = (100U + 6 * MAX_NUMNODES),
addf2c739   Paul Menage   Cpuset hardwall f...
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
  		.private = FILE_MEMLIST,
  	},
  
  	{
  		.name = "cpu_exclusive",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_CPU_EXCLUSIVE,
  	},
  
  	{
  		.name = "mem_exclusive",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEM_EXCLUSIVE,
  	},
  
  	{
786083667   Paul Menage   Cpuset hardwall f...
1705
1706
1707
1708
1709
1710
1711
  		.name = "mem_hardwall",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEM_HARDWALL,
  	},
  
  	{
addf2c739   Paul Menage   Cpuset hardwall f...
1712
1713
1714
1715
1716
1717
1718
1719
  		.name = "sched_load_balance",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_SCHED_LOAD_BALANCE,
  	},
  
  	{
  		.name = "sched_relax_domain_level",
5be7a4792   Paul Menage   Fix cpuset sched_...
1720
1721
  		.read_s64 = cpuset_read_s64,
  		.write_s64 = cpuset_write_s64,
addf2c739   Paul Menage   Cpuset hardwall f...
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
  		.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
  	},
  
  	{
  		.name = "memory_migrate",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEMORY_MIGRATE,
  	},
  
  	{
  		.name = "memory_pressure",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEMORY_PRESSURE,
099fca322   Li Zefan   cgroups: show cor...
1737
  		.mode = S_IRUGO,
addf2c739   Paul Menage   Cpuset hardwall f...
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
  	},
  
  	{
  		.name = "memory_spread_page",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_SPREAD_PAGE,
  	},
  
  	{
  		.name = "memory_spread_slab",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_SPREAD_SLAB,
  	},
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1753

4baf6e332   Tejun Heo   cgroup: convert a...
1754
1755
1756
1757
1758
1759
1760
  	{
  		.name = "memory_pressure_enabled",
  		.flags = CFTYPE_ONLY_ON_ROOT,
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEMORY_PRESSURE_ENABLED,
  	},
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1761

4baf6e332   Tejun Heo   cgroup: convert a...
1762
1763
  	{ }	/* terminate */
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1764
1765
  
  /*
92fb97487   Tejun Heo   cgroup: rename ->...
1766
   *	cpuset_css_alloc - allocate a cpuset css
c9e5fe66f   Li Zefan   cpuset: rename @c...
1767
   *	cgrp:	control group that the new cpuset will be part of
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1768
   */
eb95419b0   Tejun Heo   cgroup: pass arou...
1769
1770
  static struct cgroup_subsys_state *
  cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1771
  {
c8f699bb5   Tejun Heo   cpuset: introduce...
1772
  	struct cpuset *cs;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1773

eb95419b0   Tejun Heo   cgroup: pass arou...
1774
  	if (!parent_css)
8793d854e   Paul Menage   Task Control Grou...
1775
  		return &top_cpuset.css;
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1776

c8f699bb5   Tejun Heo   cpuset: introduce...
1777
  	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1778
  	if (!cs)
8793d854e   Paul Menage   Task Control Grou...
1779
  		return ERR_PTR(-ENOMEM);
300ed6cbb   Li Zefan   cpuset: convert c...
1780
1781
1782
1783
  	if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
  		kfree(cs);
  		return ERR_PTR(-ENOMEM);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1784

029190c51   Paul Jackson   cpuset sched_load...
1785
  	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
300ed6cbb   Li Zefan   cpuset: convert c...
1786
  	cpumask_clear(cs->cpus_allowed);
f9a86fcbb   Mike Travis   cpuset: modify cp...
1787
  	nodes_clear(cs->mems_allowed);
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1788
  	fmeter_init(&cs->fmeter);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1789
  	cs->relax_domain_level = -1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1790

c8f699bb5   Tejun Heo   cpuset: introduce...
1791
1792
  	return &cs->css;
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
1793
  static int cpuset_css_online(struct cgroup_subsys_state *css)
c8f699bb5   Tejun Heo   cpuset: introduce...
1794
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
1795
  	struct cpuset *cs = css_cs(css);
c431069fe   Tejun Heo   cpuset: remove cp...
1796
  	struct cpuset *parent = parent_cs(cs);
ae8086ce1   Tejun Heo   cpuset: introduce...
1797
  	struct cpuset *tmp_cs;
492eb21b9   Tejun Heo   cgroup: make hier...
1798
  	struct cgroup_subsys_state *pos_css;
c8f699bb5   Tejun Heo   cpuset: introduce...
1799
1800
1801
  
  	if (!parent)
  		return 0;
5d21cc2db   Tejun Heo   cpuset: replace c...
1802
  	mutex_lock(&cpuset_mutex);
efeb77b2f   Tejun Heo   cpuset: introduce...
1803
  	set_bit(CS_ONLINE, &cs->flags);
c8f699bb5   Tejun Heo   cpuset: introduce...
1804
1805
1806
1807
  	if (is_spread_page(parent))
  		set_bit(CS_SPREAD_PAGE, &cs->flags);
  	if (is_spread_slab(parent))
  		set_bit(CS_SPREAD_SLAB, &cs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1808

202f72d5d   Paul Jackson   [PATCH] cpuset: n...
1809
  	number_of_cpusets++;
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1810

eb95419b0   Tejun Heo   cgroup: pass arou...
1811
  	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
5d21cc2db   Tejun Heo   cpuset: replace c...
1812
  		goto out_unlock;
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
  
  	/*
  	 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
  	 * set.  This flag handling is implemented in cgroup core for
  	 * histrical reasons - the flag may be specified during mount.
  	 *
  	 * Currently, if any sibling cpusets have exclusive cpus or mem, we
  	 * refuse to clone the configuration - thereby refusing the task to
  	 * be entered, and as a result refusing the sys_unshare() or
  	 * clone() which initiated it.  If this becomes a problem for some
  	 * users who wish to allow that scenario, then this could be
  	 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
  	 * (and likewise for mems) to the new cgroup.
  	 */
ae8086ce1   Tejun Heo   cpuset: introduce...
1827
  	rcu_read_lock();
492eb21b9   Tejun Heo   cgroup: make hier...
1828
  	cpuset_for_each_child(tmp_cs, pos_css, parent) {
ae8086ce1   Tejun Heo   cpuset: introduce...
1829
1830
  		if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
  			rcu_read_unlock();
5d21cc2db   Tejun Heo   cpuset: replace c...
1831
  			goto out_unlock;
ae8086ce1   Tejun Heo   cpuset: introduce...
1832
  		}
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1833
  	}
ae8086ce1   Tejun Heo   cpuset: introduce...
1834
  	rcu_read_unlock();
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1835
1836
1837
1838
1839
  
  	mutex_lock(&callback_mutex);
  	cs->mems_allowed = parent->mems_allowed;
  	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
  	mutex_unlock(&callback_mutex);
5d21cc2db   Tejun Heo   cpuset: replace c...
1840
1841
  out_unlock:
  	mutex_unlock(&cpuset_mutex);
c8f699bb5   Tejun Heo   cpuset: introduce...
1842
1843
  	return 0;
  }
0b9e6965a   Zhao Hongjiang   cpuset: relocate ...
1844
1845
1846
1847
1848
  /*
   * If the cpuset being removed has its flag 'sched_load_balance'
   * enabled, then simulate turning sched_load_balance off, which
   * will call rebuild_sched_domains_locked().
   */
eb95419b0   Tejun Heo   cgroup: pass arou...
1849
  static void cpuset_css_offline(struct cgroup_subsys_state *css)
c8f699bb5   Tejun Heo   cpuset: introduce...
1850
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
1851
  	struct cpuset *cs = css_cs(css);
c8f699bb5   Tejun Heo   cpuset: introduce...
1852

5d21cc2db   Tejun Heo   cpuset: replace c...
1853
  	mutex_lock(&cpuset_mutex);
c8f699bb5   Tejun Heo   cpuset: introduce...
1854
1855
1856
1857
1858
  
  	if (is_sched_load_balance(cs))
  		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
  
  	number_of_cpusets--;
efeb77b2f   Tejun Heo   cpuset: introduce...
1859
  	clear_bit(CS_ONLINE, &cs->flags);
c8f699bb5   Tejun Heo   cpuset: introduce...
1860

5d21cc2db   Tejun Heo   cpuset: replace c...
1861
  	mutex_unlock(&cpuset_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1862
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
1863
  static void cpuset_css_free(struct cgroup_subsys_state *css)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1864
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
1865
  	struct cpuset *cs = css_cs(css);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1866

300ed6cbb   Li Zefan   cpuset: convert c...
1867
  	free_cpumask_var(cs->cpus_allowed);
8793d854e   Paul Menage   Task Control Grou...
1868
  	kfree(cs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1869
  }
073219e99   Tejun Heo   cgroup: clean up ...
1870
  struct cgroup_subsys cpuset_cgrp_subsys = {
92fb97487   Tejun Heo   cgroup: rename ->...
1871
  	.css_alloc = cpuset_css_alloc,
c8f699bb5   Tejun Heo   cpuset: introduce...
1872
1873
  	.css_online = cpuset_css_online,
  	.css_offline = cpuset_css_offline,
92fb97487   Tejun Heo   cgroup: rename ->...
1874
  	.css_free = cpuset_css_free,
8793d854e   Paul Menage   Task Control Grou...
1875
  	.can_attach = cpuset_can_attach,
452477fa6   Tejun Heo   cpuset: pin down ...
1876
  	.cancel_attach = cpuset_cancel_attach,
8793d854e   Paul Menage   Task Control Grou...
1877
  	.attach = cpuset_attach,
4baf6e332   Tejun Heo   cgroup: convert a...
1878
  	.base_cftypes = files,
8793d854e   Paul Menage   Task Control Grou...
1879
1880
  	.early_init = 1,
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1881
1882
1883
1884
1885
1886
1887
1888
  /**
   * cpuset_init - initialize cpusets at system boot
   *
   * Description: Initialize top_cpuset and the cpuset internal file system,
   **/
  
  int __init cpuset_init(void)
  {
8793d854e   Paul Menage   Task Control Grou...
1889
  	int err = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1890

58568d2a8   Miao Xie   cpuset,mm: update...
1891
1892
  	if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
  		BUG();
300ed6cbb   Li Zefan   cpuset: convert c...
1893
  	cpumask_setall(top_cpuset.cpus_allowed);
f9a86fcbb   Mike Travis   cpuset: modify cp...
1894
  	nodes_setall(top_cpuset.mems_allowed);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1895

3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1896
  	fmeter_init(&top_cpuset.fmeter);
029190c51   Paul Jackson   cpuset sched_load...
1897
  	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1898
  	top_cpuset.relax_domain_level = -1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1899

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1900
1901
  	err = register_filesystem(&cpuset_fs_type);
  	if (err < 0)
8793d854e   Paul Menage   Task Control Grou...
1902
  		return err;
2341d1b65   Li Zefan   cpuset: convert c...
1903
1904
  	if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
  		BUG();
202f72d5d   Paul Jackson   [PATCH] cpuset: n...
1905
  	number_of_cpusets = 1;
8793d854e   Paul Menage   Task Control Grou...
1906
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1907
  }
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1908
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
1909
   * If CPU and/or memory hotplug handlers, below, unplug any CPUs
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1910
1911
   * or memory nodes, we need to walk over the cpuset hierarchy,
   * removing that CPU or node from all cpusets.  If this removes the
956db3ca0   Cliff Wickman   hotplug cpu: move...
1912
1913
   * last CPU or node from a cpuset, then move the tasks in the empty
   * cpuset to its next-highest non-empty parent.
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1914
   */
956db3ca0   Cliff Wickman   hotplug cpu: move...
1915
1916
1917
  static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
  {
  	struct cpuset *parent;
c8d9c90c7   Paul Jackson   hotplug cpu: move...
1918
  	/*
956db3ca0   Cliff Wickman   hotplug cpu: move...
1919
1920
1921
  	 * Find its next-highest non-empty parent, (top cpuset
  	 * has online cpus, so can't be empty).
  	 */
c431069fe   Tejun Heo   cpuset: remove cp...
1922
  	parent = parent_cs(cs);
300ed6cbb   Li Zefan   cpuset: convert c...
1923
  	while (cpumask_empty(parent->cpus_allowed) ||
b45012955   Paul Jackson   hotplug cpu move ...
1924
  			nodes_empty(parent->mems_allowed))
c431069fe   Tejun Heo   cpuset: remove cp...
1925
  		parent = parent_cs(parent);
956db3ca0   Cliff Wickman   hotplug cpu: move...
1926

8cc993452   Tejun Heo   cgroup, cpuset: r...
1927
  	if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
e61734c55   Tejun Heo   cgroup: remove cg...
1928
1929
1930
1931
  		printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset ");
  		pr_cont_cgroup_name(cs->css.cgroup);
  		pr_cont("
  ");
8cc993452   Tejun Heo   cgroup, cpuset: r...
1932
  	}
956db3ca0   Cliff Wickman   hotplug cpu: move...
1933
  }
deb7aa308   Tejun Heo   cpuset: reorganiz...
1934
  /**
388afd854   Li Zefan   cpuset: remove as...
1935
   * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
deb7aa308   Tejun Heo   cpuset: reorganiz...
1936
   * @cs: cpuset in interest
956db3ca0   Cliff Wickman   hotplug cpu: move...
1937
   *
deb7aa308   Tejun Heo   cpuset: reorganiz...
1938
1939
1940
   * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
   * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
   * all its tasks are moved to the nearest ancestor with both resources.
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
1941
   */
388afd854   Li Zefan   cpuset: remove as...
1942
  static void cpuset_hotplug_update_tasks(struct cpuset *cs)
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
1943
  {
deb7aa308   Tejun Heo   cpuset: reorganiz...
1944
  	static cpumask_t off_cpus;
33ad801df   Li Zefan   cpuset: record ol...
1945
  	static nodemask_t off_mems;
5d21cc2db   Tejun Heo   cpuset: replace c...
1946
  	bool is_empty;
5c5cc6232   Li Zefan   cpuset: allow to ...
1947
  	bool sane = cgroup_sane_behavior(cs->css.cgroup);
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
1948

e44193d39   Li Zefan   cpuset: let hotpl...
1949
1950
  retry:
  	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
1951

5d21cc2db   Tejun Heo   cpuset: replace c...
1952
  	mutex_lock(&cpuset_mutex);
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
1953

e44193d39   Li Zefan   cpuset: let hotpl...
1954
1955
1956
1957
1958
1959
1960
1961
  	/*
  	 * We have raced with task attaching. We wait until attaching
  	 * is finished, so we won't attach a task to an empty cpuset.
  	 */
  	if (cs->attach_in_progress) {
  		mutex_unlock(&cpuset_mutex);
  		goto retry;
  	}
deb7aa308   Tejun Heo   cpuset: reorganiz...
1962
1963
  	cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
  	nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
1964

5c5cc6232   Li Zefan   cpuset: allow to ...
1965
1966
1967
1968
1969
1970
  	mutex_lock(&callback_mutex);
  	cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
  	mutex_unlock(&callback_mutex);
  
  	/*
  	 * If sane_behavior flag is set, we need to update tasks' cpumask
f047cecf2   Li Zefan   cpuset: fix to mi...
1971
1972
1973
  	 * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
  	 * call update_tasks_cpumask() if the cpuset becomes empty, as
  	 * the tasks in it will be migrated to an ancestor.
5c5cc6232   Li Zefan   cpuset: allow to ...
1974
1975
  	 */
  	if ((sane && cpumask_empty(cs->cpus_allowed)) ||
f047cecf2   Li Zefan   cpuset: fix to mi...
1976
  	    (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
d66393e54   Tejun Heo   cpuset: use css_t...
1977
  		update_tasks_cpumask(cs);
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
1978

5c5cc6232   Li Zefan   cpuset: allow to ...
1979
1980
1981
1982
1983
1984
  	mutex_lock(&callback_mutex);
  	nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
  	mutex_unlock(&callback_mutex);
  
  	/*
  	 * If sane_behavior flag is set, we need to update tasks' nodemask
f047cecf2   Li Zefan   cpuset: fix to mi...
1985
1986
1987
  	 * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
  	 * call update_tasks_nodemask() if the cpuset becomes empty, as
  	 * the tasks in it will be migratd to an ancestor.
5c5cc6232   Li Zefan   cpuset: allow to ...
1988
1989
  	 */
  	if ((sane && nodes_empty(cs->mems_allowed)) ||
f047cecf2   Li Zefan   cpuset: fix to mi...
1990
  	    (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
d66393e54   Tejun Heo   cpuset: use css_t...
1991
  		update_tasks_nodemask(cs);
deb7aa308   Tejun Heo   cpuset: reorganiz...
1992

5d21cc2db   Tejun Heo   cpuset: replace c...
1993
1994
  	is_empty = cpumask_empty(cs->cpus_allowed) ||
  		nodes_empty(cs->mems_allowed);
8d0339487   Tejun Heo   cpuset: make CPU ...
1995

5d21cc2db   Tejun Heo   cpuset: replace c...
1996
1997
1998
  	mutex_unlock(&cpuset_mutex);
  
  	/*
5c5cc6232   Li Zefan   cpuset: allow to ...
1999
2000
2001
2002
  	 * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
  	 *
  	 * Otherwise move tasks to the nearest ancestor with execution
  	 * resources.  This is full cgroup operation which will
5d21cc2db   Tejun Heo   cpuset: replace c...
2003
2004
  	 * also call back into cpuset.  Should be done outside any lock.
  	 */
5c5cc6232   Li Zefan   cpuset: allow to ...
2005
  	if (!sane && is_empty)
5d21cc2db   Tejun Heo   cpuset: replace c...
2006
  		remove_tasks_in_empty_cpuset(cs);
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2007
  }
deb7aa308   Tejun Heo   cpuset: reorganiz...
2008
  /**
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
2009
   * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
956db3ca0   Cliff Wickman   hotplug cpu: move...
2010
   *
deb7aa308   Tejun Heo   cpuset: reorganiz...
2011
2012
2013
2014
2015
   * This function is called after either CPU or memory configuration has
   * changed and updates cpuset accordingly.  The top_cpuset is always
   * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
   * order to make cpusets transparent (of no affect) on systems that are
   * actively using CPU hotplug but making no active use of cpusets.
956db3ca0   Cliff Wickman   hotplug cpu: move...
2016
   *
deb7aa308   Tejun Heo   cpuset: reorganiz...
2017
   * Non-root cpusets are only affected by offlining.  If any CPUs or memory
388afd854   Li Zefan   cpuset: remove as...
2018
2019
   * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
   * all descendants.
956db3ca0   Cliff Wickman   hotplug cpu: move...
2020
   *
deb7aa308   Tejun Heo   cpuset: reorganiz...
2021
2022
   * Note that CPU offlining during suspend is ignored.  We don't modify
   * cpusets across suspend/resume cycles at all.
956db3ca0   Cliff Wickman   hotplug cpu: move...
2023
   */
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
2024
  static void cpuset_hotplug_workfn(struct work_struct *work)
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2025
  {
5c5cc6232   Li Zefan   cpuset: allow to ...
2026
2027
  	static cpumask_t new_cpus;
  	static nodemask_t new_mems;
deb7aa308   Tejun Heo   cpuset: reorganiz...
2028
  	bool cpus_updated, mems_updated;
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2029

5d21cc2db   Tejun Heo   cpuset: replace c...
2030
  	mutex_lock(&cpuset_mutex);
956db3ca0   Cliff Wickman   hotplug cpu: move...
2031

deb7aa308   Tejun Heo   cpuset: reorganiz...
2032
2033
2034
  	/* fetch the available cpus/mems and find out which changed how */
  	cpumask_copy(&new_cpus, cpu_active_mask);
  	new_mems = node_states[N_MEMORY];
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
2035

deb7aa308   Tejun Heo   cpuset: reorganiz...
2036
  	cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
deb7aa308   Tejun Heo   cpuset: reorganiz...
2037
  	mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
2038

deb7aa308   Tejun Heo   cpuset: reorganiz...
2039
2040
2041
2042
2043
2044
2045
  	/* synchronize cpus_allowed to cpu_active_mask */
  	if (cpus_updated) {
  		mutex_lock(&callback_mutex);
  		cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
  		mutex_unlock(&callback_mutex);
  		/* we don't mess with cpumasks of tasks in top_cpuset */
  	}
b45012955   Paul Jackson   hotplug cpu move ...
2046

deb7aa308   Tejun Heo   cpuset: reorganiz...
2047
2048
  	/* synchronize mems_allowed to N_MEMORY */
  	if (mems_updated) {
deb7aa308   Tejun Heo   cpuset: reorganiz...
2049
2050
2051
  		mutex_lock(&callback_mutex);
  		top_cpuset.mems_allowed = new_mems;
  		mutex_unlock(&callback_mutex);
d66393e54   Tejun Heo   cpuset: use css_t...
2052
  		update_tasks_nodemask(&top_cpuset);
deb7aa308   Tejun Heo   cpuset: reorganiz...
2053
  	}
b45012955   Paul Jackson   hotplug cpu move ...
2054

388afd854   Li Zefan   cpuset: remove as...
2055
  	mutex_unlock(&cpuset_mutex);
5c5cc6232   Li Zefan   cpuset: allow to ...
2056
2057
  	/* if cpus or mems changed, we need to propagate to descendants */
  	if (cpus_updated || mems_updated) {
deb7aa308   Tejun Heo   cpuset: reorganiz...
2058
  		struct cpuset *cs;
492eb21b9   Tejun Heo   cgroup: make hier...
2059
  		struct cgroup_subsys_state *pos_css;
f9b4fb8da   Miao Xie   cpusets: update t...
2060

fc560a26a   Tejun Heo   cpuset: replace c...
2061
  		rcu_read_lock();
492eb21b9   Tejun Heo   cgroup: make hier...
2062
  		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
bd8815a6d   Tejun Heo   cgroup: make css_...
2063
  			if (cs == &top_cpuset || !css_tryget(&cs->css))
388afd854   Li Zefan   cpuset: remove as...
2064
2065
  				continue;
  			rcu_read_unlock();
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
2066

388afd854   Li Zefan   cpuset: remove as...
2067
  			cpuset_hotplug_update_tasks(cs);
b45012955   Paul Jackson   hotplug cpu move ...
2068

388afd854   Li Zefan   cpuset: remove as...
2069
2070
2071
2072
2073
  			rcu_read_lock();
  			css_put(&cs->css);
  		}
  		rcu_read_unlock();
  	}
8d0339487   Tejun Heo   cpuset: make CPU ...
2074

deb7aa308   Tejun Heo   cpuset: reorganiz...
2075
  	/* rebuild sched domains if cpus_allowed has changed */
e0e80a02e   Li Zhong   cpuset: use rebui...
2076
2077
  	if (cpus_updated)
  		rebuild_sched_domains();
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2078
  }
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
2079
  void cpuset_update_active_cpus(bool cpu_online)
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2080
  {
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
  	/*
  	 * We're inside cpu hotplug critical region which usually nests
  	 * inside cgroup synchronization.  Bounce actual hotplug processing
  	 * to a work item to avoid reverse locking order.
  	 *
  	 * We still need to do partition_sched_domains() synchronously;
  	 * otherwise, the scheduler will get confused and put tasks to the
  	 * dead CPU.  Fall back to the default single domain.
  	 * cpuset_hotplug_workfn() will rebuild it as necessary.
  	 */
  	partition_sched_domains(1, NULL, NULL);
  	schedule_work(&cpuset_hotplug_work);
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2093
  }
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2094

38837fc75   Paul Jackson   [PATCH] cpuset: t...
2095
  /*
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
2096
2097
   * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
   * Call this routine anytime after node_states[N_MEMORY] changes.
a1cd2b13f   Srivatsa S. Bhat   cpusets: Remove/u...
2098
   * See cpuset_update_active_cpus() for CPU hotplug handling.
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2099
   */
f481891fd   Miao Xie   cpuset: update to...
2100
2101
  static int cpuset_track_online_nodes(struct notifier_block *self,
  				unsigned long action, void *arg)
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2102
  {
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
2103
  	schedule_work(&cpuset_hotplug_work);
f481891fd   Miao Xie   cpuset: update to...
2104
  	return NOTIFY_OK;
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2105
  }
d8f10cb3d   Andrew Morton   kernel/cpuset.c: ...
2106
2107
2108
2109
2110
  
  static struct notifier_block cpuset_track_online_nodes_nb = {
  	.notifier_call = cpuset_track_online_nodes,
  	.priority = 10,		/* ??! */
  };
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2111

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2112
2113
2114
2115
  /**
   * cpuset_init_smp - initialize cpus_allowed
   *
   * Description: Finish top cpuset after cpu, node maps are initialized
d8f10cb3d   Andrew Morton   kernel/cpuset.c: ...
2116
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2117
2118
  void __init cpuset_init_smp(void)
  {
6ad4c1888   Peter Zijlstra   sched: Fix balanc...
2119
  	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
2120
  	top_cpuset.mems_allowed = node_states[N_MEMORY];
33ad801df   Li Zefan   cpuset: record ol...
2121
  	top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2122

d8f10cb3d   Andrew Morton   kernel/cpuset.c: ...
2123
  	register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2124
2125
2126
  }
  
  /**
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2127
2128
   * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
   * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
6af866af3   Li Zefan   cpuset: remove re...
2129
   * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2130
   *
300ed6cbb   Li Zefan   cpuset: convert c...
2131
   * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2132
   * attached to the specified @tsk.  Guaranteed to return some non-empty
5f054e31c   Rusty Russell   documentation: re...
2133
   * subset of cpu_online_mask, even if this means going outside the
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2134
2135
   * tasks cpuset.
   **/
6af866af3   Li Zefan   cpuset: remove re...
2136
  void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2137
  {
070b57fca   Li Zefan   cpuset: introduce...
2138
  	struct cpuset *cpus_cs;
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2139
  	mutex_lock(&callback_mutex);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2140
  	rcu_read_lock();
070b57fca   Li Zefan   cpuset: introduce...
2141
2142
  	cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
  	guarantee_online_cpus(cpus_cs, pmask);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2143
  	rcu_read_unlock();
897f0b3c3   Oleg Nesterov   sched: Kill the b...
2144
  	mutex_unlock(&callback_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2145
  }
2baab4e90   Peter Zijlstra   sched: Fix select...
2146
  void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
9084bb824   Oleg Nesterov   sched: Make selec...
2147
  {
c9710d801   Tejun Heo   cpuset: drop "con...
2148
  	struct cpuset *cpus_cs;
9084bb824   Oleg Nesterov   sched: Make selec...
2149
2150
  
  	rcu_read_lock();
070b57fca   Li Zefan   cpuset: introduce...
2151
2152
  	cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
  	do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
9084bb824   Oleg Nesterov   sched: Make selec...
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
  	rcu_read_unlock();
  
  	/*
  	 * We own tsk->cpus_allowed, nobody can change it under us.
  	 *
  	 * But we used cs && cs->cpus_allowed lockless and thus can
  	 * race with cgroup_attach_task() or update_cpumask() and get
  	 * the wrong tsk->cpus_allowed. However, both cases imply the
  	 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
  	 * which takes task_rq_lock().
  	 *
  	 * If we are called after it dropped the lock we must see all
  	 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
  	 * set any mask even if it is not right from task_cs() pov,
  	 * the pending set_cpus_allowed_ptr() will fix things.
2baab4e90   Peter Zijlstra   sched: Fix select...
2168
2169
2170
  	 *
  	 * select_fallback_rq() will fix things ups and set cpu_possible_mask
  	 * if required.
9084bb824   Oleg Nesterov   sched: Make selec...
2171
  	 */
9084bb824   Oleg Nesterov   sched: Make selec...
2172
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2173
2174
  void cpuset_init_current_mems_allowed(void)
  {
f9a86fcbb   Mike Travis   cpuset: modify cp...
2175
  	nodes_setall(current->mems_allowed);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2176
  }
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2177
  /**
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2178
2179
2180
2181
2182
   * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
   * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
   *
   * Description: Returns the nodemask_t mems_allowed of the cpuset
   * attached to the specified @tsk.  Guaranteed to return some non-empty
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
2183
   * subset of node_states[N_MEMORY], even if this means going outside the
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2184
2185
2186
2187
2188
   * tasks cpuset.
   **/
  
  nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
  {
070b57fca   Li Zefan   cpuset: introduce...
2189
  	struct cpuset *mems_cs;
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2190
  	nodemask_t mask;
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2191
  	mutex_lock(&callback_mutex);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2192
  	rcu_read_lock();
070b57fca   Li Zefan   cpuset: introduce...
2193
2194
  	mems_cs = effective_nodemask_cpuset(task_cs(tsk));
  	guarantee_online_mems(mems_cs, &mask);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2195
  	rcu_read_unlock();
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2196
  	mutex_unlock(&callback_mutex);
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2197
2198
2199
2200
2201
  
  	return mask;
  }
  
  /**
19770b326   Mel Gorman   mm: filter based ...
2202
2203
   * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
   * @nodemask: the nodemask to be checked
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2204
   *
19770b326   Mel Gorman   mm: filter based ...
2205
   * Are any of the nodes in the nodemask allowed in current->mems_allowed?
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2206
   */
19770b326   Mel Gorman   mm: filter based ...
2207
  int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2208
  {
19770b326   Mel Gorman   mm: filter based ...
2209
  	return nodes_intersects(*nodemask, current->mems_allowed);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2210
  }
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2211
  /*
786083667   Paul Menage   Cpuset hardwall f...
2212
2213
2214
2215
   * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
   * mem_hardwall ancestor to the specified cpuset.  Call holding
   * callback_mutex.  If no ancestor is mem_exclusive or mem_hardwall
   * (an unusual configuration), then returns the root cpuset.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2216
   */
c9710d801   Tejun Heo   cpuset: drop "con...
2217
  static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2218
  {
c431069fe   Tejun Heo   cpuset: remove cp...
2219
2220
  	while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
  		cs = parent_cs(cs);
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2221
2222
  	return cs;
  }
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2223
  /**
a1bc5a4ee   David Rientjes   cpusets: replace ...
2224
2225
   * cpuset_node_allowed_softwall - Can we allocate on a memory node?
   * @node: is this an allowed node?
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2226
   * @gfp_mask: memory allocation flags
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2227
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2228
2229
2230
2231
2232
2233
   * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
   * set, yes, we can always allocate.  If node is in our task's mems_allowed,
   * yes.  If it's not a __GFP_HARDWALL request and this node is in the nearest
   * hardwalled cpuset ancestor to this task's cpuset, yes.  If the task has been
   * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
   * flag, yes.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2234
2235
   * Otherwise, no.
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2236
2237
2238
   * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
   * cpuset_node_allowed_hardwall().  Otherwise, cpuset_node_allowed_softwall()
   * might sleep, and might allow a node from an enclosing cpuset.
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2239
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2240
2241
   * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
   * cpusets, and never sleeps.
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2242
2243
2244
2245
2246
2247
2248
   *
   * The __GFP_THISNODE placement logic is really handled elsewhere,
   * by forcibly using a zonelist starting at a specified node, and by
   * (in get_page_from_freelist()) refusing to consider the zones for
   * any node on the zonelist except the first.  By the time any such
   * calls get to this routine, we should just shut up and say 'yes'.
   *
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2249
   * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
c596d9f32   David Rientjes   cpusets: allow TI...
2250
2251
   * and do not allow allocations outside the current tasks cpuset
   * unless the task has been OOM killed as is marked TIF_MEMDIE.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2252
   * GFP_KERNEL allocations are not so marked, so can escape to the
786083667   Paul Menage   Cpuset hardwall f...
2253
   * nearest enclosing hardwalled ancestor cpuset.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2254
   *
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2255
2256
2257
2258
2259
2260
2261
   * Scanning up parent cpusets requires callback_mutex.  The
   * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
   * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
   * current tasks mems_allowed came up empty on the first pass over
   * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
   * cpuset are short of memory, might require taking the callback_mutex
   * mutex.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2262
   *
36be57ffe   Paul Jackson   [PATCH] cpuset: u...
2263
   * The first call here from mm/page_alloc:get_page_from_freelist()
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2264
2265
2266
   * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
   * so no allocation on a node outside the cpuset is allowed (unless
   * in interrupt, of course).
36be57ffe   Paul Jackson   [PATCH] cpuset: u...
2267
2268
2269
2270
2271
2272
   *
   * The second pass through get_page_from_freelist() doesn't even call
   * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
   * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
   * in alloc_flags.  That logic and the checks below have the combined
   * affect that:
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2273
2274
   *	in_interrupt - any node ok (current task context irrelevant)
   *	GFP_ATOMIC   - any node ok
c596d9f32   David Rientjes   cpusets: allow TI...
2275
   *	TIF_MEMDIE   - any node ok
786083667   Paul Menage   Cpuset hardwall f...
2276
   *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2277
   *	GFP_USER     - only nodes in current tasks mems allowed ok.
36be57ffe   Paul Jackson   [PATCH] cpuset: u...
2278
2279
   *
   * Rule:
a1bc5a4ee   David Rientjes   cpusets: replace ...
2280
   *    Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
36be57ffe   Paul Jackson   [PATCH] cpuset: u...
2281
2282
   *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
   *    the code that might scan up ancestor cpusets and sleep.
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2283
   */
a1bc5a4ee   David Rientjes   cpusets: replace ...
2284
  int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2285
  {
c9710d801   Tejun Heo   cpuset: drop "con...
2286
  	struct cpuset *cs;		/* current cpuset ancestors */
29afd49b7   Paul Jackson   [PATCH] cpuset: r...
2287
  	int allowed;			/* is allocation in zone z allowed? */
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2288

9b819d204   Christoph Lameter   [PATCH] Add __GFP...
2289
  	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2290
  		return 1;
92d1dbd27   Paul Jackson   [PATCH] cpuset: m...
2291
  	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2292
2293
  	if (node_isset(node, current->mems_allowed))
  		return 1;
c596d9f32   David Rientjes   cpusets: allow TI...
2294
2295
2296
2297
2298
2299
  	/*
  	 * Allow tasks that have access to memory reserves because they have
  	 * been OOM killed to get memory anywhere.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE)))
  		return 1;
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2300
2301
  	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
  		return 0;
5563e7707   Bob Picco   [PATCH] cpuset: f...
2302
2303
  	if (current->flags & PF_EXITING) /* Let dying task have memory */
  		return 1;
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2304
  	/* Not hardwall and node outside mems_allowed: scan up cpusets */
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2305
  	mutex_lock(&callback_mutex);
053199edf   Paul Jackson   [PATCH] cpusets: ...
2306

b8dadcb58   Li Zefan   cpuset: use rcu_r...
2307
  	rcu_read_lock();
786083667   Paul Menage   Cpuset hardwall f...
2308
  	cs = nearest_hardwall_ancestor(task_cs(current));
99afb0fd5   Li Zefan   cpuset: fix a rac...
2309
  	allowed = node_isset(node, cs->mems_allowed);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2310
  	rcu_read_unlock();
053199edf   Paul Jackson   [PATCH] cpusets: ...
2311

3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2312
  	mutex_unlock(&callback_mutex);
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2313
  	return allowed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2314
  }
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2315
  /*
a1bc5a4ee   David Rientjes   cpusets: replace ...
2316
2317
   * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
   * @node: is this an allowed node?
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2318
2319
   * @gfp_mask: memory allocation flags
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2320
2321
2322
2323
2324
   * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
   * set, yes, we can always allocate.  If node is in our task's mems_allowed,
   * yes.  If the task has been OOM killed and has access to memory reserves as
   * specified by the TIF_MEMDIE flag, yes.
   * Otherwise, no.
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2325
2326
2327
2328
2329
2330
2331
   *
   * The __GFP_THISNODE placement logic is really handled elsewhere,
   * by forcibly using a zonelist starting at a specified node, and by
   * (in get_page_from_freelist()) refusing to consider the zones for
   * any node on the zonelist except the first.  By the time any such
   * calls get to this routine, we should just shut up and say 'yes'.
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2332
2333
   * Unlike the cpuset_node_allowed_softwall() variant, above,
   * this variant requires that the node be in the current task's
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2334
2335
2336
2337
   * mems_allowed or that we're in interrupt.  It does not scan up the
   * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
   * It never sleeps.
   */
a1bc5a4ee   David Rientjes   cpusets: replace ...
2338
  int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2339
  {
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2340
2341
  	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
  		return 1;
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2342
2343
  	if (node_isset(node, current->mems_allowed))
  		return 1;
dedf8b79e   Daniel Walker   whitespace fixes:...
2344
2345
2346
2347
2348
2349
  	/*
  	 * Allow tasks that have access to memory reserves because they have
  	 * been OOM killed to get memory anywhere.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE)))
  		return 1;
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2350
2351
  	return 0;
  }
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2352
  /**
6adef3ebe   Jack Steiner   cpusets: new roun...
2353
2354
   * cpuset_mem_spread_node() - On which node to begin search for a file page
   * cpuset_slab_spread_node() - On which node to begin search for a slab page
825a46af5   Paul Jackson   [PATCH] cpuset me...
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
   *
   * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
   * tasks in a cpuset with is_spread_page or is_spread_slab set),
   * and if the memory allocation used cpuset_mem_spread_node()
   * to determine on which node to start looking, as it will for
   * certain page cache or slab cache pages such as used for file
   * system buffers and inode caches, then instead of starting on the
   * local node to look for a free page, rather spread the starting
   * node around the tasks mems_allowed nodes.
   *
   * We don't have to worry about the returned node being offline
   * because "it can't happen", and even if it did, it would be ok.
   *
   * The routines calling guarantee_online_mems() are careful to
   * only set nodes in task->mems_allowed that are online.  So it
   * should not be possible for the following code to return an
   * offline node.  But if it did, that would be ok, as this routine
   * is not returning the node where the allocation must be, only
   * the node where the search should start.  The zonelist passed to
   * __alloc_pages() will include all nodes.  If the slab allocator
   * is passed an offline node, it will fall back to the local node.
   * See kmem_cache_alloc_node().
   */
6adef3ebe   Jack Steiner   cpusets: new roun...
2378
  static int cpuset_spread_node(int *rotor)
825a46af5   Paul Jackson   [PATCH] cpuset me...
2379
2380
  {
  	int node;
6adef3ebe   Jack Steiner   cpusets: new roun...
2381
  	node = next_node(*rotor, current->mems_allowed);
825a46af5   Paul Jackson   [PATCH] cpuset me...
2382
2383
  	if (node == MAX_NUMNODES)
  		node = first_node(current->mems_allowed);
6adef3ebe   Jack Steiner   cpusets: new roun...
2384
  	*rotor = node;
825a46af5   Paul Jackson   [PATCH] cpuset me...
2385
2386
  	return node;
  }
6adef3ebe   Jack Steiner   cpusets: new roun...
2387
2388
2389
  
  int cpuset_mem_spread_node(void)
  {
778d3b0ff   Michal Hocko   cpusets: randomiz...
2390
2391
2392
  	if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
  		current->cpuset_mem_spread_rotor =
  			node_random(&current->mems_allowed);
6adef3ebe   Jack Steiner   cpusets: new roun...
2393
2394
2395
2396
2397
  	return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
  }
  
  int cpuset_slab_spread_node(void)
  {
778d3b0ff   Michal Hocko   cpusets: randomiz...
2398
2399
2400
  	if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
  		current->cpuset_slab_spread_rotor =
  			node_random(&current->mems_allowed);
6adef3ebe   Jack Steiner   cpusets: new roun...
2401
2402
  	return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
  }
825a46af5   Paul Jackson   [PATCH] cpuset me...
2403
2404
2405
  EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
  
  /**
bbe373f2c   David Rientjes   oom: compare cpus...
2406
2407
2408
2409
2410
2411
2412
2413
   * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
   * @tsk1: pointer to task_struct of some task.
   * @tsk2: pointer to task_struct of some other task.
   *
   * Description: Return true if @tsk1's mems_allowed intersects the
   * mems_allowed of @tsk2.  Used by the OOM killer to determine if
   * one of the task's memory usage might impact the memory available
   * to the other.
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2414
   **/
bbe373f2c   David Rientjes   oom: compare cpus...
2415
2416
  int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
  				   const struct task_struct *tsk2)
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2417
  {
bbe373f2c   David Rientjes   oom: compare cpus...
2418
  	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2419
  }
f440d98f8   Li Zefan   cpuset: use cgrou...
2420
  #define CPUSET_NODELIST_LEN	(256)
75aa19941   David Rientjes   oom: print trigge...
2421
2422
2423
2424
2425
  /**
   * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
   * @task: pointer to task_struct of some task.
   *
   * Description: Prints @task's name, cpuset name, and cached copy of its
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2426
   * mems_allowed to the kernel log.
75aa19941   David Rientjes   oom: print trigge...
2427
2428
2429
   */
  void cpuset_print_task_mems_allowed(struct task_struct *tsk)
  {
f440d98f8   Li Zefan   cpuset: use cgrou...
2430
2431
2432
  	 /* Statically allocated to prevent using excess stack. */
  	static char cpuset_nodelist[CPUSET_NODELIST_LEN];
  	static DEFINE_SPINLOCK(cpuset_buffer_lock);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2433
  	struct cgroup *cgrp;
75aa19941   David Rientjes   oom: print trigge...
2434

f440d98f8   Li Zefan   cpuset: use cgrou...
2435
  	spin_lock(&cpuset_buffer_lock);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2436
  	rcu_read_lock();
63f43f55c   Li Zefan   cpuset: fix cpuse...
2437

b8dadcb58   Li Zefan   cpuset: use rcu_r...
2438
  	cgrp = task_cs(tsk)->css.cgroup;
75aa19941   David Rientjes   oom: print trigge...
2439
2440
  	nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
  			   tsk->mems_allowed);
e61734c55   Tejun Heo   cgroup: remove cg...
2441
2442
2443
2444
  	printk(KERN_INFO "%s cpuset=", tsk->comm);
  	pr_cont_cgroup_name(cgrp);
  	pr_cont(" mems_allowed=%s
  ", cpuset_nodelist);
f440d98f8   Li Zefan   cpuset: use cgrou...
2445

cfb5966be   Li Zefan   cpuset: fix RCU l...
2446
  	rcu_read_unlock();
75aa19941   David Rientjes   oom: print trigge...
2447
2448
  	spin_unlock(&cpuset_buffer_lock);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2449
  /*
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2450
2451
2452
2453
   * Collection of memory_pressure is suppressed unless
   * this flag is enabled by writing "1" to the special
   * cpuset file 'memory_pressure_enabled' in the root cpuset.
   */
c5b2aff89   Paul Jackson   [PATCH] cpuset: m...
2454
  int cpuset_memory_pressure_enabled __read_mostly;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
  
  /**
   * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
   *
   * Keep a running average of the rate of synchronous (direct)
   * page reclaim efforts initiated by tasks in each cpuset.
   *
   * This represents the rate at which some task in the cpuset
   * ran low on memory on all nodes it was allowed to use, and
   * had to enter the kernels page reclaim code in an effort to
   * create more free memory by tossing clean pages or swapping
   * or writing dirty pages.
   *
   * Display to user space in the per-cpuset read-only file
   * "memory_pressure".  Value displayed is an integer
   * representing the recent rate of entry into the synchronous
   * (direct) page reclaim by any task attached to the cpuset.
   **/
  
  void __cpuset_memory_pressure_bump(void)
  {
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2476
  	rcu_read_lock();
8793d854e   Paul Menage   Task Control Grou...
2477
  	fmeter_markevent(&task_cs(current)->fmeter);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2478
  	rcu_read_unlock();
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2479
  }
8793d854e   Paul Menage   Task Control Grou...
2480
  #ifdef CONFIG_PROC_PID_CPUSET
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2481
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2482
2483
2484
   * proc_cpuset_show()
   *  - Print tasks cpuset path into seq_file.
   *  - Used for /proc/<pid>/cpuset.
053199edf   Paul Jackson   [PATCH] cpusets: ...
2485
2486
   *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
   *    doesn't really matter if tsk->cpuset changes after we read it,
5d21cc2db   Tejun Heo   cpuset: replace c...
2487
   *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
2df167a30   Paul Menage   cgroups: update c...
2488
   *    anyway.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2489
   */
8d8b97ba4   Al Viro   take cgroup_open(...
2490
  int proc_cpuset_show(struct seq_file *m, void *unused_v)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2491
  {
13b41b094   Eric W. Biederman   [PATCH] proc: Use...
2492
  	struct pid *pid;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2493
  	struct task_struct *tsk;
e61734c55   Tejun Heo   cgroup: remove cg...
2494
  	char *buf, *p;
8793d854e   Paul Menage   Task Control Grou...
2495
  	struct cgroup_subsys_state *css;
99f895518   Eric W. Biederman   [PATCH] proc: don...
2496
  	int retval;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2497

99f895518   Eric W. Biederman   [PATCH] proc: don...
2498
  	retval = -ENOMEM;
e61734c55   Tejun Heo   cgroup: remove cg...
2499
  	buf = kmalloc(PATH_MAX, GFP_KERNEL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2500
  	if (!buf)
99f895518   Eric W. Biederman   [PATCH] proc: don...
2501
2502
2503
  		goto out;
  
  	retval = -ESRCH;
13b41b094   Eric W. Biederman   [PATCH] proc: Use...
2504
2505
  	pid = m->private;
  	tsk = get_pid_task(pid, PIDTYPE_PID);
99f895518   Eric W. Biederman   [PATCH] proc: don...
2506
2507
  	if (!tsk)
  		goto out_free;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2508

e61734c55   Tejun Heo   cgroup: remove cg...
2509
  	retval = -ENAMETOOLONG;
27e89ae5d   Li Zefan   cpuset: fix RCU l...
2510
  	rcu_read_lock();
073219e99   Tejun Heo   cgroup: clean up ...
2511
  	css = task_css(tsk, cpuset_cgrp_id);
e61734c55   Tejun Heo   cgroup: remove cg...
2512
  	p = cgroup_path(css->cgroup, buf, PATH_MAX);
27e89ae5d   Li Zefan   cpuset: fix RCU l...
2513
  	rcu_read_unlock();
e61734c55   Tejun Heo   cgroup: remove cg...
2514
  	if (!p)
27e89ae5d   Li Zefan   cpuset: fix RCU l...
2515
  		goto out_put_task;
e61734c55   Tejun Heo   cgroup: remove cg...
2516
  	seq_puts(m, p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2517
2518
  	seq_putc(m, '
  ');
e61734c55   Tejun Heo   cgroup: remove cg...
2519
  	retval = 0;
27e89ae5d   Li Zefan   cpuset: fix RCU l...
2520
  out_put_task:
99f895518   Eric W. Biederman   [PATCH] proc: don...
2521
2522
  	put_task_struct(tsk);
  out_free:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2523
  	kfree(buf);
99f895518   Eric W. Biederman   [PATCH] proc: don...
2524
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2525
2526
  	return retval;
  }
8793d854e   Paul Menage   Task Control Grou...
2527
  #endif /* CONFIG_PROC_PID_CPUSET */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2528

d01d48278   Heiko Carstens   sched: Always sho...
2529
  /* Display task mems_allowed in /proc/<pid>/status file. */
df5f8314c   Eric W. Biederman   proc: seqfile con...
2530
2531
  void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
  {
df5f8314c   Eric W. Biederman   proc: seqfile con...
2532
  	seq_printf(m, "Mems_allowed:\t");
30e8e1360   Lai Jiangshan   cpuset: use seq_*...
2533
  	seq_nodemask(m, &task->mems_allowed);
df5f8314c   Eric W. Biederman   proc: seqfile con...
2534
2535
  	seq_printf(m, "
  ");
39106dcf8   Mike Travis   cpumask: use new ...
2536
  	seq_printf(m, "Mems_allowed_list:\t");
30e8e1360   Lai Jiangshan   cpuset: use seq_*...
2537
  	seq_nodemask_list(m, &task->mems_allowed);
39106dcf8   Mike Travis   cpumask: use new ...
2538
2539
  	seq_printf(m, "
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2540
  }