Blame view

kernel/cpuset.c 77.1 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
  /*
   *  kernel/cpuset.c
   *
   *  Processor and Memory placement constraints for sets of tasks.
   *
   *  Copyright (C) 2003 BULL SA.
029190c51   Paul Jackson   cpuset sched_load...
7
   *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
8793d854e   Paul Menage   Task Control Grou...
8
   *  Copyright (C) 2006 Google, Inc
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
9
10
11
   *
   *  Portions derived from Patrick Mochel's sysfs code.
   *  sysfs is Copyright (c) 2001-3 Patrick Mochel
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
12
   *
825a46af5   Paul Jackson   [PATCH] cpuset me...
13
   *  2003-10-10 Written by Simon Derr.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
14
   *  2003-10-22 Updates by Stephen Hemminger.
825a46af5   Paul Jackson   [PATCH] cpuset me...
15
   *  2004 May-July Rework by Paul Jackson.
8793d854e   Paul Menage   Task Control Grou...
16
   *  2006 Rework by Paul Menage to use generic cgroups
cf417141c   Max Krasnyansky   sched, cpuset: re...
17
18
   *  2008 Rework of the scheduler domains and CPU hotplug handling
   *       by Max Krasnyansky
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
19
20
21
22
23
   *
   *  This file is subject to the terms and conditions of the GNU General Public
   *  License.  See the file COPYING in the main directory of the Linux
   *  distribution for more details.
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
24
25
26
27
28
29
30
31
32
33
34
35
  #include <linux/cpu.h>
  #include <linux/cpumask.h>
  #include <linux/cpuset.h>
  #include <linux/err.h>
  #include <linux/errno.h>
  #include <linux/file.h>
  #include <linux/fs.h>
  #include <linux/init.h>
  #include <linux/interrupt.h>
  #include <linux/kernel.h>
  #include <linux/kmod.h>
  #include <linux/list.h>
68860ec10   Paul Jackson   [PATCH] cpusets: ...
36
  #include <linux/mempolicy.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
37
  #include <linux/mm.h>
f481891fd   Miao Xie   cpuset: update to...
38
  #include <linux/memory.h>
9984de1a5   Paul Gortmaker   kernel: Map most ...
39
  #include <linux/export.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
40
41
42
43
  #include <linux/mount.h>
  #include <linux/namei.h>
  #include <linux/pagemap.h>
  #include <linux/proc_fs.h>
6b9c2603c   Paul Jackson   [PATCH] cpuset: u...
44
  #include <linux/rcupdate.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
45
46
  #include <linux/sched.h>
  #include <linux/seq_file.h>
22fb52dd7   David Quigley   [PATCH] SELinux: ...
47
  #include <linux/security.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
48
  #include <linux/slab.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
49
50
51
52
53
54
55
56
  #include <linux/spinlock.h>
  #include <linux/stat.h>
  #include <linux/string.h>
  #include <linux/time.h>
  #include <linux/backing-dev.h>
  #include <linux/sort.h>
  
  #include <asm/uaccess.h>
60063497a   Arun Sharma   atomic: use <linu...
57
  #include <linux/atomic.h>
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
58
  #include <linux/mutex.h>
956db3ca0   Cliff Wickman   hotplug cpu: move...
59
60
  #include <linux/workqueue.h>
  #include <linux/cgroup.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
61

202f72d5d   Paul Jackson   [PATCH] cpuset: n...
62
63
64
65
66
  /*
   * Tracks how many cpusets are currently defined in system.
   * When there is only one cpuset (the root cpuset) we can
   * short circuit some hooks.
   */
7edc59628   Paul Jackson   [PATCH] cpuset: m...
67
  int number_of_cpusets __read_mostly;
202f72d5d   Paul Jackson   [PATCH] cpuset: n...
68

2df167a30   Paul Menage   cgroups: update c...
69
  /* Forward declare cgroup structures */
8793d854e   Paul Menage   Task Control Grou...
70
71
  struct cgroup_subsys cpuset_subsys;
  struct cpuset;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
72
73
74
75
76
77
78
79
  /* See "Frequency meter" comments, below. */
  
  struct fmeter {
  	int cnt;		/* unprocessed events count */
  	int val;		/* most recent output value */
  	time_t time;		/* clock (secs) when val computed */
  	spinlock_t lock;	/* guards read or write of above */
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
80
  struct cpuset {
8793d854e   Paul Menage   Task Control Grou...
81
  	struct cgroup_subsys_state css;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
82
  	unsigned long flags;		/* "unsigned long" so bitops work */
300ed6cbb   Li Zefan   cpuset: convert c...
83
  	cpumask_var_t cpus_allowed;	/* CPUs allowed to tasks in cpuset */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
84
  	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
85
  	struct fmeter fmeter;		/* memory_pressure filter */
029190c51   Paul Jackson   cpuset sched_load...
86

452477fa6   Tejun Heo   cpuset: pin down ...
87
88
89
90
91
  	/*
  	 * Tasks are being attached to this cpuset.  Used to prevent
  	 * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
  	 */
  	int attach_in_progress;
029190c51   Paul Jackson   cpuset sched_load...
92
93
  	/* partition number for rebuild_sched_domains() */
  	int pn;
956db3ca0   Cliff Wickman   hotplug cpu: move...
94

1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
95
96
  	/* for custom sched domain */
  	int relax_domain_level;
8d0339487   Tejun Heo   cpuset: make CPU ...
97
  	struct work_struct hotplug_work;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
98
  };
8793d854e   Paul Menage   Task Control Grou...
99
100
101
102
103
104
105
106
107
108
109
110
111
  /* Retrieve the cpuset for a cgroup */
  static inline struct cpuset *cgroup_cs(struct cgroup *cont)
  {
  	return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
  			    struct cpuset, css);
  }
  
  /* Retrieve the cpuset for a task */
  static inline struct cpuset *task_cs(struct task_struct *task)
  {
  	return container_of(task_subsys_state(task, cpuset_subsys_id),
  			    struct cpuset, css);
  }
8793d854e   Paul Menage   Task Control Grou...
112

c431069fe   Tejun Heo   cpuset: remove cp...
113
114
115
116
117
118
119
120
  static inline struct cpuset *parent_cs(const struct cpuset *cs)
  {
  	struct cgroup *pcgrp = cs->css.cgroup->parent;
  
  	if (pcgrp)
  		return cgroup_cs(pcgrp);
  	return NULL;
  }
b246272ec   David Rientjes   cpusets: stall wh...
121
122
123
124
125
126
127
128
129
130
131
  #ifdef CONFIG_NUMA
  static inline bool task_has_mempolicy(struct task_struct *task)
  {
  	return task->mempolicy;
  }
  #else
  static inline bool task_has_mempolicy(struct task_struct *task)
  {
  	return false;
  }
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
132
133
  /* bits in struct cpuset flags field */
  typedef enum {
efeb77b2f   Tejun Heo   cpuset: introduce...
134
  	CS_ONLINE,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
135
136
  	CS_CPU_EXCLUSIVE,
  	CS_MEM_EXCLUSIVE,
786083667   Paul Menage   Cpuset hardwall f...
137
  	CS_MEM_HARDWALL,
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
138
  	CS_MEMORY_MIGRATE,
029190c51   Paul Jackson   cpuset sched_load...
139
  	CS_SCHED_LOAD_BALANCE,
825a46af5   Paul Jackson   [PATCH] cpuset me...
140
141
  	CS_SPREAD_PAGE,
  	CS_SPREAD_SLAB,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
142
143
144
  } cpuset_flagbits_t;
  
  /* convenient tests for these bits */
efeb77b2f   Tejun Heo   cpuset: introduce...
145
146
147
148
  static inline bool is_cpuset_online(const struct cpuset *cs)
  {
  	return test_bit(CS_ONLINE, &cs->flags);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
149
150
  static inline int is_cpu_exclusive(const struct cpuset *cs)
  {
7b5b9ef0e   Paul Jackson   [PATCH] cpuset cl...
151
  	return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
152
153
154
155
  }
  
  static inline int is_mem_exclusive(const struct cpuset *cs)
  {
7b5b9ef0e   Paul Jackson   [PATCH] cpuset cl...
156
  	return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
157
  }
786083667   Paul Menage   Cpuset hardwall f...
158
159
160
161
  static inline int is_mem_hardwall(const struct cpuset *cs)
  {
  	return test_bit(CS_MEM_HARDWALL, &cs->flags);
  }
029190c51   Paul Jackson   cpuset sched_load...
162
163
164
165
  static inline int is_sched_load_balance(const struct cpuset *cs)
  {
  	return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
  }
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
166
167
  static inline int is_memory_migrate(const struct cpuset *cs)
  {
7b5b9ef0e   Paul Jackson   [PATCH] cpuset cl...
168
  	return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
169
  }
825a46af5   Paul Jackson   [PATCH] cpuset me...
170
171
172
173
174
175
176
177
178
  static inline int is_spread_page(const struct cpuset *cs)
  {
  	return test_bit(CS_SPREAD_PAGE, &cs->flags);
  }
  
  static inline int is_spread_slab(const struct cpuset *cs)
  {
  	return test_bit(CS_SPREAD_SLAB, &cs->flags);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
179
  static struct cpuset top_cpuset = {
efeb77b2f   Tejun Heo   cpuset: introduce...
180
181
  	.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
  		  (1 << CS_MEM_EXCLUSIVE)),
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
182
  };
ae8086ce1   Tejun Heo   cpuset: introduce...
183
184
185
186
187
188
189
190
191
192
193
194
  /**
   * cpuset_for_each_child - traverse online children of a cpuset
   * @child_cs: loop cursor pointing to the current child
   * @pos_cgrp: used for iteration
   * @parent_cs: target cpuset to walk children of
   *
   * Walk @child_cs through the online children of @parent_cs.  Must be used
   * with RCU read locked.
   */
  #define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs)		\
  	cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup)	\
  		if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
fc560a26a   Tejun Heo   cpuset: replace c...
195
196
197
198
199
200
201
202
203
204
205
206
207
  /**
   * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
   * @des_cs: loop cursor pointing to the current descendant
   * @pos_cgrp: used for iteration
   * @root_cs: target cpuset to walk ancestor of
   *
   * Walk @des_cs through the online descendants of @root_cs.  Must be used
   * with RCU read locked.  The caller may modify @pos_cgrp by calling
   * cgroup_rightmost_descendant() to skip subtree.
   */
  #define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs)	\
  	cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
  		if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
208
  /*
5d21cc2db   Tejun Heo   cpuset: replace c...
209
210
211
212
213
214
215
216
217
218
219
220
221
222
   * There are two global mutexes guarding cpuset structures - cpuset_mutex
   * and callback_mutex.  The latter may nest inside the former.  We also
   * require taking task_lock() when dereferencing a task's cpuset pointer.
   * See "The task_lock() exception", at the end of this comment.
   *
   * A task must hold both mutexes to modify cpusets.  If a task holds
   * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
   * is the only task able to also acquire callback_mutex and be able to
   * modify cpusets.  It can perform various checks on the cpuset structure
   * first, knowing nothing will change.  It can also allocate memory while
   * just holding cpuset_mutex.  While it is performing these checks, various
   * callback routines can briefly acquire callback_mutex to query cpusets.
   * Once it is ready to make the changes, it takes callback_mutex, blocking
   * everyone else.
053199edf   Paul Jackson   [PATCH] cpusets: ...
223
224
   *
   * Calls to the kernel memory allocator can not be made while holding
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
225
   * callback_mutex, as that would risk double tripping on callback_mutex
053199edf   Paul Jackson   [PATCH] cpusets: ...
226
227
228
   * from one of the callbacks into the cpuset code from within
   * __alloc_pages().
   *
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
229
   * If a task is only holding callback_mutex, then it has read-only
053199edf   Paul Jackson   [PATCH] cpusets: ...
230
231
   * access to cpusets.
   *
58568d2a8   Miao Xie   cpuset,mm: update...
232
233
234
   * Now, the task_struct fields mems_allowed and mempolicy may be changed
   * by other task, we use alloc_lock in the task_struct fields to protect
   * them.
053199edf   Paul Jackson   [PATCH] cpusets: ...
235
   *
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
236
   * The cpuset_common_file_read() handlers only hold callback_mutex across
053199edf   Paul Jackson   [PATCH] cpusets: ...
237
238
239
   * small pieces of code, such as when reading out possibly multi-word
   * cpumasks and nodemasks.
   *
2df167a30   Paul Menage   cgroups: update c...
240
241
   * Accessing a task's cpuset should be done in accordance with the
   * guidelines for accessing subsystem state in kernel/cgroup.c
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
242
   */
5d21cc2db   Tejun Heo   cpuset: replace c...
243
  static DEFINE_MUTEX(cpuset_mutex);
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
244
  static DEFINE_MUTEX(callback_mutex);
4247bdc60   Paul Jackson   [PATCH] cpuset se...
245

cf417141c   Max Krasnyansky   sched, cpuset: re...
246
  /*
75aa19941   David Rientjes   oom: print trigge...
247
248
249
250
251
252
253
254
255
256
257
   * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
   * buffers.  They are statically allocated to prevent using excess stack
   * when calling cpuset_print_task_mems_allowed().
   */
  #define CPUSET_NAME_LEN		(128)
  #define	CPUSET_NODELIST_LEN	(256)
  static char cpuset_name[CPUSET_NAME_LEN];
  static char cpuset_nodelist[CPUSET_NODELIST_LEN];
  static DEFINE_SPINLOCK(cpuset_buffer_lock);
  
  /*
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
258
259
   * CPU / memory hotplug is handled asynchronously.
   */
8d0339487   Tejun Heo   cpuset: make CPU ...
260
  static struct workqueue_struct *cpuset_propagate_hotplug_wq;
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
261
  static void cpuset_hotplug_workfn(struct work_struct *work);
8d0339487   Tejun Heo   cpuset: make CPU ...
262
  static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
02bb58637   Tejun Heo   cpuset: schedule ...
263
  static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
264
265
266
267
  
  static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
  
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
268
   * This is ugly, but preserves the userspace API for existing cpuset
8793d854e   Paul Menage   Task Control Grou...
269
   * users. If someone tries to mount the "cpuset" filesystem, we
cf417141c   Max Krasnyansky   sched, cpuset: re...
270
271
   * silently switch it to mount "cgroup" instead
   */
f7e835710   Al Viro   convert cgroup an...
272
273
  static struct dentry *cpuset_mount(struct file_system_type *fs_type,
  			 int flags, const char *unused_dev_name, void *data)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
274
  {
8793d854e   Paul Menage   Task Control Grou...
275
  	struct file_system_type *cgroup_fs = get_fs_type("cgroup");
f7e835710   Al Viro   convert cgroup an...
276
  	struct dentry *ret = ERR_PTR(-ENODEV);
8793d854e   Paul Menage   Task Control Grou...
277
278
279
280
  	if (cgroup_fs) {
  		char mountopts[] =
  			"cpuset,noprefix,"
  			"release_agent=/sbin/cpuset_release_agent";
f7e835710   Al Viro   convert cgroup an...
281
282
  		ret = cgroup_fs->mount(cgroup_fs, flags,
  					   unused_dev_name, mountopts);
8793d854e   Paul Menage   Task Control Grou...
283
284
285
  		put_filesystem(cgroup_fs);
  	}
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
286
287
288
289
  }
  
  static struct file_system_type cpuset_fs_type = {
  	.name = "cpuset",
f7e835710   Al Viro   convert cgroup an...
290
  	.mount = cpuset_mount,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
291
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
292
  /*
300ed6cbb   Li Zefan   cpuset: convert c...
293
   * Return in pmask the portion of a cpusets's cpus_allowed that
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
294
295
296
   * are online.  If none are online, walk up the cpuset hierarchy
   * until we find one that does have some online cpus.  If we get
   * all the way to the top and still haven't found any online cpus,
5f054e31c   Rusty Russell   documentation: re...
297
298
   * return cpu_online_mask.  Or if passed a NULL cs from an exit'ing
   * task, return cpu_online_mask.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
299
300
   *
   * One way or another, we guarantee to return some non-empty subset
5f054e31c   Rusty Russell   documentation: re...
301
   * of cpu_online_mask.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
302
   *
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
303
   * Call with callback_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
304
   */
6af866af3   Li Zefan   cpuset: remove re...
305
306
  static void guarantee_online_cpus(const struct cpuset *cs,
  				  struct cpumask *pmask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
307
  {
300ed6cbb   Li Zefan   cpuset: convert c...
308
  	while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
c431069fe   Tejun Heo   cpuset: remove cp...
309
  		cs = parent_cs(cs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
310
  	if (cs)
300ed6cbb   Li Zefan   cpuset: convert c...
311
  		cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
312
  	else
300ed6cbb   Li Zefan   cpuset: convert c...
313
314
  		cpumask_copy(pmask, cpu_online_mask);
  	BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
315
316
317
318
  }
  
  /*
   * Return in *pmask the portion of a cpusets's mems_allowed that
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
319
320
321
   * are online, with memory.  If none are online with memory, walk
   * up the cpuset hierarchy until we find one that does have some
   * online mems.  If we get all the way to the top and still haven't
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
322
   * found any online mems, return node_states[N_MEMORY].
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
323
324
   *
   * One way or another, we guarantee to return some non-empty subset
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
325
   * of node_states[N_MEMORY].
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
326
   *
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
327
   * Call with callback_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
328
329
330
331
   */
  
  static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  {
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
332
  	while (cs && !nodes_intersects(cs->mems_allowed,
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
333
  					node_states[N_MEMORY]))
c431069fe   Tejun Heo   cpuset: remove cp...
334
  		cs = parent_cs(cs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
335
  	if (cs)
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
336
  		nodes_and(*pmask, cs->mems_allowed,
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
337
  					node_states[N_MEMORY]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
338
  	else
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
339
340
  		*pmask = node_states[N_MEMORY];
  	BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY]));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
341
  }
f3b39d47e   Miao Xie   cpusets: restruct...
342
343
344
  /*
   * update task's spread flag if cpuset's page/slab spread flag is set
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
345
   * Called with callback_mutex/cpuset_mutex held
f3b39d47e   Miao Xie   cpusets: restruct...
346
347
348
349
350
351
352
353
354
355
356
357
358
   */
  static void cpuset_update_task_spread_flag(struct cpuset *cs,
  					struct task_struct *tsk)
  {
  	if (is_spread_page(cs))
  		tsk->flags |= PF_SPREAD_PAGE;
  	else
  		tsk->flags &= ~PF_SPREAD_PAGE;
  	if (is_spread_slab(cs))
  		tsk->flags |= PF_SPREAD_SLAB;
  	else
  		tsk->flags &= ~PF_SPREAD_SLAB;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
359
360
361
362
363
  /*
   * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
   *
   * One cpuset is a subset of another if all its allowed CPUs and
   * Memory Nodes are a subset of the other, and its exclusive flags
5d21cc2db   Tejun Heo   cpuset: replace c...
364
   * are only set if the other's are set.  Call holding cpuset_mutex.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
365
366
367
368
   */
  
  static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
  {
300ed6cbb   Li Zefan   cpuset: convert c...
369
  	return	cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
370
371
372
373
  		nodes_subset(p->mems_allowed, q->mems_allowed) &&
  		is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
  		is_mem_exclusive(p) <= is_mem_exclusive(q);
  }
645fcc9d2   Li Zefan   cpuset: don't all...
374
375
376
377
378
379
  /**
   * alloc_trial_cpuset - allocate a trial cpuset
   * @cs: the cpuset that the trial cpuset duplicates
   */
  static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
  {
300ed6cbb   Li Zefan   cpuset: convert c...
380
381
382
383
384
385
386
387
388
389
390
391
392
  	struct cpuset *trial;
  
  	trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
  	if (!trial)
  		return NULL;
  
  	if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
  		kfree(trial);
  		return NULL;
  	}
  	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
  
  	return trial;
645fcc9d2   Li Zefan   cpuset: don't all...
393
394
395
396
397
398
399
400
  }
  
  /**
   * free_trial_cpuset - free the trial cpuset
   * @trial: the trial cpuset to be freed
   */
  static void free_trial_cpuset(struct cpuset *trial)
  {
300ed6cbb   Li Zefan   cpuset: convert c...
401
  	free_cpumask_var(trial->cpus_allowed);
645fcc9d2   Li Zefan   cpuset: don't all...
402
403
  	kfree(trial);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
404
405
406
407
408
409
410
  /*
   * validate_change() - Used to validate that any proposed cpuset change
   *		       follows the structural rules for cpusets.
   *
   * If we replaced the flag and mask values of the current cpuset
   * (cur) with those values in the trial cpuset (trial), would
   * our various subset and exclusive rules still be valid?  Presumes
5d21cc2db   Tejun Heo   cpuset: replace c...
411
   * cpuset_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
412
413
414
415
416
417
418
419
420
421
422
423
424
425
   *
   * 'cur' is the address of an actual, in-use cpuset.  Operations
   * such as list traversal that depend on the actual address of the
   * cpuset in the list must use cur below, not trial.
   *
   * 'trial' is the address of bulk structure copy of cur, with
   * perhaps one or more of the fields cpus_allowed, mems_allowed,
   * or flags changed to new, trial values.
   *
   * Return 0 if valid, -errno if not.
   */
  
  static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
  {
8793d854e   Paul Menage   Task Control Grou...
426
  	struct cgroup *cont;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
427
  	struct cpuset *c, *par;
ae8086ce1   Tejun Heo   cpuset: introduce...
428
429
430
  	int ret;
  
  	rcu_read_lock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
431
432
  
  	/* Each of our child cpusets must be a subset of us */
ae8086ce1   Tejun Heo   cpuset: introduce...
433
434
435
436
  	ret = -EBUSY;
  	cpuset_for_each_child(c, cont, cur)
  		if (!is_cpuset_subset(c, trial))
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
437
438
  
  	/* Remaining checks don't apply to root cpuset */
ae8086ce1   Tejun Heo   cpuset: introduce...
439
  	ret = 0;
696040670   Paul Jackson   [PATCH] cpuset: m...
440
  	if (cur == &top_cpuset)
ae8086ce1   Tejun Heo   cpuset: introduce...
441
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
442

c431069fe   Tejun Heo   cpuset: remove cp...
443
  	par = parent_cs(cur);
696040670   Paul Jackson   [PATCH] cpuset: m...
444

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
445
  	/* We must be a subset of our parent cpuset */
ae8086ce1   Tejun Heo   cpuset: introduce...
446
  	ret = -EACCES;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
447
  	if (!is_cpuset_subset(trial, par))
ae8086ce1   Tejun Heo   cpuset: introduce...
448
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
449

2df167a30   Paul Menage   cgroups: update c...
450
451
452
453
  	/*
  	 * If either I or some sibling (!= me) is exclusive, we can't
  	 * overlap
  	 */
ae8086ce1   Tejun Heo   cpuset: introduce...
454
455
  	ret = -EINVAL;
  	cpuset_for_each_child(c, cont, par) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
456
457
  		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
  		    c != cur &&
300ed6cbb   Li Zefan   cpuset: convert c...
458
  		    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
ae8086ce1   Tejun Heo   cpuset: introduce...
459
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
460
461
462
  		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
  		    c != cur &&
  		    nodes_intersects(trial->mems_allowed, c->mems_allowed))
ae8086ce1   Tejun Heo   cpuset: introduce...
463
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
464
  	}
452477fa6   Tejun Heo   cpuset: pin down ...
465
466
467
468
  	/*
  	 * Cpusets with tasks - existing or newly being attached - can't
  	 * have empty cpus_allowed or mems_allowed.
  	 */
ae8086ce1   Tejun Heo   cpuset: introduce...
469
  	ret = -ENOSPC;
452477fa6   Tejun Heo   cpuset: pin down ...
470
  	if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
ae8086ce1   Tejun Heo   cpuset: introduce...
471
472
473
  	    (cpumask_empty(trial->cpus_allowed) ||
  	     nodes_empty(trial->mems_allowed)))
  		goto out;
020958b62   Paul Jackson   cpusets: decrusti...
474

ae8086ce1   Tejun Heo   cpuset: introduce...
475
476
477
478
  	ret = 0;
  out:
  	rcu_read_unlock();
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
479
  }
db7f47cf4   Paul Menage   cpusets: allow cp...
480
  #ifdef CONFIG_SMP
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
481
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
482
   * Helper routine for generate_sched_domains().
029190c51   Paul Jackson   cpuset sched_load...
483
484
   * Do cpusets a, b have overlapping cpus_allowed masks?
   */
029190c51   Paul Jackson   cpuset sched_load...
485
486
  static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
  {
300ed6cbb   Li Zefan   cpuset: convert c...
487
  	return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
029190c51   Paul Jackson   cpuset sched_load...
488
  }
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
489
490
491
  static void
  update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
  {
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
492
493
494
495
  	if (dattr->relax_domain_level < c->relax_domain_level)
  		dattr->relax_domain_level = c->relax_domain_level;
  	return;
  }
fc560a26a   Tejun Heo   cpuset: replace c...
496
497
  static void update_domain_attr_tree(struct sched_domain_attr *dattr,
  				    struct cpuset *root_cs)
f5393693e   Lai Jiangshan   cpuset: speed up ...
498
  {
fc560a26a   Tejun Heo   cpuset: replace c...
499
500
  	struct cpuset *cp;
  	struct cgroup *pos_cgrp;
f5393693e   Lai Jiangshan   cpuset: speed up ...
501

fc560a26a   Tejun Heo   cpuset: replace c...
502
503
504
505
506
  	rcu_read_lock();
  	cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
  		/* skip the whole subtree if @cp doesn't have any CPU */
  		if (cpumask_empty(cp->cpus_allowed)) {
  			pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
f5393693e   Lai Jiangshan   cpuset: speed up ...
507
  			continue;
fc560a26a   Tejun Heo   cpuset: replace c...
508
  		}
f5393693e   Lai Jiangshan   cpuset: speed up ...
509
510
511
  
  		if (is_sched_load_balance(cp))
  			update_domain_attr(dattr, cp);
f5393693e   Lai Jiangshan   cpuset: speed up ...
512
  	}
fc560a26a   Tejun Heo   cpuset: replace c...
513
  	rcu_read_unlock();
f5393693e   Lai Jiangshan   cpuset: speed up ...
514
  }
029190c51   Paul Jackson   cpuset sched_load...
515
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
516
517
518
519
520
521
522
523
524
   * generate_sched_domains()
   *
   * This function builds a partial partition of the systems CPUs
   * A 'partial partition' is a set of non-overlapping subsets whose
   * union is a subset of that set.
   * The output of this function needs to be passed to kernel/sched.c
   * partition_sched_domains() routine, which will rebuild the scheduler's
   * load balancing domains (sched domains) as specified by that partial
   * partition.
029190c51   Paul Jackson   cpuset sched_load...
525
   *
45ce80fb6   Li Zefan   cgroups: consolid...
526
   * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt
029190c51   Paul Jackson   cpuset sched_load...
527
528
529
530
531
532
533
   * for a background explanation of this.
   *
   * Does not return errors, on the theory that the callers of this
   * routine would rather not worry about failures to rebuild sched
   * domains when operating in the severe memory shortage situations
   * that could cause allocation failures below.
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
534
   * Must be called with cpuset_mutex held.
029190c51   Paul Jackson   cpuset sched_load...
535
536
   *
   * The three key local variables below are:
aeed68242   Li Zefan   cpuset: clean up ...
537
   *    q  - a linked-list queue of cpuset pointers, used to implement a
029190c51   Paul Jackson   cpuset sched_load...
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
   *	   top-down scan of all cpusets.  This scan loads a pointer
   *	   to each cpuset marked is_sched_load_balance into the
   *	   array 'csa'.  For our purposes, rebuilding the schedulers
   *	   sched domains, we can ignore !is_sched_load_balance cpusets.
   *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
   *	   that need to be load balanced, for convenient iterative
   *	   access by the subsequent code that finds the best partition,
   *	   i.e the set of domains (subsets) of CPUs such that the
   *	   cpus_allowed of every cpuset marked is_sched_load_balance
   *	   is a subset of one of these domains, while there are as
   *	   many such domains as possible, each as small as possible.
   * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
   *	   the kernel/sched.c routine partition_sched_domains() in a
   *	   convenient format, that can be easily compared to the prior
   *	   value to determine what partition elements (sched domains)
   *	   were changed (added or removed.)
   *
   * Finding the best partition (set of domains):
   *	The triple nested loops below over i, j, k scan over the
   *	load balanced cpusets (using the array of cpuset pointers in
   *	csa[]) looking for pairs of cpusets that have overlapping
   *	cpus_allowed, but which don't have the same 'pn' partition
   *	number and gives them in the same partition number.  It keeps
   *	looping on the 'restart' label until it can no longer find
   *	any such pairs.
   *
   *	The union of the cpus_allowed masks from the set of
   *	all cpusets having the same 'pn' value then form the one
   *	element of the partition (one sched domain) to be passed to
   *	partition_sched_domains().
   */
acc3f5d7c   Rusty Russell   cpumask: Partitio...
569
  static int generate_sched_domains(cpumask_var_t **domains,
cf417141c   Max Krasnyansky   sched, cpuset: re...
570
  			struct sched_domain_attr **attributes)
029190c51   Paul Jackson   cpuset sched_load...
571
  {
029190c51   Paul Jackson   cpuset sched_load...
572
573
574
575
  	struct cpuset *cp;	/* scans q */
  	struct cpuset **csa;	/* array of all cpuset ptrs */
  	int csn;		/* how many cpuset ptrs in csa so far */
  	int i, j, k;		/* indices for partition finding loops */
acc3f5d7c   Rusty Russell   cpumask: Partitio...
576
  	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
577
  	struct sched_domain_attr *dattr;  /* attributes for custom domains */
1583715dd   Ingo Molnar   sched, cpusets: f...
578
  	int ndoms = 0;		/* number of sched domains in result */
6af866af3   Li Zefan   cpuset: remove re...
579
  	int nslot;		/* next empty doms[] struct cpumask slot */
fc560a26a   Tejun Heo   cpuset: replace c...
580
  	struct cgroup *pos_cgrp;
029190c51   Paul Jackson   cpuset sched_load...
581

029190c51   Paul Jackson   cpuset sched_load...
582
  	doms = NULL;
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
583
  	dattr = NULL;
cf417141c   Max Krasnyansky   sched, cpuset: re...
584
  	csa = NULL;
029190c51   Paul Jackson   cpuset sched_load...
585
586
587
  
  	/* Special case for the 99% of systems with one, full, sched domain */
  	if (is_sched_load_balance(&top_cpuset)) {
acc3f5d7c   Rusty Russell   cpumask: Partitio...
588
589
  		ndoms = 1;
  		doms = alloc_sched_domains(ndoms);
029190c51   Paul Jackson   cpuset sched_load...
590
  		if (!doms)
cf417141c   Max Krasnyansky   sched, cpuset: re...
591
  			goto done;
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
592
593
594
  		dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
  		if (dattr) {
  			*dattr = SD_ATTR_INIT;
93a655755   Li Zefan   cpuset: fix wrong...
595
  			update_domain_attr_tree(dattr, &top_cpuset);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
596
  		}
acc3f5d7c   Rusty Russell   cpumask: Partitio...
597
  		cpumask_copy(doms[0], top_cpuset.cpus_allowed);
cf417141c   Max Krasnyansky   sched, cpuset: re...
598

cf417141c   Max Krasnyansky   sched, cpuset: re...
599
  		goto done;
029190c51   Paul Jackson   cpuset sched_load...
600
  	}
029190c51   Paul Jackson   cpuset sched_load...
601
602
603
604
  	csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
  	if (!csa)
  		goto done;
  	csn = 0;
fc560a26a   Tejun Heo   cpuset: replace c...
605
606
  	rcu_read_lock();
  	cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
f5393693e   Lai Jiangshan   cpuset: speed up ...
607
  		/*
fc560a26a   Tejun Heo   cpuset: replace c...
608
609
610
611
612
613
  		 * Continue traversing beyond @cp iff @cp has some CPUs and
  		 * isn't load balancing.  The former is obvious.  The
  		 * latter: All child cpusets contain a subset of the
  		 * parent's cpus, so just skip them, and then we call
  		 * update_domain_attr_tree() to calc relax_domain_level of
  		 * the corresponding sched domain.
f5393693e   Lai Jiangshan   cpuset: speed up ...
614
  		 */
fc560a26a   Tejun Heo   cpuset: replace c...
615
616
  		if (!cpumask_empty(cp->cpus_allowed) &&
  		    !is_sched_load_balance(cp))
f5393693e   Lai Jiangshan   cpuset: speed up ...
617
  			continue;
489a5393a   Lai Jiangshan   cpuset: don't pas...
618

fc560a26a   Tejun Heo   cpuset: replace c...
619
620
621
622
623
624
625
  		if (is_sched_load_balance(cp))
  			csa[csn++] = cp;
  
  		/* skip @cp's subtree */
  		pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
  	}
  	rcu_read_unlock();
029190c51   Paul Jackson   cpuset sched_load...
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
  
  	for (i = 0; i < csn; i++)
  		csa[i]->pn = i;
  	ndoms = csn;
  
  restart:
  	/* Find the best partition (set of sched domains) */
  	for (i = 0; i < csn; i++) {
  		struct cpuset *a = csa[i];
  		int apn = a->pn;
  
  		for (j = 0; j < csn; j++) {
  			struct cpuset *b = csa[j];
  			int bpn = b->pn;
  
  			if (apn != bpn && cpusets_overlap(a, b)) {
  				for (k = 0; k < csn; k++) {
  					struct cpuset *c = csa[k];
  
  					if (c->pn == bpn)
  						c->pn = apn;
  				}
  				ndoms--;	/* one less element */
  				goto restart;
  			}
  		}
  	}
cf417141c   Max Krasnyansky   sched, cpuset: re...
653
654
655
656
  	/*
  	 * Now we know how many domains to create.
  	 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
  	 */
acc3f5d7c   Rusty Russell   cpumask: Partitio...
657
  	doms = alloc_sched_domains(ndoms);
700018e0a   Li Zefan   cpuset: fix regre...
658
  	if (!doms)
cf417141c   Max Krasnyansky   sched, cpuset: re...
659
  		goto done;
cf417141c   Max Krasnyansky   sched, cpuset: re...
660
661
662
663
664
  
  	/*
  	 * The rest of the code, including the scheduler, can deal with
  	 * dattr==NULL case. No need to abort if alloc fails.
  	 */
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
665
  	dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
029190c51   Paul Jackson   cpuset sched_load...
666
667
668
  
  	for (nslot = 0, i = 0; i < csn; i++) {
  		struct cpuset *a = csa[i];
6af866af3   Li Zefan   cpuset: remove re...
669
  		struct cpumask *dp;
029190c51   Paul Jackson   cpuset sched_load...
670
  		int apn = a->pn;
cf417141c   Max Krasnyansky   sched, cpuset: re...
671
672
673
674
  		if (apn < 0) {
  			/* Skip completed partitions */
  			continue;
  		}
acc3f5d7c   Rusty Russell   cpumask: Partitio...
675
  		dp = doms[nslot];
cf417141c   Max Krasnyansky   sched, cpuset: re...
676
677
678
679
680
681
682
683
684
685
686
  
  		if (nslot == ndoms) {
  			static int warnings = 10;
  			if (warnings) {
  				printk(KERN_WARNING
  				 "rebuild_sched_domains confused:"
  				  " nslot %d, ndoms %d, csn %d, i %d,"
  				  " apn %d
  ",
  				  nslot, ndoms, csn, i, apn);
  				warnings--;
029190c51   Paul Jackson   cpuset sched_load...
687
  			}
cf417141c   Max Krasnyansky   sched, cpuset: re...
688
689
  			continue;
  		}
029190c51   Paul Jackson   cpuset sched_load...
690

6af866af3   Li Zefan   cpuset: remove re...
691
  		cpumask_clear(dp);
cf417141c   Max Krasnyansky   sched, cpuset: re...
692
693
694
695
696
697
  		if (dattr)
  			*(dattr + nslot) = SD_ATTR_INIT;
  		for (j = i; j < csn; j++) {
  			struct cpuset *b = csa[j];
  
  			if (apn == b->pn) {
300ed6cbb   Li Zefan   cpuset: convert c...
698
  				cpumask_or(dp, dp, b->cpus_allowed);
cf417141c   Max Krasnyansky   sched, cpuset: re...
699
700
701
702
703
  				if (dattr)
  					update_domain_attr_tree(dattr + nslot, b);
  
  				/* Done with this partition */
  				b->pn = -1;
029190c51   Paul Jackson   cpuset sched_load...
704
  			}
029190c51   Paul Jackson   cpuset sched_load...
705
  		}
cf417141c   Max Krasnyansky   sched, cpuset: re...
706
  		nslot++;
029190c51   Paul Jackson   cpuset sched_load...
707
708
  	}
  	BUG_ON(nslot != ndoms);
cf417141c   Max Krasnyansky   sched, cpuset: re...
709
710
  done:
  	kfree(csa);
700018e0a   Li Zefan   cpuset: fix regre...
711
712
713
714
715
716
  	/*
  	 * Fallback to the default domain if kmalloc() failed.
  	 * See comments in partition_sched_domains().
  	 */
  	if (doms == NULL)
  		ndoms = 1;
cf417141c   Max Krasnyansky   sched, cpuset: re...
717
718
719
720
721
722
723
724
  	*domains    = doms;
  	*attributes = dattr;
  	return ndoms;
  }
  
  /*
   * Rebuild scheduler domains.
   *
699140ba8   Tejun Heo   cpuset: drop asyn...
725
726
727
728
729
   * If the flag 'sched_load_balance' of any cpuset with non-empty
   * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
   * which has that flag enabled, or if any cpuset with a non-empty
   * 'cpus' is removed, then call this routine to rebuild the
   * scheduler's dynamic sched domains.
cf417141c   Max Krasnyansky   sched, cpuset: re...
730
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
731
   * Call with cpuset_mutex held.  Takes get_online_cpus().
cf417141c   Max Krasnyansky   sched, cpuset: re...
732
   */
699140ba8   Tejun Heo   cpuset: drop asyn...
733
  static void rebuild_sched_domains_locked(void)
cf417141c   Max Krasnyansky   sched, cpuset: re...
734
735
  {
  	struct sched_domain_attr *attr;
acc3f5d7c   Rusty Russell   cpumask: Partitio...
736
  	cpumask_var_t *doms;
cf417141c   Max Krasnyansky   sched, cpuset: re...
737
  	int ndoms;
5d21cc2db   Tejun Heo   cpuset: replace c...
738
  	lockdep_assert_held(&cpuset_mutex);
86ef5c9a8   Gautham R Shenoy   cpu-hotplug: repl...
739
  	get_online_cpus();
cf417141c   Max Krasnyansky   sched, cpuset: re...
740
741
  
  	/* Generate domain masks and attrs */
cf417141c   Max Krasnyansky   sched, cpuset: re...
742
  	ndoms = generate_sched_domains(&doms, &attr);
cf417141c   Max Krasnyansky   sched, cpuset: re...
743
744
745
  
  	/* Have scheduler rebuild the domains */
  	partition_sched_domains(ndoms, doms, attr);
86ef5c9a8   Gautham R Shenoy   cpu-hotplug: repl...
746
  	put_online_cpus();
cf417141c   Max Krasnyansky   sched, cpuset: re...
747
  }
db7f47cf4   Paul Menage   cpusets: allow cp...
748
  #else /* !CONFIG_SMP */
699140ba8   Tejun Heo   cpuset: drop asyn...
749
  static void rebuild_sched_domains_locked(void)
db7f47cf4   Paul Menage   cpusets: allow cp...
750
751
  {
  }
e1b8090bd   Geert Uytterhoeven   cpumask: Fix gene...
752
  static int generate_sched_domains(cpumask_var_t **domains,
db7f47cf4   Paul Menage   cpusets: allow cp...
753
754
755
756
757
758
  			struct sched_domain_attr **attributes)
  {
  	*domains = NULL;
  	return 1;
  }
  #endif /* CONFIG_SMP */
029190c51   Paul Jackson   cpuset sched_load...
759

cf417141c   Max Krasnyansky   sched, cpuset: re...
760
761
  void rebuild_sched_domains(void)
  {
5d21cc2db   Tejun Heo   cpuset: replace c...
762
  	mutex_lock(&cpuset_mutex);
699140ba8   Tejun Heo   cpuset: drop asyn...
763
  	rebuild_sched_domains_locked();
5d21cc2db   Tejun Heo   cpuset: replace c...
764
  	mutex_unlock(&cpuset_mutex);
029190c51   Paul Jackson   cpuset sched_load...
765
  }
58f4790b7   Cliff Wickman   cpusets: update_c...
766
767
768
769
770
  /**
   * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
   * @tsk: task to test
   * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
771
   * Call with cpuset_mutex held.  May take callback_mutex during call.
58f4790b7   Cliff Wickman   cpusets: update_c...
772
773
774
   * Called for each task in a cgroup by cgroup_scan_tasks().
   * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
   * words, if its mask is not equal to its cpuset's mask).
053199edf   Paul Jackson   [PATCH] cpusets: ...
775
   */
9e0c914ca   Adrian Bunk   kernel/cpuset.c: ...
776
777
  static int cpuset_test_cpumask(struct task_struct *tsk,
  			       struct cgroup_scanner *scan)
58f4790b7   Cliff Wickman   cpusets: update_c...
778
  {
300ed6cbb   Li Zefan   cpuset: convert c...
779
  	return !cpumask_equal(&tsk->cpus_allowed,
58f4790b7   Cliff Wickman   cpusets: update_c...
780
781
  			(cgroup_cs(scan->cg))->cpus_allowed);
  }
053199edf   Paul Jackson   [PATCH] cpusets: ...
782

58f4790b7   Cliff Wickman   cpusets: update_c...
783
784
785
786
787
788
789
790
791
  /**
   * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
   * @tsk: task to test
   * @scan: struct cgroup_scanner containing the cgroup of the task
   *
   * Called by cgroup_scan_tasks() for each task in a cgroup whose
   * cpus_allowed mask needs to be changed.
   *
   * We don't need to re-check for the cgroup/cpuset membership, since we're
5d21cc2db   Tejun Heo   cpuset: replace c...
792
   * holding cpuset_mutex at this point.
58f4790b7   Cliff Wickman   cpusets: update_c...
793
   */
9e0c914ca   Adrian Bunk   kernel/cpuset.c: ...
794
795
  static void cpuset_change_cpumask(struct task_struct *tsk,
  				  struct cgroup_scanner *scan)
58f4790b7   Cliff Wickman   cpusets: update_c...
796
  {
300ed6cbb   Li Zefan   cpuset: convert c...
797
  	set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
58f4790b7   Cliff Wickman   cpusets: update_c...
798
799
800
  }
  
  /**
0b2f630a2   Miao Xie   cpusets: restruct...
801
802
   * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
   * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
4e74339af   Li Zefan   cpuset: avoid cha...
803
   * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
0b2f630a2   Miao Xie   cpusets: restruct...
804
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
805
   * Called with cpuset_mutex held
0b2f630a2   Miao Xie   cpusets: restruct...
806
807
808
809
   *
   * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
   * calling callback functions for each.
   *
4e74339af   Li Zefan   cpuset: avoid cha...
810
811
   * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
   * if @heap != NULL.
0b2f630a2   Miao Xie   cpusets: restruct...
812
   */
4e74339af   Li Zefan   cpuset: avoid cha...
813
  static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
0b2f630a2   Miao Xie   cpusets: restruct...
814
815
  {
  	struct cgroup_scanner scan;
0b2f630a2   Miao Xie   cpusets: restruct...
816
817
818
819
  
  	scan.cg = cs->css.cgroup;
  	scan.test_task = cpuset_test_cpumask;
  	scan.process_task = cpuset_change_cpumask;
4e74339af   Li Zefan   cpuset: avoid cha...
820
821
  	scan.heap = heap;
  	cgroup_scan_tasks(&scan);
0b2f630a2   Miao Xie   cpusets: restruct...
822
823
824
  }
  
  /**
58f4790b7   Cliff Wickman   cpusets: update_c...
825
826
827
828
   * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
   * @cs: the cpuset to consider
   * @buf: buffer of cpu numbers written to this cpuset
   */
645fcc9d2   Li Zefan   cpuset: don't all...
829
830
  static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  			  const char *buf)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
831
  {
4e74339af   Li Zefan   cpuset: avoid cha...
832
  	struct ptr_heap heap;
58f4790b7   Cliff Wickman   cpusets: update_c...
833
834
  	int retval;
  	int is_load_balanced;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
835

5f054e31c   Rusty Russell   documentation: re...
836
  	/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
837
838
  	if (cs == &top_cpuset)
  		return -EACCES;
6f7f02e78   David Rientjes   cpusets: allow em...
839
  	/*
c8d9c90c7   Paul Jackson   hotplug cpu: move...
840
  	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
020958b62   Paul Jackson   cpusets: decrusti...
841
842
843
  	 * Since cpulist_parse() fails on an empty mask, we special case
  	 * that parsing.  The validate_change() call ensures that cpusets
  	 * with tasks have cpus.
6f7f02e78   David Rientjes   cpusets: allow em...
844
  	 */
020958b62   Paul Jackson   cpusets: decrusti...
845
  	if (!*buf) {
300ed6cbb   Li Zefan   cpuset: convert c...
846
  		cpumask_clear(trialcs->cpus_allowed);
6f7f02e78   David Rientjes   cpusets: allow em...
847
  	} else {
300ed6cbb   Li Zefan   cpuset: convert c...
848
  		retval = cpulist_parse(buf, trialcs->cpus_allowed);
6f7f02e78   David Rientjes   cpusets: allow em...
849
850
  		if (retval < 0)
  			return retval;
37340746a   Lai Jiangshan   cpusets: fix bug ...
851

6ad4c1888   Peter Zijlstra   sched: Fix balanc...
852
  		if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
37340746a   Lai Jiangshan   cpusets: fix bug ...
853
  			return -EINVAL;
6f7f02e78   David Rientjes   cpusets: allow em...
854
  	}
645fcc9d2   Li Zefan   cpuset: don't all...
855
  	retval = validate_change(cs, trialcs);
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
856
857
  	if (retval < 0)
  		return retval;
029190c51   Paul Jackson   cpuset sched_load...
858

8707d8b8c   Paul Menage   Fix cpusets updat...
859
  	/* Nothing to do if the cpus didn't change */
300ed6cbb   Li Zefan   cpuset: convert c...
860
  	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
8707d8b8c   Paul Menage   Fix cpusets updat...
861
  		return 0;
58f4790b7   Cliff Wickman   cpusets: update_c...
862

4e74339af   Li Zefan   cpuset: avoid cha...
863
864
865
  	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
  	if (retval)
  		return retval;
645fcc9d2   Li Zefan   cpuset: don't all...
866
  	is_load_balanced = is_sched_load_balance(trialcs);
029190c51   Paul Jackson   cpuset sched_load...
867

3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
868
  	mutex_lock(&callback_mutex);
300ed6cbb   Li Zefan   cpuset: convert c...
869
  	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
870
  	mutex_unlock(&callback_mutex);
029190c51   Paul Jackson   cpuset sched_load...
871

8707d8b8c   Paul Menage   Fix cpusets updat...
872
873
  	/*
  	 * Scan tasks in the cpuset, and update the cpumasks of any
58f4790b7   Cliff Wickman   cpusets: update_c...
874
  	 * that need an update.
8707d8b8c   Paul Menage   Fix cpusets updat...
875
  	 */
4e74339af   Li Zefan   cpuset: avoid cha...
876
877
878
  	update_tasks_cpumask(cs, &heap);
  
  	heap_free(&heap);
58f4790b7   Cliff Wickman   cpusets: update_c...
879

8707d8b8c   Paul Menage   Fix cpusets updat...
880
  	if (is_load_balanced)
699140ba8   Tejun Heo   cpuset: drop asyn...
881
  		rebuild_sched_domains_locked();
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
882
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
883
  }
053199edf   Paul Jackson   [PATCH] cpusets: ...
884
  /*
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
885
886
887
888
889
890
891
   * cpuset_migrate_mm
   *
   *    Migrate memory region from one set of nodes to another.
   *
   *    Temporarilly set tasks mems_allowed to target nodes of migration,
   *    so that the migration code can allocate pages on these nodes.
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
892
   *    Call holding cpuset_mutex, so current's cpuset won't change
c8d9c90c7   Paul Jackson   hotplug cpu: move...
893
   *    during this call, as manage_mutex holds off any cpuset_attach()
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
894
895
   *    calls.  Therefore we don't need to take task_lock around the
   *    call to guarantee_online_mems(), as we know no one is changing
2df167a30   Paul Menage   cgroups: update c...
896
   *    our task's cpuset.
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
897
   *
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
898
899
900
901
   *    While the mm_struct we are migrating is typically from some
   *    other task, the task_struct mems_allowed that we are hacking
   *    is for our current task, which must allocate new pages for that
   *    migrating memory region.
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
902
903
904
905
906
907
   */
  
  static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
  							const nodemask_t *to)
  {
  	struct task_struct *tsk = current;
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
908
  	tsk->mems_allowed = *to;
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
909
910
  
  	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
8793d854e   Paul Menage   Task Control Grou...
911
  	guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
912
  }
3b6766fe6   Li Zefan   cpuset: rewrite u...
913
  /*
58568d2a8   Miao Xie   cpuset,mm: update...
914
915
916
917
918
919
920
   * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
   * @tsk: the task to change
   * @newmems: new nodes that the task will be set
   *
   * In order to avoid seeing no nodes if the old and new nodes are disjoint,
   * we structure updates as setting all new allowed nodes, then clearing newly
   * disallowed ones.
58568d2a8   Miao Xie   cpuset,mm: update...
921
922
923
924
   */
  static void cpuset_change_task_nodemask(struct task_struct *tsk,
  					nodemask_t *newmems)
  {
b246272ec   David Rientjes   cpusets: stall wh...
925
  	bool need_loop;
89e8a244b   David Rientjes   cpusets: avoid lo...
926

c0ff7453b   Miao Xie   cpuset,mm: fix no...
927
928
929
930
931
932
933
934
935
936
  	/*
  	 * Allow tasks that have access to memory reserves because they have
  	 * been OOM killed to get memory anywhere.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE)))
  		return;
  	if (current->flags & PF_EXITING) /* Let dying task have memory */
  		return;
  
  	task_lock(tsk);
b246272ec   David Rientjes   cpusets: stall wh...
937
938
939
940
941
942
943
944
  	/*
  	 * Determine if a loop is necessary if another thread is doing
  	 * get_mems_allowed().  If at least one node remains unchanged and
  	 * tsk does not have a mempolicy, then an empty nodemask will not be
  	 * possible when mems_allowed is larger than a word.
  	 */
  	need_loop = task_has_mempolicy(tsk) ||
  			!nodes_intersects(*newmems, tsk->mems_allowed);
c0ff7453b   Miao Xie   cpuset,mm: fix no...
945

cc9a6c877   Mel Gorman   cpuset: mm: reduc...
946
947
  	if (need_loop)
  		write_seqcount_begin(&tsk->mems_allowed_seq);
c0ff7453b   Miao Xie   cpuset,mm: fix no...
948

cc9a6c877   Mel Gorman   cpuset: mm: reduc...
949
950
  	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
  	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
c0ff7453b   Miao Xie   cpuset,mm: fix no...
951
952
  
  	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
58568d2a8   Miao Xie   cpuset,mm: update...
953
  	tsk->mems_allowed = *newmems;
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
954
955
956
  
  	if (need_loop)
  		write_seqcount_end(&tsk->mems_allowed_seq);
c0ff7453b   Miao Xie   cpuset,mm: fix no...
957
  	task_unlock(tsk);
58568d2a8   Miao Xie   cpuset,mm: update...
958
959
960
961
962
  }
  
  /*
   * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
   * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
5d21cc2db   Tejun Heo   cpuset: replace c...
963
   * memory_migrate flag is set. Called with cpuset_mutex held.
3b6766fe6   Li Zefan   cpuset: rewrite u...
964
965
966
967
968
969
970
971
   */
  static void cpuset_change_nodemask(struct task_struct *p,
  				   struct cgroup_scanner *scan)
  {
  	struct mm_struct *mm;
  	struct cpuset *cs;
  	int migrate;
  	const nodemask_t *oldmem = scan->data;
5d21cc2db   Tejun Heo   cpuset: replace c...
972
  	static nodemask_t newmems;	/* protected by cpuset_mutex */
58568d2a8   Miao Xie   cpuset,mm: update...
973
974
  
  	cs = cgroup_cs(scan->cg);
ee24d3797   Li Zefan   cpuset: fix unche...
975
  	guarantee_online_mems(cs, &newmems);
58568d2a8   Miao Xie   cpuset,mm: update...
976

ee24d3797   Li Zefan   cpuset: fix unche...
977
  	cpuset_change_task_nodemask(p, &newmems);
53feb2976   Miao Xie   cpuset: alloc nod...
978

3b6766fe6   Li Zefan   cpuset: rewrite u...
979
980
981
  	mm = get_task_mm(p);
  	if (!mm)
  		return;
3b6766fe6   Li Zefan   cpuset: rewrite u...
982
983
984
985
986
987
988
  	migrate = is_memory_migrate(cs);
  
  	mpol_rebind_mm(mm, &cs->mems_allowed);
  	if (migrate)
  		cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
  	mmput(mm);
  }
8793d854e   Paul Menage   Task Control Grou...
989
  static void *cpuset_being_rebound;
0b2f630a2   Miao Xie   cpusets: restruct...
990
991
992
993
  /**
   * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
   * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
   * @oldmem: old mems_allowed of cpuset cs
010cfac4c   Li Zefan   cpuset: avoid cha...
994
   * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
0b2f630a2   Miao Xie   cpusets: restruct...
995
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
996
   * Called with cpuset_mutex held
010cfac4c   Li Zefan   cpuset: avoid cha...
997
998
   * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
   * if @heap != NULL.
0b2f630a2   Miao Xie   cpusets: restruct...
999
   */
010cfac4c   Li Zefan   cpuset: avoid cha...
1000
1001
  static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
  				 struct ptr_heap *heap)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1002
  {
3b6766fe6   Li Zefan   cpuset: rewrite u...
1003
  	struct cgroup_scanner scan;
59dac16fb   Paul Jackson   [PATCH] cpuset: u...
1004

846a16bf0   Lee Schermerhorn   mempolicy: rename...
1005
  	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1006

3b6766fe6   Li Zefan   cpuset: rewrite u...
1007
1008
1009
  	scan.cg = cs->css.cgroup;
  	scan.test_task = NULL;
  	scan.process_task = cpuset_change_nodemask;
010cfac4c   Li Zefan   cpuset: avoid cha...
1010
  	scan.heap = heap;
3b6766fe6   Li Zefan   cpuset: rewrite u...
1011
  	scan.data = (nodemask_t *)oldmem;
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1012
1013
  
  	/*
3b6766fe6   Li Zefan   cpuset: rewrite u...
1014
1015
1016
1017
  	 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
  	 * take while holding tasklist_lock.  Forks can happen - the
  	 * mpol_dup() cpuset_being_rebound check will catch such forks,
  	 * and rebind their vma mempolicies too.  Because we still hold
5d21cc2db   Tejun Heo   cpuset: replace c...
1018
  	 * the global cpuset_mutex, we know that no other rebind effort
3b6766fe6   Li Zefan   cpuset: rewrite u...
1019
  	 * will be contending for the global variable cpuset_being_rebound.
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1020
  	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
04c19fa6f   Paul Jackson   [PATCH] cpuset: m...
1021
  	 * is idempotent.  Also migrate pages in each mm to new nodes.
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1022
  	 */
010cfac4c   Li Zefan   cpuset: avoid cha...
1023
  	cgroup_scan_tasks(&scan);
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1024

2df167a30   Paul Menage   cgroups: update c...
1025
  	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
8793d854e   Paul Menage   Task Control Grou...
1026
  	cpuset_being_rebound = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1027
  }
0b2f630a2   Miao Xie   cpusets: restruct...
1028
1029
1030
  /*
   * Handle user request to change the 'mems' memory placement
   * of a cpuset.  Needs to validate the request, update the
58568d2a8   Miao Xie   cpuset,mm: update...
1031
1032
1033
1034
   * cpusets mems_allowed, and for each task in the cpuset,
   * update mems_allowed and rebind task's mempolicy and any vma
   * mempolicies and if the cpuset is marked 'memory_migrate',
   * migrate the tasks pages to the new memory.
0b2f630a2   Miao Xie   cpusets: restruct...
1035
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
1036
   * Call with cpuset_mutex held.  May take callback_mutex during call.
0b2f630a2   Miao Xie   cpusets: restruct...
1037
1038
1039
1040
   * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
   * lock each such tasks mm->mmap_sem, scan its vma's and rebind
   * their mempolicies to the cpusets new mems_allowed.
   */
645fcc9d2   Li Zefan   cpuset: don't all...
1041
1042
  static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
  			   const char *buf)
0b2f630a2   Miao Xie   cpusets: restruct...
1043
  {
53feb2976   Miao Xie   cpuset: alloc nod...
1044
  	NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
0b2f630a2   Miao Xie   cpusets: restruct...
1045
  	int retval;
010cfac4c   Li Zefan   cpuset: avoid cha...
1046
  	struct ptr_heap heap;
0b2f630a2   Miao Xie   cpusets: restruct...
1047

53feb2976   Miao Xie   cpuset: alloc nod...
1048
1049
  	if (!oldmem)
  		return -ENOMEM;
0b2f630a2   Miao Xie   cpusets: restruct...
1050
  	/*
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
1051
  	 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
0b2f630a2   Miao Xie   cpusets: restruct...
1052
1053
  	 * it's read-only
  	 */
53feb2976   Miao Xie   cpuset: alloc nod...
1054
1055
1056
1057
  	if (cs == &top_cpuset) {
  		retval = -EACCES;
  		goto done;
  	}
0b2f630a2   Miao Xie   cpusets: restruct...
1058

0b2f630a2   Miao Xie   cpusets: restruct...
1059
1060
1061
1062
1063
1064
1065
  	/*
  	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
  	 * Since nodelist_parse() fails on an empty mask, we special case
  	 * that parsing.  The validate_change() call ensures that cpusets
  	 * with tasks have memory.
  	 */
  	if (!*buf) {
645fcc9d2   Li Zefan   cpuset: don't all...
1066
  		nodes_clear(trialcs->mems_allowed);
0b2f630a2   Miao Xie   cpusets: restruct...
1067
  	} else {
645fcc9d2   Li Zefan   cpuset: don't all...
1068
  		retval = nodelist_parse(buf, trialcs->mems_allowed);
0b2f630a2   Miao Xie   cpusets: restruct...
1069
1070
  		if (retval < 0)
  			goto done;
645fcc9d2   Li Zefan   cpuset: don't all...
1071
  		if (!nodes_subset(trialcs->mems_allowed,
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
1072
  				node_states[N_MEMORY])) {
53feb2976   Miao Xie   cpuset: alloc nod...
1073
1074
1075
  			retval =  -EINVAL;
  			goto done;
  		}
0b2f630a2   Miao Xie   cpusets: restruct...
1076
  	}
53feb2976   Miao Xie   cpuset: alloc nod...
1077
1078
  	*oldmem = cs->mems_allowed;
  	if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
0b2f630a2   Miao Xie   cpusets: restruct...
1079
1080
1081
  		retval = 0;		/* Too easy - nothing to do */
  		goto done;
  	}
645fcc9d2   Li Zefan   cpuset: don't all...
1082
  	retval = validate_change(cs, trialcs);
0b2f630a2   Miao Xie   cpusets: restruct...
1083
1084
  	if (retval < 0)
  		goto done;
010cfac4c   Li Zefan   cpuset: avoid cha...
1085
1086
1087
  	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
  	if (retval < 0)
  		goto done;
0b2f630a2   Miao Xie   cpusets: restruct...
1088
  	mutex_lock(&callback_mutex);
645fcc9d2   Li Zefan   cpuset: don't all...
1089
  	cs->mems_allowed = trialcs->mems_allowed;
0b2f630a2   Miao Xie   cpusets: restruct...
1090
  	mutex_unlock(&callback_mutex);
53feb2976   Miao Xie   cpuset: alloc nod...
1091
  	update_tasks_nodemask(cs, oldmem, &heap);
010cfac4c   Li Zefan   cpuset: avoid cha...
1092
1093
  
  	heap_free(&heap);
0b2f630a2   Miao Xie   cpusets: restruct...
1094
  done:
53feb2976   Miao Xie   cpuset: alloc nod...
1095
  	NODEMASK_FREE(oldmem);
0b2f630a2   Miao Xie   cpusets: restruct...
1096
1097
  	return retval;
  }
8793d854e   Paul Menage   Task Control Grou...
1098
1099
1100
1101
  int current_cpuset_is_being_rebound(void)
  {
  	return task_cs(current) == cpuset_being_rebound;
  }
5be7a4792   Paul Menage   Fix cpuset sched_...
1102
  static int update_relax_domain_level(struct cpuset *cs, s64 val)
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1103
  {
db7f47cf4   Paul Menage   cpusets: allow cp...
1104
  #ifdef CONFIG_SMP
60495e776   Peter Zijlstra   sched: Dynamic sc...
1105
  	if (val < -1 || val >= sched_domain_level_max)
30e0e1781   Li Zefan   cpuset: limit the...
1106
  		return -EINVAL;
db7f47cf4   Paul Menage   cpusets: allow cp...
1107
  #endif
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1108
1109
1110
  
  	if (val != cs->relax_domain_level) {
  		cs->relax_domain_level = val;
300ed6cbb   Li Zefan   cpuset: convert c...
1111
1112
  		if (!cpumask_empty(cs->cpus_allowed) &&
  		    is_sched_load_balance(cs))
699140ba8   Tejun Heo   cpuset: drop asyn...
1113
  			rebuild_sched_domains_locked();
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1114
1115
1116
1117
  	}
  
  	return 0;
  }
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1118
  /*
950592f7b   Miao Xie   cpusets: update t...
1119
1120
1121
1122
1123
1124
1125
   * cpuset_change_flag - make a task's spread flags the same as its cpuset's
   * @tsk: task to be updated
   * @scan: struct cgroup_scanner containing the cgroup of the task
   *
   * Called by cgroup_scan_tasks() for each task in a cgroup.
   *
   * We don't need to re-check for the cgroup/cpuset membership, since we're
5d21cc2db   Tejun Heo   cpuset: replace c...
1126
   * holding cpuset_mutex at this point.
950592f7b   Miao Xie   cpusets: update t...
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
   */
  static void cpuset_change_flag(struct task_struct *tsk,
  				struct cgroup_scanner *scan)
  {
  	cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
  }
  
  /*
   * update_tasks_flags - update the spread flags of tasks in the cpuset.
   * @cs: the cpuset in which each task's spread flags needs to be changed
   * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
1139
   * Called with cpuset_mutex held
950592f7b   Miao Xie   cpusets: update t...
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
   *
   * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
   * calling callback functions for each.
   *
   * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
   * if @heap != NULL.
   */
  static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
  {
  	struct cgroup_scanner scan;
  
  	scan.cg = cs->css.cgroup;
  	scan.test_task = NULL;
  	scan.process_task = cpuset_change_flag;
  	scan.heap = heap;
  	cgroup_scan_tasks(&scan);
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1159
   * update_flag - read a 0 or a 1 in a file and update associated flag
786083667   Paul Menage   Cpuset hardwall f...
1160
1161
1162
   * bit:		the bit to update (see cpuset_flagbits_t)
   * cs:		the cpuset to update
   * turning_on: 	whether the flag is being set or cleared
053199edf   Paul Jackson   [PATCH] cpusets: ...
1163
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
1164
   * Call with cpuset_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1165
   */
700fe1ab9   Paul Menage   CGroup API files:...
1166
1167
  static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
  		       int turning_on)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1168
  {
645fcc9d2   Li Zefan   cpuset: don't all...
1169
  	struct cpuset *trialcs;
40b6a7623   Rakib Mullick   cpuset.c: remove ...
1170
  	int balance_flag_changed;
950592f7b   Miao Xie   cpusets: update t...
1171
1172
1173
  	int spread_flag_changed;
  	struct ptr_heap heap;
  	int err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1174

645fcc9d2   Li Zefan   cpuset: don't all...
1175
1176
1177
  	trialcs = alloc_trial_cpuset(cs);
  	if (!trialcs)
  		return -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1178
  	if (turning_on)
645fcc9d2   Li Zefan   cpuset: don't all...
1179
  		set_bit(bit, &trialcs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1180
  	else
645fcc9d2   Li Zefan   cpuset: don't all...
1181
  		clear_bit(bit, &trialcs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1182

645fcc9d2   Li Zefan   cpuset: don't all...
1183
  	err = validate_change(cs, trialcs);
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
1184
  	if (err < 0)
645fcc9d2   Li Zefan   cpuset: don't all...
1185
  		goto out;
029190c51   Paul Jackson   cpuset sched_load...
1186

950592f7b   Miao Xie   cpusets: update t...
1187
1188
1189
  	err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
  	if (err < 0)
  		goto out;
029190c51   Paul Jackson   cpuset sched_load...
1190
  	balance_flag_changed = (is_sched_load_balance(cs) !=
645fcc9d2   Li Zefan   cpuset: don't all...
1191
  				is_sched_load_balance(trialcs));
029190c51   Paul Jackson   cpuset sched_load...
1192

950592f7b   Miao Xie   cpusets: update t...
1193
1194
  	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
  			|| (is_spread_page(cs) != is_spread_page(trialcs)));
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
1195
  	mutex_lock(&callback_mutex);
645fcc9d2   Li Zefan   cpuset: don't all...
1196
  	cs->flags = trialcs->flags;
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
1197
  	mutex_unlock(&callback_mutex);
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
1198

300ed6cbb   Li Zefan   cpuset: convert c...
1199
  	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
699140ba8   Tejun Heo   cpuset: drop asyn...
1200
  		rebuild_sched_domains_locked();
029190c51   Paul Jackson   cpuset sched_load...
1201

950592f7b   Miao Xie   cpusets: update t...
1202
1203
1204
  	if (spread_flag_changed)
  		update_tasks_flags(cs, &heap);
  	heap_free(&heap);
645fcc9d2   Li Zefan   cpuset: don't all...
1205
1206
1207
  out:
  	free_trial_cpuset(trialcs);
  	return err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1208
  }
053199edf   Paul Jackson   [PATCH] cpusets: ...
1209
  /*
80f7228b5   Adrian Bunk   typo fixes: occur...
1210
   * Frequency meter - How fast is some event occurring?
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
   *
   * These routines manage a digitally filtered, constant time based,
   * event frequency meter.  There are four routines:
   *   fmeter_init() - initialize a frequency meter.
   *   fmeter_markevent() - called each time the event happens.
   *   fmeter_getrate() - returns the recent rate of such events.
   *   fmeter_update() - internal routine used to update fmeter.
   *
   * A common data structure is passed to each of these routines,
   * which is used to keep track of the state required to manage the
   * frequency meter and its digital filter.
   *
   * The filter works on the number of events marked per unit time.
   * The filter is single-pole low-pass recursive (IIR).  The time unit
   * is 1 second.  Arithmetic is done using 32-bit integers scaled to
   * simulate 3 decimal digits of precision (multiplied by 1000).
   *
   * With an FM_COEF of 933, and a time base of 1 second, the filter
   * has a half-life of 10 seconds, meaning that if the events quit
   * happening, then the rate returned from the fmeter_getrate()
   * will be cut in half each 10 seconds, until it converges to zero.
   *
   * It is not worth doing a real infinitely recursive filter.  If more
   * than FM_MAXTICKS ticks have elapsed since the last filter event,
   * just compute FM_MAXTICKS ticks worth, by which point the level
   * will be stable.
   *
   * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
   * arithmetic overflow in the fmeter_update() routine.
   *
   * Given the simple 32 bit integer arithmetic used, this meter works
   * best for reporting rates between one per millisecond (msec) and
   * one per 32 (approx) seconds.  At constant rates faster than one
   * per msec it maxes out at values just under 1,000,000.  At constant
   * rates between one per msec, and one per second it will stabilize
   * to a value N*1000, where N is the rate of events per second.
   * At constant rates between one per second and one per 32 seconds,
   * it will be choppy, moving up on the seconds that have an event,
   * and then decaying until the next event.  At rates slower than
   * about one in 32 seconds, it decays all the way back to zero between
   * each event.
   */
  
  #define FM_COEF 933		/* coefficient for half-life of 10 secs */
  #define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
  #define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */
  #define FM_SCALE 1000		/* faux fixed point scale */
  
  /* Initialize a frequency meter */
  static void fmeter_init(struct fmeter *fmp)
  {
  	fmp->cnt = 0;
  	fmp->val = 0;
  	fmp->time = 0;
  	spin_lock_init(&fmp->lock);
  }
  
  /* Internal meter update - process cnt events and update value */
  static void fmeter_update(struct fmeter *fmp)
  {
  	time_t now = get_seconds();
  	time_t ticks = now - fmp->time;
  
  	if (ticks == 0)
  		return;
  
  	ticks = min(FM_MAXTICKS, ticks);
  	while (ticks-- > 0)
  		fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
  	fmp->time = now;
  
  	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
  	fmp->cnt = 0;
  }
  
  /* Process any previous ticks, then bump cnt by one (times scale). */
  static void fmeter_markevent(struct fmeter *fmp)
  {
  	spin_lock(&fmp->lock);
  	fmeter_update(fmp);
  	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
  	spin_unlock(&fmp->lock);
  }
  
  /* Process any previous ticks, then return current value. */
  static int fmeter_getrate(struct fmeter *fmp)
  {
  	int val;
  
  	spin_lock(&fmp->lock);
  	fmeter_update(fmp);
  	val = fmp->val;
  	spin_unlock(&fmp->lock);
  	return val;
  }
5d21cc2db   Tejun Heo   cpuset: replace c...
1306
  /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
761b3ef50   Li Zefan   cgroup: remove cg...
1307
  static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
f780bdb7c   Ben Blum   cgroups: add per-...
1308
  {
2f7ee5691   Tejun Heo   cgroup: introduce...
1309
  	struct cpuset *cs = cgroup_cs(cgrp);
bb9d97b6d   Tejun Heo   cgroup: don't use...
1310
1311
  	struct task_struct *task;
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1312

5d21cc2db   Tejun Heo   cpuset: replace c...
1313
1314
1315
  	mutex_lock(&cpuset_mutex);
  
  	ret = -ENOSPC;
300ed6cbb   Li Zefan   cpuset: convert c...
1316
  	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
5d21cc2db   Tejun Heo   cpuset: replace c...
1317
  		goto out_unlock;
9985b0bab   David Rientjes   sched: prevent bo...
1318

bb9d97b6d   Tejun Heo   cgroup: don't use...
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
  	cgroup_taskset_for_each(task, cgrp, tset) {
  		/*
  		 * Kthreads bound to specific cpus cannot be moved to a new
  		 * cpuset; we cannot change their cpu affinity and
  		 * isolating such threads by their set of allowed nodes is
  		 * unnecessary.  Thus, cpusets are not applicable for such
  		 * threads.  This prevents checking for success of
  		 * set_cpus_allowed_ptr() on all attached tasks before
  		 * cpus_allowed may be changed.
  		 */
5d21cc2db   Tejun Heo   cpuset: replace c...
1329
  		ret = -EINVAL;
bb9d97b6d   Tejun Heo   cgroup: don't use...
1330
  		if (task->flags & PF_THREAD_BOUND)
5d21cc2db   Tejun Heo   cpuset: replace c...
1331
1332
1333
1334
  			goto out_unlock;
  		ret = security_task_setscheduler(task);
  		if (ret)
  			goto out_unlock;
bb9d97b6d   Tejun Heo   cgroup: don't use...
1335
  	}
f780bdb7c   Ben Blum   cgroups: add per-...
1336

452477fa6   Tejun Heo   cpuset: pin down ...
1337
1338
1339
1340
1341
  	/*
  	 * Mark attach is in progress.  This makes validate_change() fail
  	 * changes which zero cpus/mems_allowed.
  	 */
  	cs->attach_in_progress++;
5d21cc2db   Tejun Heo   cpuset: replace c...
1342
1343
1344
1345
  	ret = 0;
  out_unlock:
  	mutex_unlock(&cpuset_mutex);
  	return ret;
8793d854e   Paul Menage   Task Control Grou...
1346
  }
f780bdb7c   Ben Blum   cgroups: add per-...
1347

452477fa6   Tejun Heo   cpuset: pin down ...
1348
1349
1350
  static void cpuset_cancel_attach(struct cgroup *cgrp,
  				 struct cgroup_taskset *tset)
  {
5d21cc2db   Tejun Heo   cpuset: replace c...
1351
  	mutex_lock(&cpuset_mutex);
452477fa6   Tejun Heo   cpuset: pin down ...
1352
  	cgroup_cs(cgrp)->attach_in_progress--;
5d21cc2db   Tejun Heo   cpuset: replace c...
1353
  	mutex_unlock(&cpuset_mutex);
8793d854e   Paul Menage   Task Control Grou...
1354
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1355

4e4c9a140   Tejun Heo   cpuset: cleanup c...
1356
  /*
5d21cc2db   Tejun Heo   cpuset: replace c...
1357
   * Protected by cpuset_mutex.  cpus_attach is used only by cpuset_attach()
4e4c9a140   Tejun Heo   cpuset: cleanup c...
1358
1359
1360
1361
   * but we can't allocate it dynamically there.  Define it global and
   * allocate from cpuset_init().
   */
  static cpumask_var_t cpus_attach;
761b3ef50   Li Zefan   cgroup: remove cg...
1362
  static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
8793d854e   Paul Menage   Task Control Grou...
1363
  {
5d21cc2db   Tejun Heo   cpuset: replace c...
1364
  	/* static bufs protected by cpuset_mutex */
4e4c9a140   Tejun Heo   cpuset: cleanup c...
1365
1366
  	static nodemask_t cpuset_attach_nodemask_from;
  	static nodemask_t cpuset_attach_nodemask_to;
8793d854e   Paul Menage   Task Control Grou...
1367
  	struct mm_struct *mm;
bb9d97b6d   Tejun Heo   cgroup: don't use...
1368
1369
  	struct task_struct *task;
  	struct task_struct *leader = cgroup_taskset_first(tset);
2f7ee5691   Tejun Heo   cgroup: introduce...
1370
1371
1372
  	struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
  	struct cpuset *cs = cgroup_cs(cgrp);
  	struct cpuset *oldcs = cgroup_cs(oldcgrp);
22fb52dd7   David Quigley   [PATCH] SELinux: ...
1373

5d21cc2db   Tejun Heo   cpuset: replace c...
1374
  	mutex_lock(&cpuset_mutex);
4e4c9a140   Tejun Heo   cpuset: cleanup c...
1375
1376
1377
1378
1379
1380
1381
  	/* prepare for attach */
  	if (cs == &top_cpuset)
  		cpumask_copy(cpus_attach, cpu_possible_mask);
  	else
  		guarantee_online_cpus(cs, cpus_attach);
  
  	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
bb9d97b6d   Tejun Heo   cgroup: don't use...
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
  	cgroup_taskset_for_each(task, cgrp, tset) {
  		/*
  		 * can_attach beforehand should guarantee that this doesn't
  		 * fail.  TODO: have a better way to handle failure here
  		 */
  		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
  
  		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
  		cpuset_update_task_spread_flag(cs, task);
  	}
22fb52dd7   David Quigley   [PATCH] SELinux: ...
1392

f780bdb7c   Ben Blum   cgroups: add per-...
1393
1394
1395
1396
1397
1398
  	/*
  	 * Change mm, possibly for multiple threads in a threadgroup. This is
  	 * expensive and may sleep.
  	 */
  	cpuset_attach_nodemask_from = oldcs->mems_allowed;
  	cpuset_attach_nodemask_to = cs->mems_allowed;
bb9d97b6d   Tejun Heo   cgroup: don't use...
1399
  	mm = get_task_mm(leader);
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1400
  	if (mm) {
f780bdb7c   Ben Blum   cgroups: add per-...
1401
  		mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
2741a559a   Paul Jackson   [PATCH] cpuset: u...
1402
  		if (is_memory_migrate(cs))
f780bdb7c   Ben Blum   cgroups: add per-...
1403
1404
  			cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
  					  &cpuset_attach_nodemask_to);
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1405
1406
  		mmput(mm);
  	}
452477fa6   Tejun Heo   cpuset: pin down ...
1407
1408
  
  	cs->attach_in_progress--;
02bb58637   Tejun Heo   cpuset: schedule ...
1409
1410
1411
1412
1413
1414
1415
1416
  
  	/*
  	 * We may have raced with CPU/memory hotunplug.  Trigger hotplug
  	 * propagation if @cs doesn't have any CPU or memory.  It will move
  	 * the newly added tasks to the nearest parent which can execute.
  	 */
  	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
  		schedule_cpuset_propagate_hotplug(cs);
5d21cc2db   Tejun Heo   cpuset: replace c...
1417
1418
  
  	mutex_unlock(&cpuset_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1419
1420
1421
1422
1423
  }
  
  /* The various types of files and directories in a cpuset file system */
  
  typedef enum {
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
1424
  	FILE_MEMORY_MIGRATE,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1425
1426
1427
1428
  	FILE_CPULIST,
  	FILE_MEMLIST,
  	FILE_CPU_EXCLUSIVE,
  	FILE_MEM_EXCLUSIVE,
786083667   Paul Menage   Cpuset hardwall f...
1429
  	FILE_MEM_HARDWALL,
029190c51   Paul Jackson   cpuset sched_load...
1430
  	FILE_SCHED_LOAD_BALANCE,
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1431
  	FILE_SCHED_RELAX_DOMAIN_LEVEL,
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1432
1433
  	FILE_MEMORY_PRESSURE_ENABLED,
  	FILE_MEMORY_PRESSURE,
825a46af5   Paul Jackson   [PATCH] cpuset me...
1434
1435
  	FILE_SPREAD_PAGE,
  	FILE_SPREAD_SLAB,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1436
  } cpuset_filetype_t;
700fe1ab9   Paul Menage   CGroup API files:...
1437
1438
  static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
  {
700fe1ab9   Paul Menage   CGroup API files:...
1439
1440
  	struct cpuset *cs = cgroup_cs(cgrp);
  	cpuset_filetype_t type = cft->private;
5d21cc2db   Tejun Heo   cpuset: replace c...
1441
  	int retval = -ENODEV;
700fe1ab9   Paul Menage   CGroup API files:...
1442

5d21cc2db   Tejun Heo   cpuset: replace c...
1443
1444
1445
  	mutex_lock(&cpuset_mutex);
  	if (!is_cpuset_online(cs))
  		goto out_unlock;
700fe1ab9   Paul Menage   CGroup API files:...
1446
1447
  
  	switch (type) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1448
  	case FILE_CPU_EXCLUSIVE:
700fe1ab9   Paul Menage   CGroup API files:...
1449
  		retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1450
1451
  		break;
  	case FILE_MEM_EXCLUSIVE:
700fe1ab9   Paul Menage   CGroup API files:...
1452
  		retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1453
  		break;
786083667   Paul Menage   Cpuset hardwall f...
1454
1455
1456
  	case FILE_MEM_HARDWALL:
  		retval = update_flag(CS_MEM_HARDWALL, cs, val);
  		break;
029190c51   Paul Jackson   cpuset sched_load...
1457
  	case FILE_SCHED_LOAD_BALANCE:
700fe1ab9   Paul Menage   CGroup API files:...
1458
  		retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1459
  		break;
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
1460
  	case FILE_MEMORY_MIGRATE:
700fe1ab9   Paul Menage   CGroup API files:...
1461
  		retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
1462
  		break;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1463
  	case FILE_MEMORY_PRESSURE_ENABLED:
700fe1ab9   Paul Menage   CGroup API files:...
1464
  		cpuset_memory_pressure_enabled = !!val;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1465
1466
1467
1468
  		break;
  	case FILE_MEMORY_PRESSURE:
  		retval = -EACCES;
  		break;
825a46af5   Paul Jackson   [PATCH] cpuset me...
1469
  	case FILE_SPREAD_PAGE:
700fe1ab9   Paul Menage   CGroup API files:...
1470
  		retval = update_flag(CS_SPREAD_PAGE, cs, val);
825a46af5   Paul Jackson   [PATCH] cpuset me...
1471
1472
  		break;
  	case FILE_SPREAD_SLAB:
700fe1ab9   Paul Menage   CGroup API files:...
1473
  		retval = update_flag(CS_SPREAD_SLAB, cs, val);
825a46af5   Paul Jackson   [PATCH] cpuset me...
1474
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1475
1476
  	default:
  		retval = -EINVAL;
700fe1ab9   Paul Menage   CGroup API files:...
1477
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1478
  	}
5d21cc2db   Tejun Heo   cpuset: replace c...
1479
1480
  out_unlock:
  	mutex_unlock(&cpuset_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1481
1482
  	return retval;
  }
5be7a4792   Paul Menage   Fix cpuset sched_...
1483
1484
  static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
  {
5be7a4792   Paul Menage   Fix cpuset sched_...
1485
1486
  	struct cpuset *cs = cgroup_cs(cgrp);
  	cpuset_filetype_t type = cft->private;
5d21cc2db   Tejun Heo   cpuset: replace c...
1487
  	int retval = -ENODEV;
5be7a4792   Paul Menage   Fix cpuset sched_...
1488

5d21cc2db   Tejun Heo   cpuset: replace c...
1489
1490
1491
  	mutex_lock(&cpuset_mutex);
  	if (!is_cpuset_online(cs))
  		goto out_unlock;
e37123953   Paul Menage   cgroup files: rem...
1492

5be7a4792   Paul Menage   Fix cpuset sched_...
1493
1494
1495
1496
1497
1498
1499
1500
  	switch (type) {
  	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
  		retval = update_relax_domain_level(cs, val);
  		break;
  	default:
  		retval = -EINVAL;
  		break;
  	}
5d21cc2db   Tejun Heo   cpuset: replace c...
1501
1502
  out_unlock:
  	mutex_unlock(&cpuset_mutex);
5be7a4792   Paul Menage   Fix cpuset sched_...
1503
1504
  	return retval;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1505
  /*
e37123953   Paul Menage   cgroup files: rem...
1506
1507
1508
1509
1510
   * Common handling for a write to a "cpus" or "mems" file.
   */
  static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
  				const char *buf)
  {
645fcc9d2   Li Zefan   cpuset: don't all...
1511
1512
  	struct cpuset *cs = cgroup_cs(cgrp);
  	struct cpuset *trialcs;
5d21cc2db   Tejun Heo   cpuset: replace c...
1513
  	int retval = -ENODEV;
e37123953   Paul Menage   cgroup files: rem...
1514

3a5a6d0c2   Tejun Heo   cpuset: don't nes...
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
  	/*
  	 * CPU or memory hotunplug may leave @cs w/o any execution
  	 * resources, in which case the hotplug code asynchronously updates
  	 * configuration and transfers all tasks to the nearest ancestor
  	 * which can execute.
  	 *
  	 * As writes to "cpus" or "mems" may restore @cs's execution
  	 * resources, wait for the previously scheduled operations before
  	 * proceeding, so that we don't end up keep removing tasks added
  	 * after execution capability is restored.
02bb58637   Tejun Heo   cpuset: schedule ...
1525
1526
1527
1528
  	 *
  	 * Flushing cpuset_hotplug_work is enough to synchronize against
  	 * hotplug hanlding; however, cpuset_attach() may schedule
  	 * propagation work directly.  Flush the workqueue too.
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
1529
1530
  	 */
  	flush_work(&cpuset_hotplug_work);
02bb58637   Tejun Heo   cpuset: schedule ...
1531
  	flush_workqueue(cpuset_propagate_hotplug_wq);
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
1532

5d21cc2db   Tejun Heo   cpuset: replace c...
1533
1534
1535
  	mutex_lock(&cpuset_mutex);
  	if (!is_cpuset_online(cs))
  		goto out_unlock;
e37123953   Paul Menage   cgroup files: rem...
1536

645fcc9d2   Li Zefan   cpuset: don't all...
1537
  	trialcs = alloc_trial_cpuset(cs);
b75f38d65   Li Zefan   cpuset: add a mis...
1538
1539
  	if (!trialcs) {
  		retval = -ENOMEM;
5d21cc2db   Tejun Heo   cpuset: replace c...
1540
  		goto out_unlock;
b75f38d65   Li Zefan   cpuset: add a mis...
1541
  	}
645fcc9d2   Li Zefan   cpuset: don't all...
1542

e37123953   Paul Menage   cgroup files: rem...
1543
1544
  	switch (cft->private) {
  	case FILE_CPULIST:
645fcc9d2   Li Zefan   cpuset: don't all...
1545
  		retval = update_cpumask(cs, trialcs, buf);
e37123953   Paul Menage   cgroup files: rem...
1546
1547
  		break;
  	case FILE_MEMLIST:
645fcc9d2   Li Zefan   cpuset: don't all...
1548
  		retval = update_nodemask(cs, trialcs, buf);
e37123953   Paul Menage   cgroup files: rem...
1549
1550
1551
1552
1553
  		break;
  	default:
  		retval = -EINVAL;
  		break;
  	}
645fcc9d2   Li Zefan   cpuset: don't all...
1554
1555
  
  	free_trial_cpuset(trialcs);
5d21cc2db   Tejun Heo   cpuset: replace c...
1556
1557
  out_unlock:
  	mutex_unlock(&cpuset_mutex);
e37123953   Paul Menage   cgroup files: rem...
1558
1559
1560
1561
  	return retval;
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
   * These ascii lists should be read in a single call, by using a user
   * buffer large enough to hold the entire map.  If read in smaller
   * chunks, there is no guarantee of atomicity.  Since the display format
   * used, list of ranges of sequential numbers, is variable length,
   * and since these maps can change value dynamically, one could read
   * gibberish by doing partial reads while a list was changing.
   * A single large read to a buffer that crosses a page boundary is
   * ok, because the result being copied to user land is not recomputed
   * across a page fault.
   */
9303e0c48   Li Zefan   cpuset: remove un...
1572
  static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1573
  {
9303e0c48   Li Zefan   cpuset: remove un...
1574
  	size_t count;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1575

3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
1576
  	mutex_lock(&callback_mutex);
9303e0c48   Li Zefan   cpuset: remove un...
1577
  	count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
1578
  	mutex_unlock(&callback_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1579

9303e0c48   Li Zefan   cpuset: remove un...
1580
  	return count;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1581
  }
9303e0c48   Li Zefan   cpuset: remove un...
1582
  static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1583
  {
9303e0c48   Li Zefan   cpuset: remove un...
1584
  	size_t count;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1585

3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
1586
  	mutex_lock(&callback_mutex);
9303e0c48   Li Zefan   cpuset: remove un...
1587
  	count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
1588
  	mutex_unlock(&callback_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1589

9303e0c48   Li Zefan   cpuset: remove un...
1590
  	return count;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1591
  }
8793d854e   Paul Menage   Task Control Grou...
1592
1593
1594
1595
1596
  static ssize_t cpuset_common_file_read(struct cgroup *cont,
  				       struct cftype *cft,
  				       struct file *file,
  				       char __user *buf,
  				       size_t nbytes, loff_t *ppos)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1597
  {
8793d854e   Paul Menage   Task Control Grou...
1598
  	struct cpuset *cs = cgroup_cs(cont);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1599
1600
1601
1602
  	cpuset_filetype_t type = cft->private;
  	char *page;
  	ssize_t retval = 0;
  	char *s;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1603

e12ba74d8   Mel Gorman   Group short-lived...
1604
  	if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
  		return -ENOMEM;
  
  	s = page;
  
  	switch (type) {
  	case FILE_CPULIST:
  		s += cpuset_sprintf_cpulist(s, cs);
  		break;
  	case FILE_MEMLIST:
  		s += cpuset_sprintf_memlist(s, cs);
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1616
1617
1618
1619
1620
1621
  	default:
  		retval = -EINVAL;
  		goto out;
  	}
  	*s++ = '
  ';
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1622

eacaa1f5a   Al Viro   [PATCH] cpuset cr...
1623
  	retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1624
1625
1626
1627
  out:
  	free_page((unsigned long)page);
  	return retval;
  }
700fe1ab9   Paul Menage   CGroup API files:...
1628
1629
1630
1631
1632
1633
1634
1635
1636
  static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
  {
  	struct cpuset *cs = cgroup_cs(cont);
  	cpuset_filetype_t type = cft->private;
  	switch (type) {
  	case FILE_CPU_EXCLUSIVE:
  		return is_cpu_exclusive(cs);
  	case FILE_MEM_EXCLUSIVE:
  		return is_mem_exclusive(cs);
786083667   Paul Menage   Cpuset hardwall f...
1637
1638
  	case FILE_MEM_HARDWALL:
  		return is_mem_hardwall(cs);
700fe1ab9   Paul Menage   CGroup API files:...
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
  	case FILE_SCHED_LOAD_BALANCE:
  		return is_sched_load_balance(cs);
  	case FILE_MEMORY_MIGRATE:
  		return is_memory_migrate(cs);
  	case FILE_MEMORY_PRESSURE_ENABLED:
  		return cpuset_memory_pressure_enabled;
  	case FILE_MEMORY_PRESSURE:
  		return fmeter_getrate(&cs->fmeter);
  	case FILE_SPREAD_PAGE:
  		return is_spread_page(cs);
  	case FILE_SPREAD_SLAB:
  		return is_spread_slab(cs);
  	default:
  		BUG();
  	}
cf417141c   Max Krasnyansky   sched, cpuset: re...
1654
1655
1656
  
  	/* Unreachable but makes gcc happy */
  	return 0;
700fe1ab9   Paul Menage   CGroup API files:...
1657
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1658

5be7a4792   Paul Menage   Fix cpuset sched_...
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
  static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
  {
  	struct cpuset *cs = cgroup_cs(cont);
  	cpuset_filetype_t type = cft->private;
  	switch (type) {
  	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
  		return cs->relax_domain_level;
  	default:
  		BUG();
  	}
cf417141c   Max Krasnyansky   sched, cpuset: re...
1669
1670
1671
  
  	/* Unrechable but makes gcc happy */
  	return 0;
5be7a4792   Paul Menage   Fix cpuset sched_...
1672
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1673
1674
1675
1676
  
  /*
   * for the common functions, 'private' gives the type of file
   */
addf2c739   Paul Menage   Cpuset hardwall f...
1677
1678
1679
1680
  static struct cftype files[] = {
  	{
  		.name = "cpus",
  		.read = cpuset_common_file_read,
e37123953   Paul Menage   cgroup files: rem...
1681
1682
  		.write_string = cpuset_write_resmask,
  		.max_write_len = (100U + 6 * NR_CPUS),
addf2c739   Paul Menage   Cpuset hardwall f...
1683
1684
1685
1686
1687
1688
  		.private = FILE_CPULIST,
  	},
  
  	{
  		.name = "mems",
  		.read = cpuset_common_file_read,
e37123953   Paul Menage   cgroup files: rem...
1689
1690
  		.write_string = cpuset_write_resmask,
  		.max_write_len = (100U + 6 * MAX_NUMNODES),
addf2c739   Paul Menage   Cpuset hardwall f...
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
  		.private = FILE_MEMLIST,
  	},
  
  	{
  		.name = "cpu_exclusive",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_CPU_EXCLUSIVE,
  	},
  
  	{
  		.name = "mem_exclusive",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEM_EXCLUSIVE,
  	},
  
  	{
786083667   Paul Menage   Cpuset hardwall f...
1709
1710
1711
1712
1713
1714
1715
  		.name = "mem_hardwall",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEM_HARDWALL,
  	},
  
  	{
addf2c739   Paul Menage   Cpuset hardwall f...
1716
1717
1718
1719
1720
1721
1722
1723
  		.name = "sched_load_balance",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_SCHED_LOAD_BALANCE,
  	},
  
  	{
  		.name = "sched_relax_domain_level",
5be7a4792   Paul Menage   Fix cpuset sched_...
1724
1725
  		.read_s64 = cpuset_read_s64,
  		.write_s64 = cpuset_write_s64,
addf2c739   Paul Menage   Cpuset hardwall f...
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
  		.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
  	},
  
  	{
  		.name = "memory_migrate",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEMORY_MIGRATE,
  	},
  
  	{
  		.name = "memory_pressure",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEMORY_PRESSURE,
099fca322   Li Zefan   cgroups: show cor...
1741
  		.mode = S_IRUGO,
addf2c739   Paul Menage   Cpuset hardwall f...
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
  	},
  
  	{
  		.name = "memory_spread_page",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_SPREAD_PAGE,
  	},
  
  	{
  		.name = "memory_spread_slab",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_SPREAD_SLAB,
  	},
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1757

4baf6e332   Tejun Heo   cgroup: convert a...
1758
1759
1760
1761
1762
1763
1764
  	{
  		.name = "memory_pressure_enabled",
  		.flags = CFTYPE_ONLY_ON_ROOT,
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEMORY_PRESSURE_ENABLED,
  	},
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1765

4baf6e332   Tejun Heo   cgroup: convert a...
1766
1767
  	{ }	/* terminate */
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1768
1769
  
  /*
92fb97487   Tejun Heo   cgroup: rename ->...
1770
   *	cpuset_css_alloc - allocate a cpuset css
2df167a30   Paul Menage   cgroups: update c...
1771
   *	cont:	control group that the new cpuset will be part of
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1772
   */
92fb97487   Tejun Heo   cgroup: rename ->...
1773
  static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1774
  {
c8f699bb5   Tejun Heo   cpuset: introduce...
1775
  	struct cpuset *cs;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1776

c8f699bb5   Tejun Heo   cpuset: introduce...
1777
  	if (!cont->parent)
8793d854e   Paul Menage   Task Control Grou...
1778
  		return &top_cpuset.css;
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1779

c8f699bb5   Tejun Heo   cpuset: introduce...
1780
  	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1781
  	if (!cs)
8793d854e   Paul Menage   Task Control Grou...
1782
  		return ERR_PTR(-ENOMEM);
300ed6cbb   Li Zefan   cpuset: convert c...
1783
1784
1785
1786
  	if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
  		kfree(cs);
  		return ERR_PTR(-ENOMEM);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1787

029190c51   Paul Jackson   cpuset sched_load...
1788
  	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
300ed6cbb   Li Zefan   cpuset: convert c...
1789
  	cpumask_clear(cs->cpus_allowed);
f9a86fcbb   Mike Travis   cpuset: modify cp...
1790
  	nodes_clear(cs->mems_allowed);
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1791
  	fmeter_init(&cs->fmeter);
8d0339487   Tejun Heo   cpuset: make CPU ...
1792
  	INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1793
  	cs->relax_domain_level = -1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1794

c8f699bb5   Tejun Heo   cpuset: introduce...
1795
1796
1797
1798
1799
1800
  	return &cs->css;
  }
  
  static int cpuset_css_online(struct cgroup *cgrp)
  {
  	struct cpuset *cs = cgroup_cs(cgrp);
c431069fe   Tejun Heo   cpuset: remove cp...
1801
  	struct cpuset *parent = parent_cs(cs);
ae8086ce1   Tejun Heo   cpuset: introduce...
1802
1803
  	struct cpuset *tmp_cs;
  	struct cgroup *pos_cg;
c8f699bb5   Tejun Heo   cpuset: introduce...
1804
1805
1806
  
  	if (!parent)
  		return 0;
5d21cc2db   Tejun Heo   cpuset: replace c...
1807
  	mutex_lock(&cpuset_mutex);
efeb77b2f   Tejun Heo   cpuset: introduce...
1808
  	set_bit(CS_ONLINE, &cs->flags);
c8f699bb5   Tejun Heo   cpuset: introduce...
1809
1810
1811
1812
  	if (is_spread_page(parent))
  		set_bit(CS_SPREAD_PAGE, &cs->flags);
  	if (is_spread_slab(parent))
  		set_bit(CS_SPREAD_SLAB, &cs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1813

202f72d5d   Paul Jackson   [PATCH] cpuset: n...
1814
  	number_of_cpusets++;
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1815

c8f699bb5   Tejun Heo   cpuset: introduce...
1816
  	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags))
5d21cc2db   Tejun Heo   cpuset: replace c...
1817
  		goto out_unlock;
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
  
  	/*
  	 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
  	 * set.  This flag handling is implemented in cgroup core for
  	 * histrical reasons - the flag may be specified during mount.
  	 *
  	 * Currently, if any sibling cpusets have exclusive cpus or mem, we
  	 * refuse to clone the configuration - thereby refusing the task to
  	 * be entered, and as a result refusing the sys_unshare() or
  	 * clone() which initiated it.  If this becomes a problem for some
  	 * users who wish to allow that scenario, then this could be
  	 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
  	 * (and likewise for mems) to the new cgroup.
  	 */
ae8086ce1   Tejun Heo   cpuset: introduce...
1832
1833
1834
1835
  	rcu_read_lock();
  	cpuset_for_each_child(tmp_cs, pos_cg, parent) {
  		if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
  			rcu_read_unlock();
5d21cc2db   Tejun Heo   cpuset: replace c...
1836
  			goto out_unlock;
ae8086ce1   Tejun Heo   cpuset: introduce...
1837
  		}
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1838
  	}
ae8086ce1   Tejun Heo   cpuset: introduce...
1839
  	rcu_read_unlock();
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1840
1841
1842
1843
1844
  
  	mutex_lock(&callback_mutex);
  	cs->mems_allowed = parent->mems_allowed;
  	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
  	mutex_unlock(&callback_mutex);
5d21cc2db   Tejun Heo   cpuset: replace c...
1845
1846
  out_unlock:
  	mutex_unlock(&cpuset_mutex);
c8f699bb5   Tejun Heo   cpuset: introduce...
1847
1848
1849
1850
1851
1852
  	return 0;
  }
  
  static void cpuset_css_offline(struct cgroup *cgrp)
  {
  	struct cpuset *cs = cgroup_cs(cgrp);
5d21cc2db   Tejun Heo   cpuset: replace c...
1853
  	mutex_lock(&cpuset_mutex);
c8f699bb5   Tejun Heo   cpuset: introduce...
1854
1855
1856
1857
1858
  
  	if (is_sched_load_balance(cs))
  		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
  
  	number_of_cpusets--;
efeb77b2f   Tejun Heo   cpuset: introduce...
1859
  	clear_bit(CS_ONLINE, &cs->flags);
c8f699bb5   Tejun Heo   cpuset: introduce...
1860

5d21cc2db   Tejun Heo   cpuset: replace c...
1861
  	mutex_unlock(&cpuset_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1862
  }
029190c51   Paul Jackson   cpuset sched_load...
1863
  /*
029190c51   Paul Jackson   cpuset sched_load...
1864
1865
   * If the cpuset being removed has its flag 'sched_load_balance'
   * enabled, then simulate turning sched_load_balance off, which
699140ba8   Tejun Heo   cpuset: drop asyn...
1866
   * will call rebuild_sched_domains_locked().
029190c51   Paul Jackson   cpuset sched_load...
1867
   */
92fb97487   Tejun Heo   cgroup: rename ->...
1868
  static void cpuset_css_free(struct cgroup *cont)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1869
  {
8793d854e   Paul Menage   Task Control Grou...
1870
  	struct cpuset *cs = cgroup_cs(cont);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1871

300ed6cbb   Li Zefan   cpuset: convert c...
1872
  	free_cpumask_var(cs->cpus_allowed);
8793d854e   Paul Menage   Task Control Grou...
1873
  	kfree(cs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1874
  }
8793d854e   Paul Menage   Task Control Grou...
1875
1876
  struct cgroup_subsys cpuset_subsys = {
  	.name = "cpuset",
92fb97487   Tejun Heo   cgroup: rename ->...
1877
  	.css_alloc = cpuset_css_alloc,
c8f699bb5   Tejun Heo   cpuset: introduce...
1878
1879
  	.css_online = cpuset_css_online,
  	.css_offline = cpuset_css_offline,
92fb97487   Tejun Heo   cgroup: rename ->...
1880
  	.css_free = cpuset_css_free,
8793d854e   Paul Menage   Task Control Grou...
1881
  	.can_attach = cpuset_can_attach,
452477fa6   Tejun Heo   cpuset: pin down ...
1882
  	.cancel_attach = cpuset_cancel_attach,
8793d854e   Paul Menage   Task Control Grou...
1883
  	.attach = cpuset_attach,
8793d854e   Paul Menage   Task Control Grou...
1884
  	.subsys_id = cpuset_subsys_id,
4baf6e332   Tejun Heo   cgroup: convert a...
1885
  	.base_cftypes = files,
8793d854e   Paul Menage   Task Control Grou...
1886
1887
  	.early_init = 1,
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1888
1889
1890
1891
1892
1893
1894
1895
  /**
   * cpuset_init - initialize cpusets at system boot
   *
   * Description: Initialize top_cpuset and the cpuset internal file system,
   **/
  
  int __init cpuset_init(void)
  {
8793d854e   Paul Menage   Task Control Grou...
1896
  	int err = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1897

58568d2a8   Miao Xie   cpuset,mm: update...
1898
1899
  	if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
  		BUG();
300ed6cbb   Li Zefan   cpuset: convert c...
1900
  	cpumask_setall(top_cpuset.cpus_allowed);
f9a86fcbb   Mike Travis   cpuset: modify cp...
1901
  	nodes_setall(top_cpuset.mems_allowed);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1902

3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1903
  	fmeter_init(&top_cpuset.fmeter);
029190c51   Paul Jackson   cpuset sched_load...
1904
  	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1905
  	top_cpuset.relax_domain_level = -1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1906

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1907
1908
  	err = register_filesystem(&cpuset_fs_type);
  	if (err < 0)
8793d854e   Paul Menage   Task Control Grou...
1909
  		return err;
2341d1b65   Li Zefan   cpuset: convert c...
1910
1911
  	if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
  		BUG();
202f72d5d   Paul Jackson   [PATCH] cpuset: n...
1912
  	number_of_cpusets = 1;
8793d854e   Paul Menage   Task Control Grou...
1913
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1914
  }
956db3ca0   Cliff Wickman   hotplug cpu: move...
1915
1916
1917
1918
1919
1920
1921
1922
  /**
   * cpuset_do_move_task - move a given task to another cpuset
   * @tsk: pointer to task_struct the task to move
   * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
   *
   * Called by cgroup_scan_tasks() for each task in a cgroup.
   * Return nonzero to stop the walk through the tasks.
   */
9e0c914ca   Adrian Bunk   kernel/cpuset.c: ...
1923
1924
  static void cpuset_do_move_task(struct task_struct *tsk,
  				struct cgroup_scanner *scan)
956db3ca0   Cliff Wickman   hotplug cpu: move...
1925
  {
7f81b1ae1   Li Zefan   cpuset: remove st...
1926
  	struct cgroup *new_cgroup = scan->data;
956db3ca0   Cliff Wickman   hotplug cpu: move...
1927

5d21cc2db   Tejun Heo   cpuset: replace c...
1928
  	cgroup_lock();
7f81b1ae1   Li Zefan   cpuset: remove st...
1929
  	cgroup_attach_task(new_cgroup, tsk);
5d21cc2db   Tejun Heo   cpuset: replace c...
1930
  	cgroup_unlock();
956db3ca0   Cliff Wickman   hotplug cpu: move...
1931
1932
1933
1934
1935
1936
1937
  }
  
  /**
   * move_member_tasks_to_cpuset - move tasks from one cpuset to another
   * @from: cpuset in which the tasks currently reside
   * @to: cpuset to which the tasks will be moved
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
1938
   * Called with cpuset_mutex held
c8d9c90c7   Paul Jackson   hotplug cpu: move...
1939
   * callback_mutex must not be held, as cpuset_attach() will take it.
956db3ca0   Cliff Wickman   hotplug cpu: move...
1940
1941
1942
1943
1944
1945
   *
   * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
   * calling callback functions for each.
   */
  static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
  {
7f81b1ae1   Li Zefan   cpuset: remove st...
1946
  	struct cgroup_scanner scan;
956db3ca0   Cliff Wickman   hotplug cpu: move...
1947

7f81b1ae1   Li Zefan   cpuset: remove st...
1948
1949
1950
1951
1952
  	scan.cg = from->css.cgroup;
  	scan.test_task = NULL; /* select all tasks in cgroup */
  	scan.process_task = cpuset_do_move_task;
  	scan.heap = NULL;
  	scan.data = to->css.cgroup;
956db3ca0   Cliff Wickman   hotplug cpu: move...
1953

7f81b1ae1   Li Zefan   cpuset: remove st...
1954
  	if (cgroup_scan_tasks(&scan))
956db3ca0   Cliff Wickman   hotplug cpu: move...
1955
1956
1957
1958
  		printk(KERN_ERR "move_member_tasks_to_cpuset: "
  				"cgroup_scan_tasks failed
  ");
  }
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1959
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
1960
   * If CPU and/or memory hotplug handlers, below, unplug any CPUs
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1961
1962
   * or memory nodes, we need to walk over the cpuset hierarchy,
   * removing that CPU or node from all cpusets.  If this removes the
956db3ca0   Cliff Wickman   hotplug cpu: move...
1963
1964
   * last CPU or node from a cpuset, then move the tasks in the empty
   * cpuset to its next-highest non-empty parent.
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1965
   */
956db3ca0   Cliff Wickman   hotplug cpu: move...
1966
1967
1968
  static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
  {
  	struct cpuset *parent;
c8d9c90c7   Paul Jackson   hotplug cpu: move...
1969
  	/*
956db3ca0   Cliff Wickman   hotplug cpu: move...
1970
1971
1972
  	 * Find its next-highest non-empty parent, (top cpuset
  	 * has online cpus, so can't be empty).
  	 */
c431069fe   Tejun Heo   cpuset: remove cp...
1973
  	parent = parent_cs(cs);
300ed6cbb   Li Zefan   cpuset: convert c...
1974
  	while (cpumask_empty(parent->cpus_allowed) ||
b45012955   Paul Jackson   hotplug cpu move ...
1975
  			nodes_empty(parent->mems_allowed))
c431069fe   Tejun Heo   cpuset: remove cp...
1976
  		parent = parent_cs(parent);
956db3ca0   Cliff Wickman   hotplug cpu: move...
1977
1978
1979
  
  	move_member_tasks_to_cpuset(cs, parent);
  }
deb7aa308   Tejun Heo   cpuset: reorganiz...
1980
  /**
8d0339487   Tejun Heo   cpuset: make CPU ...
1981
   * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset
deb7aa308   Tejun Heo   cpuset: reorganiz...
1982
   * @cs: cpuset in interest
956db3ca0   Cliff Wickman   hotplug cpu: move...
1983
   *
deb7aa308   Tejun Heo   cpuset: reorganiz...
1984
1985
1986
   * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
   * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
   * all its tasks are moved to the nearest ancestor with both resources.
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
1987
   */
8d0339487   Tejun Heo   cpuset: make CPU ...
1988
  static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
1989
  {
deb7aa308   Tejun Heo   cpuset: reorganiz...
1990
1991
  	static cpumask_t off_cpus;
  	static nodemask_t off_mems, tmp_mems;
8d0339487   Tejun Heo   cpuset: make CPU ...
1992
  	struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
5d21cc2db   Tejun Heo   cpuset: replace c...
1993
  	bool is_empty;
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
1994

5d21cc2db   Tejun Heo   cpuset: replace c...
1995
  	mutex_lock(&cpuset_mutex);
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
1996

deb7aa308   Tejun Heo   cpuset: reorganiz...
1997
1998
  	cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
  	nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
1999

deb7aa308   Tejun Heo   cpuset: reorganiz...
2000
2001
2002
2003
2004
2005
  	/* remove offline cpus from @cs */
  	if (!cpumask_empty(&off_cpus)) {
  		mutex_lock(&callback_mutex);
  		cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
  		mutex_unlock(&callback_mutex);
  		update_tasks_cpumask(cs, NULL);
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
2006
  	}
deb7aa308   Tejun Heo   cpuset: reorganiz...
2007
2008
2009
2010
2011
2012
2013
  	/* remove offline mems from @cs */
  	if (!nodes_empty(off_mems)) {
  		tmp_mems = cs->mems_allowed;
  		mutex_lock(&callback_mutex);
  		nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
  		mutex_unlock(&callback_mutex);
  		update_tasks_nodemask(cs, &tmp_mems, NULL);
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2014
  	}
deb7aa308   Tejun Heo   cpuset: reorganiz...
2015

5d21cc2db   Tejun Heo   cpuset: replace c...
2016
2017
  	is_empty = cpumask_empty(cs->cpus_allowed) ||
  		nodes_empty(cs->mems_allowed);
8d0339487   Tejun Heo   cpuset: make CPU ...
2018

5d21cc2db   Tejun Heo   cpuset: replace c...
2019
2020
2021
2022
2023
2024
2025
2026
2027
  	mutex_unlock(&cpuset_mutex);
  
  	/*
  	 * If @cs became empty, move tasks to the nearest ancestor with
  	 * execution resources.  This is full cgroup operation which will
  	 * also call back into cpuset.  Should be done outside any lock.
  	 */
  	if (is_empty)
  		remove_tasks_in_empty_cpuset(cs);
8d0339487   Tejun Heo   cpuset: make CPU ...
2028
2029
2030
  
  	/* the following may free @cs, should be the last operation */
  	css_put(&cs->css);
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
2031
  }
8d0339487   Tejun Heo   cpuset: make CPU ...
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
  /**
   * schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset
   * @cs: cpuset of interest
   *
   * Schedule cpuset_propagate_hotplug_workfn() which will update CPU and
   * memory masks according to top_cpuset.
   */
  static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
  {
  	/*
  	 * Pin @cs.  The refcnt will be released when the work item
  	 * finishes executing.
  	 */
  	if (!css_tryget(&cs->css))
  		return;
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
2047

8d0339487   Tejun Heo   cpuset: make CPU ...
2048
2049
2050
2051
2052
2053
2054
  	/*
  	 * Queue @cs->hotplug_work.  If already pending, lose the css ref.
  	 * cpuset_propagate_hotplug_wq is ordered and propagation will
  	 * happen in the order this function is called.
  	 */
  	if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work))
  		css_put(&cs->css);
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2055
  }
deb7aa308   Tejun Heo   cpuset: reorganiz...
2056
  /**
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
2057
   * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
956db3ca0   Cliff Wickman   hotplug cpu: move...
2058
   *
deb7aa308   Tejun Heo   cpuset: reorganiz...
2059
2060
2061
2062
2063
   * This function is called after either CPU or memory configuration has
   * changed and updates cpuset accordingly.  The top_cpuset is always
   * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
   * order to make cpusets transparent (of no affect) on systems that are
   * actively using CPU hotplug but making no active use of cpusets.
956db3ca0   Cliff Wickman   hotplug cpu: move...
2064
   *
deb7aa308   Tejun Heo   cpuset: reorganiz...
2065
2066
2067
   * Non-root cpusets are only affected by offlining.  If any CPUs or memory
   * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all
   * descendants.
956db3ca0   Cliff Wickman   hotplug cpu: move...
2068
   *
deb7aa308   Tejun Heo   cpuset: reorganiz...
2069
2070
   * Note that CPU offlining during suspend is ignored.  We don't modify
   * cpusets across suspend/resume cycles at all.
956db3ca0   Cliff Wickman   hotplug cpu: move...
2071
   */
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
2072
  static void cpuset_hotplug_workfn(struct work_struct *work)
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2073
  {
deb7aa308   Tejun Heo   cpuset: reorganiz...
2074
2075
2076
2077
  	static cpumask_t new_cpus, tmp_cpus;
  	static nodemask_t new_mems, tmp_mems;
  	bool cpus_updated, mems_updated;
  	bool cpus_offlined, mems_offlined;
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2078

5d21cc2db   Tejun Heo   cpuset: replace c...
2079
  	mutex_lock(&cpuset_mutex);
956db3ca0   Cliff Wickman   hotplug cpu: move...
2080

deb7aa308   Tejun Heo   cpuset: reorganiz...
2081
2082
2083
  	/* fetch the available cpus/mems and find out which changed how */
  	cpumask_copy(&new_cpus, cpu_active_mask);
  	new_mems = node_states[N_MEMORY];
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
2084

deb7aa308   Tejun Heo   cpuset: reorganiz...
2085
2086
2087
  	cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
  	cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
  				       &new_cpus);
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
2088

deb7aa308   Tejun Heo   cpuset: reorganiz...
2089
2090
2091
  	mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
  	nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
  	mems_offlined = !nodes_empty(tmp_mems);
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
2092

deb7aa308   Tejun Heo   cpuset: reorganiz...
2093
2094
2095
2096
2097
2098
2099
  	/* synchronize cpus_allowed to cpu_active_mask */
  	if (cpus_updated) {
  		mutex_lock(&callback_mutex);
  		cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
  		mutex_unlock(&callback_mutex);
  		/* we don't mess with cpumasks of tasks in top_cpuset */
  	}
b45012955   Paul Jackson   hotplug cpu move ...
2100

deb7aa308   Tejun Heo   cpuset: reorganiz...
2101
2102
2103
2104
2105
2106
2107
2108
  	/* synchronize mems_allowed to N_MEMORY */
  	if (mems_updated) {
  		tmp_mems = top_cpuset.mems_allowed;
  		mutex_lock(&callback_mutex);
  		top_cpuset.mems_allowed = new_mems;
  		mutex_unlock(&callback_mutex);
  		update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL);
  	}
b45012955   Paul Jackson   hotplug cpu move ...
2109

deb7aa308   Tejun Heo   cpuset: reorganiz...
2110
2111
2112
  	/* if cpus or mems went down, we need to propagate to descendants */
  	if (cpus_offlined || mems_offlined) {
  		struct cpuset *cs;
fc560a26a   Tejun Heo   cpuset: replace c...
2113
  		struct cgroup *pos_cgrp;
f9b4fb8da   Miao Xie   cpusets: update t...
2114

fc560a26a   Tejun Heo   cpuset: replace c...
2115
2116
2117
2118
  		rcu_read_lock();
  		cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset)
  			schedule_cpuset_propagate_hotplug(cs);
  		rcu_read_unlock();
deb7aa308   Tejun Heo   cpuset: reorganiz...
2119
  	}
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
2120

5d21cc2db   Tejun Heo   cpuset: replace c...
2121
  	mutex_unlock(&cpuset_mutex);
b45012955   Paul Jackson   hotplug cpu move ...
2122

8d0339487   Tejun Heo   cpuset: make CPU ...
2123
2124
  	/* wait for propagations to finish */
  	flush_workqueue(cpuset_propagate_hotplug_wq);
deb7aa308   Tejun Heo   cpuset: reorganiz...
2125
2126
2127
2128
2129
  	/* rebuild sched domains if cpus_allowed has changed */
  	if (cpus_updated) {
  		struct sched_domain_attr *attr;
  		cpumask_var_t *doms;
  		int ndoms;
5d21cc2db   Tejun Heo   cpuset: replace c...
2130
  		mutex_lock(&cpuset_mutex);
deb7aa308   Tejun Heo   cpuset: reorganiz...
2131
  		ndoms = generate_sched_domains(&doms, &attr);
5d21cc2db   Tejun Heo   cpuset: replace c...
2132
  		mutex_unlock(&cpuset_mutex);
deb7aa308   Tejun Heo   cpuset: reorganiz...
2133
2134
  
  		partition_sched_domains(ndoms, doms, attr);
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2135
2136
  	}
  }
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
2137
  void cpuset_update_active_cpus(bool cpu_online)
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2138
  {
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
  	/*
  	 * We're inside cpu hotplug critical region which usually nests
  	 * inside cgroup synchronization.  Bounce actual hotplug processing
  	 * to a work item to avoid reverse locking order.
  	 *
  	 * We still need to do partition_sched_domains() synchronously;
  	 * otherwise, the scheduler will get confused and put tasks to the
  	 * dead CPU.  Fall back to the default single domain.
  	 * cpuset_hotplug_workfn() will rebuild it as necessary.
  	 */
  	partition_sched_domains(1, NULL, NULL);
  	schedule_work(&cpuset_hotplug_work);
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2151
  }
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2152

38837fc75   Paul Jackson   [PATCH] cpuset: t...
2153
  /*
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
2154
2155
   * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
   * Call this routine anytime after node_states[N_MEMORY] changes.
a1cd2b13f   Srivatsa S. Bhat   cpusets: Remove/u...
2156
   * See cpuset_update_active_cpus() for CPU hotplug handling.
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2157
   */
f481891fd   Miao Xie   cpuset: update to...
2158
2159
  static int cpuset_track_online_nodes(struct notifier_block *self,
  				unsigned long action, void *arg)
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2160
  {
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
2161
  	schedule_work(&cpuset_hotplug_work);
f481891fd   Miao Xie   cpuset: update to...
2162
  	return NOTIFY_OK;
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2163
  }
d8f10cb3d   Andrew Morton   kernel/cpuset.c: ...
2164
2165
2166
2167
2168
  
  static struct notifier_block cpuset_track_online_nodes_nb = {
  	.notifier_call = cpuset_track_online_nodes,
  	.priority = 10,		/* ??! */
  };
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2169

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2170
2171
2172
2173
  /**
   * cpuset_init_smp - initialize cpus_allowed
   *
   * Description: Finish top cpuset after cpu, node maps are initialized
d8f10cb3d   Andrew Morton   kernel/cpuset.c: ...
2174
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2175
2176
  void __init cpuset_init_smp(void)
  {
6ad4c1888   Peter Zijlstra   sched: Fix balanc...
2177
  	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
2178
  	top_cpuset.mems_allowed = node_states[N_MEMORY];
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2179

d8f10cb3d   Andrew Morton   kernel/cpuset.c: ...
2180
  	register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
f90d4118b   Miao Xie   cpuset: fix possi...
2181

8d0339487   Tejun Heo   cpuset: make CPU ...
2182
2183
2184
  	cpuset_propagate_hotplug_wq =
  		alloc_ordered_workqueue("cpuset_hotplug", 0);
  	BUG_ON(!cpuset_propagate_hotplug_wq);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2185
2186
2187
  }
  
  /**
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2188
2189
   * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
   * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
6af866af3   Li Zefan   cpuset: remove re...
2190
   * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2191
   *
300ed6cbb   Li Zefan   cpuset: convert c...
2192
   * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2193
   * attached to the specified @tsk.  Guaranteed to return some non-empty
5f054e31c   Rusty Russell   documentation: re...
2194
   * subset of cpu_online_mask, even if this means going outside the
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2195
2196
   * tasks cpuset.
   **/
6af866af3   Li Zefan   cpuset: remove re...
2197
  void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2198
  {
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2199
  	mutex_lock(&callback_mutex);
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2200
  	task_lock(tsk);
f9a86fcbb   Mike Travis   cpuset: modify cp...
2201
  	guarantee_online_cpus(task_cs(tsk), pmask);
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2202
  	task_unlock(tsk);
897f0b3c3   Oleg Nesterov   sched: Kill the b...
2203
  	mutex_unlock(&callback_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2204
  }
2baab4e90   Peter Zijlstra   sched: Fix select...
2205
  void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
9084bb824   Oleg Nesterov   sched: Make selec...
2206
2207
  {
  	const struct cpuset *cs;
9084bb824   Oleg Nesterov   sched: Make selec...
2208
2209
2210
2211
  
  	rcu_read_lock();
  	cs = task_cs(tsk);
  	if (cs)
1e1b6c511   KOSAKI Motohiro   cpuset: Fix cpuse...
2212
  		do_set_cpus_allowed(tsk, cs->cpus_allowed);
9084bb824   Oleg Nesterov   sched: Make selec...
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
  	rcu_read_unlock();
  
  	/*
  	 * We own tsk->cpus_allowed, nobody can change it under us.
  	 *
  	 * But we used cs && cs->cpus_allowed lockless and thus can
  	 * race with cgroup_attach_task() or update_cpumask() and get
  	 * the wrong tsk->cpus_allowed. However, both cases imply the
  	 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
  	 * which takes task_rq_lock().
  	 *
  	 * If we are called after it dropped the lock we must see all
  	 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
  	 * set any mask even if it is not right from task_cs() pov,
  	 * the pending set_cpus_allowed_ptr() will fix things.
2baab4e90   Peter Zijlstra   sched: Fix select...
2228
2229
2230
  	 *
  	 * select_fallback_rq() will fix things ups and set cpu_possible_mask
  	 * if required.
9084bb824   Oleg Nesterov   sched: Make selec...
2231
  	 */
9084bb824   Oleg Nesterov   sched: Make selec...
2232
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2233
2234
  void cpuset_init_current_mems_allowed(void)
  {
f9a86fcbb   Mike Travis   cpuset: modify cp...
2235
  	nodes_setall(current->mems_allowed);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2236
  }
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2237
  /**
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2238
2239
2240
2241
2242
   * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
   * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
   *
   * Description: Returns the nodemask_t mems_allowed of the cpuset
   * attached to the specified @tsk.  Guaranteed to return some non-empty
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
2243
   * subset of node_states[N_MEMORY], even if this means going outside the
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2244
2245
2246
2247
2248
2249
   * tasks cpuset.
   **/
  
  nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
  {
  	nodemask_t mask;
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2250
  	mutex_lock(&callback_mutex);
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2251
  	task_lock(tsk);
8793d854e   Paul Menage   Task Control Grou...
2252
  	guarantee_online_mems(task_cs(tsk), &mask);
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2253
  	task_unlock(tsk);
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2254
  	mutex_unlock(&callback_mutex);
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2255
2256
2257
2258
2259
  
  	return mask;
  }
  
  /**
19770b326   Mel Gorman   mm: filter based ...
2260
2261
   * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
   * @nodemask: the nodemask to be checked
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2262
   *
19770b326   Mel Gorman   mm: filter based ...
2263
   * Are any of the nodes in the nodemask allowed in current->mems_allowed?
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2264
   */
19770b326   Mel Gorman   mm: filter based ...
2265
  int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2266
  {
19770b326   Mel Gorman   mm: filter based ...
2267
  	return nodes_intersects(*nodemask, current->mems_allowed);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2268
  }
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2269
  /*
786083667   Paul Menage   Cpuset hardwall f...
2270
2271
2272
2273
   * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
   * mem_hardwall ancestor to the specified cpuset.  Call holding
   * callback_mutex.  If no ancestor is mem_exclusive or mem_hardwall
   * (an unusual configuration), then returns the root cpuset.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2274
   */
786083667   Paul Menage   Cpuset hardwall f...
2275
  static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2276
  {
c431069fe   Tejun Heo   cpuset: remove cp...
2277
2278
  	while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
  		cs = parent_cs(cs);
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2279
2280
  	return cs;
  }
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2281
  /**
a1bc5a4ee   David Rientjes   cpusets: replace ...
2282
2283
   * cpuset_node_allowed_softwall - Can we allocate on a memory node?
   * @node: is this an allowed node?
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2284
   * @gfp_mask: memory allocation flags
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2285
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2286
2287
2288
2289
2290
2291
   * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
   * set, yes, we can always allocate.  If node is in our task's mems_allowed,
   * yes.  If it's not a __GFP_HARDWALL request and this node is in the nearest
   * hardwalled cpuset ancestor to this task's cpuset, yes.  If the task has been
   * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
   * flag, yes.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2292
2293
   * Otherwise, no.
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2294
2295
2296
   * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
   * cpuset_node_allowed_hardwall().  Otherwise, cpuset_node_allowed_softwall()
   * might sleep, and might allow a node from an enclosing cpuset.
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2297
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2298
2299
   * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
   * cpusets, and never sleeps.
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2300
2301
2302
2303
2304
2305
2306
   *
   * The __GFP_THISNODE placement logic is really handled elsewhere,
   * by forcibly using a zonelist starting at a specified node, and by
   * (in get_page_from_freelist()) refusing to consider the zones for
   * any node on the zonelist except the first.  By the time any such
   * calls get to this routine, we should just shut up and say 'yes'.
   *
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2307
   * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
c596d9f32   David Rientjes   cpusets: allow TI...
2308
2309
   * and do not allow allocations outside the current tasks cpuset
   * unless the task has been OOM killed as is marked TIF_MEMDIE.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2310
   * GFP_KERNEL allocations are not so marked, so can escape to the
786083667   Paul Menage   Cpuset hardwall f...
2311
   * nearest enclosing hardwalled ancestor cpuset.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2312
   *
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2313
2314
2315
2316
2317
2318
2319
   * Scanning up parent cpusets requires callback_mutex.  The
   * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
   * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
   * current tasks mems_allowed came up empty on the first pass over
   * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
   * cpuset are short of memory, might require taking the callback_mutex
   * mutex.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2320
   *
36be57ffe   Paul Jackson   [PATCH] cpuset: u...
2321
   * The first call here from mm/page_alloc:get_page_from_freelist()
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2322
2323
2324
   * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
   * so no allocation on a node outside the cpuset is allowed (unless
   * in interrupt, of course).
36be57ffe   Paul Jackson   [PATCH] cpuset: u...
2325
2326
2327
2328
2329
2330
   *
   * The second pass through get_page_from_freelist() doesn't even call
   * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
   * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
   * in alloc_flags.  That logic and the checks below have the combined
   * affect that:
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2331
2332
   *	in_interrupt - any node ok (current task context irrelevant)
   *	GFP_ATOMIC   - any node ok
c596d9f32   David Rientjes   cpusets: allow TI...
2333
   *	TIF_MEMDIE   - any node ok
786083667   Paul Menage   Cpuset hardwall f...
2334
   *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2335
   *	GFP_USER     - only nodes in current tasks mems allowed ok.
36be57ffe   Paul Jackson   [PATCH] cpuset: u...
2336
2337
   *
   * Rule:
a1bc5a4ee   David Rientjes   cpusets: replace ...
2338
   *    Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
36be57ffe   Paul Jackson   [PATCH] cpuset: u...
2339
2340
   *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
   *    the code that might scan up ancestor cpusets and sleep.
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2341
   */
a1bc5a4ee   David Rientjes   cpusets: replace ...
2342
  int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2343
  {
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2344
  	const struct cpuset *cs;	/* current cpuset ancestors */
29afd49b7   Paul Jackson   [PATCH] cpuset: r...
2345
  	int allowed;			/* is allocation in zone z allowed? */
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2346

9b819d204   Christoph Lameter   [PATCH] Add __GFP...
2347
  	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2348
  		return 1;
92d1dbd27   Paul Jackson   [PATCH] cpuset: m...
2349
  	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2350
2351
  	if (node_isset(node, current->mems_allowed))
  		return 1;
c596d9f32   David Rientjes   cpusets: allow TI...
2352
2353
2354
2355
2356
2357
  	/*
  	 * Allow tasks that have access to memory reserves because they have
  	 * been OOM killed to get memory anywhere.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE)))
  		return 1;
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2358
2359
  	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
  		return 0;
5563e7707   Bob Picco   [PATCH] cpuset: f...
2360
2361
  	if (current->flags & PF_EXITING) /* Let dying task have memory */
  		return 1;
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2362
  	/* Not hardwall and node outside mems_allowed: scan up cpusets */
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2363
  	mutex_lock(&callback_mutex);
053199edf   Paul Jackson   [PATCH] cpusets: ...
2364

053199edf   Paul Jackson   [PATCH] cpusets: ...
2365
  	task_lock(current);
786083667   Paul Menage   Cpuset hardwall f...
2366
  	cs = nearest_hardwall_ancestor(task_cs(current));
053199edf   Paul Jackson   [PATCH] cpusets: ...
2367
  	task_unlock(current);
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2368
  	allowed = node_isset(node, cs->mems_allowed);
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2369
  	mutex_unlock(&callback_mutex);
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2370
  	return allowed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2371
  }
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2372
  /*
a1bc5a4ee   David Rientjes   cpusets: replace ...
2373
2374
   * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
   * @node: is this an allowed node?
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2375
2376
   * @gfp_mask: memory allocation flags
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2377
2378
2379
2380
2381
   * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
   * set, yes, we can always allocate.  If node is in our task's mems_allowed,
   * yes.  If the task has been OOM killed and has access to memory reserves as
   * specified by the TIF_MEMDIE flag, yes.
   * Otherwise, no.
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2382
2383
2384
2385
2386
2387
2388
   *
   * The __GFP_THISNODE placement logic is really handled elsewhere,
   * by forcibly using a zonelist starting at a specified node, and by
   * (in get_page_from_freelist()) refusing to consider the zones for
   * any node on the zonelist except the first.  By the time any such
   * calls get to this routine, we should just shut up and say 'yes'.
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2389
2390
   * Unlike the cpuset_node_allowed_softwall() variant, above,
   * this variant requires that the node be in the current task's
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2391
2392
2393
2394
   * mems_allowed or that we're in interrupt.  It does not scan up the
   * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
   * It never sleeps.
   */
a1bc5a4ee   David Rientjes   cpusets: replace ...
2395
  int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2396
  {
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2397
2398
  	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
  		return 1;
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2399
2400
  	if (node_isset(node, current->mems_allowed))
  		return 1;
dedf8b79e   Daniel Walker   whitespace fixes:...
2401
2402
2403
2404
2405
2406
  	/*
  	 * Allow tasks that have access to memory reserves because they have
  	 * been OOM killed to get memory anywhere.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE)))
  		return 1;
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2407
2408
  	return 0;
  }
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2409
  /**
6adef3ebe   Jack Steiner   cpusets: new roun...
2410
2411
   * cpuset_mem_spread_node() - On which node to begin search for a file page
   * cpuset_slab_spread_node() - On which node to begin search for a slab page
825a46af5   Paul Jackson   [PATCH] cpuset me...
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
   *
   * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
   * tasks in a cpuset with is_spread_page or is_spread_slab set),
   * and if the memory allocation used cpuset_mem_spread_node()
   * to determine on which node to start looking, as it will for
   * certain page cache or slab cache pages such as used for file
   * system buffers and inode caches, then instead of starting on the
   * local node to look for a free page, rather spread the starting
   * node around the tasks mems_allowed nodes.
   *
   * We don't have to worry about the returned node being offline
   * because "it can't happen", and even if it did, it would be ok.
   *
   * The routines calling guarantee_online_mems() are careful to
   * only set nodes in task->mems_allowed that are online.  So it
   * should not be possible for the following code to return an
   * offline node.  But if it did, that would be ok, as this routine
   * is not returning the node where the allocation must be, only
   * the node where the search should start.  The zonelist passed to
   * __alloc_pages() will include all nodes.  If the slab allocator
   * is passed an offline node, it will fall back to the local node.
   * See kmem_cache_alloc_node().
   */
6adef3ebe   Jack Steiner   cpusets: new roun...
2435
  static int cpuset_spread_node(int *rotor)
825a46af5   Paul Jackson   [PATCH] cpuset me...
2436
2437
  {
  	int node;
6adef3ebe   Jack Steiner   cpusets: new roun...
2438
  	node = next_node(*rotor, current->mems_allowed);
825a46af5   Paul Jackson   [PATCH] cpuset me...
2439
2440
  	if (node == MAX_NUMNODES)
  		node = first_node(current->mems_allowed);
6adef3ebe   Jack Steiner   cpusets: new roun...
2441
  	*rotor = node;
825a46af5   Paul Jackson   [PATCH] cpuset me...
2442
2443
  	return node;
  }
6adef3ebe   Jack Steiner   cpusets: new roun...
2444
2445
2446
  
  int cpuset_mem_spread_node(void)
  {
778d3b0ff   Michal Hocko   cpusets: randomiz...
2447
2448
2449
  	if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
  		current->cpuset_mem_spread_rotor =
  			node_random(&current->mems_allowed);
6adef3ebe   Jack Steiner   cpusets: new roun...
2450
2451
2452
2453
2454
  	return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
  }
  
  int cpuset_slab_spread_node(void)
  {
778d3b0ff   Michal Hocko   cpusets: randomiz...
2455
2456
2457
  	if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
  		current->cpuset_slab_spread_rotor =
  			node_random(&current->mems_allowed);
6adef3ebe   Jack Steiner   cpusets: new roun...
2458
2459
  	return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
  }
825a46af5   Paul Jackson   [PATCH] cpuset me...
2460
2461
2462
  EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
  
  /**
bbe373f2c   David Rientjes   oom: compare cpus...
2463
2464
2465
2466
2467
2468
2469
2470
   * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
   * @tsk1: pointer to task_struct of some task.
   * @tsk2: pointer to task_struct of some other task.
   *
   * Description: Return true if @tsk1's mems_allowed intersects the
   * mems_allowed of @tsk2.  Used by the OOM killer to determine if
   * one of the task's memory usage might impact the memory available
   * to the other.
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2471
   **/
bbe373f2c   David Rientjes   oom: compare cpus...
2472
2473
  int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
  				   const struct task_struct *tsk2)
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2474
  {
bbe373f2c   David Rientjes   oom: compare cpus...
2475
  	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2476
  }
75aa19941   David Rientjes   oom: print trigge...
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
  /**
   * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
   * @task: pointer to task_struct of some task.
   *
   * Description: Prints @task's name, cpuset name, and cached copy of its
   * mems_allowed to the kernel log.  Must hold task_lock(task) to allow
   * dereferencing task_cs(task).
   */
  void cpuset_print_task_mems_allowed(struct task_struct *tsk)
  {
  	struct dentry *dentry;
  
  	dentry = task_cs(tsk)->css.cgroup->dentry;
  	spin_lock(&cpuset_buffer_lock);
63f43f55c   Li Zefan   cpuset: fix cpuse...
2491
2492
2493
2494
2495
2496
2497
2498
2499
  
  	if (!dentry) {
  		strcpy(cpuset_name, "/");
  	} else {
  		spin_lock(&dentry->d_lock);
  		strlcpy(cpuset_name, (const char *)dentry->d_name.name,
  			CPUSET_NAME_LEN);
  		spin_unlock(&dentry->d_lock);
  	}
75aa19941   David Rientjes   oom: print trigge...
2500
2501
2502
2503
2504
2505
2506
  	nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
  			   tsk->mems_allowed);
  	printk(KERN_INFO "%s cpuset=%s mems_allowed=%s
  ",
  	       tsk->comm, cpuset_name, cpuset_nodelist);
  	spin_unlock(&cpuset_buffer_lock);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2507
  /*
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2508
2509
2510
2511
   * Collection of memory_pressure is suppressed unless
   * this flag is enabled by writing "1" to the special
   * cpuset file 'memory_pressure_enabled' in the root cpuset.
   */
c5b2aff89   Paul Jackson   [PATCH] cpuset: m...
2512
  int cpuset_memory_pressure_enabled __read_mostly;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
  
  /**
   * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
   *
   * Keep a running average of the rate of synchronous (direct)
   * page reclaim efforts initiated by tasks in each cpuset.
   *
   * This represents the rate at which some task in the cpuset
   * ran low on memory on all nodes it was allowed to use, and
   * had to enter the kernels page reclaim code in an effort to
   * create more free memory by tossing clean pages or swapping
   * or writing dirty pages.
   *
   * Display to user space in the per-cpuset read-only file
   * "memory_pressure".  Value displayed is an integer
   * representing the recent rate of entry into the synchronous
   * (direct) page reclaim by any task attached to the cpuset.
   **/
  
  void __cpuset_memory_pressure_bump(void)
  {
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2534
  	task_lock(current);
8793d854e   Paul Menage   Task Control Grou...
2535
  	fmeter_markevent(&task_cs(current)->fmeter);
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2536
2537
  	task_unlock(current);
  }
8793d854e   Paul Menage   Task Control Grou...
2538
  #ifdef CONFIG_PROC_PID_CPUSET
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2539
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2540
2541
2542
   * proc_cpuset_show()
   *  - Print tasks cpuset path into seq_file.
   *  - Used for /proc/<pid>/cpuset.
053199edf   Paul Jackson   [PATCH] cpusets: ...
2543
2544
   *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
   *    doesn't really matter if tsk->cpuset changes after we read it,
5d21cc2db   Tejun Heo   cpuset: replace c...
2545
   *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
2df167a30   Paul Menage   cgroups: update c...
2546
   *    anyway.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2547
   */
029190c51   Paul Jackson   cpuset sched_load...
2548
  static int proc_cpuset_show(struct seq_file *m, void *unused_v)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2549
  {
13b41b094   Eric W. Biederman   [PATCH] proc: Use...
2550
  	struct pid *pid;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2551
2552
  	struct task_struct *tsk;
  	char *buf;
8793d854e   Paul Menage   Task Control Grou...
2553
  	struct cgroup_subsys_state *css;
99f895518   Eric W. Biederman   [PATCH] proc: don...
2554
  	int retval;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2555

99f895518   Eric W. Biederman   [PATCH] proc: don...
2556
  	retval = -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2557
2558
  	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
  	if (!buf)
99f895518   Eric W. Biederman   [PATCH] proc: don...
2559
2560
2561
  		goto out;
  
  	retval = -ESRCH;
13b41b094   Eric W. Biederman   [PATCH] proc: Use...
2562
2563
  	pid = m->private;
  	tsk = get_pid_task(pid, PIDTYPE_PID);
99f895518   Eric W. Biederman   [PATCH] proc: don...
2564
2565
  	if (!tsk)
  		goto out_free;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2566

27e89ae5d   Li Zefan   cpuset: fix RCU l...
2567
  	rcu_read_lock();
8793d854e   Paul Menage   Task Control Grou...
2568
2569
  	css = task_subsys_state(tsk, cpuset_subsys_id);
  	retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
27e89ae5d   Li Zefan   cpuset: fix RCU l...
2570
  	rcu_read_unlock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2571
  	if (retval < 0)
27e89ae5d   Li Zefan   cpuset: fix RCU l...
2572
  		goto out_put_task;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2573
2574
2575
  	seq_puts(m, buf);
  	seq_putc(m, '
  ');
27e89ae5d   Li Zefan   cpuset: fix RCU l...
2576
  out_put_task:
99f895518   Eric W. Biederman   [PATCH] proc: don...
2577
2578
  	put_task_struct(tsk);
  out_free:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2579
  	kfree(buf);
99f895518   Eric W. Biederman   [PATCH] proc: don...
2580
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2581
2582
2583
2584
2585
  	return retval;
  }
  
  static int cpuset_open(struct inode *inode, struct file *file)
  {
13b41b094   Eric W. Biederman   [PATCH] proc: Use...
2586
2587
  	struct pid *pid = PROC_I(inode)->pid;
  	return single_open(file, proc_cpuset_show, pid);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2588
  }
9a32144e9   Arjan van de Ven   [PATCH] mark stru...
2589
  const struct file_operations proc_cpuset_operations = {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2590
2591
2592
2593
2594
  	.open		= cpuset_open,
  	.read		= seq_read,
  	.llseek		= seq_lseek,
  	.release	= single_release,
  };
8793d854e   Paul Menage   Task Control Grou...
2595
  #endif /* CONFIG_PROC_PID_CPUSET */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2596

d01d48278   Heiko Carstens   sched: Always sho...
2597
  /* Display task mems_allowed in /proc/<pid>/status file. */
df5f8314c   Eric W. Biederman   proc: seqfile con...
2598
2599
  void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
  {
df5f8314c   Eric W. Biederman   proc: seqfile con...
2600
  	seq_printf(m, "Mems_allowed:\t");
30e8e1360   Lai Jiangshan   cpuset: use seq_*...
2601
  	seq_nodemask(m, &task->mems_allowed);
df5f8314c   Eric W. Biederman   proc: seqfile con...
2602
2603
  	seq_printf(m, "
  ");
39106dcf8   Mike Travis   cpumask: use new ...
2604
  	seq_printf(m, "Mems_allowed_list:\t");
30e8e1360   Lai Jiangshan   cpuset: use seq_*...
2605
  	seq_nodemask_list(m, &task->mems_allowed);
39106dcf8   Mike Travis   cpumask: use new ...
2606
2607
  	seq_printf(m, "
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2608
  }