Blame view

kernel/cpuset.c 74.1 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
  /*
   *  kernel/cpuset.c
   *
   *  Processor and Memory placement constraints for sets of tasks.
   *
   *  Copyright (C) 2003 BULL SA.
029190c51   Paul Jackson   cpuset sched_load...
7
   *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
8793d854e   Paul Menage   Task Control Grou...
8
   *  Copyright (C) 2006 Google, Inc
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
9
10
11
   *
   *  Portions derived from Patrick Mochel's sysfs code.
   *  sysfs is Copyright (c) 2001-3 Patrick Mochel
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
12
   *
825a46af5   Paul Jackson   [PATCH] cpuset me...
13
   *  2003-10-10 Written by Simon Derr.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
14
   *  2003-10-22 Updates by Stephen Hemminger.
825a46af5   Paul Jackson   [PATCH] cpuset me...
15
   *  2004 May-July Rework by Paul Jackson.
8793d854e   Paul Menage   Task Control Grou...
16
   *  2006 Rework by Paul Menage to use generic cgroups
cf417141c   Max Krasnyansky   sched, cpuset: re...
17
18
   *  2008 Rework of the scheduler domains and CPU hotplug handling
   *       by Max Krasnyansky
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
19
20
21
22
23
   *
   *  This file is subject to the terms and conditions of the GNU General Public
   *  License.  See the file COPYING in the main directory of the Linux
   *  distribution for more details.
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
24
25
26
27
28
29
30
31
32
33
34
35
  #include <linux/cpu.h>
  #include <linux/cpumask.h>
  #include <linux/cpuset.h>
  #include <linux/err.h>
  #include <linux/errno.h>
  #include <linux/file.h>
  #include <linux/fs.h>
  #include <linux/init.h>
  #include <linux/interrupt.h>
  #include <linux/kernel.h>
  #include <linux/kmod.h>
  #include <linux/list.h>
68860ec10   Paul Jackson   [PATCH] cpusets: ...
36
  #include <linux/mempolicy.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
37
  #include <linux/mm.h>
f481891fd   Miao Xie   cpuset: update to...
38
  #include <linux/memory.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
39
40
41
42
43
  #include <linux/module.h>
  #include <linux/mount.h>
  #include <linux/namei.h>
  #include <linux/pagemap.h>
  #include <linux/proc_fs.h>
6b9c2603c   Paul Jackson   [PATCH] cpuset: u...
44
  #include <linux/rcupdate.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
45
46
  #include <linux/sched.h>
  #include <linux/seq_file.h>
22fb52dd7   David Quigley   [PATCH] SELinux: ...
47
  #include <linux/security.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
48
  #include <linux/slab.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
49
50
51
52
53
54
55
56
57
  #include <linux/spinlock.h>
  #include <linux/stat.h>
  #include <linux/string.h>
  #include <linux/time.h>
  #include <linux/backing-dev.h>
  #include <linux/sort.h>
  
  #include <asm/uaccess.h>
  #include <asm/atomic.h>
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
58
  #include <linux/mutex.h>
956db3ca0   Cliff Wickman   hotplug cpu: move...
59
60
  #include <linux/workqueue.h>
  #include <linux/cgroup.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
61

202f72d5d   Paul Jackson   [PATCH] cpuset: n...
62
  /*
f90d4118b   Miao Xie   cpuset: fix possi...
63
64
65
66
67
68
69
70
   * Workqueue for cpuset related tasks.
   *
   * Using kevent workqueue may cause deadlock when memory_migrate
   * is set. So we create a separate workqueue thread for cpuset.
   */
  static struct workqueue_struct *cpuset_wq;
  
  /*
202f72d5d   Paul Jackson   [PATCH] cpuset: n...
71
72
73
74
   * Tracks how many cpusets are currently defined in system.
   * When there is only one cpuset (the root cpuset) we can
   * short circuit some hooks.
   */
7edc59628   Paul Jackson   [PATCH] cpuset: m...
75
  int number_of_cpusets __read_mostly;
202f72d5d   Paul Jackson   [PATCH] cpuset: n...
76

2df167a30   Paul Menage   cgroups: update c...
77
  /* Forward declare cgroup structures */
8793d854e   Paul Menage   Task Control Grou...
78
79
  struct cgroup_subsys cpuset_subsys;
  struct cpuset;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
80
81
82
83
84
85
86
87
  /* See "Frequency meter" comments, below. */
  
  struct fmeter {
  	int cnt;		/* unprocessed events count */
  	int val;		/* most recent output value */
  	time_t time;		/* clock (secs) when val computed */
  	spinlock_t lock;	/* guards read or write of above */
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
88
  struct cpuset {
8793d854e   Paul Menage   Task Control Grou...
89
  	struct cgroup_subsys_state css;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
90
  	unsigned long flags;		/* "unsigned long" so bitops work */
300ed6cbb   Li Zefan   cpuset: convert c...
91
  	cpumask_var_t cpus_allowed;	/* CPUs allowed to tasks in cpuset */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
92
  	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
93
  	struct cpuset *parent;		/* my parent */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
94

3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
95
  	struct fmeter fmeter;		/* memory_pressure filter */
029190c51   Paul Jackson   cpuset sched_load...
96
97
98
  
  	/* partition number for rebuild_sched_domains() */
  	int pn;
956db3ca0   Cliff Wickman   hotplug cpu: move...
99

1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
100
101
  	/* for custom sched domain */
  	int relax_domain_level;
732bee7af   Uwe Kleine-König   fix typos concern...
102
  	/* used for walking a cpuset hierarchy */
956db3ca0   Cliff Wickman   hotplug cpu: move...
103
  	struct list_head stack_list;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
104
  };
8793d854e   Paul Menage   Task Control Grou...
105
106
107
108
109
110
111
112
113
114
115
116
117
  /* Retrieve the cpuset for a cgroup */
  static inline struct cpuset *cgroup_cs(struct cgroup *cont)
  {
  	return container_of(cgroup_subsys_state(cont, cpuset_subsys_id),
  			    struct cpuset, css);
  }
  
  /* Retrieve the cpuset for a task */
  static inline struct cpuset *task_cs(struct task_struct *task)
  {
  	return container_of(task_subsys_state(task, cpuset_subsys_id),
  			    struct cpuset, css);
  }
8793d854e   Paul Menage   Task Control Grou...
118

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
119
120
121
122
  /* bits in struct cpuset flags field */
  typedef enum {
  	CS_CPU_EXCLUSIVE,
  	CS_MEM_EXCLUSIVE,
786083667   Paul Menage   Cpuset hardwall f...
123
  	CS_MEM_HARDWALL,
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
124
  	CS_MEMORY_MIGRATE,
029190c51   Paul Jackson   cpuset sched_load...
125
  	CS_SCHED_LOAD_BALANCE,
825a46af5   Paul Jackson   [PATCH] cpuset me...
126
127
  	CS_SPREAD_PAGE,
  	CS_SPREAD_SLAB,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
128
129
130
131
132
  } cpuset_flagbits_t;
  
  /* convenient tests for these bits */
  static inline int is_cpu_exclusive(const struct cpuset *cs)
  {
7b5b9ef0e   Paul Jackson   [PATCH] cpuset cl...
133
  	return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
134
135
136
137
  }
  
  static inline int is_mem_exclusive(const struct cpuset *cs)
  {
7b5b9ef0e   Paul Jackson   [PATCH] cpuset cl...
138
  	return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
139
  }
786083667   Paul Menage   Cpuset hardwall f...
140
141
142
143
  static inline int is_mem_hardwall(const struct cpuset *cs)
  {
  	return test_bit(CS_MEM_HARDWALL, &cs->flags);
  }
029190c51   Paul Jackson   cpuset sched_load...
144
145
146
147
  static inline int is_sched_load_balance(const struct cpuset *cs)
  {
  	return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
  }
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
148
149
  static inline int is_memory_migrate(const struct cpuset *cs)
  {
7b5b9ef0e   Paul Jackson   [PATCH] cpuset cl...
150
  	return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
151
  }
825a46af5   Paul Jackson   [PATCH] cpuset me...
152
153
154
155
156
157
158
159
160
  static inline int is_spread_page(const struct cpuset *cs)
  {
  	return test_bit(CS_SPREAD_PAGE, &cs->flags);
  }
  
  static inline int is_spread_slab(const struct cpuset *cs)
  {
  	return test_bit(CS_SPREAD_SLAB, &cs->flags);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
161
162
  static struct cpuset top_cpuset = {
  	.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
163
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
164
  /*
2df167a30   Paul Menage   cgroups: update c...
165
166
167
168
169
170
171
   * There are two global mutexes guarding cpuset structures.  The first
   * is the main control groups cgroup_mutex, accessed via
   * cgroup_lock()/cgroup_unlock().  The second is the cpuset-specific
   * callback_mutex, below. They can nest.  It is ok to first take
   * cgroup_mutex, then nest callback_mutex.  We also require taking
   * task_lock() when dereferencing a task's cpuset pointer.  See "The
   * task_lock() exception", at the end of this comment.
053199edf   Paul Jackson   [PATCH] cpusets: ...
172
   *
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
173
   * A task must hold both mutexes to modify cpusets.  If a task
2df167a30   Paul Menage   cgroups: update c...
174
   * holds cgroup_mutex, then it blocks others wanting that mutex,
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
175
   * ensuring that it is the only task able to also acquire callback_mutex
053199edf   Paul Jackson   [PATCH] cpusets: ...
176
177
   * and be able to modify cpusets.  It can perform various checks on
   * the cpuset structure first, knowing nothing will change.  It can
2df167a30   Paul Menage   cgroups: update c...
178
   * also allocate memory while just holding cgroup_mutex.  While it is
053199edf   Paul Jackson   [PATCH] cpusets: ...
179
   * performing these checks, various callback routines can briefly
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
180
181
   * acquire callback_mutex to query cpusets.  Once it is ready to make
   * the changes, it takes callback_mutex, blocking everyone else.
053199edf   Paul Jackson   [PATCH] cpusets: ...
182
183
   *
   * Calls to the kernel memory allocator can not be made while holding
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
184
   * callback_mutex, as that would risk double tripping on callback_mutex
053199edf   Paul Jackson   [PATCH] cpusets: ...
185
186
187
   * from one of the callbacks into the cpuset code from within
   * __alloc_pages().
   *
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
188
   * If a task is only holding callback_mutex, then it has read-only
053199edf   Paul Jackson   [PATCH] cpusets: ...
189
190
   * access to cpusets.
   *
58568d2a8   Miao Xie   cpuset,mm: update...
191
192
193
   * Now, the task_struct fields mems_allowed and mempolicy may be changed
   * by other task, we use alloc_lock in the task_struct fields to protect
   * them.
053199edf   Paul Jackson   [PATCH] cpusets: ...
194
   *
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
195
   * The cpuset_common_file_read() handlers only hold callback_mutex across
053199edf   Paul Jackson   [PATCH] cpusets: ...
196
197
198
   * small pieces of code, such as when reading out possibly multi-word
   * cpumasks and nodemasks.
   *
2df167a30   Paul Menage   cgroups: update c...
199
200
   * Accessing a task's cpuset should be done in accordance with the
   * guidelines for accessing subsystem state in kernel/cgroup.c
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
201
   */
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
202
  static DEFINE_MUTEX(callback_mutex);
4247bdc60   Paul Jackson   [PATCH] cpuset se...
203

cf417141c   Max Krasnyansky   sched, cpuset: re...
204
  /*
75aa19941   David Rientjes   oom: print trigge...
205
206
207
208
209
210
211
212
213
214
215
   * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist
   * buffers.  They are statically allocated to prevent using excess stack
   * when calling cpuset_print_task_mems_allowed().
   */
  #define CPUSET_NAME_LEN		(128)
  #define	CPUSET_NODELIST_LEN	(256)
  static char cpuset_name[CPUSET_NAME_LEN];
  static char cpuset_nodelist[CPUSET_NODELIST_LEN];
  static DEFINE_SPINLOCK(cpuset_buffer_lock);
  
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
216
   * This is ugly, but preserves the userspace API for existing cpuset
8793d854e   Paul Menage   Task Control Grou...
217
   * users. If someone tries to mount the "cpuset" filesystem, we
cf417141c   Max Krasnyansky   sched, cpuset: re...
218
219
   * silently switch it to mount "cgroup" instead
   */
454e2398b   David Howells   [PATCH] VFS: Perm...
220
221
222
  static int cpuset_get_sb(struct file_system_type *fs_type,
  			 int flags, const char *unused_dev_name,
  			 void *data, struct vfsmount *mnt)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
223
  {
8793d854e   Paul Menage   Task Control Grou...
224
225
226
227
228
229
230
231
232
233
234
  	struct file_system_type *cgroup_fs = get_fs_type("cgroup");
  	int ret = -ENODEV;
  	if (cgroup_fs) {
  		char mountopts[] =
  			"cpuset,noprefix,"
  			"release_agent=/sbin/cpuset_release_agent";
  		ret = cgroup_fs->get_sb(cgroup_fs, flags,
  					   unused_dev_name, mountopts, mnt);
  		put_filesystem(cgroup_fs);
  	}
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
235
236
237
238
239
  }
  
  static struct file_system_type cpuset_fs_type = {
  	.name = "cpuset",
  	.get_sb = cpuset_get_sb,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
240
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
241
  /*
300ed6cbb   Li Zefan   cpuset: convert c...
242
   * Return in pmask the portion of a cpusets's cpus_allowed that
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
243
244
245
246
247
248
249
250
251
   * are online.  If none are online, walk up the cpuset hierarchy
   * until we find one that does have some online cpus.  If we get
   * all the way to the top and still haven't found any online cpus,
   * return cpu_online_map.  Or if passed a NULL cs from an exit'ing
   * task, return cpu_online_map.
   *
   * One way or another, we guarantee to return some non-empty subset
   * of cpu_online_map.
   *
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
252
   * Call with callback_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
253
   */
6af866af3   Li Zefan   cpuset: remove re...
254
255
  static void guarantee_online_cpus(const struct cpuset *cs,
  				  struct cpumask *pmask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
256
  {
300ed6cbb   Li Zefan   cpuset: convert c...
257
  	while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
258
259
  		cs = cs->parent;
  	if (cs)
300ed6cbb   Li Zefan   cpuset: convert c...
260
  		cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
261
  	else
300ed6cbb   Li Zefan   cpuset: convert c...
262
263
  		cpumask_copy(pmask, cpu_online_mask);
  	BUG_ON(!cpumask_intersects(pmask, cpu_online_mask));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
264
265
266
267
  }
  
  /*
   * Return in *pmask the portion of a cpusets's mems_allowed that
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
268
269
270
271
   * are online, with memory.  If none are online with memory, walk
   * up the cpuset hierarchy until we find one that does have some
   * online mems.  If we get all the way to the top and still haven't
   * found any online mems, return node_states[N_HIGH_MEMORY].
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
272
273
   *
   * One way or another, we guarantee to return some non-empty subset
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
274
   * of node_states[N_HIGH_MEMORY].
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
275
   *
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
276
   * Call with callback_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
277
278
279
280
   */
  
  static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  {
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
281
282
  	while (cs && !nodes_intersects(cs->mems_allowed,
  					node_states[N_HIGH_MEMORY]))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
283
284
  		cs = cs->parent;
  	if (cs)
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
285
286
  		nodes_and(*pmask, cs->mems_allowed,
  					node_states[N_HIGH_MEMORY]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
287
  	else
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
288
289
  		*pmask = node_states[N_HIGH_MEMORY];
  	BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
290
  }
f3b39d47e   Miao Xie   cpusets: restruct...
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
  /*
   * update task's spread flag if cpuset's page/slab spread flag is set
   *
   * Called with callback_mutex/cgroup_mutex held
   */
  static void cpuset_update_task_spread_flag(struct cpuset *cs,
  					struct task_struct *tsk)
  {
  	if (is_spread_page(cs))
  		tsk->flags |= PF_SPREAD_PAGE;
  	else
  		tsk->flags &= ~PF_SPREAD_PAGE;
  	if (is_spread_slab(cs))
  		tsk->flags |= PF_SPREAD_SLAB;
  	else
  		tsk->flags &= ~PF_SPREAD_SLAB;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
308
309
310
311
312
  /*
   * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
   *
   * One cpuset is a subset of another if all its allowed CPUs and
   * Memory Nodes are a subset of the other, and its exclusive flags
2df167a30   Paul Menage   cgroups: update c...
313
   * are only set if the other's are set.  Call holding cgroup_mutex.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
314
315
316
317
   */
  
  static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
  {
300ed6cbb   Li Zefan   cpuset: convert c...
318
  	return	cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
319
320
321
322
  		nodes_subset(p->mems_allowed, q->mems_allowed) &&
  		is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
  		is_mem_exclusive(p) <= is_mem_exclusive(q);
  }
645fcc9d2   Li Zefan   cpuset: don't all...
323
324
325
326
327
328
  /**
   * alloc_trial_cpuset - allocate a trial cpuset
   * @cs: the cpuset that the trial cpuset duplicates
   */
  static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs)
  {
300ed6cbb   Li Zefan   cpuset: convert c...
329
330
331
332
333
334
335
336
337
338
339
340
341
  	struct cpuset *trial;
  
  	trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
  	if (!trial)
  		return NULL;
  
  	if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
  		kfree(trial);
  		return NULL;
  	}
  	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
  
  	return trial;
645fcc9d2   Li Zefan   cpuset: don't all...
342
343
344
345
346
347
348
349
  }
  
  /**
   * free_trial_cpuset - free the trial cpuset
   * @trial: the trial cpuset to be freed
   */
  static void free_trial_cpuset(struct cpuset *trial)
  {
300ed6cbb   Li Zefan   cpuset: convert c...
350
  	free_cpumask_var(trial->cpus_allowed);
645fcc9d2   Li Zefan   cpuset: don't all...
351
352
  	kfree(trial);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
353
354
355
356
357
358
359
  /*
   * validate_change() - Used to validate that any proposed cpuset change
   *		       follows the structural rules for cpusets.
   *
   * If we replaced the flag and mask values of the current cpuset
   * (cur) with those values in the trial cpuset (trial), would
   * our various subset and exclusive rules still be valid?  Presumes
2df167a30   Paul Menage   cgroups: update c...
360
   * cgroup_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
361
362
363
364
365
366
367
368
369
370
371
372
373
374
   *
   * 'cur' is the address of an actual, in-use cpuset.  Operations
   * such as list traversal that depend on the actual address of the
   * cpuset in the list must use cur below, not trial.
   *
   * 'trial' is the address of bulk structure copy of cur, with
   * perhaps one or more of the fields cpus_allowed, mems_allowed,
   * or flags changed to new, trial values.
   *
   * Return 0 if valid, -errno if not.
   */
  
  static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
  {
8793d854e   Paul Menage   Task Control Grou...
375
  	struct cgroup *cont;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
376
377
378
  	struct cpuset *c, *par;
  
  	/* Each of our child cpusets must be a subset of us */
8793d854e   Paul Menage   Task Control Grou...
379
380
  	list_for_each_entry(cont, &cur->css.cgroup->children, sibling) {
  		if (!is_cpuset_subset(cgroup_cs(cont), trial))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
381
382
383
384
  			return -EBUSY;
  	}
  
  	/* Remaining checks don't apply to root cpuset */
696040670   Paul Jackson   [PATCH] cpuset: m...
385
  	if (cur == &top_cpuset)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
386
  		return 0;
696040670   Paul Jackson   [PATCH] cpuset: m...
387
  	par = cur->parent;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
388
389
390
  	/* We must be a subset of our parent cpuset */
  	if (!is_cpuset_subset(trial, par))
  		return -EACCES;
2df167a30   Paul Menage   cgroups: update c...
391
392
393
394
  	/*
  	 * If either I or some sibling (!= me) is exclusive, we can't
  	 * overlap
  	 */
8793d854e   Paul Menage   Task Control Grou...
395
396
  	list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
  		c = cgroup_cs(cont);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
397
398
  		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
  		    c != cur &&
300ed6cbb   Li Zefan   cpuset: convert c...
399
  		    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
400
401
402
403
404
405
  			return -EINVAL;
  		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
  		    c != cur &&
  		    nodes_intersects(trial->mems_allowed, c->mems_allowed))
  			return -EINVAL;
  	}
020958b62   Paul Jackson   cpusets: decrusti...
406
407
  	/* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */
  	if (cgroup_task_count(cur->css.cgroup)) {
300ed6cbb   Li Zefan   cpuset: convert c...
408
  		if (cpumask_empty(trial->cpus_allowed) ||
020958b62   Paul Jackson   cpusets: decrusti...
409
410
411
412
  		    nodes_empty(trial->mems_allowed)) {
  			return -ENOSPC;
  		}
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
413
414
  	return 0;
  }
db7f47cf4   Paul Menage   cpusets: allow cp...
415
  #ifdef CONFIG_SMP
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
416
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
417
   * Helper routine for generate_sched_domains().
029190c51   Paul Jackson   cpuset sched_load...
418
419
   * Do cpusets a, b have overlapping cpus_allowed masks?
   */
029190c51   Paul Jackson   cpuset sched_load...
420
421
  static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
  {
300ed6cbb   Li Zefan   cpuset: convert c...
422
  	return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
029190c51   Paul Jackson   cpuset sched_load...
423
  }
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
424
425
426
  static void
  update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
  {
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
427
428
429
430
  	if (dattr->relax_domain_level < c->relax_domain_level)
  		dattr->relax_domain_level = c->relax_domain_level;
  	return;
  }
f5393693e   Lai Jiangshan   cpuset: speed up ...
431
432
433
434
435
436
437
438
439
440
441
442
443
  static void
  update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
  {
  	LIST_HEAD(q);
  
  	list_add(&c->stack_list, &q);
  	while (!list_empty(&q)) {
  		struct cpuset *cp;
  		struct cgroup *cont;
  		struct cpuset *child;
  
  		cp = list_first_entry(&q, struct cpuset, stack_list);
  		list_del(q.next);
300ed6cbb   Li Zefan   cpuset: convert c...
444
  		if (cpumask_empty(cp->cpus_allowed))
f5393693e   Lai Jiangshan   cpuset: speed up ...
445
446
447
448
449
450
451
452
453
454
455
  			continue;
  
  		if (is_sched_load_balance(cp))
  			update_domain_attr(dattr, cp);
  
  		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
  			child = cgroup_cs(cont);
  			list_add_tail(&child->stack_list, &q);
  		}
  	}
  }
029190c51   Paul Jackson   cpuset sched_load...
456
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
457
458
459
460
461
462
463
464
465
   * generate_sched_domains()
   *
   * This function builds a partial partition of the systems CPUs
   * A 'partial partition' is a set of non-overlapping subsets whose
   * union is a subset of that set.
   * The output of this function needs to be passed to kernel/sched.c
   * partition_sched_domains() routine, which will rebuild the scheduler's
   * load balancing domains (sched domains) as specified by that partial
   * partition.
029190c51   Paul Jackson   cpuset sched_load...
466
   *
45ce80fb6   Li Zefan   cgroups: consolid...
467
   * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt
029190c51   Paul Jackson   cpuset sched_load...
468
469
470
471
472
473
474
   * for a background explanation of this.
   *
   * Does not return errors, on the theory that the callers of this
   * routine would rather not worry about failures to rebuild sched
   * domains when operating in the severe memory shortage situations
   * that could cause allocation failures below.
   *
cf417141c   Max Krasnyansky   sched, cpuset: re...
475
   * Must be called with cgroup_lock held.
029190c51   Paul Jackson   cpuset sched_load...
476
477
   *
   * The three key local variables below are:
aeed68242   Li Zefan   cpuset: clean up ...
478
   *    q  - a linked-list queue of cpuset pointers, used to implement a
029190c51   Paul Jackson   cpuset sched_load...
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
   *	   top-down scan of all cpusets.  This scan loads a pointer
   *	   to each cpuset marked is_sched_load_balance into the
   *	   array 'csa'.  For our purposes, rebuilding the schedulers
   *	   sched domains, we can ignore !is_sched_load_balance cpusets.
   *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
   *	   that need to be load balanced, for convenient iterative
   *	   access by the subsequent code that finds the best partition,
   *	   i.e the set of domains (subsets) of CPUs such that the
   *	   cpus_allowed of every cpuset marked is_sched_load_balance
   *	   is a subset of one of these domains, while there are as
   *	   many such domains as possible, each as small as possible.
   * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
   *	   the kernel/sched.c routine partition_sched_domains() in a
   *	   convenient format, that can be easily compared to the prior
   *	   value to determine what partition elements (sched domains)
   *	   were changed (added or removed.)
   *
   * Finding the best partition (set of domains):
   *	The triple nested loops below over i, j, k scan over the
   *	load balanced cpusets (using the array of cpuset pointers in
   *	csa[]) looking for pairs of cpusets that have overlapping
   *	cpus_allowed, but which don't have the same 'pn' partition
   *	number and gives them in the same partition number.  It keeps
   *	looping on the 'restart' label until it can no longer find
   *	any such pairs.
   *
   *	The union of the cpus_allowed masks from the set of
   *	all cpusets having the same 'pn' value then form the one
   *	element of the partition (one sched domain) to be passed to
   *	partition_sched_domains().
   */
acc3f5d7c   Rusty Russell   cpumask: Partitio...
510
  static int generate_sched_domains(cpumask_var_t **domains,
cf417141c   Max Krasnyansky   sched, cpuset: re...
511
  			struct sched_domain_attr **attributes)
029190c51   Paul Jackson   cpuset sched_load...
512
  {
cf417141c   Max Krasnyansky   sched, cpuset: re...
513
  	LIST_HEAD(q);		/* queue of cpusets to be scanned */
029190c51   Paul Jackson   cpuset sched_load...
514
515
516
517
  	struct cpuset *cp;	/* scans q */
  	struct cpuset **csa;	/* array of all cpuset ptrs */
  	int csn;		/* how many cpuset ptrs in csa so far */
  	int i, j, k;		/* indices for partition finding loops */
acc3f5d7c   Rusty Russell   cpumask: Partitio...
518
  	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
519
  	struct sched_domain_attr *dattr;  /* attributes for custom domains */
1583715dd   Ingo Molnar   sched, cpusets: f...
520
  	int ndoms = 0;		/* number of sched domains in result */
6af866af3   Li Zefan   cpuset: remove re...
521
  	int nslot;		/* next empty doms[] struct cpumask slot */
029190c51   Paul Jackson   cpuset sched_load...
522

029190c51   Paul Jackson   cpuset sched_load...
523
  	doms = NULL;
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
524
  	dattr = NULL;
cf417141c   Max Krasnyansky   sched, cpuset: re...
525
  	csa = NULL;
029190c51   Paul Jackson   cpuset sched_load...
526
527
528
  
  	/* Special case for the 99% of systems with one, full, sched domain */
  	if (is_sched_load_balance(&top_cpuset)) {
acc3f5d7c   Rusty Russell   cpumask: Partitio...
529
530
  		ndoms = 1;
  		doms = alloc_sched_domains(ndoms);
029190c51   Paul Jackson   cpuset sched_load...
531
  		if (!doms)
cf417141c   Max Krasnyansky   sched, cpuset: re...
532
  			goto done;
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
533
534
535
  		dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
  		if (dattr) {
  			*dattr = SD_ATTR_INIT;
93a655755   Li Zefan   cpuset: fix wrong...
536
  			update_domain_attr_tree(dattr, &top_cpuset);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
537
  		}
acc3f5d7c   Rusty Russell   cpumask: Partitio...
538
  		cpumask_copy(doms[0], top_cpuset.cpus_allowed);
cf417141c   Max Krasnyansky   sched, cpuset: re...
539

cf417141c   Max Krasnyansky   sched, cpuset: re...
540
  		goto done;
029190c51   Paul Jackson   cpuset sched_load...
541
  	}
029190c51   Paul Jackson   cpuset sched_load...
542
543
544
545
  	csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
  	if (!csa)
  		goto done;
  	csn = 0;
aeed68242   Li Zefan   cpuset: clean up ...
546
547
  	list_add(&top_cpuset.stack_list, &q);
  	while (!list_empty(&q)) {
029190c51   Paul Jackson   cpuset sched_load...
548
549
  		struct cgroup *cont;
  		struct cpuset *child;   /* scans child cpusets of cp */
489a5393a   Lai Jiangshan   cpuset: don't pas...
550

aeed68242   Li Zefan   cpuset: clean up ...
551
552
  		cp = list_first_entry(&q, struct cpuset, stack_list);
  		list_del(q.next);
300ed6cbb   Li Zefan   cpuset: convert c...
553
  		if (cpumask_empty(cp->cpus_allowed))
489a5393a   Lai Jiangshan   cpuset: don't pas...
554
  			continue;
f5393693e   Lai Jiangshan   cpuset: speed up ...
555
556
557
558
559
560
561
  		/*
  		 * All child cpusets contain a subset of the parent's cpus, so
  		 * just skip them, and then we call update_domain_attr_tree()
  		 * to calc relax_domain_level of the corresponding sched
  		 * domain.
  		 */
  		if (is_sched_load_balance(cp)) {
029190c51   Paul Jackson   cpuset sched_load...
562
  			csa[csn++] = cp;
f5393693e   Lai Jiangshan   cpuset: speed up ...
563
564
  			continue;
  		}
489a5393a   Lai Jiangshan   cpuset: don't pas...
565

029190c51   Paul Jackson   cpuset sched_load...
566
567
  		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
  			child = cgroup_cs(cont);
aeed68242   Li Zefan   cpuset: clean up ...
568
  			list_add_tail(&child->stack_list, &q);
029190c51   Paul Jackson   cpuset sched_load...
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
  		}
    	}
  
  	for (i = 0; i < csn; i++)
  		csa[i]->pn = i;
  	ndoms = csn;
  
  restart:
  	/* Find the best partition (set of sched domains) */
  	for (i = 0; i < csn; i++) {
  		struct cpuset *a = csa[i];
  		int apn = a->pn;
  
  		for (j = 0; j < csn; j++) {
  			struct cpuset *b = csa[j];
  			int bpn = b->pn;
  
  			if (apn != bpn && cpusets_overlap(a, b)) {
  				for (k = 0; k < csn; k++) {
  					struct cpuset *c = csa[k];
  
  					if (c->pn == bpn)
  						c->pn = apn;
  				}
  				ndoms--;	/* one less element */
  				goto restart;
  			}
  		}
  	}
cf417141c   Max Krasnyansky   sched, cpuset: re...
598
599
600
601
  	/*
  	 * Now we know how many domains to create.
  	 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
  	 */
acc3f5d7c   Rusty Russell   cpumask: Partitio...
602
  	doms = alloc_sched_domains(ndoms);
700018e0a   Li Zefan   cpuset: fix regre...
603
  	if (!doms)
cf417141c   Max Krasnyansky   sched, cpuset: re...
604
  		goto done;
cf417141c   Max Krasnyansky   sched, cpuset: re...
605
606
607
608
609
  
  	/*
  	 * The rest of the code, including the scheduler, can deal with
  	 * dattr==NULL case. No need to abort if alloc fails.
  	 */
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
610
  	dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
029190c51   Paul Jackson   cpuset sched_load...
611
612
613
  
  	for (nslot = 0, i = 0; i < csn; i++) {
  		struct cpuset *a = csa[i];
6af866af3   Li Zefan   cpuset: remove re...
614
  		struct cpumask *dp;
029190c51   Paul Jackson   cpuset sched_load...
615
  		int apn = a->pn;
cf417141c   Max Krasnyansky   sched, cpuset: re...
616
617
618
619
  		if (apn < 0) {
  			/* Skip completed partitions */
  			continue;
  		}
acc3f5d7c   Rusty Russell   cpumask: Partitio...
620
  		dp = doms[nslot];
cf417141c   Max Krasnyansky   sched, cpuset: re...
621
622
623
624
625
626
627
628
629
630
631
  
  		if (nslot == ndoms) {
  			static int warnings = 10;
  			if (warnings) {
  				printk(KERN_WARNING
  				 "rebuild_sched_domains confused:"
  				  " nslot %d, ndoms %d, csn %d, i %d,"
  				  " apn %d
  ",
  				  nslot, ndoms, csn, i, apn);
  				warnings--;
029190c51   Paul Jackson   cpuset sched_load...
632
  			}
cf417141c   Max Krasnyansky   sched, cpuset: re...
633
634
  			continue;
  		}
029190c51   Paul Jackson   cpuset sched_load...
635

6af866af3   Li Zefan   cpuset: remove re...
636
  		cpumask_clear(dp);
cf417141c   Max Krasnyansky   sched, cpuset: re...
637
638
639
640
641
642
  		if (dattr)
  			*(dattr + nslot) = SD_ATTR_INIT;
  		for (j = i; j < csn; j++) {
  			struct cpuset *b = csa[j];
  
  			if (apn == b->pn) {
300ed6cbb   Li Zefan   cpuset: convert c...
643
  				cpumask_or(dp, dp, b->cpus_allowed);
cf417141c   Max Krasnyansky   sched, cpuset: re...
644
645
646
647
648
  				if (dattr)
  					update_domain_attr_tree(dattr + nslot, b);
  
  				/* Done with this partition */
  				b->pn = -1;
029190c51   Paul Jackson   cpuset sched_load...
649
  			}
029190c51   Paul Jackson   cpuset sched_load...
650
  		}
cf417141c   Max Krasnyansky   sched, cpuset: re...
651
  		nslot++;
029190c51   Paul Jackson   cpuset sched_load...
652
653
  	}
  	BUG_ON(nslot != ndoms);
cf417141c   Max Krasnyansky   sched, cpuset: re...
654
655
  done:
  	kfree(csa);
700018e0a   Li Zefan   cpuset: fix regre...
656
657
658
659
660
661
  	/*
  	 * Fallback to the default domain if kmalloc() failed.
  	 * See comments in partition_sched_domains().
  	 */
  	if (doms == NULL)
  		ndoms = 1;
cf417141c   Max Krasnyansky   sched, cpuset: re...
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
  	*domains    = doms;
  	*attributes = dattr;
  	return ndoms;
  }
  
  /*
   * Rebuild scheduler domains.
   *
   * Call with neither cgroup_mutex held nor within get_online_cpus().
   * Takes both cgroup_mutex and get_online_cpus().
   *
   * Cannot be directly called from cpuset code handling changes
   * to the cpuset pseudo-filesystem, because it cannot be called
   * from code that already holds cgroup_mutex.
   */
  static void do_rebuild_sched_domains(struct work_struct *unused)
  {
  	struct sched_domain_attr *attr;
acc3f5d7c   Rusty Russell   cpumask: Partitio...
680
  	cpumask_var_t *doms;
cf417141c   Max Krasnyansky   sched, cpuset: re...
681
  	int ndoms;
86ef5c9a8   Gautham R Shenoy   cpu-hotplug: repl...
682
  	get_online_cpus();
cf417141c   Max Krasnyansky   sched, cpuset: re...
683
684
685
686
687
688
689
690
  
  	/* Generate domain masks and attrs */
  	cgroup_lock();
  	ndoms = generate_sched_domains(&doms, &attr);
  	cgroup_unlock();
  
  	/* Have scheduler rebuild the domains */
  	partition_sched_domains(ndoms, doms, attr);
86ef5c9a8   Gautham R Shenoy   cpu-hotplug: repl...
691
  	put_online_cpus();
cf417141c   Max Krasnyansky   sched, cpuset: re...
692
  }
db7f47cf4   Paul Menage   cpusets: allow cp...
693
694
695
696
  #else /* !CONFIG_SMP */
  static void do_rebuild_sched_domains(struct work_struct *unused)
  {
  }
e1b8090bd   Geert Uytterhoeven   cpumask: Fix gene...
697
  static int generate_sched_domains(cpumask_var_t **domains,
db7f47cf4   Paul Menage   cpusets: allow cp...
698
699
700
701
702
703
  			struct sched_domain_attr **attributes)
  {
  	*domains = NULL;
  	return 1;
  }
  #endif /* CONFIG_SMP */
029190c51   Paul Jackson   cpuset sched_load...
704

cf417141c   Max Krasnyansky   sched, cpuset: re...
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
  static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
  
  /*
   * Rebuild scheduler domains, asynchronously via workqueue.
   *
   * If the flag 'sched_load_balance' of any cpuset with non-empty
   * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
   * which has that flag enabled, or if any cpuset with a non-empty
   * 'cpus' is removed, then call this routine to rebuild the
   * scheduler's dynamic sched domains.
   *
   * The rebuild_sched_domains() and partition_sched_domains()
   * routines must nest cgroup_lock() inside get_online_cpus(),
   * but such cpuset changes as these must nest that locking the
   * other way, holding cgroup_lock() for much of the code.
   *
   * So in order to avoid an ABBA deadlock, the cpuset code handling
   * these user changes delegates the actual sched domain rebuilding
   * to a separate workqueue thread, which ends up processing the
   * above do_rebuild_sched_domains() function.
   */
  static void async_rebuild_sched_domains(void)
  {
f90d4118b   Miao Xie   cpuset: fix possi...
728
  	queue_work(cpuset_wq, &rebuild_sched_domains_work);
cf417141c   Max Krasnyansky   sched, cpuset: re...
729
730
731
732
733
734
735
736
737
738
739
740
741
742
  }
  
  /*
   * Accomplishes the same scheduler domain rebuild as the above
   * async_rebuild_sched_domains(), however it directly calls the
   * rebuild routine synchronously rather than calling it via an
   * asynchronous work thread.
   *
   * This can only be called from code that is not holding
   * cgroup_mutex (not nested in a cgroup_lock() call.)
   */
  void rebuild_sched_domains(void)
  {
  	do_rebuild_sched_domains(NULL);
029190c51   Paul Jackson   cpuset sched_load...
743
  }
58f4790b7   Cliff Wickman   cpusets: update_c...
744
745
746
747
748
  /**
   * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
   * @tsk: task to test
   * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
   *
2df167a30   Paul Menage   cgroups: update c...
749
   * Call with cgroup_mutex held.  May take callback_mutex during call.
58f4790b7   Cliff Wickman   cpusets: update_c...
750
751
752
   * Called for each task in a cgroup by cgroup_scan_tasks().
   * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
   * words, if its mask is not equal to its cpuset's mask).
053199edf   Paul Jackson   [PATCH] cpusets: ...
753
   */
9e0c914ca   Adrian Bunk   kernel/cpuset.c: ...
754
755
  static int cpuset_test_cpumask(struct task_struct *tsk,
  			       struct cgroup_scanner *scan)
58f4790b7   Cliff Wickman   cpusets: update_c...
756
  {
300ed6cbb   Li Zefan   cpuset: convert c...
757
  	return !cpumask_equal(&tsk->cpus_allowed,
58f4790b7   Cliff Wickman   cpusets: update_c...
758
759
  			(cgroup_cs(scan->cg))->cpus_allowed);
  }
053199edf   Paul Jackson   [PATCH] cpusets: ...
760

58f4790b7   Cliff Wickman   cpusets: update_c...
761
762
763
764
765
766
767
768
769
770
771
  /**
   * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
   * @tsk: task to test
   * @scan: struct cgroup_scanner containing the cgroup of the task
   *
   * Called by cgroup_scan_tasks() for each task in a cgroup whose
   * cpus_allowed mask needs to be changed.
   *
   * We don't need to re-check for the cgroup/cpuset membership, since we're
   * holding cgroup_lock() at this point.
   */
9e0c914ca   Adrian Bunk   kernel/cpuset.c: ...
772
773
  static void cpuset_change_cpumask(struct task_struct *tsk,
  				  struct cgroup_scanner *scan)
58f4790b7   Cliff Wickman   cpusets: update_c...
774
  {
300ed6cbb   Li Zefan   cpuset: convert c...
775
  	set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed));
58f4790b7   Cliff Wickman   cpusets: update_c...
776
777
778
  }
  
  /**
0b2f630a2   Miao Xie   cpusets: restruct...
779
780
   * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
   * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
4e74339af   Li Zefan   cpuset: avoid cha...
781
   * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
0b2f630a2   Miao Xie   cpusets: restruct...
782
783
784
785
786
787
   *
   * Called with cgroup_mutex held
   *
   * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
   * calling callback functions for each.
   *
4e74339af   Li Zefan   cpuset: avoid cha...
788
789
   * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
   * if @heap != NULL.
0b2f630a2   Miao Xie   cpusets: restruct...
790
   */
4e74339af   Li Zefan   cpuset: avoid cha...
791
  static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
0b2f630a2   Miao Xie   cpusets: restruct...
792
793
  {
  	struct cgroup_scanner scan;
0b2f630a2   Miao Xie   cpusets: restruct...
794
795
796
797
  
  	scan.cg = cs->css.cgroup;
  	scan.test_task = cpuset_test_cpumask;
  	scan.process_task = cpuset_change_cpumask;
4e74339af   Li Zefan   cpuset: avoid cha...
798
799
  	scan.heap = heap;
  	cgroup_scan_tasks(&scan);
0b2f630a2   Miao Xie   cpusets: restruct...
800
801
802
  }
  
  /**
58f4790b7   Cliff Wickman   cpusets: update_c...
803
804
805
806
   * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
   * @cs: the cpuset to consider
   * @buf: buffer of cpu numbers written to this cpuset
   */
645fcc9d2   Li Zefan   cpuset: don't all...
807
808
  static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  			  const char *buf)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
809
  {
4e74339af   Li Zefan   cpuset: avoid cha...
810
  	struct ptr_heap heap;
58f4790b7   Cliff Wickman   cpusets: update_c...
811
812
  	int retval;
  	int is_load_balanced;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
813

4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
814
815
816
  	/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
  	if (cs == &top_cpuset)
  		return -EACCES;
6f7f02e78   David Rientjes   cpusets: allow em...
817
  	/*
c8d9c90c7   Paul Jackson   hotplug cpu: move...
818
  	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
020958b62   Paul Jackson   cpusets: decrusti...
819
820
821
  	 * Since cpulist_parse() fails on an empty mask, we special case
  	 * that parsing.  The validate_change() call ensures that cpusets
  	 * with tasks have cpus.
6f7f02e78   David Rientjes   cpusets: allow em...
822
  	 */
020958b62   Paul Jackson   cpusets: decrusti...
823
  	if (!*buf) {
300ed6cbb   Li Zefan   cpuset: convert c...
824
  		cpumask_clear(trialcs->cpus_allowed);
6f7f02e78   David Rientjes   cpusets: allow em...
825
  	} else {
300ed6cbb   Li Zefan   cpuset: convert c...
826
  		retval = cpulist_parse(buf, trialcs->cpus_allowed);
6f7f02e78   David Rientjes   cpusets: allow em...
827
828
  		if (retval < 0)
  			return retval;
37340746a   Lai Jiangshan   cpusets: fix bug ...
829

6ad4c1888   Peter Zijlstra   sched: Fix balanc...
830
  		if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
37340746a   Lai Jiangshan   cpusets: fix bug ...
831
  			return -EINVAL;
6f7f02e78   David Rientjes   cpusets: allow em...
832
  	}
645fcc9d2   Li Zefan   cpuset: don't all...
833
  	retval = validate_change(cs, trialcs);
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
834
835
  	if (retval < 0)
  		return retval;
029190c51   Paul Jackson   cpuset sched_load...
836

8707d8b8c   Paul Menage   Fix cpusets updat...
837
  	/* Nothing to do if the cpus didn't change */
300ed6cbb   Li Zefan   cpuset: convert c...
838
  	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
8707d8b8c   Paul Menage   Fix cpusets updat...
839
  		return 0;
58f4790b7   Cliff Wickman   cpusets: update_c...
840

4e74339af   Li Zefan   cpuset: avoid cha...
841
842
843
  	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
  	if (retval)
  		return retval;
645fcc9d2   Li Zefan   cpuset: don't all...
844
  	is_load_balanced = is_sched_load_balance(trialcs);
029190c51   Paul Jackson   cpuset sched_load...
845

3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
846
  	mutex_lock(&callback_mutex);
300ed6cbb   Li Zefan   cpuset: convert c...
847
  	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
848
  	mutex_unlock(&callback_mutex);
029190c51   Paul Jackson   cpuset sched_load...
849

8707d8b8c   Paul Menage   Fix cpusets updat...
850
851
  	/*
  	 * Scan tasks in the cpuset, and update the cpumasks of any
58f4790b7   Cliff Wickman   cpusets: update_c...
852
  	 * that need an update.
8707d8b8c   Paul Menage   Fix cpusets updat...
853
  	 */
4e74339af   Li Zefan   cpuset: avoid cha...
854
855
856
  	update_tasks_cpumask(cs, &heap);
  
  	heap_free(&heap);
58f4790b7   Cliff Wickman   cpusets: update_c...
857

8707d8b8c   Paul Menage   Fix cpusets updat...
858
  	if (is_load_balanced)
cf417141c   Max Krasnyansky   sched, cpuset: re...
859
  		async_rebuild_sched_domains();
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
860
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
861
  }
053199edf   Paul Jackson   [PATCH] cpusets: ...
862
  /*
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
863
864
865
866
867
868
869
   * cpuset_migrate_mm
   *
   *    Migrate memory region from one set of nodes to another.
   *
   *    Temporarilly set tasks mems_allowed to target nodes of migration,
   *    so that the migration code can allocate pages on these nodes.
   *
2df167a30   Paul Menage   cgroups: update c...
870
   *    Call holding cgroup_mutex, so current's cpuset won't change
c8d9c90c7   Paul Jackson   hotplug cpu: move...
871
   *    during this call, as manage_mutex holds off any cpuset_attach()
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
872
873
   *    calls.  Therefore we don't need to take task_lock around the
   *    call to guarantee_online_mems(), as we know no one is changing
2df167a30   Paul Menage   cgroups: update c...
874
   *    our task's cpuset.
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
875
   *
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
876
877
878
879
   *    While the mm_struct we are migrating is typically from some
   *    other task, the task_struct mems_allowed that we are hacking
   *    is for our current task, which must allocate new pages for that
   *    migrating memory region.
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
880
881
882
883
884
885
   */
  
  static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
  							const nodemask_t *to)
  {
  	struct task_struct *tsk = current;
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
886
  	tsk->mems_allowed = *to;
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
887
888
  
  	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
8793d854e   Paul Menage   Task Control Grou...
889
  	guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
890
  }
3b6766fe6   Li Zefan   cpuset: rewrite u...
891
  /*
58568d2a8   Miao Xie   cpuset,mm: update...
892
893
894
895
896
897
898
   * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
   * @tsk: the task to change
   * @newmems: new nodes that the task will be set
   *
   * In order to avoid seeing no nodes if the old and new nodes are disjoint,
   * we structure updates as setting all new allowed nodes, then clearing newly
   * disallowed ones.
58568d2a8   Miao Xie   cpuset,mm: update...
899
900
901
902
   */
  static void cpuset_change_task_nodemask(struct task_struct *tsk,
  					nodemask_t *newmems)
  {
c0ff7453b   Miao Xie   cpuset,mm: fix no...
903
904
905
906
907
908
909
910
911
912
913
  repeat:
  	/*
  	 * Allow tasks that have access to memory reserves because they have
  	 * been OOM killed to get memory anywhere.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE)))
  		return;
  	if (current->flags & PF_EXITING) /* Let dying task have memory */
  		return;
  
  	task_lock(tsk);
58568d2a8   Miao Xie   cpuset,mm: update...
914
  	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
c0ff7453b   Miao Xie   cpuset,mm: fix no...
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
  	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
  
  
  	/*
  	 * ensure checking ->mems_allowed_change_disable after setting all new
  	 * allowed nodes.
  	 *
  	 * the read-side task can see an nodemask with new allowed nodes and
  	 * old allowed nodes. and if it allocates page when cpuset clears newly
  	 * disallowed ones continuous, it can see the new allowed bits.
  	 *
  	 * And if setting all new allowed nodes is after the checking, setting
  	 * all new allowed nodes and clearing newly disallowed ones will be done
  	 * continuous, and the read-side task may find no node to alloc page.
  	 */
  	smp_mb();
  
  	/*
  	 * Allocation of memory is very fast, we needn't sleep when waiting
  	 * for the read-side.
  	 */
  	while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
  		task_unlock(tsk);
  		if (!task_curr(tsk))
  			yield();
  		goto repeat;
  	}
  
  	/*
  	 * ensure checking ->mems_allowed_change_disable before clearing all new
  	 * disallowed nodes.
  	 *
  	 * if clearing newly disallowed bits before the checking, the read-side
  	 * task may find no node to alloc page.
  	 */
  	smp_mb();
  
  	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
58568d2a8   Miao Xie   cpuset,mm: update...
953
  	tsk->mems_allowed = *newmems;
c0ff7453b   Miao Xie   cpuset,mm: fix no...
954
  	task_unlock(tsk);
58568d2a8   Miao Xie   cpuset,mm: update...
955
956
957
958
959
960
  }
  
  /*
   * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
   * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
   * memory_migrate flag is set. Called with cgroup_mutex held.
3b6766fe6   Li Zefan   cpuset: rewrite u...
961
962
963
964
965
966
967
968
   */
  static void cpuset_change_nodemask(struct task_struct *p,
  				   struct cgroup_scanner *scan)
  {
  	struct mm_struct *mm;
  	struct cpuset *cs;
  	int migrate;
  	const nodemask_t *oldmem = scan->data;
53feb2976   Miao Xie   cpuset: alloc nod...
969
970
971
972
  	NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
  
  	if (!newmems)
  		return;
58568d2a8   Miao Xie   cpuset,mm: update...
973
974
  
  	cs = cgroup_cs(scan->cg);
53feb2976   Miao Xie   cpuset: alloc nod...
975
  	guarantee_online_mems(cs, newmems);
58568d2a8   Miao Xie   cpuset,mm: update...
976

53feb2976   Miao Xie   cpuset: alloc nod...
977
  	cpuset_change_task_nodemask(p, newmems);
3b6766fe6   Li Zefan   cpuset: rewrite u...
978

53feb2976   Miao Xie   cpuset: alloc nod...
979
  	NODEMASK_FREE(newmems);
3b6766fe6   Li Zefan   cpuset: rewrite u...
980
981
982
  	mm = get_task_mm(p);
  	if (!mm)
  		return;
3b6766fe6   Li Zefan   cpuset: rewrite u...
983
984
985
986
987
988
989
  	migrate = is_memory_migrate(cs);
  
  	mpol_rebind_mm(mm, &cs->mems_allowed);
  	if (migrate)
  		cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
  	mmput(mm);
  }
8793d854e   Paul Menage   Task Control Grou...
990
  static void *cpuset_being_rebound;
0b2f630a2   Miao Xie   cpusets: restruct...
991
992
993
994
  /**
   * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
   * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
   * @oldmem: old mems_allowed of cpuset cs
010cfac4c   Li Zefan   cpuset: avoid cha...
995
   * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
0b2f630a2   Miao Xie   cpusets: restruct...
996
997
   *
   * Called with cgroup_mutex held
010cfac4c   Li Zefan   cpuset: avoid cha...
998
999
   * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
   * if @heap != NULL.
0b2f630a2   Miao Xie   cpusets: restruct...
1000
   */
010cfac4c   Li Zefan   cpuset: avoid cha...
1001
1002
  static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
  				 struct ptr_heap *heap)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1003
  {
3b6766fe6   Li Zefan   cpuset: rewrite u...
1004
  	struct cgroup_scanner scan;
59dac16fb   Paul Jackson   [PATCH] cpuset: u...
1005

846a16bf0   Lee Schermerhorn   mempolicy: rename...
1006
  	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1007

3b6766fe6   Li Zefan   cpuset: rewrite u...
1008
1009
1010
  	scan.cg = cs->css.cgroup;
  	scan.test_task = NULL;
  	scan.process_task = cpuset_change_nodemask;
010cfac4c   Li Zefan   cpuset: avoid cha...
1011
  	scan.heap = heap;
3b6766fe6   Li Zefan   cpuset: rewrite u...
1012
  	scan.data = (nodemask_t *)oldmem;
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1013
1014
  
  	/*
3b6766fe6   Li Zefan   cpuset: rewrite u...
1015
1016
1017
1018
1019
1020
  	 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
  	 * take while holding tasklist_lock.  Forks can happen - the
  	 * mpol_dup() cpuset_being_rebound check will catch such forks,
  	 * and rebind their vma mempolicies too.  Because we still hold
  	 * the global cgroup_mutex, we know that no other rebind effort
  	 * will be contending for the global variable cpuset_being_rebound.
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1021
  	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
04c19fa6f   Paul Jackson   [PATCH] cpuset: m...
1022
  	 * is idempotent.  Also migrate pages in each mm to new nodes.
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1023
  	 */
010cfac4c   Li Zefan   cpuset: avoid cha...
1024
  	cgroup_scan_tasks(&scan);
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1025

2df167a30   Paul Menage   cgroups: update c...
1026
  	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
8793d854e   Paul Menage   Task Control Grou...
1027
  	cpuset_being_rebound = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1028
  }
0b2f630a2   Miao Xie   cpusets: restruct...
1029
1030
1031
  /*
   * Handle user request to change the 'mems' memory placement
   * of a cpuset.  Needs to validate the request, update the
58568d2a8   Miao Xie   cpuset,mm: update...
1032
1033
1034
1035
   * cpusets mems_allowed, and for each task in the cpuset,
   * update mems_allowed and rebind task's mempolicy and any vma
   * mempolicies and if the cpuset is marked 'memory_migrate',
   * migrate the tasks pages to the new memory.
0b2f630a2   Miao Xie   cpusets: restruct...
1036
1037
1038
1039
1040
1041
   *
   * Call with cgroup_mutex held.  May take callback_mutex during call.
   * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
   * lock each such tasks mm->mmap_sem, scan its vma's and rebind
   * their mempolicies to the cpusets new mems_allowed.
   */
645fcc9d2   Li Zefan   cpuset: don't all...
1042
1043
  static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
  			   const char *buf)
0b2f630a2   Miao Xie   cpusets: restruct...
1044
  {
53feb2976   Miao Xie   cpuset: alloc nod...
1045
  	NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
0b2f630a2   Miao Xie   cpusets: restruct...
1046
  	int retval;
010cfac4c   Li Zefan   cpuset: avoid cha...
1047
  	struct ptr_heap heap;
0b2f630a2   Miao Xie   cpusets: restruct...
1048

53feb2976   Miao Xie   cpuset: alloc nod...
1049
1050
  	if (!oldmem)
  		return -ENOMEM;
0b2f630a2   Miao Xie   cpusets: restruct...
1051
1052
1053
1054
  	/*
  	 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
  	 * it's read-only
  	 */
53feb2976   Miao Xie   cpuset: alloc nod...
1055
1056
1057
1058
  	if (cs == &top_cpuset) {
  		retval = -EACCES;
  		goto done;
  	}
0b2f630a2   Miao Xie   cpusets: restruct...
1059

0b2f630a2   Miao Xie   cpusets: restruct...
1060
1061
1062
1063
1064
1065
1066
  	/*
  	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
  	 * Since nodelist_parse() fails on an empty mask, we special case
  	 * that parsing.  The validate_change() call ensures that cpusets
  	 * with tasks have memory.
  	 */
  	if (!*buf) {
645fcc9d2   Li Zefan   cpuset: don't all...
1067
  		nodes_clear(trialcs->mems_allowed);
0b2f630a2   Miao Xie   cpusets: restruct...
1068
  	} else {
645fcc9d2   Li Zefan   cpuset: don't all...
1069
  		retval = nodelist_parse(buf, trialcs->mems_allowed);
0b2f630a2   Miao Xie   cpusets: restruct...
1070
1071
  		if (retval < 0)
  			goto done;
645fcc9d2   Li Zefan   cpuset: don't all...
1072
  		if (!nodes_subset(trialcs->mems_allowed,
53feb2976   Miao Xie   cpuset: alloc nod...
1073
1074
1075
1076
  				node_states[N_HIGH_MEMORY])) {
  			retval =  -EINVAL;
  			goto done;
  		}
0b2f630a2   Miao Xie   cpusets: restruct...
1077
  	}
53feb2976   Miao Xie   cpuset: alloc nod...
1078
1079
  	*oldmem = cs->mems_allowed;
  	if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
0b2f630a2   Miao Xie   cpusets: restruct...
1080
1081
1082
  		retval = 0;		/* Too easy - nothing to do */
  		goto done;
  	}
645fcc9d2   Li Zefan   cpuset: don't all...
1083
  	retval = validate_change(cs, trialcs);
0b2f630a2   Miao Xie   cpusets: restruct...
1084
1085
  	if (retval < 0)
  		goto done;
010cfac4c   Li Zefan   cpuset: avoid cha...
1086
1087
1088
  	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
  	if (retval < 0)
  		goto done;
0b2f630a2   Miao Xie   cpusets: restruct...
1089
  	mutex_lock(&callback_mutex);
645fcc9d2   Li Zefan   cpuset: don't all...
1090
  	cs->mems_allowed = trialcs->mems_allowed;
0b2f630a2   Miao Xie   cpusets: restruct...
1091
  	mutex_unlock(&callback_mutex);
53feb2976   Miao Xie   cpuset: alloc nod...
1092
  	update_tasks_nodemask(cs, oldmem, &heap);
010cfac4c   Li Zefan   cpuset: avoid cha...
1093
1094
  
  	heap_free(&heap);
0b2f630a2   Miao Xie   cpusets: restruct...
1095
  done:
53feb2976   Miao Xie   cpuset: alloc nod...
1096
  	NODEMASK_FREE(oldmem);
0b2f630a2   Miao Xie   cpusets: restruct...
1097
1098
  	return retval;
  }
8793d854e   Paul Menage   Task Control Grou...
1099
1100
1101
1102
  int current_cpuset_is_being_rebound(void)
  {
  	return task_cs(current) == cpuset_being_rebound;
  }
5be7a4792   Paul Menage   Fix cpuset sched_...
1103
  static int update_relax_domain_level(struct cpuset *cs, s64 val)
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1104
  {
db7f47cf4   Paul Menage   cpusets: allow cp...
1105
  #ifdef CONFIG_SMP
30e0e1781   Li Zefan   cpuset: limit the...
1106
1107
  	if (val < -1 || val >= SD_LV_MAX)
  		return -EINVAL;
db7f47cf4   Paul Menage   cpusets: allow cp...
1108
  #endif
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1109
1110
1111
  
  	if (val != cs->relax_domain_level) {
  		cs->relax_domain_level = val;
300ed6cbb   Li Zefan   cpuset: convert c...
1112
1113
  		if (!cpumask_empty(cs->cpus_allowed) &&
  		    is_sched_load_balance(cs))
cf417141c   Max Krasnyansky   sched, cpuset: re...
1114
  			async_rebuild_sched_domains();
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1115
1116
1117
1118
  	}
  
  	return 0;
  }
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1119
  /*
950592f7b   Miao Xie   cpusets: update t...
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
   * cpuset_change_flag - make a task's spread flags the same as its cpuset's
   * @tsk: task to be updated
   * @scan: struct cgroup_scanner containing the cgroup of the task
   *
   * Called by cgroup_scan_tasks() for each task in a cgroup.
   *
   * We don't need to re-check for the cgroup/cpuset membership, since we're
   * holding cgroup_lock() at this point.
   */
  static void cpuset_change_flag(struct task_struct *tsk,
  				struct cgroup_scanner *scan)
  {
  	cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
  }
  
  /*
   * update_tasks_flags - update the spread flags of tasks in the cpuset.
   * @cs: the cpuset in which each task's spread flags needs to be changed
   * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
   *
   * Called with cgroup_mutex held
   *
   * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
   * calling callback functions for each.
   *
   * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
   * if @heap != NULL.
   */
  static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
  {
  	struct cgroup_scanner scan;
  
  	scan.cg = cs->css.cgroup;
  	scan.test_task = NULL;
  	scan.process_task = cpuset_change_flag;
  	scan.heap = heap;
  	cgroup_scan_tasks(&scan);
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1160
   * update_flag - read a 0 or a 1 in a file and update associated flag
786083667   Paul Menage   Cpuset hardwall f...
1161
1162
1163
   * bit:		the bit to update (see cpuset_flagbits_t)
   * cs:		the cpuset to update
   * turning_on: 	whether the flag is being set or cleared
053199edf   Paul Jackson   [PATCH] cpusets: ...
1164
   *
2df167a30   Paul Menage   cgroups: update c...
1165
   * Call with cgroup_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1166
   */
700fe1ab9   Paul Menage   CGroup API files:...
1167
1168
  static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
  		       int turning_on)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1169
  {
645fcc9d2   Li Zefan   cpuset: don't all...
1170
  	struct cpuset *trialcs;
40b6a7623   Rakib Mullick   cpuset.c: remove ...
1171
  	int balance_flag_changed;
950592f7b   Miao Xie   cpusets: update t...
1172
1173
1174
  	int spread_flag_changed;
  	struct ptr_heap heap;
  	int err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1175

645fcc9d2   Li Zefan   cpuset: don't all...
1176
1177
1178
  	trialcs = alloc_trial_cpuset(cs);
  	if (!trialcs)
  		return -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1179
  	if (turning_on)
645fcc9d2   Li Zefan   cpuset: don't all...
1180
  		set_bit(bit, &trialcs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1181
  	else
645fcc9d2   Li Zefan   cpuset: don't all...
1182
  		clear_bit(bit, &trialcs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1183

645fcc9d2   Li Zefan   cpuset: don't all...
1184
  	err = validate_change(cs, trialcs);
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
1185
  	if (err < 0)
645fcc9d2   Li Zefan   cpuset: don't all...
1186
  		goto out;
029190c51   Paul Jackson   cpuset sched_load...
1187

950592f7b   Miao Xie   cpusets: update t...
1188
1189
1190
  	err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
  	if (err < 0)
  		goto out;
029190c51   Paul Jackson   cpuset sched_load...
1191
  	balance_flag_changed = (is_sched_load_balance(cs) !=
645fcc9d2   Li Zefan   cpuset: don't all...
1192
  				is_sched_load_balance(trialcs));
029190c51   Paul Jackson   cpuset sched_load...
1193

950592f7b   Miao Xie   cpusets: update t...
1194
1195
  	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
  			|| (is_spread_page(cs) != is_spread_page(trialcs)));
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
1196
  	mutex_lock(&callback_mutex);
645fcc9d2   Li Zefan   cpuset: don't all...
1197
  	cs->flags = trialcs->flags;
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
1198
  	mutex_unlock(&callback_mutex);
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
1199

300ed6cbb   Li Zefan   cpuset: convert c...
1200
  	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
cf417141c   Max Krasnyansky   sched, cpuset: re...
1201
  		async_rebuild_sched_domains();
029190c51   Paul Jackson   cpuset sched_load...
1202

950592f7b   Miao Xie   cpusets: update t...
1203
1204
1205
  	if (spread_flag_changed)
  		update_tasks_flags(cs, &heap);
  	heap_free(&heap);
645fcc9d2   Li Zefan   cpuset: don't all...
1206
1207
1208
  out:
  	free_trial_cpuset(trialcs);
  	return err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1209
  }
053199edf   Paul Jackson   [PATCH] cpusets: ...
1210
  /*
80f7228b5   Adrian Bunk   typo fixes: occur...
1211
   * Frequency meter - How fast is some event occurring?
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
   *
   * These routines manage a digitally filtered, constant time based,
   * event frequency meter.  There are four routines:
   *   fmeter_init() - initialize a frequency meter.
   *   fmeter_markevent() - called each time the event happens.
   *   fmeter_getrate() - returns the recent rate of such events.
   *   fmeter_update() - internal routine used to update fmeter.
   *
   * A common data structure is passed to each of these routines,
   * which is used to keep track of the state required to manage the
   * frequency meter and its digital filter.
   *
   * The filter works on the number of events marked per unit time.
   * The filter is single-pole low-pass recursive (IIR).  The time unit
   * is 1 second.  Arithmetic is done using 32-bit integers scaled to
   * simulate 3 decimal digits of precision (multiplied by 1000).
   *
   * With an FM_COEF of 933, and a time base of 1 second, the filter
   * has a half-life of 10 seconds, meaning that if the events quit
   * happening, then the rate returned from the fmeter_getrate()
   * will be cut in half each 10 seconds, until it converges to zero.
   *
   * It is not worth doing a real infinitely recursive filter.  If more
   * than FM_MAXTICKS ticks have elapsed since the last filter event,
   * just compute FM_MAXTICKS ticks worth, by which point the level
   * will be stable.
   *
   * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
   * arithmetic overflow in the fmeter_update() routine.
   *
   * Given the simple 32 bit integer arithmetic used, this meter works
   * best for reporting rates between one per millisecond (msec) and
   * one per 32 (approx) seconds.  At constant rates faster than one
   * per msec it maxes out at values just under 1,000,000.  At constant
   * rates between one per msec, and one per second it will stabilize
   * to a value N*1000, where N is the rate of events per second.
   * At constant rates between one per second and one per 32 seconds,
   * it will be choppy, moving up on the seconds that have an event,
   * and then decaying until the next event.  At rates slower than
   * about one in 32 seconds, it decays all the way back to zero between
   * each event.
   */
  
  #define FM_COEF 933		/* coefficient for half-life of 10 secs */
  #define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
  #define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */
  #define FM_SCALE 1000		/* faux fixed point scale */
  
  /* Initialize a frequency meter */
  static void fmeter_init(struct fmeter *fmp)
  {
  	fmp->cnt = 0;
  	fmp->val = 0;
  	fmp->time = 0;
  	spin_lock_init(&fmp->lock);
  }
  
  /* Internal meter update - process cnt events and update value */
  static void fmeter_update(struct fmeter *fmp)
  {
  	time_t now = get_seconds();
  	time_t ticks = now - fmp->time;
  
  	if (ticks == 0)
  		return;
  
  	ticks = min(FM_MAXTICKS, ticks);
  	while (ticks-- > 0)
  		fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
  	fmp->time = now;
  
  	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
  	fmp->cnt = 0;
  }
  
  /* Process any previous ticks, then bump cnt by one (times scale). */
  static void fmeter_markevent(struct fmeter *fmp)
  {
  	spin_lock(&fmp->lock);
  	fmeter_update(fmp);
  	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
  	spin_unlock(&fmp->lock);
  }
  
  /* Process any previous ticks, then return current value. */
  static int fmeter_getrate(struct fmeter *fmp)
  {
  	int val;
  
  	spin_lock(&fmp->lock);
  	fmeter_update(fmp);
  	val = fmp->val;
  	spin_unlock(&fmp->lock);
  	return val;
  }
2341d1b65   Li Zefan   cpuset: convert c...
1307
1308
  /* Protected by cgroup_lock */
  static cpumask_var_t cpus_attach;
2df167a30   Paul Menage   cgroups: update c...
1309
  /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
be367d099   Ben Blum   cgroups: let ss->...
1310
1311
  static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
  			     struct task_struct *tsk, bool threadgroup)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1312
  {
be367d099   Ben Blum   cgroups: let ss->...
1313
  	int ret;
8793d854e   Paul Menage   Task Control Grou...
1314
  	struct cpuset *cs = cgroup_cs(cont);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1315

300ed6cbb   Li Zefan   cpuset: convert c...
1316
  	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1317
  		return -ENOSPC;
9985b0bab   David Rientjes   sched: prevent bo...
1318

6d7b2f5f9   David Rientjes   cpusets: prevent ...
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
  	/*
  	 * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
  	 * cannot change their cpu affinity and isolating such threads by their
  	 * set of allowed nodes is unnecessary.  Thus, cpusets are not
  	 * applicable for such threads.  This prevents checking for success of
  	 * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
  	 * be changed.
  	 */
  	if (tsk->flags & PF_THREAD_BOUND)
  		return -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1329

be367d099   Ben Blum   cgroups: let ss->...
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
  	ret = security_task_setscheduler(tsk, 0, NULL);
  	if (ret)
  		return ret;
  	if (threadgroup) {
  		struct task_struct *c;
  
  		rcu_read_lock();
  		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
  			ret = security_task_setscheduler(c, 0, NULL);
  			if (ret) {
  				rcu_read_unlock();
  				return ret;
  			}
  		}
  		rcu_read_unlock();
  	}
  	return 0;
  }
  
  static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
  			       struct cpuset *cs)
  {
  	int err;
  	/*
  	 * can_attach beforehand should guarantee that this doesn't fail.
  	 * TODO: have a better way to handle failure here
  	 */
  	err = set_cpus_allowed_ptr(tsk, cpus_attach);
  	WARN_ON_ONCE(err);
be367d099   Ben Blum   cgroups: let ss->...
1359
  	cpuset_change_task_nodemask(tsk, to);
be367d099   Ben Blum   cgroups: let ss->...
1360
  	cpuset_update_task_spread_flag(cs, tsk);
8793d854e   Paul Menage   Task Control Grou...
1361
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1362

be367d099   Ben Blum   cgroups: let ss->...
1363
1364
1365
  static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
  			  struct cgroup *oldcont, struct task_struct *tsk,
  			  bool threadgroup)
8793d854e   Paul Menage   Task Control Grou...
1366
  {
8793d854e   Paul Menage   Task Control Grou...
1367
1368
1369
  	struct mm_struct *mm;
  	struct cpuset *cs = cgroup_cs(cont);
  	struct cpuset *oldcs = cgroup_cs(oldcont);
53feb2976   Miao Xie   cpuset: alloc nod...
1370
1371
1372
1373
1374
  	NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
  	NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
  
  	if (from == NULL || to == NULL)
  		goto alloc_fail;
22fb52dd7   David Quigley   [PATCH] SELinux: ...
1375

f5813d942   Miao Xie   cpusets: set task...
1376
  	if (cs == &top_cpuset) {
2341d1b65   Li Zefan   cpuset: convert c...
1377
  		cpumask_copy(cpus_attach, cpu_possible_mask);
f5813d942   Miao Xie   cpusets: set task...
1378
  	} else {
2341d1b65   Li Zefan   cpuset: convert c...
1379
  		guarantee_online_cpus(cs, cpus_attach);
f5813d942   Miao Xie   cpusets: set task...
1380
  	}
53feb2976   Miao Xie   cpuset: alloc nod...
1381
  	guarantee_online_mems(cs, to);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1382

be367d099   Ben Blum   cgroups: let ss->...
1383
  	/* do per-task migration stuff possibly for each in the threadgroup */
53feb2976   Miao Xie   cpuset: alloc nod...
1384
  	cpuset_attach_task(tsk, to, cs);
be367d099   Ben Blum   cgroups: let ss->...
1385
1386
1387
1388
  	if (threadgroup) {
  		struct task_struct *c;
  		rcu_read_lock();
  		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
53feb2976   Miao Xie   cpuset: alloc nod...
1389
  			cpuset_attach_task(c, to, cs);
be367d099   Ben Blum   cgroups: let ss->...
1390
1391
1392
  		}
  		rcu_read_unlock();
  	}
950592f7b   Miao Xie   cpusets: update t...
1393

be367d099   Ben Blum   cgroups: let ss->...
1394
  	/* change mm; only needs to be done once even if threadgroup */
53feb2976   Miao Xie   cpuset: alloc nod...
1395
1396
  	*from = oldcs->mems_allowed;
  	*to = cs->mems_allowed;
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1397
1398
  	mm = get_task_mm(tsk);
  	if (mm) {
53feb2976   Miao Xie   cpuset: alloc nod...
1399
  		mpol_rebind_mm(mm, to);
2741a559a   Paul Jackson   [PATCH] cpuset: u...
1400
  		if (is_memory_migrate(cs))
53feb2976   Miao Xie   cpuset: alloc nod...
1401
  			cpuset_migrate_mm(mm, from, to);
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1402
1403
  		mmput(mm);
  	}
53feb2976   Miao Xie   cpuset: alloc nod...
1404
1405
1406
1407
  
  alloc_fail:
  	NODEMASK_FREE(from);
  	NODEMASK_FREE(to);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1408
1409
1410
1411
1412
  }
  
  /* The various types of files and directories in a cpuset file system */
  
  typedef enum {
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
1413
  	FILE_MEMORY_MIGRATE,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1414
1415
1416
1417
  	FILE_CPULIST,
  	FILE_MEMLIST,
  	FILE_CPU_EXCLUSIVE,
  	FILE_MEM_EXCLUSIVE,
786083667   Paul Menage   Cpuset hardwall f...
1418
  	FILE_MEM_HARDWALL,
029190c51   Paul Jackson   cpuset sched_load...
1419
  	FILE_SCHED_LOAD_BALANCE,
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1420
  	FILE_SCHED_RELAX_DOMAIN_LEVEL,
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1421
1422
  	FILE_MEMORY_PRESSURE_ENABLED,
  	FILE_MEMORY_PRESSURE,
825a46af5   Paul Jackson   [PATCH] cpuset me...
1423
1424
  	FILE_SPREAD_PAGE,
  	FILE_SPREAD_SLAB,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1425
  } cpuset_filetype_t;
700fe1ab9   Paul Menage   CGroup API files:...
1426
1427
1428
1429
1430
  static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
  {
  	int retval = 0;
  	struct cpuset *cs = cgroup_cs(cgrp);
  	cpuset_filetype_t type = cft->private;
e37123953   Paul Menage   cgroup files: rem...
1431
  	if (!cgroup_lock_live_group(cgrp))
700fe1ab9   Paul Menage   CGroup API files:...
1432
  		return -ENODEV;
700fe1ab9   Paul Menage   CGroup API files:...
1433
1434
  
  	switch (type) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1435
  	case FILE_CPU_EXCLUSIVE:
700fe1ab9   Paul Menage   CGroup API files:...
1436
  		retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1437
1438
  		break;
  	case FILE_MEM_EXCLUSIVE:
700fe1ab9   Paul Menage   CGroup API files:...
1439
  		retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1440
  		break;
786083667   Paul Menage   Cpuset hardwall f...
1441
1442
1443
  	case FILE_MEM_HARDWALL:
  		retval = update_flag(CS_MEM_HARDWALL, cs, val);
  		break;
029190c51   Paul Jackson   cpuset sched_load...
1444
  	case FILE_SCHED_LOAD_BALANCE:
700fe1ab9   Paul Menage   CGroup API files:...
1445
  		retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1446
  		break;
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
1447
  	case FILE_MEMORY_MIGRATE:
700fe1ab9   Paul Menage   CGroup API files:...
1448
  		retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
1449
  		break;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1450
  	case FILE_MEMORY_PRESSURE_ENABLED:
700fe1ab9   Paul Menage   CGroup API files:...
1451
  		cpuset_memory_pressure_enabled = !!val;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1452
1453
1454
1455
  		break;
  	case FILE_MEMORY_PRESSURE:
  		retval = -EACCES;
  		break;
825a46af5   Paul Jackson   [PATCH] cpuset me...
1456
  	case FILE_SPREAD_PAGE:
700fe1ab9   Paul Menage   CGroup API files:...
1457
  		retval = update_flag(CS_SPREAD_PAGE, cs, val);
825a46af5   Paul Jackson   [PATCH] cpuset me...
1458
1459
  		break;
  	case FILE_SPREAD_SLAB:
700fe1ab9   Paul Menage   CGroup API files:...
1460
  		retval = update_flag(CS_SPREAD_SLAB, cs, val);
825a46af5   Paul Jackson   [PATCH] cpuset me...
1461
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1462
1463
  	default:
  		retval = -EINVAL;
700fe1ab9   Paul Menage   CGroup API files:...
1464
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1465
  	}
8793d854e   Paul Menage   Task Control Grou...
1466
  	cgroup_unlock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1467
1468
  	return retval;
  }
5be7a4792   Paul Menage   Fix cpuset sched_...
1469
1470
1471
1472
1473
  static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
  {
  	int retval = 0;
  	struct cpuset *cs = cgroup_cs(cgrp);
  	cpuset_filetype_t type = cft->private;
e37123953   Paul Menage   cgroup files: rem...
1474
  	if (!cgroup_lock_live_group(cgrp))
5be7a4792   Paul Menage   Fix cpuset sched_...
1475
  		return -ENODEV;
e37123953   Paul Menage   cgroup files: rem...
1476

5be7a4792   Paul Menage   Fix cpuset sched_...
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
  	switch (type) {
  	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
  		retval = update_relax_domain_level(cs, val);
  		break;
  	default:
  		retval = -EINVAL;
  		break;
  	}
  	cgroup_unlock();
  	return retval;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1488
  /*
e37123953   Paul Menage   cgroup files: rem...
1489
1490
1491
1492
1493
1494
   * Common handling for a write to a "cpus" or "mems" file.
   */
  static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
  				const char *buf)
  {
  	int retval = 0;
645fcc9d2   Li Zefan   cpuset: don't all...
1495
1496
  	struct cpuset *cs = cgroup_cs(cgrp);
  	struct cpuset *trialcs;
e37123953   Paul Menage   cgroup files: rem...
1497
1498
1499
  
  	if (!cgroup_lock_live_group(cgrp))
  		return -ENODEV;
645fcc9d2   Li Zefan   cpuset: don't all...
1500
1501
1502
  	trialcs = alloc_trial_cpuset(cs);
  	if (!trialcs)
  		return -ENOMEM;
e37123953   Paul Menage   cgroup files: rem...
1503
1504
  	switch (cft->private) {
  	case FILE_CPULIST:
645fcc9d2   Li Zefan   cpuset: don't all...
1505
  		retval = update_cpumask(cs, trialcs, buf);
e37123953   Paul Menage   cgroup files: rem...
1506
1507
  		break;
  	case FILE_MEMLIST:
645fcc9d2   Li Zefan   cpuset: don't all...
1508
  		retval = update_nodemask(cs, trialcs, buf);
e37123953   Paul Menage   cgroup files: rem...
1509
1510
1511
1512
1513
  		break;
  	default:
  		retval = -EINVAL;
  		break;
  	}
645fcc9d2   Li Zefan   cpuset: don't all...
1514
1515
  
  	free_trial_cpuset(trialcs);
e37123953   Paul Menage   cgroup files: rem...
1516
1517
1518
1519
1520
  	cgroup_unlock();
  	return retval;
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
   * These ascii lists should be read in a single call, by using a user
   * buffer large enough to hold the entire map.  If read in smaller
   * chunks, there is no guarantee of atomicity.  Since the display format
   * used, list of ranges of sequential numbers, is variable length,
   * and since these maps can change value dynamically, one could read
   * gibberish by doing partial reads while a list was changing.
   * A single large read to a buffer that crosses a page boundary is
   * ok, because the result being copied to user land is not recomputed
   * across a page fault.
   */
  
  static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
  {
5a7625df7   Li Zefan   cpuset: remove on...
1534
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1535

3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
1536
  	mutex_lock(&callback_mutex);
300ed6cbb   Li Zefan   cpuset: convert c...
1537
  	ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
1538
  	mutex_unlock(&callback_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1539

5a7625df7   Li Zefan   cpuset: remove on...
1540
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1541
1542
1543
1544
  }
  
  static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
  {
53feb2976   Miao Xie   cpuset: alloc nod...
1545
1546
1547
1548
1549
  	NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
  	int retval;
  
  	if (mask == NULL)
  		return -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1550

3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
1551
  	mutex_lock(&callback_mutex);
53feb2976   Miao Xie   cpuset: alloc nod...
1552
  	*mask = cs->mems_allowed;
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
1553
  	mutex_unlock(&callback_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1554

53feb2976   Miao Xie   cpuset: alloc nod...
1555
1556
1557
1558
1559
  	retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
  
  	NODEMASK_FREE(mask);
  
  	return retval;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1560
  }
8793d854e   Paul Menage   Task Control Grou...
1561
1562
1563
1564
1565
  static ssize_t cpuset_common_file_read(struct cgroup *cont,
  				       struct cftype *cft,
  				       struct file *file,
  				       char __user *buf,
  				       size_t nbytes, loff_t *ppos)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1566
  {
8793d854e   Paul Menage   Task Control Grou...
1567
  	struct cpuset *cs = cgroup_cs(cont);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1568
1569
1570
1571
  	cpuset_filetype_t type = cft->private;
  	char *page;
  	ssize_t retval = 0;
  	char *s;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1572

e12ba74d8   Mel Gorman   Group short-lived...
1573
  	if (!(page = (char *)__get_free_page(GFP_TEMPORARY)))
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
  		return -ENOMEM;
  
  	s = page;
  
  	switch (type) {
  	case FILE_CPULIST:
  		s += cpuset_sprintf_cpulist(s, cs);
  		break;
  	case FILE_MEMLIST:
  		s += cpuset_sprintf_memlist(s, cs);
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1585
1586
1587
1588
1589
1590
  	default:
  		retval = -EINVAL;
  		goto out;
  	}
  	*s++ = '
  ';
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1591

eacaa1f5a   Al Viro   [PATCH] cpuset cr...
1592
  	retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1593
1594
1595
1596
  out:
  	free_page((unsigned long)page);
  	return retval;
  }
700fe1ab9   Paul Menage   CGroup API files:...
1597
1598
1599
1600
1601
1602
1603
1604
1605
  static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
  {
  	struct cpuset *cs = cgroup_cs(cont);
  	cpuset_filetype_t type = cft->private;
  	switch (type) {
  	case FILE_CPU_EXCLUSIVE:
  		return is_cpu_exclusive(cs);
  	case FILE_MEM_EXCLUSIVE:
  		return is_mem_exclusive(cs);
786083667   Paul Menage   Cpuset hardwall f...
1606
1607
  	case FILE_MEM_HARDWALL:
  		return is_mem_hardwall(cs);
700fe1ab9   Paul Menage   CGroup API files:...
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
  	case FILE_SCHED_LOAD_BALANCE:
  		return is_sched_load_balance(cs);
  	case FILE_MEMORY_MIGRATE:
  		return is_memory_migrate(cs);
  	case FILE_MEMORY_PRESSURE_ENABLED:
  		return cpuset_memory_pressure_enabled;
  	case FILE_MEMORY_PRESSURE:
  		return fmeter_getrate(&cs->fmeter);
  	case FILE_SPREAD_PAGE:
  		return is_spread_page(cs);
  	case FILE_SPREAD_SLAB:
  		return is_spread_slab(cs);
  	default:
  		BUG();
  	}
cf417141c   Max Krasnyansky   sched, cpuset: re...
1623
1624
1625
  
  	/* Unreachable but makes gcc happy */
  	return 0;
700fe1ab9   Paul Menage   CGroup API files:...
1626
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1627

5be7a4792   Paul Menage   Fix cpuset sched_...
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
  static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
  {
  	struct cpuset *cs = cgroup_cs(cont);
  	cpuset_filetype_t type = cft->private;
  	switch (type) {
  	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
  		return cs->relax_domain_level;
  	default:
  		BUG();
  	}
cf417141c   Max Krasnyansky   sched, cpuset: re...
1638
1639
1640
  
  	/* Unrechable but makes gcc happy */
  	return 0;
5be7a4792   Paul Menage   Fix cpuset sched_...
1641
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1642
1643
1644
1645
  
  /*
   * for the common functions, 'private' gives the type of file
   */
addf2c739   Paul Menage   Cpuset hardwall f...
1646
1647
1648
1649
  static struct cftype files[] = {
  	{
  		.name = "cpus",
  		.read = cpuset_common_file_read,
e37123953   Paul Menage   cgroup files: rem...
1650
1651
  		.write_string = cpuset_write_resmask,
  		.max_write_len = (100U + 6 * NR_CPUS),
addf2c739   Paul Menage   Cpuset hardwall f...
1652
1653
1654
1655
1656
1657
  		.private = FILE_CPULIST,
  	},
  
  	{
  		.name = "mems",
  		.read = cpuset_common_file_read,
e37123953   Paul Menage   cgroup files: rem...
1658
1659
  		.write_string = cpuset_write_resmask,
  		.max_write_len = (100U + 6 * MAX_NUMNODES),
addf2c739   Paul Menage   Cpuset hardwall f...
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
  		.private = FILE_MEMLIST,
  	},
  
  	{
  		.name = "cpu_exclusive",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_CPU_EXCLUSIVE,
  	},
  
  	{
  		.name = "mem_exclusive",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEM_EXCLUSIVE,
  	},
  
  	{
786083667   Paul Menage   Cpuset hardwall f...
1678
1679
1680
1681
1682
1683
1684
  		.name = "mem_hardwall",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEM_HARDWALL,
  	},
  
  	{
addf2c739   Paul Menage   Cpuset hardwall f...
1685
1686
1687
1688
1689
1690
1691
1692
  		.name = "sched_load_balance",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_SCHED_LOAD_BALANCE,
  	},
  
  	{
  		.name = "sched_relax_domain_level",
5be7a4792   Paul Menage   Fix cpuset sched_...
1693
1694
  		.read_s64 = cpuset_read_s64,
  		.write_s64 = cpuset_write_s64,
addf2c739   Paul Menage   Cpuset hardwall f...
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
  		.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
  	},
  
  	{
  		.name = "memory_migrate",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEMORY_MIGRATE,
  	},
  
  	{
  		.name = "memory_pressure",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEMORY_PRESSURE,
099fca322   Li Zefan   cgroups: show cor...
1710
  		.mode = S_IRUGO,
addf2c739   Paul Menage   Cpuset hardwall f...
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
  	},
  
  	{
  		.name = "memory_spread_page",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_SPREAD_PAGE,
  	},
  
  	{
  		.name = "memory_spread_slab",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_SPREAD_SLAB,
  	},
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
1726
  };
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1727
1728
  static struct cftype cft_memory_pressure_enabled = {
  	.name = "memory_pressure_enabled",
700fe1ab9   Paul Menage   CGroup API files:...
1729
1730
  	.read_u64 = cpuset_read_u64,
  	.write_u64 = cpuset_write_u64,
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1731
1732
  	.private = FILE_MEMORY_PRESSURE_ENABLED,
  };
8793d854e   Paul Menage   Task Control Grou...
1733
  static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1734
1735
  {
  	int err;
addf2c739   Paul Menage   Cpuset hardwall f...
1736
1737
  	err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
  	if (err)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1738
  		return err;
8793d854e   Paul Menage   Task Control Grou...
1739
  	/* memory_pressure_enabled is in root cpuset only */
addf2c739   Paul Menage   Cpuset hardwall f...
1740
  	if (!cont->parent)
8793d854e   Paul Menage   Task Control Grou...
1741
  		err = cgroup_add_file(cont, ss,
addf2c739   Paul Menage   Cpuset hardwall f...
1742
1743
  				      &cft_memory_pressure_enabled);
  	return err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1744
1745
1746
  }
  
  /*
8793d854e   Paul Menage   Task Control Grou...
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
   * post_clone() is called at the end of cgroup_clone().
   * 'cgroup' was just created automatically as a result of
   * a cgroup_clone(), and the current task is about to
   * be moved into 'cgroup'.
   *
   * Currently we refuse to set up the cgroup - thereby
   * refusing the task to be entered, and as a result refusing
   * the sys_unshare() or clone() which initiated it - if any
   * sibling cpusets have exclusive cpus or mem.
   *
   * If this becomes a problem for some users who wish to
   * allow that scenario, then cpuset_post_clone() could be
   * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
2df167a30   Paul Menage   cgroups: update c...
1760
1761
   * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
   * held.
8793d854e   Paul Menage   Task Control Grou...
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
   */
  static void cpuset_post_clone(struct cgroup_subsys *ss,
  			      struct cgroup *cgroup)
  {
  	struct cgroup *parent, *child;
  	struct cpuset *cs, *parent_cs;
  
  	parent = cgroup->parent;
  	list_for_each_entry(child, &parent->children, sibling) {
  		cs = cgroup_cs(child);
  		if (is_mem_exclusive(cs) || is_cpu_exclusive(cs))
  			return;
  	}
  	cs = cgroup_cs(cgroup);
  	parent_cs = cgroup_cs(parent);
  
  	cs->mems_allowed = parent_cs->mems_allowed;
300ed6cbb   Li Zefan   cpuset: convert c...
1779
  	cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
8793d854e   Paul Menage   Task Control Grou...
1780
1781
1782
1783
  	return;
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1784
   *	cpuset_create - create a cpuset
2df167a30   Paul Menage   cgroups: update c...
1785
1786
   *	ss:	cpuset cgroup subsystem
   *	cont:	control group that the new cpuset will be part of
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1787
   */
8793d854e   Paul Menage   Task Control Grou...
1788
1789
1790
  static struct cgroup_subsys_state *cpuset_create(
  	struct cgroup_subsys *ss,
  	struct cgroup *cont)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1791
1792
  {
  	struct cpuset *cs;
8793d854e   Paul Menage   Task Control Grou...
1793
  	struct cpuset *parent;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1794

8793d854e   Paul Menage   Task Control Grou...
1795
  	if (!cont->parent) {
8793d854e   Paul Menage   Task Control Grou...
1796
1797
1798
  		return &top_cpuset.css;
  	}
  	parent = cgroup_cs(cont->parent);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1799
1800
  	cs = kmalloc(sizeof(*cs), GFP_KERNEL);
  	if (!cs)
8793d854e   Paul Menage   Task Control Grou...
1801
  		return ERR_PTR(-ENOMEM);
300ed6cbb   Li Zefan   cpuset: convert c...
1802
1803
1804
1805
  	if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
  		kfree(cs);
  		return ERR_PTR(-ENOMEM);
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1806

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1807
  	cs->flags = 0;
825a46af5   Paul Jackson   [PATCH] cpuset me...
1808
1809
1810
1811
  	if (is_spread_page(parent))
  		set_bit(CS_SPREAD_PAGE, &cs->flags);
  	if (is_spread_slab(parent))
  		set_bit(CS_SPREAD_SLAB, &cs->flags);
029190c51   Paul Jackson   cpuset sched_load...
1812
  	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
300ed6cbb   Li Zefan   cpuset: convert c...
1813
  	cpumask_clear(cs->cpus_allowed);
f9a86fcbb   Mike Travis   cpuset: modify cp...
1814
  	nodes_clear(cs->mems_allowed);
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1815
  	fmeter_init(&cs->fmeter);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1816
  	cs->relax_domain_level = -1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1817
1818
  
  	cs->parent = parent;
202f72d5d   Paul Jackson   [PATCH] cpuset: n...
1819
  	number_of_cpusets++;
8793d854e   Paul Menage   Task Control Grou...
1820
  	return &cs->css ;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1821
  }
029190c51   Paul Jackson   cpuset sched_load...
1822
  /*
029190c51   Paul Jackson   cpuset sched_load...
1823
1824
   * If the cpuset being removed has its flag 'sched_load_balance'
   * enabled, then simulate turning sched_load_balance off, which
cf417141c   Max Krasnyansky   sched, cpuset: re...
1825
   * will call async_rebuild_sched_domains().
029190c51   Paul Jackson   cpuset sched_load...
1826
   */
8793d854e   Paul Menage   Task Control Grou...
1827
  static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1828
  {
8793d854e   Paul Menage   Task Control Grou...
1829
  	struct cpuset *cs = cgroup_cs(cont);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1830

029190c51   Paul Jackson   cpuset sched_load...
1831
  	if (is_sched_load_balance(cs))
700fe1ab9   Paul Menage   CGroup API files:...
1832
  		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
029190c51   Paul Jackson   cpuset sched_load...
1833

202f72d5d   Paul Jackson   [PATCH] cpuset: n...
1834
  	number_of_cpusets--;
300ed6cbb   Li Zefan   cpuset: convert c...
1835
  	free_cpumask_var(cs->cpus_allowed);
8793d854e   Paul Menage   Task Control Grou...
1836
  	kfree(cs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1837
  }
8793d854e   Paul Menage   Task Control Grou...
1838
1839
1840
  struct cgroup_subsys cpuset_subsys = {
  	.name = "cpuset",
  	.create = cpuset_create,
cf417141c   Max Krasnyansky   sched, cpuset: re...
1841
  	.destroy = cpuset_destroy,
8793d854e   Paul Menage   Task Control Grou...
1842
1843
1844
1845
1846
1847
1848
  	.can_attach = cpuset_can_attach,
  	.attach = cpuset_attach,
  	.populate = cpuset_populate,
  	.post_clone = cpuset_post_clone,
  	.subsys_id = cpuset_subsys_id,
  	.early_init = 1,
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1849
1850
1851
1852
1853
1854
1855
1856
  /**
   * cpuset_init - initialize cpusets at system boot
   *
   * Description: Initialize top_cpuset and the cpuset internal file system,
   **/
  
  int __init cpuset_init(void)
  {
8793d854e   Paul Menage   Task Control Grou...
1857
  	int err = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1858

58568d2a8   Miao Xie   cpuset,mm: update...
1859
1860
  	if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
  		BUG();
300ed6cbb   Li Zefan   cpuset: convert c...
1861
  	cpumask_setall(top_cpuset.cpus_allowed);
f9a86fcbb   Mike Travis   cpuset: modify cp...
1862
  	nodes_setall(top_cpuset.mems_allowed);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1863

3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1864
  	fmeter_init(&top_cpuset.fmeter);
029190c51   Paul Jackson   cpuset sched_load...
1865
  	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1866
  	top_cpuset.relax_domain_level = -1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1867

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1868
1869
  	err = register_filesystem(&cpuset_fs_type);
  	if (err < 0)
8793d854e   Paul Menage   Task Control Grou...
1870
  		return err;
2341d1b65   Li Zefan   cpuset: convert c...
1871
1872
  	if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
  		BUG();
202f72d5d   Paul Jackson   [PATCH] cpuset: n...
1873
  	number_of_cpusets = 1;
8793d854e   Paul Menage   Task Control Grou...
1874
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1875
  }
956db3ca0   Cliff Wickman   hotplug cpu: move...
1876
1877
1878
1879
1880
1881
1882
1883
  /**
   * cpuset_do_move_task - move a given task to another cpuset
   * @tsk: pointer to task_struct the task to move
   * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
   *
   * Called by cgroup_scan_tasks() for each task in a cgroup.
   * Return nonzero to stop the walk through the tasks.
   */
9e0c914ca   Adrian Bunk   kernel/cpuset.c: ...
1884
1885
  static void cpuset_do_move_task(struct task_struct *tsk,
  				struct cgroup_scanner *scan)
956db3ca0   Cliff Wickman   hotplug cpu: move...
1886
  {
7f81b1ae1   Li Zefan   cpuset: remove st...
1887
  	struct cgroup *new_cgroup = scan->data;
956db3ca0   Cliff Wickman   hotplug cpu: move...
1888

7f81b1ae1   Li Zefan   cpuset: remove st...
1889
  	cgroup_attach_task(new_cgroup, tsk);
956db3ca0   Cliff Wickman   hotplug cpu: move...
1890
1891
1892
1893
1894
1895
1896
  }
  
  /**
   * move_member_tasks_to_cpuset - move tasks from one cpuset to another
   * @from: cpuset in which the tasks currently reside
   * @to: cpuset to which the tasks will be moved
   *
c8d9c90c7   Paul Jackson   hotplug cpu: move...
1897
1898
   * Called with cgroup_mutex held
   * callback_mutex must not be held, as cpuset_attach() will take it.
956db3ca0   Cliff Wickman   hotplug cpu: move...
1899
1900
1901
1902
1903
1904
   *
   * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
   * calling callback functions for each.
   */
  static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
  {
7f81b1ae1   Li Zefan   cpuset: remove st...
1905
  	struct cgroup_scanner scan;
956db3ca0   Cliff Wickman   hotplug cpu: move...
1906

7f81b1ae1   Li Zefan   cpuset: remove st...
1907
1908
1909
1910
1911
  	scan.cg = from->css.cgroup;
  	scan.test_task = NULL; /* select all tasks in cgroup */
  	scan.process_task = cpuset_do_move_task;
  	scan.heap = NULL;
  	scan.data = to->css.cgroup;
956db3ca0   Cliff Wickman   hotplug cpu: move...
1912

7f81b1ae1   Li Zefan   cpuset: remove st...
1913
  	if (cgroup_scan_tasks(&scan))
956db3ca0   Cliff Wickman   hotplug cpu: move...
1914
1915
1916
1917
  		printk(KERN_ERR "move_member_tasks_to_cpuset: "
  				"cgroup_scan_tasks failed
  ");
  }
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1918
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
1919
   * If CPU and/or memory hotplug handlers, below, unplug any CPUs
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1920
1921
   * or memory nodes, we need to walk over the cpuset hierarchy,
   * removing that CPU or node from all cpusets.  If this removes the
956db3ca0   Cliff Wickman   hotplug cpu: move...
1922
1923
   * last CPU or node from a cpuset, then move the tasks in the empty
   * cpuset to its next-highest non-empty parent.
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1924
   *
c8d9c90c7   Paul Jackson   hotplug cpu: move...
1925
1926
   * Called with cgroup_mutex held
   * callback_mutex must not be held, as cpuset_attach() will take it.
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1927
   */
956db3ca0   Cliff Wickman   hotplug cpu: move...
1928
1929
1930
  static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
  {
  	struct cpuset *parent;
c8d9c90c7   Paul Jackson   hotplug cpu: move...
1931
1932
1933
1934
1935
  	/*
  	 * The cgroup's css_sets list is in use if there are tasks
  	 * in the cpuset; the list is empty if there are none;
  	 * the cs->css.refcnt seems always 0.
  	 */
956db3ca0   Cliff Wickman   hotplug cpu: move...
1936
1937
  	if (list_empty(&cs->css.cgroup->css_sets))
  		return;
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1938

956db3ca0   Cliff Wickman   hotplug cpu: move...
1939
1940
1941
1942
1943
  	/*
  	 * Find its next-highest non-empty parent, (top cpuset
  	 * has online cpus, so can't be empty).
  	 */
  	parent = cs->parent;
300ed6cbb   Li Zefan   cpuset: convert c...
1944
  	while (cpumask_empty(parent->cpus_allowed) ||
b45012955   Paul Jackson   hotplug cpu move ...
1945
  			nodes_empty(parent->mems_allowed))
956db3ca0   Cliff Wickman   hotplug cpu: move...
1946
  		parent = parent->parent;
956db3ca0   Cliff Wickman   hotplug cpu: move...
1947
1948
1949
1950
1951
1952
1953
1954
  
  	move_member_tasks_to_cpuset(cs, parent);
  }
  
  /*
   * Walk the specified cpuset subtree and look for empty cpusets.
   * The tasks of such cpuset must be moved to a parent cpuset.
   *
2df167a30   Paul Menage   cgroups: update c...
1955
   * Called with cgroup_mutex held.  We take callback_mutex to modify
956db3ca0   Cliff Wickman   hotplug cpu: move...
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
   * cpus_allowed and mems_allowed.
   *
   * This walk processes the tree from top to bottom, completing one layer
   * before dropping down to the next.  It always processes a node before
   * any of its children.
   *
   * For now, since we lack memory hot unplug, we'll never see a cpuset
   * that has tasks along with an empty 'mems'.  But if we did see such
   * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
   */
d294eb83d   Frederic Weisbecker   cpusets: scan_for...
1966
  static void scan_for_empty_cpusets(struct cpuset *root)
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1967
  {
8d1e6266f   Li Zefan   cpuset: a bit cle...
1968
  	LIST_HEAD(queue);
956db3ca0   Cliff Wickman   hotplug cpu: move...
1969
1970
  	struct cpuset *cp;	/* scans cpusets being updated */
  	struct cpuset *child;	/* scans child cpusets of cp */
8793d854e   Paul Menage   Task Control Grou...
1971
  	struct cgroup *cont;
53feb2976   Miao Xie   cpuset: alloc nod...
1972
1973
1974
1975
  	NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
  
  	if (oldmems == NULL)
  		return;
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1976

956db3ca0   Cliff Wickman   hotplug cpu: move...
1977
  	list_add_tail((struct list_head *)&root->stack_list, &queue);
956db3ca0   Cliff Wickman   hotplug cpu: move...
1978
  	while (!list_empty(&queue)) {
8d1e6266f   Li Zefan   cpuset: a bit cle...
1979
  		cp = list_first_entry(&queue, struct cpuset, stack_list);
956db3ca0   Cliff Wickman   hotplug cpu: move...
1980
1981
1982
1983
1984
  		list_del(queue.next);
  		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
  			child = cgroup_cs(cont);
  			list_add_tail(&child->stack_list, &queue);
  		}
b45012955   Paul Jackson   hotplug cpu move ...
1985
1986
  
  		/* Continue past cpusets with all cpus, mems online */
6ad4c1888   Peter Zijlstra   sched: Fix balanc...
1987
  		if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
b45012955   Paul Jackson   hotplug cpu move ...
1988
1989
  		    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
  			continue;
53feb2976   Miao Xie   cpuset: alloc nod...
1990
  		*oldmems = cp->mems_allowed;
f9b4fb8da   Miao Xie   cpusets: update t...
1991

956db3ca0   Cliff Wickman   hotplug cpu: move...
1992
  		/* Remove offline cpus and mems from this cpuset. */
b45012955   Paul Jackson   hotplug cpu move ...
1993
  		mutex_lock(&callback_mutex);
300ed6cbb   Li Zefan   cpuset: convert c...
1994
  		cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
6ad4c1888   Peter Zijlstra   sched: Fix balanc...
1995
  			    cpu_active_mask);
956db3ca0   Cliff Wickman   hotplug cpu: move...
1996
1997
  		nodes_and(cp->mems_allowed, cp->mems_allowed,
  						node_states[N_HIGH_MEMORY]);
b45012955   Paul Jackson   hotplug cpu move ...
1998
1999
2000
  		mutex_unlock(&callback_mutex);
  
  		/* Move tasks from the empty cpuset to a parent */
300ed6cbb   Li Zefan   cpuset: convert c...
2001
  		if (cpumask_empty(cp->cpus_allowed) ||
b45012955   Paul Jackson   hotplug cpu move ...
2002
  		     nodes_empty(cp->mems_allowed))
956db3ca0   Cliff Wickman   hotplug cpu: move...
2003
  			remove_tasks_in_empty_cpuset(cp);
f9b4fb8da   Miao Xie   cpusets: update t...
2004
  		else {
4e74339af   Li Zefan   cpuset: avoid cha...
2005
  			update_tasks_cpumask(cp, NULL);
53feb2976   Miao Xie   cpuset: alloc nod...
2006
  			update_tasks_nodemask(cp, oldmems, NULL);
f9b4fb8da   Miao Xie   cpusets: update t...
2007
  		}
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2008
  	}
53feb2976   Miao Xie   cpuset: alloc nod...
2009
  	NODEMASK_FREE(oldmems);
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2010
2011
2012
  }
  
  /*
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2013
2014
2015
2016
2017
   * The top_cpuset tracks what CPUs and Memory Nodes are online,
   * period.  This is necessary in order to make cpusets transparent
   * (of no affect) on systems that are actively using CPU hotplug
   * but making no active use of cpusets.
   *
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2018
   * This routine ensures that top_cpuset.cpus_allowed tracks
3a101d054   Tejun Heo   sched: adjust whe...
2019
   * cpu_active_mask on each CPU hotplug (cpuhp) event.
cf417141c   Max Krasnyansky   sched, cpuset: re...
2020
2021
2022
   *
   * Called within get_online_cpus().  Needs to call cgroup_lock()
   * before calling generate_sched_domains().
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2023
   */
0b2e918aa   Tejun Heo   sched, cpuset: Dr...
2024
  void cpuset_update_active_cpus(void)
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2025
  {
cf417141c   Max Krasnyansky   sched, cpuset: re...
2026
  	struct sched_domain_attr *attr;
acc3f5d7c   Rusty Russell   cpumask: Partitio...
2027
  	cpumask_var_t *doms;
cf417141c   Max Krasnyansky   sched, cpuset: re...
2028
  	int ndoms;
cf417141c   Max Krasnyansky   sched, cpuset: re...
2029
  	cgroup_lock();
0b4217b3f   Li Zefan   cpuset: fix possi...
2030
  	mutex_lock(&callback_mutex);
6ad4c1888   Peter Zijlstra   sched: Fix balanc...
2031
  	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
0b4217b3f   Li Zefan   cpuset: fix possi...
2032
  	mutex_unlock(&callback_mutex);
cf417141c   Max Krasnyansky   sched, cpuset: re...
2033
2034
2035
2036
2037
2038
  	scan_for_empty_cpusets(&top_cpuset);
  	ndoms = generate_sched_domains(&doms, &attr);
  	cgroup_unlock();
  
  	/* Have scheduler rebuild the domains */
  	partition_sched_domains(ndoms, doms, attr);
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2039
  }
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2040

b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2041
  #ifdef CONFIG_MEMORY_HOTPLUG
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2042
  /*
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
2043
   * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
cf417141c   Max Krasnyansky   sched, cpuset: re...
2044
2045
   * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
   * See also the previous routine cpuset_track_online_cpus().
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2046
   */
f481891fd   Miao Xie   cpuset: update to...
2047
2048
  static int cpuset_track_online_nodes(struct notifier_block *self,
  				unsigned long action, void *arg)
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2049
  {
53feb2976   Miao Xie   cpuset: alloc nod...
2050
2051
2052
2053
  	NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
  
  	if (oldmems == NULL)
  		return NOTIFY_DONE;
5ab116c93   Miao Xie   cpuset: fix the p...
2054

cf417141c   Max Krasnyansky   sched, cpuset: re...
2055
  	cgroup_lock();
f481891fd   Miao Xie   cpuset: update to...
2056
2057
  	switch (action) {
  	case MEM_ONLINE:
53feb2976   Miao Xie   cpuset: alloc nod...
2058
  		*oldmems = top_cpuset.mems_allowed;
0b4217b3f   Li Zefan   cpuset: fix possi...
2059
  		mutex_lock(&callback_mutex);
f481891fd   Miao Xie   cpuset: update to...
2060
  		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
0b4217b3f   Li Zefan   cpuset: fix possi...
2061
  		mutex_unlock(&callback_mutex);
53feb2976   Miao Xie   cpuset: alloc nod...
2062
  		update_tasks_nodemask(&top_cpuset, oldmems, NULL);
5ab116c93   Miao Xie   cpuset: fix the p...
2063
2064
2065
2066
2067
2068
2069
  		break;
  	case MEM_OFFLINE:
  		/*
  		 * needn't update top_cpuset.mems_allowed explicitly because
  		 * scan_for_empty_cpusets() will update it.
  		 */
  		scan_for_empty_cpusets(&top_cpuset);
f481891fd   Miao Xie   cpuset: update to...
2070
2071
2072
2073
  		break;
  	default:
  		break;
  	}
cf417141c   Max Krasnyansky   sched, cpuset: re...
2074
  	cgroup_unlock();
53feb2976   Miao Xie   cpuset: alloc nod...
2075
2076
  
  	NODEMASK_FREE(oldmems);
f481891fd   Miao Xie   cpuset: update to...
2077
  	return NOTIFY_OK;
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2078
2079
  }
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2080
2081
2082
2083
2084
2085
2086
2087
  /**
   * cpuset_init_smp - initialize cpus_allowed
   *
   * Description: Finish top cpuset after cpu, node maps are initialized
   **/
  
  void __init cpuset_init_smp(void)
  {
6ad4c1888   Peter Zijlstra   sched: Fix balanc...
2088
  	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
2089
  	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2090

f481891fd   Miao Xie   cpuset: update to...
2091
  	hotplug_memory_notifier(cpuset_track_online_nodes, 10);
f90d4118b   Miao Xie   cpuset: fix possi...
2092
2093
2094
  
  	cpuset_wq = create_singlethread_workqueue("cpuset");
  	BUG_ON(!cpuset_wq);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2095
2096
2097
  }
  
  /**
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2098
2099
   * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
   * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
6af866af3   Li Zefan   cpuset: remove re...
2100
   * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2101
   *
300ed6cbb   Li Zefan   cpuset: convert c...
2102
   * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2103
2104
2105
2106
   * attached to the specified @tsk.  Guaranteed to return some non-empty
   * subset of cpu_online_map, even if this means going outside the
   * tasks cpuset.
   **/
6af866af3   Li Zefan   cpuset: remove re...
2107
  void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2108
  {
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2109
  	mutex_lock(&callback_mutex);
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2110
  	task_lock(tsk);
f9a86fcbb   Mike Travis   cpuset: modify cp...
2111
  	guarantee_online_cpus(task_cs(tsk), pmask);
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2112
  	task_unlock(tsk);
897f0b3c3   Oleg Nesterov   sched: Kill the b...
2113
  	mutex_unlock(&callback_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2114
  }
9084bb824   Oleg Nesterov   sched: Make selec...
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
  int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
  {
  	const struct cpuset *cs;
  	int cpu;
  
  	rcu_read_lock();
  	cs = task_cs(tsk);
  	if (cs)
  		cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
  	rcu_read_unlock();
  
  	/*
  	 * We own tsk->cpus_allowed, nobody can change it under us.
  	 *
  	 * But we used cs && cs->cpus_allowed lockless and thus can
  	 * race with cgroup_attach_task() or update_cpumask() and get
  	 * the wrong tsk->cpus_allowed. However, both cases imply the
  	 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
  	 * which takes task_rq_lock().
  	 *
  	 * If we are called after it dropped the lock we must see all
  	 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
  	 * set any mask even if it is not right from task_cs() pov,
  	 * the pending set_cpus_allowed_ptr() will fix things.
  	 */
  
  	cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
  	if (cpu >= nr_cpu_ids) {
  		/*
  		 * Either tsk->cpus_allowed is wrong (see above) or it
  		 * is actually empty. The latter case is only possible
  		 * if we are racing with remove_tasks_in_empty_cpuset().
  		 * Like above we can temporary set any mask and rely on
  		 * set_cpus_allowed_ptr() as synchronization point.
  		 */
  		cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
  		cpu = cpumask_any(cpu_active_mask);
  	}
  
  	return cpu;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2156
2157
  void cpuset_init_current_mems_allowed(void)
  {
f9a86fcbb   Mike Travis   cpuset: modify cp...
2158
  	nodes_setall(current->mems_allowed);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2159
  }
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2160
  /**
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2161
2162
2163
2164
2165
   * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
   * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
   *
   * Description: Returns the nodemask_t mems_allowed of the cpuset
   * attached to the specified @tsk.  Guaranteed to return some non-empty
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
2166
   * subset of node_states[N_HIGH_MEMORY], even if this means going outside the
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2167
2168
2169
2170
2171
2172
   * tasks cpuset.
   **/
  
  nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
  {
  	nodemask_t mask;
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2173
  	mutex_lock(&callback_mutex);
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2174
  	task_lock(tsk);
8793d854e   Paul Menage   Task Control Grou...
2175
  	guarantee_online_mems(task_cs(tsk), &mask);
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2176
  	task_unlock(tsk);
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2177
  	mutex_unlock(&callback_mutex);
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2178
2179
2180
2181
2182
  
  	return mask;
  }
  
  /**
19770b326   Mel Gorman   mm: filter based ...
2183
2184
   * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
   * @nodemask: the nodemask to be checked
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2185
   *
19770b326   Mel Gorman   mm: filter based ...
2186
   * Are any of the nodes in the nodemask allowed in current->mems_allowed?
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2187
   */
19770b326   Mel Gorman   mm: filter based ...
2188
  int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2189
  {
19770b326   Mel Gorman   mm: filter based ...
2190
  	return nodes_intersects(*nodemask, current->mems_allowed);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2191
  }
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2192
  /*
786083667   Paul Menage   Cpuset hardwall f...
2193
2194
2195
2196
   * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
   * mem_hardwall ancestor to the specified cpuset.  Call holding
   * callback_mutex.  If no ancestor is mem_exclusive or mem_hardwall
   * (an unusual configuration), then returns the root cpuset.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2197
   */
786083667   Paul Menage   Cpuset hardwall f...
2198
  static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2199
  {
786083667   Paul Menage   Cpuset hardwall f...
2200
  	while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2201
2202
2203
  		cs = cs->parent;
  	return cs;
  }
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2204
  /**
a1bc5a4ee   David Rientjes   cpusets: replace ...
2205
2206
   * cpuset_node_allowed_softwall - Can we allocate on a memory node?
   * @node: is this an allowed node?
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2207
   * @gfp_mask: memory allocation flags
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2208
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2209
2210
2211
2212
2213
2214
   * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
   * set, yes, we can always allocate.  If node is in our task's mems_allowed,
   * yes.  If it's not a __GFP_HARDWALL request and this node is in the nearest
   * hardwalled cpuset ancestor to this task's cpuset, yes.  If the task has been
   * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
   * flag, yes.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2215
2216
   * Otherwise, no.
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2217
2218
2219
   * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
   * cpuset_node_allowed_hardwall().  Otherwise, cpuset_node_allowed_softwall()
   * might sleep, and might allow a node from an enclosing cpuset.
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2220
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2221
2222
   * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
   * cpusets, and never sleeps.
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2223
2224
2225
2226
2227
2228
2229
   *
   * The __GFP_THISNODE placement logic is really handled elsewhere,
   * by forcibly using a zonelist starting at a specified node, and by
   * (in get_page_from_freelist()) refusing to consider the zones for
   * any node on the zonelist except the first.  By the time any such
   * calls get to this routine, we should just shut up and say 'yes'.
   *
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2230
   * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
c596d9f32   David Rientjes   cpusets: allow TI...
2231
2232
   * and do not allow allocations outside the current tasks cpuset
   * unless the task has been OOM killed as is marked TIF_MEMDIE.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2233
   * GFP_KERNEL allocations are not so marked, so can escape to the
786083667   Paul Menage   Cpuset hardwall f...
2234
   * nearest enclosing hardwalled ancestor cpuset.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2235
   *
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2236
2237
2238
2239
2240
2241
2242
   * Scanning up parent cpusets requires callback_mutex.  The
   * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
   * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
   * current tasks mems_allowed came up empty on the first pass over
   * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
   * cpuset are short of memory, might require taking the callback_mutex
   * mutex.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2243
   *
36be57ffe   Paul Jackson   [PATCH] cpuset: u...
2244
   * The first call here from mm/page_alloc:get_page_from_freelist()
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2245
2246
2247
   * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
   * so no allocation on a node outside the cpuset is allowed (unless
   * in interrupt, of course).
36be57ffe   Paul Jackson   [PATCH] cpuset: u...
2248
2249
2250
2251
2252
2253
   *
   * The second pass through get_page_from_freelist() doesn't even call
   * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
   * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
   * in alloc_flags.  That logic and the checks below have the combined
   * affect that:
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2254
2255
   *	in_interrupt - any node ok (current task context irrelevant)
   *	GFP_ATOMIC   - any node ok
c596d9f32   David Rientjes   cpusets: allow TI...
2256
   *	TIF_MEMDIE   - any node ok
786083667   Paul Menage   Cpuset hardwall f...
2257
   *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2258
   *	GFP_USER     - only nodes in current tasks mems allowed ok.
36be57ffe   Paul Jackson   [PATCH] cpuset: u...
2259
2260
   *
   * Rule:
a1bc5a4ee   David Rientjes   cpusets: replace ...
2261
   *    Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
36be57ffe   Paul Jackson   [PATCH] cpuset: u...
2262
2263
   *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
   *    the code that might scan up ancestor cpusets and sleep.
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2264
   */
a1bc5a4ee   David Rientjes   cpusets: replace ...
2265
  int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2266
  {
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2267
  	const struct cpuset *cs;	/* current cpuset ancestors */
29afd49b7   Paul Jackson   [PATCH] cpuset: r...
2268
  	int allowed;			/* is allocation in zone z allowed? */
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2269

9b819d204   Christoph Lameter   [PATCH] Add __GFP...
2270
  	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2271
  		return 1;
92d1dbd27   Paul Jackson   [PATCH] cpuset: m...
2272
  	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2273
2274
  	if (node_isset(node, current->mems_allowed))
  		return 1;
c596d9f32   David Rientjes   cpusets: allow TI...
2275
2276
2277
2278
2279
2280
  	/*
  	 * Allow tasks that have access to memory reserves because they have
  	 * been OOM killed to get memory anywhere.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE)))
  		return 1;
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2281
2282
  	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
  		return 0;
5563e7707   Bob Picco   [PATCH] cpuset: f...
2283
2284
  	if (current->flags & PF_EXITING) /* Let dying task have memory */
  		return 1;
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2285
  	/* Not hardwall and node outside mems_allowed: scan up cpusets */
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2286
  	mutex_lock(&callback_mutex);
053199edf   Paul Jackson   [PATCH] cpusets: ...
2287

053199edf   Paul Jackson   [PATCH] cpusets: ...
2288
  	task_lock(current);
786083667   Paul Menage   Cpuset hardwall f...
2289
  	cs = nearest_hardwall_ancestor(task_cs(current));
053199edf   Paul Jackson   [PATCH] cpusets: ...
2290
  	task_unlock(current);
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2291
  	allowed = node_isset(node, cs->mems_allowed);
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2292
  	mutex_unlock(&callback_mutex);
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2293
  	return allowed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2294
  }
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2295
  /*
a1bc5a4ee   David Rientjes   cpusets: replace ...
2296
2297
   * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
   * @node: is this an allowed node?
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2298
2299
   * @gfp_mask: memory allocation flags
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2300
2301
2302
2303
2304
   * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
   * set, yes, we can always allocate.  If node is in our task's mems_allowed,
   * yes.  If the task has been OOM killed and has access to memory reserves as
   * specified by the TIF_MEMDIE flag, yes.
   * Otherwise, no.
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2305
2306
2307
2308
2309
2310
2311
   *
   * The __GFP_THISNODE placement logic is really handled elsewhere,
   * by forcibly using a zonelist starting at a specified node, and by
   * (in get_page_from_freelist()) refusing to consider the zones for
   * any node on the zonelist except the first.  By the time any such
   * calls get to this routine, we should just shut up and say 'yes'.
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2312
2313
   * Unlike the cpuset_node_allowed_softwall() variant, above,
   * this variant requires that the node be in the current task's
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2314
2315
2316
2317
   * mems_allowed or that we're in interrupt.  It does not scan up the
   * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
   * It never sleeps.
   */
a1bc5a4ee   David Rientjes   cpusets: replace ...
2318
  int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2319
  {
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2320
2321
  	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
  		return 1;
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2322
2323
  	if (node_isset(node, current->mems_allowed))
  		return 1;
dedf8b79e   Daniel Walker   whitespace fixes:...
2324
2325
2326
2327
2328
2329
  	/*
  	 * Allow tasks that have access to memory reserves because they have
  	 * been OOM killed to get memory anywhere.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE)))
  		return 1;
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2330
2331
  	return 0;
  }
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2332
  /**
505970b96   Paul Jackson   [PATCH] cpuset oo...
2333
2334
2335
2336
2337
2338
2339
   * cpuset_unlock - release lock on cpuset changes
   *
   * Undo the lock taken in a previous cpuset_lock() call.
   */
  
  void cpuset_unlock(void)
  {
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
2340
  	mutex_unlock(&callback_mutex);
505970b96   Paul Jackson   [PATCH] cpuset oo...
2341
2342
2343
  }
  
  /**
6adef3ebe   Jack Steiner   cpusets: new roun...
2344
2345
   * cpuset_mem_spread_node() - On which node to begin search for a file page
   * cpuset_slab_spread_node() - On which node to begin search for a slab page
825a46af5   Paul Jackson   [PATCH] cpuset me...
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
   *
   * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
   * tasks in a cpuset with is_spread_page or is_spread_slab set),
   * and if the memory allocation used cpuset_mem_spread_node()
   * to determine on which node to start looking, as it will for
   * certain page cache or slab cache pages such as used for file
   * system buffers and inode caches, then instead of starting on the
   * local node to look for a free page, rather spread the starting
   * node around the tasks mems_allowed nodes.
   *
   * We don't have to worry about the returned node being offline
   * because "it can't happen", and even if it did, it would be ok.
   *
   * The routines calling guarantee_online_mems() are careful to
   * only set nodes in task->mems_allowed that are online.  So it
   * should not be possible for the following code to return an
   * offline node.  But if it did, that would be ok, as this routine
   * is not returning the node where the allocation must be, only
   * the node where the search should start.  The zonelist passed to
   * __alloc_pages() will include all nodes.  If the slab allocator
   * is passed an offline node, it will fall back to the local node.
   * See kmem_cache_alloc_node().
   */
6adef3ebe   Jack Steiner   cpusets: new roun...
2369
  static int cpuset_spread_node(int *rotor)
825a46af5   Paul Jackson   [PATCH] cpuset me...
2370
2371
  {
  	int node;
6adef3ebe   Jack Steiner   cpusets: new roun...
2372
  	node = next_node(*rotor, current->mems_allowed);
825a46af5   Paul Jackson   [PATCH] cpuset me...
2373
2374
  	if (node == MAX_NUMNODES)
  		node = first_node(current->mems_allowed);
6adef3ebe   Jack Steiner   cpusets: new roun...
2375
  	*rotor = node;
825a46af5   Paul Jackson   [PATCH] cpuset me...
2376
2377
  	return node;
  }
6adef3ebe   Jack Steiner   cpusets: new roun...
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
  
  int cpuset_mem_spread_node(void)
  {
  	return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
  }
  
  int cpuset_slab_spread_node(void)
  {
  	return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
  }
825a46af5   Paul Jackson   [PATCH] cpuset me...
2388
2389
2390
  EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
  
  /**
bbe373f2c   David Rientjes   oom: compare cpus...
2391
2392
2393
2394
2395
2396
2397
2398
   * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
   * @tsk1: pointer to task_struct of some task.
   * @tsk2: pointer to task_struct of some other task.
   *
   * Description: Return true if @tsk1's mems_allowed intersects the
   * mems_allowed of @tsk2.  Used by the OOM killer to determine if
   * one of the task's memory usage might impact the memory available
   * to the other.
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2399
   **/
bbe373f2c   David Rientjes   oom: compare cpus...
2400
2401
  int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
  				   const struct task_struct *tsk2)
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2402
  {
bbe373f2c   David Rientjes   oom: compare cpus...
2403
  	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2404
  }
75aa19941   David Rientjes   oom: print trigge...
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
  /**
   * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
   * @task: pointer to task_struct of some task.
   *
   * Description: Prints @task's name, cpuset name, and cached copy of its
   * mems_allowed to the kernel log.  Must hold task_lock(task) to allow
   * dereferencing task_cs(task).
   */
  void cpuset_print_task_mems_allowed(struct task_struct *tsk)
  {
  	struct dentry *dentry;
  
  	dentry = task_cs(tsk)->css.cgroup->dentry;
  	spin_lock(&cpuset_buffer_lock);
  	snprintf(cpuset_name, CPUSET_NAME_LEN,
  		 dentry ? (const char *)dentry->d_name.name : "/");
  	nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
  			   tsk->mems_allowed);
  	printk(KERN_INFO "%s cpuset=%s mems_allowed=%s
  ",
  	       tsk->comm, cpuset_name, cpuset_nodelist);
  	spin_unlock(&cpuset_buffer_lock);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2428
  /*
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2429
2430
2431
2432
   * Collection of memory_pressure is suppressed unless
   * this flag is enabled by writing "1" to the special
   * cpuset file 'memory_pressure_enabled' in the root cpuset.
   */
c5b2aff89   Paul Jackson   [PATCH] cpuset: m...
2433
  int cpuset_memory_pressure_enabled __read_mostly;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
  
  /**
   * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
   *
   * Keep a running average of the rate of synchronous (direct)
   * page reclaim efforts initiated by tasks in each cpuset.
   *
   * This represents the rate at which some task in the cpuset
   * ran low on memory on all nodes it was allowed to use, and
   * had to enter the kernels page reclaim code in an effort to
   * create more free memory by tossing clean pages or swapping
   * or writing dirty pages.
   *
   * Display to user space in the per-cpuset read-only file
   * "memory_pressure".  Value displayed is an integer
   * representing the recent rate of entry into the synchronous
   * (direct) page reclaim by any task attached to the cpuset.
   **/
  
  void __cpuset_memory_pressure_bump(void)
  {
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2455
  	task_lock(current);
8793d854e   Paul Menage   Task Control Grou...
2456
  	fmeter_markevent(&task_cs(current)->fmeter);
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2457
2458
  	task_unlock(current);
  }
8793d854e   Paul Menage   Task Control Grou...
2459
  #ifdef CONFIG_PROC_PID_CPUSET
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2460
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2461
2462
2463
   * proc_cpuset_show()
   *  - Print tasks cpuset path into seq_file.
   *  - Used for /proc/<pid>/cpuset.
053199edf   Paul Jackson   [PATCH] cpusets: ...
2464
2465
   *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
   *    doesn't really matter if tsk->cpuset changes after we read it,
c8d9c90c7   Paul Jackson   hotplug cpu: move...
2466
   *    and we take cgroup_mutex, keeping cpuset_attach() from changing it
2df167a30   Paul Menage   cgroups: update c...
2467
   *    anyway.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2468
   */
029190c51   Paul Jackson   cpuset sched_load...
2469
  static int proc_cpuset_show(struct seq_file *m, void *unused_v)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2470
  {
13b41b094   Eric W. Biederman   [PATCH] proc: Use...
2471
  	struct pid *pid;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2472
2473
  	struct task_struct *tsk;
  	char *buf;
8793d854e   Paul Menage   Task Control Grou...
2474
  	struct cgroup_subsys_state *css;
99f895518   Eric W. Biederman   [PATCH] proc: don...
2475
  	int retval;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2476

99f895518   Eric W. Biederman   [PATCH] proc: don...
2477
  	retval = -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2478
2479
  	buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
  	if (!buf)
99f895518   Eric W. Biederman   [PATCH] proc: don...
2480
2481
2482
  		goto out;
  
  	retval = -ESRCH;
13b41b094   Eric W. Biederman   [PATCH] proc: Use...
2483
2484
  	pid = m->private;
  	tsk = get_pid_task(pid, PIDTYPE_PID);
99f895518   Eric W. Biederman   [PATCH] proc: don...
2485
2486
  	if (!tsk)
  		goto out_free;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2487

99f895518   Eric W. Biederman   [PATCH] proc: don...
2488
  	retval = -EINVAL;
8793d854e   Paul Menage   Task Control Grou...
2489
2490
2491
  	cgroup_lock();
  	css = task_subsys_state(tsk, cpuset_subsys_id);
  	retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2492
  	if (retval < 0)
99f895518   Eric W. Biederman   [PATCH] proc: don...
2493
  		goto out_unlock;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2494
2495
2496
  	seq_puts(m, buf);
  	seq_putc(m, '
  ');
99f895518   Eric W. Biederman   [PATCH] proc: don...
2497
  out_unlock:
8793d854e   Paul Menage   Task Control Grou...
2498
  	cgroup_unlock();
99f895518   Eric W. Biederman   [PATCH] proc: don...
2499
2500
  	put_task_struct(tsk);
  out_free:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2501
  	kfree(buf);
99f895518   Eric W. Biederman   [PATCH] proc: don...
2502
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2503
2504
2505
2506
2507
  	return retval;
  }
  
  static int cpuset_open(struct inode *inode, struct file *file)
  {
13b41b094   Eric W. Biederman   [PATCH] proc: Use...
2508
2509
  	struct pid *pid = PROC_I(inode)->pid;
  	return single_open(file, proc_cpuset_show, pid);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2510
  }
9a32144e9   Arjan van de Ven   [PATCH] mark stru...
2511
  const struct file_operations proc_cpuset_operations = {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2512
2513
2514
2515
2516
  	.open		= cpuset_open,
  	.read		= seq_read,
  	.llseek		= seq_lseek,
  	.release	= single_release,
  };
8793d854e   Paul Menage   Task Control Grou...
2517
  #endif /* CONFIG_PROC_PID_CPUSET */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2518

d01d48278   Heiko Carstens   sched: Always sho...
2519
  /* Display task mems_allowed in /proc/<pid>/status file. */
df5f8314c   Eric W. Biederman   proc: seqfile con...
2520
2521
  void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
  {
df5f8314c   Eric W. Biederman   proc: seqfile con...
2522
  	seq_printf(m, "Mems_allowed:\t");
30e8e1360   Lai Jiangshan   cpuset: use seq_*...
2523
  	seq_nodemask(m, &task->mems_allowed);
df5f8314c   Eric W. Biederman   proc: seqfile con...
2524
2525
  	seq_printf(m, "
  ");
39106dcf8   Mike Travis   cpumask: use new ...
2526
  	seq_printf(m, "Mems_allowed_list:\t");
30e8e1360   Lai Jiangshan   cpuset: use seq_*...
2527
  	seq_nodemask_list(m, &task->mems_allowed);
39106dcf8   Mike Travis   cpumask: use new ...
2528
2529
  	seq_printf(m, "
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2530
  }