Blame view

kernel/cpuset.c 75.7 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
  /*
   *  kernel/cpuset.c
   *
   *  Processor and Memory placement constraints for sets of tasks.
   *
   *  Copyright (C) 2003 BULL SA.
029190c51   Paul Jackson   cpuset sched_load...
7
   *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
8793d854e   Paul Menage   Task Control Grou...
8
   *  Copyright (C) 2006 Google, Inc
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
9
10
11
   *
   *  Portions derived from Patrick Mochel's sysfs code.
   *  sysfs is Copyright (c) 2001-3 Patrick Mochel
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
12
   *
825a46af5   Paul Jackson   [PATCH] cpuset me...
13
   *  2003-10-10 Written by Simon Derr.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
14
   *  2003-10-22 Updates by Stephen Hemminger.
825a46af5   Paul Jackson   [PATCH] cpuset me...
15
   *  2004 May-July Rework by Paul Jackson.
8793d854e   Paul Menage   Task Control Grou...
16
   *  2006 Rework by Paul Menage to use generic cgroups
cf417141c   Max Krasnyansky   sched, cpuset: re...
17
18
   *  2008 Rework of the scheduler domains and CPU hotplug handling
   *       by Max Krasnyansky
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
19
20
21
22
23
   *
   *  This file is subject to the terms and conditions of the GNU General Public
   *  License.  See the file COPYING in the main directory of the Linux
   *  distribution for more details.
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
24
25
26
27
28
29
30
31
32
33
34
35
  #include <linux/cpu.h>
  #include <linux/cpumask.h>
  #include <linux/cpuset.h>
  #include <linux/err.h>
  #include <linux/errno.h>
  #include <linux/file.h>
  #include <linux/fs.h>
  #include <linux/init.h>
  #include <linux/interrupt.h>
  #include <linux/kernel.h>
  #include <linux/kmod.h>
  #include <linux/list.h>
68860ec10   Paul Jackson   [PATCH] cpusets: ...
36
  #include <linux/mempolicy.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
37
  #include <linux/mm.h>
f481891fd   Miao Xie   cpuset: update to...
38
  #include <linux/memory.h>
9984de1a5   Paul Gortmaker   kernel: Map most ...
39
  #include <linux/export.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
40
41
42
43
  #include <linux/mount.h>
  #include <linux/namei.h>
  #include <linux/pagemap.h>
  #include <linux/proc_fs.h>
6b9c2603c   Paul Jackson   [PATCH] cpuset: u...
44
  #include <linux/rcupdate.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
45
46
  #include <linux/sched.h>
  #include <linux/seq_file.h>
22fb52dd7   David Quigley   [PATCH] SELinux: ...
47
  #include <linux/security.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
48
  #include <linux/slab.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
49
50
51
52
53
54
55
56
  #include <linux/spinlock.h>
  #include <linux/stat.h>
  #include <linux/string.h>
  #include <linux/time.h>
  #include <linux/backing-dev.h>
  #include <linux/sort.h>
  
  #include <asm/uaccess.h>
60063497a   Arun Sharma   atomic: use <linu...
57
  #include <linux/atomic.h>
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
58
  #include <linux/mutex.h>
956db3ca0   Cliff Wickman   hotplug cpu: move...
59
60
  #include <linux/workqueue.h>
  #include <linux/cgroup.h>
e44193d39   Li Zefan   cpuset: let hotpl...
61
  #include <linux/wait.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
62

664eeddee   Mel Gorman   mm: page_alloc: u...
63
  struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
202f72d5d   Paul Jackson   [PATCH] cpuset: n...
64

3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
65
66
67
68
69
70
71
72
  /* See "Frequency meter" comments, below. */
  
  struct fmeter {
  	int cnt;		/* unprocessed events count */
  	int val;		/* most recent output value */
  	time_t time;		/* clock (secs) when val computed */
  	spinlock_t lock;	/* guards read or write of above */
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
73
  struct cpuset {
8793d854e   Paul Menage   Task Control Grou...
74
  	struct cgroup_subsys_state css;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
75
  	unsigned long flags;		/* "unsigned long" so bitops work */
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
76

7e88291be   Li Zefan   cpuset: make cs->...
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
  	/*
  	 * On default hierarchy:
  	 *
  	 * The user-configured masks can only be changed by writing to
  	 * cpuset.cpus and cpuset.mems, and won't be limited by the
  	 * parent masks.
  	 *
  	 * The effective masks is the real masks that apply to the tasks
  	 * in the cpuset. They may be changed if the configured masks are
  	 * changed or hotplug happens.
  	 *
  	 * effective_mask == configured_mask & parent's effective_mask,
  	 * and if it ends up empty, it will inherit the parent's mask.
  	 *
  	 *
  	 * On legacy hierachy:
  	 *
  	 * The user-configured masks are always the same with effective masks.
  	 */
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
96
97
98
99
100
101
102
  	/* user-configured CPUs and Memory Nodes allow to tasks */
  	cpumask_var_t cpus_allowed;
  	nodemask_t mems_allowed;
  
  	/* effective CPUs and Memory Nodes allow to tasks */
  	cpumask_var_t effective_cpus;
  	nodemask_t effective_mems;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
103

33ad801df   Li Zefan   cpuset: record ol...
104
105
106
107
108
109
110
111
112
113
114
  	/*
  	 * This is old Memory Nodes tasks took on.
  	 *
  	 * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
  	 * - A new cpuset's old_mems_allowed is initialized when some
  	 *   task is moved into it.
  	 * - old_mems_allowed is used in cpuset_migrate_mm() when we change
  	 *   cpuset.mems_allowed and have tasks' nodemask updated, and
  	 *   then old_mems_allowed is updated to mems_allowed.
  	 */
  	nodemask_t old_mems_allowed;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
115
  	struct fmeter fmeter;		/* memory_pressure filter */
029190c51   Paul Jackson   cpuset sched_load...
116

452477fa6   Tejun Heo   cpuset: pin down ...
117
118
119
120
121
  	/*
  	 * Tasks are being attached to this cpuset.  Used to prevent
  	 * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
  	 */
  	int attach_in_progress;
029190c51   Paul Jackson   cpuset sched_load...
122
123
  	/* partition number for rebuild_sched_domains() */
  	int pn;
956db3ca0   Cliff Wickman   hotplug cpu: move...
124

1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
125
126
  	/* for custom sched domain */
  	int relax_domain_level;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
127
  };
a7c6d554a   Tejun Heo   cgroup: add/updat...
128
  static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
8793d854e   Paul Menage   Task Control Grou...
129
  {
a7c6d554a   Tejun Heo   cgroup: add/updat...
130
  	return css ? container_of(css, struct cpuset, css) : NULL;
8793d854e   Paul Menage   Task Control Grou...
131
132
133
134
135
  }
  
  /* Retrieve the cpuset for a task */
  static inline struct cpuset *task_cs(struct task_struct *task)
  {
073219e99   Tejun Heo   cgroup: clean up ...
136
  	return css_cs(task_css(task, cpuset_cgrp_id));
8793d854e   Paul Menage   Task Control Grou...
137
  }
8793d854e   Paul Menage   Task Control Grou...
138

c9710d801   Tejun Heo   cpuset: drop "con...
139
  static inline struct cpuset *parent_cs(struct cpuset *cs)
c431069fe   Tejun Heo   cpuset: remove cp...
140
  {
5c9d535b8   Tejun Heo   cgroup: remove cs...
141
  	return css_cs(cs->css.parent);
c431069fe   Tejun Heo   cpuset: remove cp...
142
  }
b246272ec   David Rientjes   cpusets: stall wh...
143
144
145
146
147
148
149
150
151
152
153
  #ifdef CONFIG_NUMA
  static inline bool task_has_mempolicy(struct task_struct *task)
  {
  	return task->mempolicy;
  }
  #else
  static inline bool task_has_mempolicy(struct task_struct *task)
  {
  	return false;
  }
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
154
155
  /* bits in struct cpuset flags field */
  typedef enum {
efeb77b2f   Tejun Heo   cpuset: introduce...
156
  	CS_ONLINE,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
157
158
  	CS_CPU_EXCLUSIVE,
  	CS_MEM_EXCLUSIVE,
786083667   Paul Menage   Cpuset hardwall f...
159
  	CS_MEM_HARDWALL,
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
160
  	CS_MEMORY_MIGRATE,
029190c51   Paul Jackson   cpuset sched_load...
161
  	CS_SCHED_LOAD_BALANCE,
825a46af5   Paul Jackson   [PATCH] cpuset me...
162
163
  	CS_SPREAD_PAGE,
  	CS_SPREAD_SLAB,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
164
165
166
  } cpuset_flagbits_t;
  
  /* convenient tests for these bits */
efeb77b2f   Tejun Heo   cpuset: introduce...
167
168
169
170
  static inline bool is_cpuset_online(const struct cpuset *cs)
  {
  	return test_bit(CS_ONLINE, &cs->flags);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
171
172
  static inline int is_cpu_exclusive(const struct cpuset *cs)
  {
7b5b9ef0e   Paul Jackson   [PATCH] cpuset cl...
173
  	return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
174
175
176
177
  }
  
  static inline int is_mem_exclusive(const struct cpuset *cs)
  {
7b5b9ef0e   Paul Jackson   [PATCH] cpuset cl...
178
  	return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
179
  }
786083667   Paul Menage   Cpuset hardwall f...
180
181
182
183
  static inline int is_mem_hardwall(const struct cpuset *cs)
  {
  	return test_bit(CS_MEM_HARDWALL, &cs->flags);
  }
029190c51   Paul Jackson   cpuset sched_load...
184
185
186
187
  static inline int is_sched_load_balance(const struct cpuset *cs)
  {
  	return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
  }
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
188
189
  static inline int is_memory_migrate(const struct cpuset *cs)
  {
7b5b9ef0e   Paul Jackson   [PATCH] cpuset cl...
190
  	return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
191
  }
825a46af5   Paul Jackson   [PATCH] cpuset me...
192
193
194
195
196
197
198
199
200
  static inline int is_spread_page(const struct cpuset *cs)
  {
  	return test_bit(CS_SPREAD_PAGE, &cs->flags);
  }
  
  static inline int is_spread_slab(const struct cpuset *cs)
  {
  	return test_bit(CS_SPREAD_SLAB, &cs->flags);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
201
  static struct cpuset top_cpuset = {
efeb77b2f   Tejun Heo   cpuset: introduce...
202
203
  	.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
  		  (1 << CS_MEM_EXCLUSIVE)),
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
204
  };
ae8086ce1   Tejun Heo   cpuset: introduce...
205
206
207
  /**
   * cpuset_for_each_child - traverse online children of a cpuset
   * @child_cs: loop cursor pointing to the current child
492eb21b9   Tejun Heo   cgroup: make hier...
208
   * @pos_css: used for iteration
ae8086ce1   Tejun Heo   cpuset: introduce...
209
210
211
212
213
   * @parent_cs: target cpuset to walk children of
   *
   * Walk @child_cs through the online children of @parent_cs.  Must be used
   * with RCU read locked.
   */
492eb21b9   Tejun Heo   cgroup: make hier...
214
215
216
  #define cpuset_for_each_child(child_cs, pos_css, parent_cs)		\
  	css_for_each_child((pos_css), &(parent_cs)->css)		\
  		if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
ae8086ce1   Tejun Heo   cpuset: introduce...
217

fc560a26a   Tejun Heo   cpuset: replace c...
218
219
220
  /**
   * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
   * @des_cs: loop cursor pointing to the current descendant
492eb21b9   Tejun Heo   cgroup: make hier...
221
   * @pos_css: used for iteration
fc560a26a   Tejun Heo   cpuset: replace c...
222
223
224
   * @root_cs: target cpuset to walk ancestor of
   *
   * Walk @des_cs through the online descendants of @root_cs.  Must be used
492eb21b9   Tejun Heo   cgroup: make hier...
225
   * with RCU read locked.  The caller may modify @pos_css by calling
bd8815a6d   Tejun Heo   cgroup: make css_...
226
227
   * css_rightmost_descendant() to skip subtree.  @root_cs is included in the
   * iteration and the first node to be visited.
fc560a26a   Tejun Heo   cpuset: replace c...
228
   */
492eb21b9   Tejun Heo   cgroup: make hier...
229
230
231
  #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs)	\
  	css_for_each_descendant_pre((pos_css), &(root_cs)->css)		\
  		if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
fc560a26a   Tejun Heo   cpuset: replace c...
232

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
233
  /*
8447a0fee   Vladimir Davydov   cpuset: convert c...
234
235
236
237
   * There are two global locks guarding cpuset structures - cpuset_mutex and
   * callback_lock. We also require taking task_lock() when dereferencing a
   * task's cpuset pointer. See "The task_lock() exception", at the end of this
   * comment.
5d21cc2db   Tejun Heo   cpuset: replace c...
238
   *
8447a0fee   Vladimir Davydov   cpuset: convert c...
239
   * A task must hold both locks to modify cpusets.  If a task holds
5d21cc2db   Tejun Heo   cpuset: replace c...
240
   * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
8447a0fee   Vladimir Davydov   cpuset: convert c...
241
   * is the only task able to also acquire callback_lock and be able to
5d21cc2db   Tejun Heo   cpuset: replace c...
242
243
244
   * modify cpusets.  It can perform various checks on the cpuset structure
   * first, knowing nothing will change.  It can also allocate memory while
   * just holding cpuset_mutex.  While it is performing these checks, various
8447a0fee   Vladimir Davydov   cpuset: convert c...
245
246
   * callback routines can briefly acquire callback_lock to query cpusets.
   * Once it is ready to make the changes, it takes callback_lock, blocking
5d21cc2db   Tejun Heo   cpuset: replace c...
247
   * everyone else.
053199edf   Paul Jackson   [PATCH] cpusets: ...
248
249
   *
   * Calls to the kernel memory allocator can not be made while holding
8447a0fee   Vladimir Davydov   cpuset: convert c...
250
   * callback_lock, as that would risk double tripping on callback_lock
053199edf   Paul Jackson   [PATCH] cpusets: ...
251
252
253
   * from one of the callbacks into the cpuset code from within
   * __alloc_pages().
   *
8447a0fee   Vladimir Davydov   cpuset: convert c...
254
   * If a task is only holding callback_lock, then it has read-only
053199edf   Paul Jackson   [PATCH] cpusets: ...
255
256
   * access to cpusets.
   *
58568d2a8   Miao Xie   cpuset,mm: update...
257
258
259
   * Now, the task_struct fields mems_allowed and mempolicy may be changed
   * by other task, we use alloc_lock in the task_struct fields to protect
   * them.
053199edf   Paul Jackson   [PATCH] cpusets: ...
260
   *
8447a0fee   Vladimir Davydov   cpuset: convert c...
261
   * The cpuset_common_file_read() handlers only hold callback_lock across
053199edf   Paul Jackson   [PATCH] cpusets: ...
262
263
264
   * small pieces of code, such as when reading out possibly multi-word
   * cpumasks and nodemasks.
   *
2df167a30   Paul Menage   cgroups: update c...
265
266
   * Accessing a task's cpuset should be done in accordance with the
   * guidelines for accessing subsystem state in kernel/cgroup.c
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
267
   */
5d21cc2db   Tejun Heo   cpuset: replace c...
268
  static DEFINE_MUTEX(cpuset_mutex);
8447a0fee   Vladimir Davydov   cpuset: convert c...
269
  static DEFINE_SPINLOCK(callback_lock);
4247bdc60   Paul Jackson   [PATCH] cpuset se...
270

cf417141c   Max Krasnyansky   sched, cpuset: re...
271
  /*
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
272
273
274
   * CPU / memory hotplug is handled asynchronously.
   */
  static void cpuset_hotplug_workfn(struct work_struct *work);
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
275
  static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
e44193d39   Li Zefan   cpuset: let hotpl...
276
  static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
277
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
278
   * This is ugly, but preserves the userspace API for existing cpuset
8793d854e   Paul Menage   Task Control Grou...
279
   * users. If someone tries to mount the "cpuset" filesystem, we
cf417141c   Max Krasnyansky   sched, cpuset: re...
280
281
   * silently switch it to mount "cgroup" instead
   */
f7e835710   Al Viro   convert cgroup an...
282
283
  static struct dentry *cpuset_mount(struct file_system_type *fs_type,
  			 int flags, const char *unused_dev_name, void *data)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
284
  {
8793d854e   Paul Menage   Task Control Grou...
285
  	struct file_system_type *cgroup_fs = get_fs_type("cgroup");
f7e835710   Al Viro   convert cgroup an...
286
  	struct dentry *ret = ERR_PTR(-ENODEV);
8793d854e   Paul Menage   Task Control Grou...
287
288
289
290
  	if (cgroup_fs) {
  		char mountopts[] =
  			"cpuset,noprefix,"
  			"release_agent=/sbin/cpuset_release_agent";
f7e835710   Al Viro   convert cgroup an...
291
292
  		ret = cgroup_fs->mount(cgroup_fs, flags,
  					   unused_dev_name, mountopts);
8793d854e   Paul Menage   Task Control Grou...
293
294
295
  		put_filesystem(cgroup_fs);
  	}
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
296
297
298
299
  }
  
  static struct file_system_type cpuset_fs_type = {
  	.name = "cpuset",
f7e835710   Al Viro   convert cgroup an...
300
  	.mount = cpuset_mount,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
301
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
302
  /*
300ed6cbb   Li Zefan   cpuset: convert c...
303
   * Return in pmask the portion of a cpusets's cpus_allowed that
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
304
   * are online.  If none are online, walk up the cpuset hierarchy
40df2deb5   Li Zefan   cpuset: cleanup g...
305
306
   * until we find one that does have some online cpus.  The top
   * cpuset always has some cpus online.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
307
308
   *
   * One way or another, we guarantee to return some non-empty subset
5f054e31c   Rusty Russell   documentation: re...
309
   * of cpu_online_mask.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
310
   *
8447a0fee   Vladimir Davydov   cpuset: convert c...
311
   * Call with callback_lock or cpuset_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
312
   */
c9710d801   Tejun Heo   cpuset: drop "con...
313
  static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
314
  {
ae1c80238   Li Zefan   cpuset: apply cs-...
315
  	while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
c431069fe   Tejun Heo   cpuset: remove cp...
316
  		cs = parent_cs(cs);
ae1c80238   Li Zefan   cpuset: apply cs-...
317
  	cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
318
319
320
321
  }
  
  /*
   * Return in *pmask the portion of a cpusets's mems_allowed that
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
322
323
   * are online, with memory.  If none are online with memory, walk
   * up the cpuset hierarchy until we find one that does have some
40df2deb5   Li Zefan   cpuset: cleanup g...
324
   * online mems.  The top cpuset always has some mems online.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
325
326
   *
   * One way or another, we guarantee to return some non-empty subset
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
327
   * of node_states[N_MEMORY].
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
328
   *
8447a0fee   Vladimir Davydov   cpuset: convert c...
329
   * Call with callback_lock or cpuset_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
330
   */
c9710d801   Tejun Heo   cpuset: drop "con...
331
  static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
332
  {
ae1c80238   Li Zefan   cpuset: apply cs-...
333
  	while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
c431069fe   Tejun Heo   cpuset: remove cp...
334
  		cs = parent_cs(cs);
ae1c80238   Li Zefan   cpuset: apply cs-...
335
  	nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
336
  }
f3b39d47e   Miao Xie   cpusets: restruct...
337
338
339
  /*
   * update task's spread flag if cpuset's page/slab spread flag is set
   *
8447a0fee   Vladimir Davydov   cpuset: convert c...
340
   * Call with callback_lock or cpuset_mutex held.
f3b39d47e   Miao Xie   cpusets: restruct...
341
342
343
344
345
   */
  static void cpuset_update_task_spread_flag(struct cpuset *cs,
  					struct task_struct *tsk)
  {
  	if (is_spread_page(cs))
2ad654bc5   Zefan Li   cpuset: PF_SPREAD...
346
  		task_set_spread_page(tsk);
f3b39d47e   Miao Xie   cpusets: restruct...
347
  	else
2ad654bc5   Zefan Li   cpuset: PF_SPREAD...
348
  		task_clear_spread_page(tsk);
f3b39d47e   Miao Xie   cpusets: restruct...
349
  	if (is_spread_slab(cs))
2ad654bc5   Zefan Li   cpuset: PF_SPREAD...
350
  		task_set_spread_slab(tsk);
f3b39d47e   Miao Xie   cpusets: restruct...
351
  	else
2ad654bc5   Zefan Li   cpuset: PF_SPREAD...
352
  		task_clear_spread_slab(tsk);
f3b39d47e   Miao Xie   cpusets: restruct...
353
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
354
355
356
357
358
  /*
   * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
   *
   * One cpuset is a subset of another if all its allowed CPUs and
   * Memory Nodes are a subset of the other, and its exclusive flags
5d21cc2db   Tejun Heo   cpuset: replace c...
359
   * are only set if the other's are set.  Call holding cpuset_mutex.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
360
361
362
363
   */
  
  static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
  {
300ed6cbb   Li Zefan   cpuset: convert c...
364
  	return	cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
365
366
367
368
  		nodes_subset(p->mems_allowed, q->mems_allowed) &&
  		is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
  		is_mem_exclusive(p) <= is_mem_exclusive(q);
  }
645fcc9d2   Li Zefan   cpuset: don't all...
369
370
371
372
  /**
   * alloc_trial_cpuset - allocate a trial cpuset
   * @cs: the cpuset that the trial cpuset duplicates
   */
c9710d801   Tejun Heo   cpuset: drop "con...
373
  static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
645fcc9d2   Li Zefan   cpuset: don't all...
374
  {
300ed6cbb   Li Zefan   cpuset: convert c...
375
376
377
378
379
  	struct cpuset *trial;
  
  	trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
  	if (!trial)
  		return NULL;
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
380
381
382
383
  	if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
  		goto free_cs;
  	if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
  		goto free_cpus;
300ed6cbb   Li Zefan   cpuset: convert c...
384

e2b9a3d7d   Li Zefan   cpuset: add cs->e...
385
386
  	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
  	cpumask_copy(trial->effective_cpus, cs->effective_cpus);
300ed6cbb   Li Zefan   cpuset: convert c...
387
  	return trial;
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
388
389
390
391
392
393
  
  free_cpus:
  	free_cpumask_var(trial->cpus_allowed);
  free_cs:
  	kfree(trial);
  	return NULL;
645fcc9d2   Li Zefan   cpuset: don't all...
394
395
396
397
398
399
400
401
  }
  
  /**
   * free_trial_cpuset - free the trial cpuset
   * @trial: the trial cpuset to be freed
   */
  static void free_trial_cpuset(struct cpuset *trial)
  {
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
402
  	free_cpumask_var(trial->effective_cpus);
300ed6cbb   Li Zefan   cpuset: convert c...
403
  	free_cpumask_var(trial->cpus_allowed);
645fcc9d2   Li Zefan   cpuset: don't all...
404
405
  	kfree(trial);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
406
407
408
409
410
411
412
  /*
   * validate_change() - Used to validate that any proposed cpuset change
   *		       follows the structural rules for cpusets.
   *
   * If we replaced the flag and mask values of the current cpuset
   * (cur) with those values in the trial cpuset (trial), would
   * our various subset and exclusive rules still be valid?  Presumes
5d21cc2db   Tejun Heo   cpuset: replace c...
413
   * cpuset_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
414
415
416
417
418
419
420
421
422
423
424
   *
   * 'cur' is the address of an actual, in-use cpuset.  Operations
   * such as list traversal that depend on the actual address of the
   * cpuset in the list must use cur below, not trial.
   *
   * 'trial' is the address of bulk structure copy of cur, with
   * perhaps one or more of the fields cpus_allowed, mems_allowed,
   * or flags changed to new, trial values.
   *
   * Return 0 if valid, -errno if not.
   */
c9710d801   Tejun Heo   cpuset: drop "con...
425
  static int validate_change(struct cpuset *cur, struct cpuset *trial)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
426
  {
492eb21b9   Tejun Heo   cgroup: make hier...
427
  	struct cgroup_subsys_state *css;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
428
  	struct cpuset *c, *par;
ae8086ce1   Tejun Heo   cpuset: introduce...
429
430
431
  	int ret;
  
  	rcu_read_lock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
432
433
  
  	/* Each of our child cpusets must be a subset of us */
ae8086ce1   Tejun Heo   cpuset: introduce...
434
  	ret = -EBUSY;
492eb21b9   Tejun Heo   cgroup: make hier...
435
  	cpuset_for_each_child(c, css, cur)
ae8086ce1   Tejun Heo   cpuset: introduce...
436
437
  		if (!is_cpuset_subset(c, trial))
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
438
439
  
  	/* Remaining checks don't apply to root cpuset */
ae8086ce1   Tejun Heo   cpuset: introduce...
440
  	ret = 0;
696040670   Paul Jackson   [PATCH] cpuset: m...
441
  	if (cur == &top_cpuset)
ae8086ce1   Tejun Heo   cpuset: introduce...
442
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
443

c431069fe   Tejun Heo   cpuset: remove cp...
444
  	par = parent_cs(cur);
696040670   Paul Jackson   [PATCH] cpuset: m...
445

7e88291be   Li Zefan   cpuset: make cs->...
446
  	/* On legacy hiearchy, we must be a subset of our parent cpuset. */
ae8086ce1   Tejun Heo   cpuset: introduce...
447
  	ret = -EACCES;
7e88291be   Li Zefan   cpuset: make cs->...
448
  	if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
ae8086ce1   Tejun Heo   cpuset: introduce...
449
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
450

2df167a30   Paul Menage   cgroups: update c...
451
452
453
454
  	/*
  	 * If either I or some sibling (!= me) is exclusive, we can't
  	 * overlap
  	 */
ae8086ce1   Tejun Heo   cpuset: introduce...
455
  	ret = -EINVAL;
492eb21b9   Tejun Heo   cgroup: make hier...
456
  	cpuset_for_each_child(c, css, par) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
457
458
  		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
  		    c != cur &&
300ed6cbb   Li Zefan   cpuset: convert c...
459
  		    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
ae8086ce1   Tejun Heo   cpuset: introduce...
460
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
461
462
463
  		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
  		    c != cur &&
  		    nodes_intersects(trial->mems_allowed, c->mems_allowed))
ae8086ce1   Tejun Heo   cpuset: introduce...
464
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
465
  	}
452477fa6   Tejun Heo   cpuset: pin down ...
466
467
  	/*
  	 * Cpusets with tasks - existing or newly being attached - can't
1c09b195d   Li Zefan   cpuset: fix a reg...
468
  	 * be changed to have empty cpus_allowed or mems_allowed.
452477fa6   Tejun Heo   cpuset: pin down ...
469
  	 */
ae8086ce1   Tejun Heo   cpuset: introduce...
470
  	ret = -ENOSPC;
07bc356ed   Tejun Heo   cgroup: implement...
471
  	if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
1c09b195d   Li Zefan   cpuset: fix a reg...
472
473
474
475
476
477
478
  		if (!cpumask_empty(cur->cpus_allowed) &&
  		    cpumask_empty(trial->cpus_allowed))
  			goto out;
  		if (!nodes_empty(cur->mems_allowed) &&
  		    nodes_empty(trial->mems_allowed))
  			goto out;
  	}
020958b62   Paul Jackson   cpusets: decrusti...
479

f82f80426   Juri Lelli   sched/deadline: E...
480
481
482
483
484
485
486
487
488
  	/*
  	 * We can't shrink if we won't have enough room for SCHED_DEADLINE
  	 * tasks.
  	 */
  	ret = -EBUSY;
  	if (is_cpu_exclusive(cur) &&
  	    !cpuset_cpumask_can_shrink(cur->cpus_allowed,
  				       trial->cpus_allowed))
  		goto out;
ae8086ce1   Tejun Heo   cpuset: introduce...
489
490
491
492
  	ret = 0;
  out:
  	rcu_read_unlock();
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
493
  }
db7f47cf4   Paul Menage   cpusets: allow cp...
494
  #ifdef CONFIG_SMP
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
495
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
496
   * Helper routine for generate_sched_domains().
8b5f1c52d   Li Zefan   cpuset: use effec...
497
   * Do cpusets a, b have overlapping effective cpus_allowed masks?
029190c51   Paul Jackson   cpuset sched_load...
498
   */
029190c51   Paul Jackson   cpuset sched_load...
499
500
  static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
  {
8b5f1c52d   Li Zefan   cpuset: use effec...
501
  	return cpumask_intersects(a->effective_cpus, b->effective_cpus);
029190c51   Paul Jackson   cpuset sched_load...
502
  }
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
503
504
505
  static void
  update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
  {
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
506
507
508
509
  	if (dattr->relax_domain_level < c->relax_domain_level)
  		dattr->relax_domain_level = c->relax_domain_level;
  	return;
  }
fc560a26a   Tejun Heo   cpuset: replace c...
510
511
  static void update_domain_attr_tree(struct sched_domain_attr *dattr,
  				    struct cpuset *root_cs)
f5393693e   Lai Jiangshan   cpuset: speed up ...
512
  {
fc560a26a   Tejun Heo   cpuset: replace c...
513
  	struct cpuset *cp;
492eb21b9   Tejun Heo   cgroup: make hier...
514
  	struct cgroup_subsys_state *pos_css;
f5393693e   Lai Jiangshan   cpuset: speed up ...
515

fc560a26a   Tejun Heo   cpuset: replace c...
516
  	rcu_read_lock();
492eb21b9   Tejun Heo   cgroup: make hier...
517
  	cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
bd8815a6d   Tejun Heo   cgroup: make css_...
518
519
  		if (cp == root_cs)
  			continue;
fc560a26a   Tejun Heo   cpuset: replace c...
520
521
  		/* skip the whole subtree if @cp doesn't have any CPU */
  		if (cpumask_empty(cp->cpus_allowed)) {
492eb21b9   Tejun Heo   cgroup: make hier...
522
  			pos_css = css_rightmost_descendant(pos_css);
f5393693e   Lai Jiangshan   cpuset: speed up ...
523
  			continue;
fc560a26a   Tejun Heo   cpuset: replace c...
524
  		}
f5393693e   Lai Jiangshan   cpuset: speed up ...
525
526
527
  
  		if (is_sched_load_balance(cp))
  			update_domain_attr(dattr, cp);
f5393693e   Lai Jiangshan   cpuset: speed up ...
528
  	}
fc560a26a   Tejun Heo   cpuset: replace c...
529
  	rcu_read_unlock();
f5393693e   Lai Jiangshan   cpuset: speed up ...
530
  }
029190c51   Paul Jackson   cpuset sched_load...
531
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
532
533
534
535
536
   * generate_sched_domains()
   *
   * This function builds a partial partition of the systems CPUs
   * A 'partial partition' is a set of non-overlapping subsets whose
   * union is a subset of that set.
0a0fca9d8   Viresh Kumar   sched: Rename sch...
537
   * The output of this function needs to be passed to kernel/sched/core.c
cf417141c   Max Krasnyansky   sched, cpuset: re...
538
539
540
   * partition_sched_domains() routine, which will rebuild the scheduler's
   * load balancing domains (sched domains) as specified by that partial
   * partition.
029190c51   Paul Jackson   cpuset sched_load...
541
   *
45ce80fb6   Li Zefan   cgroups: consolid...
542
   * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt
029190c51   Paul Jackson   cpuset sched_load...
543
544
545
546
547
548
549
   * for a background explanation of this.
   *
   * Does not return errors, on the theory that the callers of this
   * routine would rather not worry about failures to rebuild sched
   * domains when operating in the severe memory shortage situations
   * that could cause allocation failures below.
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
550
   * Must be called with cpuset_mutex held.
029190c51   Paul Jackson   cpuset sched_load...
551
552
   *
   * The three key local variables below are:
aeed68242   Li Zefan   cpuset: clean up ...
553
   *    q  - a linked-list queue of cpuset pointers, used to implement a
029190c51   Paul Jackson   cpuset sched_load...
554
555
556
557
558
559
560
561
562
563
564
565
   *	   top-down scan of all cpusets.  This scan loads a pointer
   *	   to each cpuset marked is_sched_load_balance into the
   *	   array 'csa'.  For our purposes, rebuilding the schedulers
   *	   sched domains, we can ignore !is_sched_load_balance cpusets.
   *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
   *	   that need to be load balanced, for convenient iterative
   *	   access by the subsequent code that finds the best partition,
   *	   i.e the set of domains (subsets) of CPUs such that the
   *	   cpus_allowed of every cpuset marked is_sched_load_balance
   *	   is a subset of one of these domains, while there are as
   *	   many such domains as possible, each as small as possible.
   * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
0a0fca9d8   Viresh Kumar   sched: Rename sch...
566
   *	   the kernel/sched/core.c routine partition_sched_domains() in a
029190c51   Paul Jackson   cpuset sched_load...
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
   *	   convenient format, that can be easily compared to the prior
   *	   value to determine what partition elements (sched domains)
   *	   were changed (added or removed.)
   *
   * Finding the best partition (set of domains):
   *	The triple nested loops below over i, j, k scan over the
   *	load balanced cpusets (using the array of cpuset pointers in
   *	csa[]) looking for pairs of cpusets that have overlapping
   *	cpus_allowed, but which don't have the same 'pn' partition
   *	number and gives them in the same partition number.  It keeps
   *	looping on the 'restart' label until it can no longer find
   *	any such pairs.
   *
   *	The union of the cpus_allowed masks from the set of
   *	all cpusets having the same 'pn' value then form the one
   *	element of the partition (one sched domain) to be passed to
   *	partition_sched_domains().
   */
acc3f5d7c   Rusty Russell   cpumask: Partitio...
585
  static int generate_sched_domains(cpumask_var_t **domains,
cf417141c   Max Krasnyansky   sched, cpuset: re...
586
  			struct sched_domain_attr **attributes)
029190c51   Paul Jackson   cpuset sched_load...
587
  {
029190c51   Paul Jackson   cpuset sched_load...
588
589
590
591
  	struct cpuset *cp;	/* scans q */
  	struct cpuset **csa;	/* array of all cpuset ptrs */
  	int csn;		/* how many cpuset ptrs in csa so far */
  	int i, j, k;		/* indices for partition finding loops */
acc3f5d7c   Rusty Russell   cpumask: Partitio...
592
  	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
593
  	struct sched_domain_attr *dattr;  /* attributes for custom domains */
1583715dd   Ingo Molnar   sched, cpusets: f...
594
  	int ndoms = 0;		/* number of sched domains in result */
6af866af3   Li Zefan   cpuset: remove re...
595
  	int nslot;		/* next empty doms[] struct cpumask slot */
492eb21b9   Tejun Heo   cgroup: make hier...
596
  	struct cgroup_subsys_state *pos_css;
029190c51   Paul Jackson   cpuset sched_load...
597

029190c51   Paul Jackson   cpuset sched_load...
598
  	doms = NULL;
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
599
  	dattr = NULL;
cf417141c   Max Krasnyansky   sched, cpuset: re...
600
  	csa = NULL;
029190c51   Paul Jackson   cpuset sched_load...
601
602
603
  
  	/* Special case for the 99% of systems with one, full, sched domain */
  	if (is_sched_load_balance(&top_cpuset)) {
acc3f5d7c   Rusty Russell   cpumask: Partitio...
604
605
  		ndoms = 1;
  		doms = alloc_sched_domains(ndoms);
029190c51   Paul Jackson   cpuset sched_load...
606
  		if (!doms)
cf417141c   Max Krasnyansky   sched, cpuset: re...
607
  			goto done;
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
608
609
610
  		dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
  		if (dattr) {
  			*dattr = SD_ATTR_INIT;
93a655755   Li Zefan   cpuset: fix wrong...
611
  			update_domain_attr_tree(dattr, &top_cpuset);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
612
  		}
8b5f1c52d   Li Zefan   cpuset: use effec...
613
  		cpumask_copy(doms[0], top_cpuset.effective_cpus);
cf417141c   Max Krasnyansky   sched, cpuset: re...
614

cf417141c   Max Krasnyansky   sched, cpuset: re...
615
  		goto done;
029190c51   Paul Jackson   cpuset sched_load...
616
  	}
664eeddee   Mel Gorman   mm: page_alloc: u...
617
  	csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
029190c51   Paul Jackson   cpuset sched_load...
618
619
620
  	if (!csa)
  		goto done;
  	csn = 0;
fc560a26a   Tejun Heo   cpuset: replace c...
621
  	rcu_read_lock();
492eb21b9   Tejun Heo   cgroup: make hier...
622
  	cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
bd8815a6d   Tejun Heo   cgroup: make css_...
623
624
  		if (cp == &top_cpuset)
  			continue;
f5393693e   Lai Jiangshan   cpuset: speed up ...
625
  		/*
fc560a26a   Tejun Heo   cpuset: replace c...
626
627
628
629
630
631
  		 * Continue traversing beyond @cp iff @cp has some CPUs and
  		 * isn't load balancing.  The former is obvious.  The
  		 * latter: All child cpusets contain a subset of the
  		 * parent's cpus, so just skip them, and then we call
  		 * update_domain_attr_tree() to calc relax_domain_level of
  		 * the corresponding sched domain.
f5393693e   Lai Jiangshan   cpuset: speed up ...
632
  		 */
fc560a26a   Tejun Heo   cpuset: replace c...
633
634
  		if (!cpumask_empty(cp->cpus_allowed) &&
  		    !is_sched_load_balance(cp))
f5393693e   Lai Jiangshan   cpuset: speed up ...
635
  			continue;
489a5393a   Lai Jiangshan   cpuset: don't pas...
636

fc560a26a   Tejun Heo   cpuset: replace c...
637
638
639
640
  		if (is_sched_load_balance(cp))
  			csa[csn++] = cp;
  
  		/* skip @cp's subtree */
492eb21b9   Tejun Heo   cgroup: make hier...
641
  		pos_css = css_rightmost_descendant(pos_css);
fc560a26a   Tejun Heo   cpuset: replace c...
642
643
  	}
  	rcu_read_unlock();
029190c51   Paul Jackson   cpuset sched_load...
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
  
  	for (i = 0; i < csn; i++)
  		csa[i]->pn = i;
  	ndoms = csn;
  
  restart:
  	/* Find the best partition (set of sched domains) */
  	for (i = 0; i < csn; i++) {
  		struct cpuset *a = csa[i];
  		int apn = a->pn;
  
  		for (j = 0; j < csn; j++) {
  			struct cpuset *b = csa[j];
  			int bpn = b->pn;
  
  			if (apn != bpn && cpusets_overlap(a, b)) {
  				for (k = 0; k < csn; k++) {
  					struct cpuset *c = csa[k];
  
  					if (c->pn == bpn)
  						c->pn = apn;
  				}
  				ndoms--;	/* one less element */
  				goto restart;
  			}
  		}
  	}
cf417141c   Max Krasnyansky   sched, cpuset: re...
671
672
673
674
  	/*
  	 * Now we know how many domains to create.
  	 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
  	 */
acc3f5d7c   Rusty Russell   cpumask: Partitio...
675
  	doms = alloc_sched_domains(ndoms);
700018e0a   Li Zefan   cpuset: fix regre...
676
  	if (!doms)
cf417141c   Max Krasnyansky   sched, cpuset: re...
677
  		goto done;
cf417141c   Max Krasnyansky   sched, cpuset: re...
678
679
680
681
682
  
  	/*
  	 * The rest of the code, including the scheduler, can deal with
  	 * dattr==NULL case. No need to abort if alloc fails.
  	 */
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
683
  	dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
029190c51   Paul Jackson   cpuset sched_load...
684
685
686
  
  	for (nslot = 0, i = 0; i < csn; i++) {
  		struct cpuset *a = csa[i];
6af866af3   Li Zefan   cpuset: remove re...
687
  		struct cpumask *dp;
029190c51   Paul Jackson   cpuset sched_load...
688
  		int apn = a->pn;
cf417141c   Max Krasnyansky   sched, cpuset: re...
689
690
691
692
  		if (apn < 0) {
  			/* Skip completed partitions */
  			continue;
  		}
acc3f5d7c   Rusty Russell   cpumask: Partitio...
693
  		dp = doms[nslot];
cf417141c   Max Krasnyansky   sched, cpuset: re...
694
695
696
697
  
  		if (nslot == ndoms) {
  			static int warnings = 10;
  			if (warnings) {
12d3089c1   Fabian Frederick   kernel/cpuset.c: ...
698
699
700
  				pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d
  ",
  					nslot, ndoms, csn, i, apn);
cf417141c   Max Krasnyansky   sched, cpuset: re...
701
  				warnings--;
029190c51   Paul Jackson   cpuset sched_load...
702
  			}
cf417141c   Max Krasnyansky   sched, cpuset: re...
703
704
  			continue;
  		}
029190c51   Paul Jackson   cpuset sched_load...
705

6af866af3   Li Zefan   cpuset: remove re...
706
  		cpumask_clear(dp);
cf417141c   Max Krasnyansky   sched, cpuset: re...
707
708
709
710
711
712
  		if (dattr)
  			*(dattr + nslot) = SD_ATTR_INIT;
  		for (j = i; j < csn; j++) {
  			struct cpuset *b = csa[j];
  
  			if (apn == b->pn) {
8b5f1c52d   Li Zefan   cpuset: use effec...
713
  				cpumask_or(dp, dp, b->effective_cpus);
cf417141c   Max Krasnyansky   sched, cpuset: re...
714
715
716
717
718
  				if (dattr)
  					update_domain_attr_tree(dattr + nslot, b);
  
  				/* Done with this partition */
  				b->pn = -1;
029190c51   Paul Jackson   cpuset sched_load...
719
  			}
029190c51   Paul Jackson   cpuset sched_load...
720
  		}
cf417141c   Max Krasnyansky   sched, cpuset: re...
721
  		nslot++;
029190c51   Paul Jackson   cpuset sched_load...
722
723
  	}
  	BUG_ON(nslot != ndoms);
cf417141c   Max Krasnyansky   sched, cpuset: re...
724
725
  done:
  	kfree(csa);
700018e0a   Li Zefan   cpuset: fix regre...
726
727
728
729
730
731
  	/*
  	 * Fallback to the default domain if kmalloc() failed.
  	 * See comments in partition_sched_domains().
  	 */
  	if (doms == NULL)
  		ndoms = 1;
cf417141c   Max Krasnyansky   sched, cpuset: re...
732
733
734
735
736
737
738
739
  	*domains    = doms;
  	*attributes = dattr;
  	return ndoms;
  }
  
  /*
   * Rebuild scheduler domains.
   *
699140ba8   Tejun Heo   cpuset: drop asyn...
740
741
742
743
744
   * If the flag 'sched_load_balance' of any cpuset with non-empty
   * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
   * which has that flag enabled, or if any cpuset with a non-empty
   * 'cpus' is removed, then call this routine to rebuild the
   * scheduler's dynamic sched domains.
cf417141c   Max Krasnyansky   sched, cpuset: re...
745
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
746
   * Call with cpuset_mutex held.  Takes get_online_cpus().
cf417141c   Max Krasnyansky   sched, cpuset: re...
747
   */
699140ba8   Tejun Heo   cpuset: drop asyn...
748
  static void rebuild_sched_domains_locked(void)
cf417141c   Max Krasnyansky   sched, cpuset: re...
749
750
  {
  	struct sched_domain_attr *attr;
acc3f5d7c   Rusty Russell   cpumask: Partitio...
751
  	cpumask_var_t *doms;
cf417141c   Max Krasnyansky   sched, cpuset: re...
752
  	int ndoms;
5d21cc2db   Tejun Heo   cpuset: replace c...
753
  	lockdep_assert_held(&cpuset_mutex);
86ef5c9a8   Gautham R Shenoy   cpu-hotplug: repl...
754
  	get_online_cpus();
cf417141c   Max Krasnyansky   sched, cpuset: re...
755

5b16c2a49   Li Zefan   cpuset: fix cpu h...
756
757
758
759
760
  	/*
  	 * We have raced with CPU hotplug. Don't do anything to avoid
  	 * passing doms with offlined cpu to partition_sched_domains().
  	 * Anyways, hotplug work item will rebuild sched domains.
  	 */
8b5f1c52d   Li Zefan   cpuset: use effec...
761
  	if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
5b16c2a49   Li Zefan   cpuset: fix cpu h...
762
  		goto out;
cf417141c   Max Krasnyansky   sched, cpuset: re...
763
  	/* Generate domain masks and attrs */
cf417141c   Max Krasnyansky   sched, cpuset: re...
764
  	ndoms = generate_sched_domains(&doms, &attr);
cf417141c   Max Krasnyansky   sched, cpuset: re...
765
766
767
  
  	/* Have scheduler rebuild the domains */
  	partition_sched_domains(ndoms, doms, attr);
5b16c2a49   Li Zefan   cpuset: fix cpu h...
768
  out:
86ef5c9a8   Gautham R Shenoy   cpu-hotplug: repl...
769
  	put_online_cpus();
cf417141c   Max Krasnyansky   sched, cpuset: re...
770
  }
db7f47cf4   Paul Menage   cpusets: allow cp...
771
  #else /* !CONFIG_SMP */
699140ba8   Tejun Heo   cpuset: drop asyn...
772
  static void rebuild_sched_domains_locked(void)
db7f47cf4   Paul Menage   cpusets: allow cp...
773
774
  {
  }
db7f47cf4   Paul Menage   cpusets: allow cp...
775
  #endif /* CONFIG_SMP */
029190c51   Paul Jackson   cpuset sched_load...
776

cf417141c   Max Krasnyansky   sched, cpuset: re...
777
778
  void rebuild_sched_domains(void)
  {
5d21cc2db   Tejun Heo   cpuset: replace c...
779
  	mutex_lock(&cpuset_mutex);
699140ba8   Tejun Heo   cpuset: drop asyn...
780
  	rebuild_sched_domains_locked();
5d21cc2db   Tejun Heo   cpuset: replace c...
781
  	mutex_unlock(&cpuset_mutex);
029190c51   Paul Jackson   cpuset sched_load...
782
  }
58f4790b7   Cliff Wickman   cpusets: update_c...
783
  /**
0b2f630a2   Miao Xie   cpusets: restruct...
784
785
   * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
   * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
0b2f630a2   Miao Xie   cpusets: restruct...
786
   *
d66393e54   Tejun Heo   cpuset: use css_t...
787
788
789
   * Iterate through each task of @cs updating its cpus_allowed to the
   * effective cpuset's.  As this function is called with cpuset_mutex held,
   * cpuset membership stays stable.
0b2f630a2   Miao Xie   cpusets: restruct...
790
   */
d66393e54   Tejun Heo   cpuset: use css_t...
791
  static void update_tasks_cpumask(struct cpuset *cs)
0b2f630a2   Miao Xie   cpusets: restruct...
792
  {
d66393e54   Tejun Heo   cpuset: use css_t...
793
794
795
796
797
  	struct css_task_iter it;
  	struct task_struct *task;
  
  	css_task_iter_start(&cs->css, &it);
  	while ((task = css_task_iter_next(&it)))
ae1c80238   Li Zefan   cpuset: apply cs-...
798
  		set_cpus_allowed_ptr(task, cs->effective_cpus);
d66393e54   Tejun Heo   cpuset: use css_t...
799
  	css_task_iter_end(&it);
0b2f630a2   Miao Xie   cpusets: restruct...
800
  }
5c5cc6232   Li Zefan   cpuset: allow to ...
801
  /*
734d45130   Li Zefan   cpuset: update cs...
802
803
804
805
806
807
   * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
   * @cs: the cpuset to consider
   * @new_cpus: temp variable for calculating new effective_cpus
   *
   * When congifured cpumask is changed, the effective cpumasks of this cpuset
   * and all its descendants need to be updated.
5c5cc6232   Li Zefan   cpuset: allow to ...
808
   *
734d45130   Li Zefan   cpuset: update cs...
809
   * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
5c5cc6232   Li Zefan   cpuset: allow to ...
810
811
812
   *
   * Called with cpuset_mutex held
   */
734d45130   Li Zefan   cpuset: update cs...
813
  static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
5c5cc6232   Li Zefan   cpuset: allow to ...
814
815
  {
  	struct cpuset *cp;
492eb21b9   Tejun Heo   cgroup: make hier...
816
  	struct cgroup_subsys_state *pos_css;
8b5f1c52d   Li Zefan   cpuset: use effec...
817
  	bool need_rebuild_sched_domains = false;
5c5cc6232   Li Zefan   cpuset: allow to ...
818
819
  
  	rcu_read_lock();
734d45130   Li Zefan   cpuset: update cs...
820
821
822
823
  	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
  		struct cpuset *parent = parent_cs(cp);
  
  		cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
554b0d1c8   Li Zefan   cpuset: inherit a...
824
825
826
827
828
829
  		/*
  		 * If it becomes empty, inherit the effective mask of the
  		 * parent, which is guaranteed to have some CPUs.
  		 */
  		if (cpumask_empty(new_cpus))
  			cpumask_copy(new_cpus, parent->effective_cpus);
734d45130   Li Zefan   cpuset: update cs...
830
831
832
833
  		/* Skip the whole subtree if the cpumask remains the same. */
  		if (cpumask_equal(new_cpus, cp->effective_cpus)) {
  			pos_css = css_rightmost_descendant(pos_css);
  			continue;
5c5cc6232   Li Zefan   cpuset: allow to ...
834
  		}
734d45130   Li Zefan   cpuset: update cs...
835

ec903c0c8   Tejun Heo   cgroup: rename cs...
836
  		if (!css_tryget_online(&cp->css))
5c5cc6232   Li Zefan   cpuset: allow to ...
837
838
  			continue;
  		rcu_read_unlock();
8447a0fee   Vladimir Davydov   cpuset: convert c...
839
  		spin_lock_irq(&callback_lock);
734d45130   Li Zefan   cpuset: update cs...
840
  		cpumask_copy(cp->effective_cpus, new_cpus);
8447a0fee   Vladimir Davydov   cpuset: convert c...
841
  		spin_unlock_irq(&callback_lock);
734d45130   Li Zefan   cpuset: update cs...
842
843
844
  
  		WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
  			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
d66393e54   Tejun Heo   cpuset: use css_t...
845
  		update_tasks_cpumask(cp);
5c5cc6232   Li Zefan   cpuset: allow to ...
846

8b5f1c52d   Li Zefan   cpuset: use effec...
847
848
849
850
851
852
853
  		/*
  		 * If the effective cpumask of any non-empty cpuset is changed,
  		 * we need to rebuild sched domains.
  		 */
  		if (!cpumask_empty(cp->cpus_allowed) &&
  		    is_sched_load_balance(cp))
  			need_rebuild_sched_domains = true;
5c5cc6232   Li Zefan   cpuset: allow to ...
854
855
856
857
  		rcu_read_lock();
  		css_put(&cp->css);
  	}
  	rcu_read_unlock();
8b5f1c52d   Li Zefan   cpuset: use effec...
858
859
860
  
  	if (need_rebuild_sched_domains)
  		rebuild_sched_domains_locked();
5c5cc6232   Li Zefan   cpuset: allow to ...
861
  }
0b2f630a2   Miao Xie   cpusets: restruct...
862
  /**
58f4790b7   Cliff Wickman   cpusets: update_c...
863
864
   * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
   * @cs: the cpuset to consider
fc34ac1dc   Fabian Frederick   kernel/cpuset.c: ...
865
   * @trialcs: trial cpuset
58f4790b7   Cliff Wickman   cpusets: update_c...
866
867
   * @buf: buffer of cpu numbers written to this cpuset
   */
645fcc9d2   Li Zefan   cpuset: don't all...
868
869
  static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  			  const char *buf)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
870
  {
58f4790b7   Cliff Wickman   cpusets: update_c...
871
  	int retval;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
872

5f054e31c   Rusty Russell   documentation: re...
873
  	/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
874
875
  	if (cs == &top_cpuset)
  		return -EACCES;
6f7f02e78   David Rientjes   cpusets: allow em...
876
  	/*
c8d9c90c7   Paul Jackson   hotplug cpu: move...
877
  	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
020958b62   Paul Jackson   cpusets: decrusti...
878
879
880
  	 * Since cpulist_parse() fails on an empty mask, we special case
  	 * that parsing.  The validate_change() call ensures that cpusets
  	 * with tasks have cpus.
6f7f02e78   David Rientjes   cpusets: allow em...
881
  	 */
020958b62   Paul Jackson   cpusets: decrusti...
882
  	if (!*buf) {
300ed6cbb   Li Zefan   cpuset: convert c...
883
  		cpumask_clear(trialcs->cpus_allowed);
6f7f02e78   David Rientjes   cpusets: allow em...
884
  	} else {
300ed6cbb   Li Zefan   cpuset: convert c...
885
  		retval = cpulist_parse(buf, trialcs->cpus_allowed);
6f7f02e78   David Rientjes   cpusets: allow em...
886
887
  		if (retval < 0)
  			return retval;
37340746a   Lai Jiangshan   cpusets: fix bug ...
888

5d8ba82c3   Li Zefan   cpuset: allow wri...
889
890
  		if (!cpumask_subset(trialcs->cpus_allowed,
  				    top_cpuset.cpus_allowed))
37340746a   Lai Jiangshan   cpusets: fix bug ...
891
  			return -EINVAL;
6f7f02e78   David Rientjes   cpusets: allow em...
892
  	}
029190c51   Paul Jackson   cpuset sched_load...
893

8707d8b8c   Paul Menage   Fix cpusets updat...
894
  	/* Nothing to do if the cpus didn't change */
300ed6cbb   Li Zefan   cpuset: convert c...
895
  	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
8707d8b8c   Paul Menage   Fix cpusets updat...
896
  		return 0;
58f4790b7   Cliff Wickman   cpusets: update_c...
897

a73456f37   Li Zefan   cpuset: re-struct...
898
899
900
  	retval = validate_change(cs, trialcs);
  	if (retval < 0)
  		return retval;
8447a0fee   Vladimir Davydov   cpuset: convert c...
901
  	spin_lock_irq(&callback_lock);
300ed6cbb   Li Zefan   cpuset: convert c...
902
  	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
8447a0fee   Vladimir Davydov   cpuset: convert c...
903
  	spin_unlock_irq(&callback_lock);
029190c51   Paul Jackson   cpuset sched_load...
904

734d45130   Li Zefan   cpuset: update cs...
905
906
  	/* use trialcs->cpus_allowed as a temp variable */
  	update_cpumasks_hier(cs, trialcs->cpus_allowed);
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
907
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
908
  }
053199edf   Paul Jackson   [PATCH] cpusets: ...
909
  /*
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
910
911
912
913
914
915
916
   * cpuset_migrate_mm
   *
   *    Migrate memory region from one set of nodes to another.
   *
   *    Temporarilly set tasks mems_allowed to target nodes of migration,
   *    so that the migration code can allocate pages on these nodes.
   *
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
917
918
919
920
   *    While the mm_struct we are migrating is typically from some
   *    other task, the task_struct mems_allowed that we are hacking
   *    is for our current task, which must allocate new pages for that
   *    migrating memory region.
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
921
922
923
924
925
926
   */
  
  static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
  							const nodemask_t *to)
  {
  	struct task_struct *tsk = current;
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
927
  	tsk->mems_allowed = *to;
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
928
929
  
  	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
472958300   Li Zefan   cpuset: fix a loc...
930
  	rcu_read_lock();
ae1c80238   Li Zefan   cpuset: apply cs-...
931
  	guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
472958300   Li Zefan   cpuset: fix a loc...
932
  	rcu_read_unlock();
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
933
  }
3b6766fe6   Li Zefan   cpuset: rewrite u...
934
  /*
58568d2a8   Miao Xie   cpuset,mm: update...
935
936
937
938
939
940
941
   * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
   * @tsk: the task to change
   * @newmems: new nodes that the task will be set
   *
   * In order to avoid seeing no nodes if the old and new nodes are disjoint,
   * we structure updates as setting all new allowed nodes, then clearing newly
   * disallowed ones.
58568d2a8   Miao Xie   cpuset,mm: update...
942
943
944
945
   */
  static void cpuset_change_task_nodemask(struct task_struct *tsk,
  					nodemask_t *newmems)
  {
b246272ec   David Rientjes   cpusets: stall wh...
946
  	bool need_loop;
89e8a244b   David Rientjes   cpusets: avoid lo...
947

c0ff7453b   Miao Xie   cpuset,mm: fix no...
948
949
950
951
952
953
954
955
956
957
  	/*
  	 * Allow tasks that have access to memory reserves because they have
  	 * been OOM killed to get memory anywhere.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE)))
  		return;
  	if (current->flags & PF_EXITING) /* Let dying task have memory */
  		return;
  
  	task_lock(tsk);
b246272ec   David Rientjes   cpusets: stall wh...
958
959
  	/*
  	 * Determine if a loop is necessary if another thread is doing
d26914d11   Mel Gorman   mm: optimize put_...
960
  	 * read_mems_allowed_begin().  If at least one node remains unchanged and
b246272ec   David Rientjes   cpusets: stall wh...
961
962
963
964
965
  	 * tsk does not have a mempolicy, then an empty nodemask will not be
  	 * possible when mems_allowed is larger than a word.
  	 */
  	need_loop = task_has_mempolicy(tsk) ||
  			!nodes_intersects(*newmems, tsk->mems_allowed);
c0ff7453b   Miao Xie   cpuset,mm: fix no...
966

0fc0287c9   Peter Zijlstra   cpuset: Fix memor...
967
968
  	if (need_loop) {
  		local_irq_disable();
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
969
  		write_seqcount_begin(&tsk->mems_allowed_seq);
0fc0287c9   Peter Zijlstra   cpuset: Fix memor...
970
  	}
c0ff7453b   Miao Xie   cpuset,mm: fix no...
971

cc9a6c877   Mel Gorman   cpuset: mm: reduc...
972
973
  	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
  	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
c0ff7453b   Miao Xie   cpuset,mm: fix no...
974
975
  
  	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
58568d2a8   Miao Xie   cpuset,mm: update...
976
  	tsk->mems_allowed = *newmems;
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
977

0fc0287c9   Peter Zijlstra   cpuset: Fix memor...
978
  	if (need_loop) {
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
979
  		write_seqcount_end(&tsk->mems_allowed_seq);
0fc0287c9   Peter Zijlstra   cpuset: Fix memor...
980
981
  		local_irq_enable();
  	}
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
982

c0ff7453b   Miao Xie   cpuset,mm: fix no...
983
  	task_unlock(tsk);
58568d2a8   Miao Xie   cpuset,mm: update...
984
  }
8793d854e   Paul Menage   Task Control Grou...
985
  static void *cpuset_being_rebound;
0b2f630a2   Miao Xie   cpusets: restruct...
986
987
988
  /**
   * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
   * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
0b2f630a2   Miao Xie   cpusets: restruct...
989
   *
d66393e54   Tejun Heo   cpuset: use css_t...
990
991
992
   * Iterate through each task of @cs updating its mems_allowed to the
   * effective cpuset's.  As this function is called with cpuset_mutex held,
   * cpuset membership stays stable.
0b2f630a2   Miao Xie   cpusets: restruct...
993
   */
d66393e54   Tejun Heo   cpuset: use css_t...
994
  static void update_tasks_nodemask(struct cpuset *cs)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
995
  {
33ad801df   Li Zefan   cpuset: record ol...
996
  	static nodemask_t newmems;	/* protected by cpuset_mutex */
d66393e54   Tejun Heo   cpuset: use css_t...
997
998
  	struct css_task_iter it;
  	struct task_struct *task;
59dac16fb   Paul Jackson   [PATCH] cpuset: u...
999

846a16bf0   Lee Schermerhorn   mempolicy: rename...
1000
  	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1001

ae1c80238   Li Zefan   cpuset: apply cs-...
1002
  	guarantee_online_mems(cs, &newmems);
33ad801df   Li Zefan   cpuset: record ol...
1003

4225399a6   Paul Jackson   [PATCH] cpuset: r...
1004
  	/*
3b6766fe6   Li Zefan   cpuset: rewrite u...
1005
1006
1007
1008
  	 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
  	 * take while holding tasklist_lock.  Forks can happen - the
  	 * mpol_dup() cpuset_being_rebound check will catch such forks,
  	 * and rebind their vma mempolicies too.  Because we still hold
5d21cc2db   Tejun Heo   cpuset: replace c...
1009
  	 * the global cpuset_mutex, we know that no other rebind effort
3b6766fe6   Li Zefan   cpuset: rewrite u...
1010
  	 * will be contending for the global variable cpuset_being_rebound.
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1011
  	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
04c19fa6f   Paul Jackson   [PATCH] cpuset: m...
1012
  	 * is idempotent.  Also migrate pages in each mm to new nodes.
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1013
  	 */
d66393e54   Tejun Heo   cpuset: use css_t...
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
  	css_task_iter_start(&cs->css, &it);
  	while ((task = css_task_iter_next(&it))) {
  		struct mm_struct *mm;
  		bool migrate;
  
  		cpuset_change_task_nodemask(task, &newmems);
  
  		mm = get_task_mm(task);
  		if (!mm)
  			continue;
  
  		migrate = is_memory_migrate(cs);
  
  		mpol_rebind_mm(mm, &cs->mems_allowed);
  		if (migrate)
  			cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
  		mmput(mm);
  	}
  	css_task_iter_end(&it);
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1033

33ad801df   Li Zefan   cpuset: record ol...
1034
1035
1036
1037
1038
  	/*
  	 * All the tasks' nodemasks have been updated, update
  	 * cs->old_mems_allowed.
  	 */
  	cs->old_mems_allowed = newmems;
2df167a30   Paul Menage   cgroups: update c...
1039
  	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
8793d854e   Paul Menage   Task Control Grou...
1040
  	cpuset_being_rebound = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1041
  }
0b2f630a2   Miao Xie   cpusets: restruct...
1042
  /*
734d45130   Li Zefan   cpuset: update cs...
1043
1044
1045
   * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
   * @cs: the cpuset to consider
   * @new_mems: a temp variable for calculating new effective_mems
5c5cc6232   Li Zefan   cpuset: allow to ...
1046
   *
734d45130   Li Zefan   cpuset: update cs...
1047
1048
   * When configured nodemask is changed, the effective nodemasks of this cpuset
   * and all its descendants need to be updated.
5c5cc6232   Li Zefan   cpuset: allow to ...
1049
   *
734d45130   Li Zefan   cpuset: update cs...
1050
   * On legacy hiearchy, effective_mems will be the same with mems_allowed.
5c5cc6232   Li Zefan   cpuset: allow to ...
1051
1052
1053
   *
   * Called with cpuset_mutex held
   */
734d45130   Li Zefan   cpuset: update cs...
1054
  static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
5c5cc6232   Li Zefan   cpuset: allow to ...
1055
1056
  {
  	struct cpuset *cp;
492eb21b9   Tejun Heo   cgroup: make hier...
1057
  	struct cgroup_subsys_state *pos_css;
5c5cc6232   Li Zefan   cpuset: allow to ...
1058
1059
  
  	rcu_read_lock();
734d45130   Li Zefan   cpuset: update cs...
1060
1061
1062
1063
  	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
  		struct cpuset *parent = parent_cs(cp);
  
  		nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
554b0d1c8   Li Zefan   cpuset: inherit a...
1064
1065
1066
1067
1068
1069
  		/*
  		 * If it becomes empty, inherit the effective mask of the
  		 * parent, which is guaranteed to have some MEMs.
  		 */
  		if (nodes_empty(*new_mems))
  			*new_mems = parent->effective_mems;
734d45130   Li Zefan   cpuset: update cs...
1070
1071
1072
1073
  		/* Skip the whole subtree if the nodemask remains the same. */
  		if (nodes_equal(*new_mems, cp->effective_mems)) {
  			pos_css = css_rightmost_descendant(pos_css);
  			continue;
5c5cc6232   Li Zefan   cpuset: allow to ...
1074
  		}
734d45130   Li Zefan   cpuset: update cs...
1075

ec903c0c8   Tejun Heo   cgroup: rename cs...
1076
  		if (!css_tryget_online(&cp->css))
5c5cc6232   Li Zefan   cpuset: allow to ...
1077
1078
  			continue;
  		rcu_read_unlock();
8447a0fee   Vladimir Davydov   cpuset: convert c...
1079
  		spin_lock_irq(&callback_lock);
734d45130   Li Zefan   cpuset: update cs...
1080
  		cp->effective_mems = *new_mems;
8447a0fee   Vladimir Davydov   cpuset: convert c...
1081
  		spin_unlock_irq(&callback_lock);
734d45130   Li Zefan   cpuset: update cs...
1082
1083
  
  		WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
a13812683   Li Zefan   cpuset: fix the W...
1084
  			!nodes_equal(cp->mems_allowed, cp->effective_mems));
734d45130   Li Zefan   cpuset: update cs...
1085

d66393e54   Tejun Heo   cpuset: use css_t...
1086
  		update_tasks_nodemask(cp);
5c5cc6232   Li Zefan   cpuset: allow to ...
1087
1088
1089
1090
1091
1092
1093
1094
  
  		rcu_read_lock();
  		css_put(&cp->css);
  	}
  	rcu_read_unlock();
  }
  
  /*
0b2f630a2   Miao Xie   cpusets: restruct...
1095
1096
   * Handle user request to change the 'mems' memory placement
   * of a cpuset.  Needs to validate the request, update the
58568d2a8   Miao Xie   cpuset,mm: update...
1097
1098
1099
1100
   * cpusets mems_allowed, and for each task in the cpuset,
   * update mems_allowed and rebind task's mempolicy and any vma
   * mempolicies and if the cpuset is marked 'memory_migrate',
   * migrate the tasks pages to the new memory.
0b2f630a2   Miao Xie   cpusets: restruct...
1101
   *
8447a0fee   Vladimir Davydov   cpuset: convert c...
1102
   * Call with cpuset_mutex held. May take callback_lock during call.
0b2f630a2   Miao Xie   cpusets: restruct...
1103
1104
1105
1106
   * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
   * lock each such tasks mm->mmap_sem, scan its vma's and rebind
   * their mempolicies to the cpusets new mems_allowed.
   */
645fcc9d2   Li Zefan   cpuset: don't all...
1107
1108
  static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
  			   const char *buf)
0b2f630a2   Miao Xie   cpusets: restruct...
1109
  {
0b2f630a2   Miao Xie   cpusets: restruct...
1110
1111
1112
  	int retval;
  
  	/*
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
1113
  	 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
0b2f630a2   Miao Xie   cpusets: restruct...
1114
1115
  	 * it's read-only
  	 */
53feb2976   Miao Xie   cpuset: alloc nod...
1116
1117
1118
1119
  	if (cs == &top_cpuset) {
  		retval = -EACCES;
  		goto done;
  	}
0b2f630a2   Miao Xie   cpusets: restruct...
1120

0b2f630a2   Miao Xie   cpusets: restruct...
1121
1122
1123
1124
1125
1126
1127
  	/*
  	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
  	 * Since nodelist_parse() fails on an empty mask, we special case
  	 * that parsing.  The validate_change() call ensures that cpusets
  	 * with tasks have memory.
  	 */
  	if (!*buf) {
645fcc9d2   Li Zefan   cpuset: don't all...
1128
  		nodes_clear(trialcs->mems_allowed);
0b2f630a2   Miao Xie   cpusets: restruct...
1129
  	} else {
645fcc9d2   Li Zefan   cpuset: don't all...
1130
  		retval = nodelist_parse(buf, trialcs->mems_allowed);
0b2f630a2   Miao Xie   cpusets: restruct...
1131
1132
  		if (retval < 0)
  			goto done;
645fcc9d2   Li Zefan   cpuset: don't all...
1133
  		if (!nodes_subset(trialcs->mems_allowed,
5d8ba82c3   Li Zefan   cpuset: allow wri...
1134
1135
  				  top_cpuset.mems_allowed)) {
  			retval = -EINVAL;
53feb2976   Miao Xie   cpuset: alloc nod...
1136
1137
  			goto done;
  		}
0b2f630a2   Miao Xie   cpusets: restruct...
1138
  	}
33ad801df   Li Zefan   cpuset: record ol...
1139
1140
  
  	if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
0b2f630a2   Miao Xie   cpusets: restruct...
1141
1142
1143
  		retval = 0;		/* Too easy - nothing to do */
  		goto done;
  	}
645fcc9d2   Li Zefan   cpuset: don't all...
1144
  	retval = validate_change(cs, trialcs);
0b2f630a2   Miao Xie   cpusets: restruct...
1145
1146
  	if (retval < 0)
  		goto done;
8447a0fee   Vladimir Davydov   cpuset: convert c...
1147
  	spin_lock_irq(&callback_lock);
645fcc9d2   Li Zefan   cpuset: don't all...
1148
  	cs->mems_allowed = trialcs->mems_allowed;
8447a0fee   Vladimir Davydov   cpuset: convert c...
1149
  	spin_unlock_irq(&callback_lock);
0b2f630a2   Miao Xie   cpusets: restruct...
1150

734d45130   Li Zefan   cpuset: update cs...
1151
1152
  	/* use trialcs->mems_allowed as a temp variable */
  	update_nodemasks_hier(cs, &cs->mems_allowed);
0b2f630a2   Miao Xie   cpusets: restruct...
1153
1154
1155
  done:
  	return retval;
  }
8793d854e   Paul Menage   Task Control Grou...
1156
1157
  int current_cpuset_is_being_rebound(void)
  {
391acf970   Gu Zheng   cpuset,mempolicy:...
1158
1159
1160
1161
1162
1163
1164
  	int ret;
  
  	rcu_read_lock();
  	ret = task_cs(current) == cpuset_being_rebound;
  	rcu_read_unlock();
  
  	return ret;
8793d854e   Paul Menage   Task Control Grou...
1165
  }
5be7a4792   Paul Menage   Fix cpuset sched_...
1166
  static int update_relax_domain_level(struct cpuset *cs, s64 val)
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1167
  {
db7f47cf4   Paul Menage   cpusets: allow cp...
1168
  #ifdef CONFIG_SMP
60495e776   Peter Zijlstra   sched: Dynamic sc...
1169
  	if (val < -1 || val >= sched_domain_level_max)
30e0e1781   Li Zefan   cpuset: limit the...
1170
  		return -EINVAL;
db7f47cf4   Paul Menage   cpusets: allow cp...
1171
  #endif
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1172
1173
1174
  
  	if (val != cs->relax_domain_level) {
  		cs->relax_domain_level = val;
300ed6cbb   Li Zefan   cpuset: convert c...
1175
1176
  		if (!cpumask_empty(cs->cpus_allowed) &&
  		    is_sched_load_balance(cs))
699140ba8   Tejun Heo   cpuset: drop asyn...
1177
  			rebuild_sched_domains_locked();
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1178
1179
1180
1181
  	}
  
  	return 0;
  }
72ec70299   Tejun Heo   cgroup: make task...
1182
  /**
950592f7b   Miao Xie   cpusets: update t...
1183
1184
   * update_tasks_flags - update the spread flags of tasks in the cpuset.
   * @cs: the cpuset in which each task's spread flags needs to be changed
950592f7b   Miao Xie   cpusets: update t...
1185
   *
d66393e54   Tejun Heo   cpuset: use css_t...
1186
1187
1188
   * Iterate through each task of @cs updating its spread flags.  As this
   * function is called with cpuset_mutex held, cpuset membership stays
   * stable.
950592f7b   Miao Xie   cpusets: update t...
1189
   */
d66393e54   Tejun Heo   cpuset: use css_t...
1190
  static void update_tasks_flags(struct cpuset *cs)
950592f7b   Miao Xie   cpusets: update t...
1191
  {
d66393e54   Tejun Heo   cpuset: use css_t...
1192
1193
1194
1195
1196
1197
1198
  	struct css_task_iter it;
  	struct task_struct *task;
  
  	css_task_iter_start(&cs->css, &it);
  	while ((task = css_task_iter_next(&it)))
  		cpuset_update_task_spread_flag(cs, task);
  	css_task_iter_end(&it);
950592f7b   Miao Xie   cpusets: update t...
1199
1200
1201
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1202
   * update_flag - read a 0 or a 1 in a file and update associated flag
786083667   Paul Menage   Cpuset hardwall f...
1203
1204
1205
   * bit:		the bit to update (see cpuset_flagbits_t)
   * cs:		the cpuset to update
   * turning_on: 	whether the flag is being set or cleared
053199edf   Paul Jackson   [PATCH] cpusets: ...
1206
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
1207
   * Call with cpuset_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1208
   */
700fe1ab9   Paul Menage   CGroup API files:...
1209
1210
  static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
  		       int turning_on)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1211
  {
645fcc9d2   Li Zefan   cpuset: don't all...
1212
  	struct cpuset *trialcs;
40b6a7623   Rakib Mullick   cpuset.c: remove ...
1213
  	int balance_flag_changed;
950592f7b   Miao Xie   cpusets: update t...
1214
  	int spread_flag_changed;
950592f7b   Miao Xie   cpusets: update t...
1215
  	int err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1216

645fcc9d2   Li Zefan   cpuset: don't all...
1217
1218
1219
  	trialcs = alloc_trial_cpuset(cs);
  	if (!trialcs)
  		return -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1220
  	if (turning_on)
645fcc9d2   Li Zefan   cpuset: don't all...
1221
  		set_bit(bit, &trialcs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1222
  	else
645fcc9d2   Li Zefan   cpuset: don't all...
1223
  		clear_bit(bit, &trialcs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1224

645fcc9d2   Li Zefan   cpuset: don't all...
1225
  	err = validate_change(cs, trialcs);
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
1226
  	if (err < 0)
645fcc9d2   Li Zefan   cpuset: don't all...
1227
  		goto out;
029190c51   Paul Jackson   cpuset sched_load...
1228

029190c51   Paul Jackson   cpuset sched_load...
1229
  	balance_flag_changed = (is_sched_load_balance(cs) !=
645fcc9d2   Li Zefan   cpuset: don't all...
1230
  				is_sched_load_balance(trialcs));
029190c51   Paul Jackson   cpuset sched_load...
1231

950592f7b   Miao Xie   cpusets: update t...
1232
1233
  	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
  			|| (is_spread_page(cs) != is_spread_page(trialcs)));
8447a0fee   Vladimir Davydov   cpuset: convert c...
1234
  	spin_lock_irq(&callback_lock);
645fcc9d2   Li Zefan   cpuset: don't all...
1235
  	cs->flags = trialcs->flags;
8447a0fee   Vladimir Davydov   cpuset: convert c...
1236
  	spin_unlock_irq(&callback_lock);
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
1237

300ed6cbb   Li Zefan   cpuset: convert c...
1238
  	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
699140ba8   Tejun Heo   cpuset: drop asyn...
1239
  		rebuild_sched_domains_locked();
029190c51   Paul Jackson   cpuset sched_load...
1240

950592f7b   Miao Xie   cpusets: update t...
1241
  	if (spread_flag_changed)
d66393e54   Tejun Heo   cpuset: use css_t...
1242
  		update_tasks_flags(cs);
645fcc9d2   Li Zefan   cpuset: don't all...
1243
1244
1245
  out:
  	free_trial_cpuset(trialcs);
  	return err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1246
  }
053199edf   Paul Jackson   [PATCH] cpusets: ...
1247
  /*
80f7228b5   Adrian Bunk   typo fixes: occur...
1248
   * Frequency meter - How fast is some event occurring?
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
   *
   * These routines manage a digitally filtered, constant time based,
   * event frequency meter.  There are four routines:
   *   fmeter_init() - initialize a frequency meter.
   *   fmeter_markevent() - called each time the event happens.
   *   fmeter_getrate() - returns the recent rate of such events.
   *   fmeter_update() - internal routine used to update fmeter.
   *
   * A common data structure is passed to each of these routines,
   * which is used to keep track of the state required to manage the
   * frequency meter and its digital filter.
   *
   * The filter works on the number of events marked per unit time.
   * The filter is single-pole low-pass recursive (IIR).  The time unit
   * is 1 second.  Arithmetic is done using 32-bit integers scaled to
   * simulate 3 decimal digits of precision (multiplied by 1000).
   *
   * With an FM_COEF of 933, and a time base of 1 second, the filter
   * has a half-life of 10 seconds, meaning that if the events quit
   * happening, then the rate returned from the fmeter_getrate()
   * will be cut in half each 10 seconds, until it converges to zero.
   *
   * It is not worth doing a real infinitely recursive filter.  If more
   * than FM_MAXTICKS ticks have elapsed since the last filter event,
   * just compute FM_MAXTICKS ticks worth, by which point the level
   * will be stable.
   *
   * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
   * arithmetic overflow in the fmeter_update() routine.
   *
   * Given the simple 32 bit integer arithmetic used, this meter works
   * best for reporting rates between one per millisecond (msec) and
   * one per 32 (approx) seconds.  At constant rates faster than one
   * per msec it maxes out at values just under 1,000,000.  At constant
   * rates between one per msec, and one per second it will stabilize
   * to a value N*1000, where N is the rate of events per second.
   * At constant rates between one per second and one per 32 seconds,
   * it will be choppy, moving up on the seconds that have an event,
   * and then decaying until the next event.  At rates slower than
   * about one in 32 seconds, it decays all the way back to zero between
   * each event.
   */
  
  #define FM_COEF 933		/* coefficient for half-life of 10 secs */
  #define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
  #define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */
  #define FM_SCALE 1000		/* faux fixed point scale */
  
  /* Initialize a frequency meter */
  static void fmeter_init(struct fmeter *fmp)
  {
  	fmp->cnt = 0;
  	fmp->val = 0;
  	fmp->time = 0;
  	spin_lock_init(&fmp->lock);
  }
  
  /* Internal meter update - process cnt events and update value */
  static void fmeter_update(struct fmeter *fmp)
  {
  	time_t now = get_seconds();
  	time_t ticks = now - fmp->time;
  
  	if (ticks == 0)
  		return;
  
  	ticks = min(FM_MAXTICKS, ticks);
  	while (ticks-- > 0)
  		fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
  	fmp->time = now;
  
  	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
  	fmp->cnt = 0;
  }
  
  /* Process any previous ticks, then bump cnt by one (times scale). */
  static void fmeter_markevent(struct fmeter *fmp)
  {
  	spin_lock(&fmp->lock);
  	fmeter_update(fmp);
  	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
  	spin_unlock(&fmp->lock);
  }
  
  /* Process any previous ticks, then return current value. */
  static int fmeter_getrate(struct fmeter *fmp)
  {
  	int val;
  
  	spin_lock(&fmp->lock);
  	fmeter_update(fmp);
  	val = fmp->val;
  	spin_unlock(&fmp->lock);
  	return val;
  }
57fce0a68   Tejun Heo   cpuset: don't use...
1344
  static struct cpuset *cpuset_attach_old_cs;
5d21cc2db   Tejun Heo   cpuset: replace c...
1345
  /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
eb95419b0   Tejun Heo   cgroup: pass arou...
1346
1347
  static int cpuset_can_attach(struct cgroup_subsys_state *css,
  			     struct cgroup_taskset *tset)
f780bdb7c   Ben Blum   cgroups: add per-...
1348
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
1349
  	struct cpuset *cs = css_cs(css);
bb9d97b6d   Tejun Heo   cgroup: don't use...
1350
1351
  	struct task_struct *task;
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1352

57fce0a68   Tejun Heo   cpuset: don't use...
1353
1354
  	/* used later by cpuset_attach() */
  	cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
5d21cc2db   Tejun Heo   cpuset: replace c...
1355
  	mutex_lock(&cpuset_mutex);
aa6ec29be   Tejun Heo   cgroup: remove sa...
1356
  	/* allow moving tasks into an empty cpuset if on default hierarchy */
5d21cc2db   Tejun Heo   cpuset: replace c...
1357
  	ret = -ENOSPC;
aa6ec29be   Tejun Heo   cgroup: remove sa...
1358
  	if (!cgroup_on_dfl(css->cgroup) &&
88fa523bf   Li Zefan   cpuset: allow to ...
1359
  	    (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
5d21cc2db   Tejun Heo   cpuset: replace c...
1360
  		goto out_unlock;
9985b0bab   David Rientjes   sched: prevent bo...
1361

924f0d9a2   Tejun Heo   cgroup: drop @ski...
1362
  	cgroup_taskset_for_each(task, tset) {
7f51412a4   Juri Lelli   sched/deadline: F...
1363
1364
  		ret = task_can_attach(task, cs->cpus_allowed);
  		if (ret)
5d21cc2db   Tejun Heo   cpuset: replace c...
1365
1366
1367
1368
  			goto out_unlock;
  		ret = security_task_setscheduler(task);
  		if (ret)
  			goto out_unlock;
bb9d97b6d   Tejun Heo   cgroup: don't use...
1369
  	}
f780bdb7c   Ben Blum   cgroups: add per-...
1370

452477fa6   Tejun Heo   cpuset: pin down ...
1371
1372
1373
1374
1375
  	/*
  	 * Mark attach is in progress.  This makes validate_change() fail
  	 * changes which zero cpus/mems_allowed.
  	 */
  	cs->attach_in_progress++;
5d21cc2db   Tejun Heo   cpuset: replace c...
1376
1377
1378
1379
  	ret = 0;
  out_unlock:
  	mutex_unlock(&cpuset_mutex);
  	return ret;
8793d854e   Paul Menage   Task Control Grou...
1380
  }
f780bdb7c   Ben Blum   cgroups: add per-...
1381

eb95419b0   Tejun Heo   cgroup: pass arou...
1382
  static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
452477fa6   Tejun Heo   cpuset: pin down ...
1383
1384
  				 struct cgroup_taskset *tset)
  {
5d21cc2db   Tejun Heo   cpuset: replace c...
1385
  	mutex_lock(&cpuset_mutex);
eb95419b0   Tejun Heo   cgroup: pass arou...
1386
  	css_cs(css)->attach_in_progress--;
5d21cc2db   Tejun Heo   cpuset: replace c...
1387
  	mutex_unlock(&cpuset_mutex);
8793d854e   Paul Menage   Task Control Grou...
1388
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1389

4e4c9a140   Tejun Heo   cpuset: cleanup c...
1390
  /*
5d21cc2db   Tejun Heo   cpuset: replace c...
1391
   * Protected by cpuset_mutex.  cpus_attach is used only by cpuset_attach()
4e4c9a140   Tejun Heo   cpuset: cleanup c...
1392
1393
1394
1395
   * but we can't allocate it dynamically there.  Define it global and
   * allocate from cpuset_init().
   */
  static cpumask_var_t cpus_attach;
eb95419b0   Tejun Heo   cgroup: pass arou...
1396
1397
  static void cpuset_attach(struct cgroup_subsys_state *css,
  			  struct cgroup_taskset *tset)
8793d854e   Paul Menage   Task Control Grou...
1398
  {
67bd2c598   Li Zefan   cpuset: remove un...
1399
  	/* static buf protected by cpuset_mutex */
4e4c9a140   Tejun Heo   cpuset: cleanup c...
1400
  	static nodemask_t cpuset_attach_nodemask_to;
8793d854e   Paul Menage   Task Control Grou...
1401
  	struct mm_struct *mm;
bb9d97b6d   Tejun Heo   cgroup: don't use...
1402
1403
  	struct task_struct *task;
  	struct task_struct *leader = cgroup_taskset_first(tset);
eb95419b0   Tejun Heo   cgroup: pass arou...
1404
  	struct cpuset *cs = css_cs(css);
57fce0a68   Tejun Heo   cpuset: don't use...
1405
  	struct cpuset *oldcs = cpuset_attach_old_cs;
22fb52dd7   David Quigley   [PATCH] SELinux: ...
1406

5d21cc2db   Tejun Heo   cpuset: replace c...
1407
  	mutex_lock(&cpuset_mutex);
4e4c9a140   Tejun Heo   cpuset: cleanup c...
1408
1409
1410
1411
  	/* prepare for attach */
  	if (cs == &top_cpuset)
  		cpumask_copy(cpus_attach, cpu_possible_mask);
  	else
ae1c80238   Li Zefan   cpuset: apply cs-...
1412
  		guarantee_online_cpus(cs, cpus_attach);
4e4c9a140   Tejun Heo   cpuset: cleanup c...
1413

ae1c80238   Li Zefan   cpuset: apply cs-...
1414
  	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
4e4c9a140   Tejun Heo   cpuset: cleanup c...
1415

924f0d9a2   Tejun Heo   cgroup: drop @ski...
1416
  	cgroup_taskset_for_each(task, tset) {
bb9d97b6d   Tejun Heo   cgroup: don't use...
1417
1418
1419
1420
1421
1422
1423
1424
1425
  		/*
  		 * can_attach beforehand should guarantee that this doesn't
  		 * fail.  TODO: have a better way to handle failure here
  		 */
  		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
  
  		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
  		cpuset_update_task_spread_flag(cs, task);
  	}
22fb52dd7   David Quigley   [PATCH] SELinux: ...
1426

f780bdb7c   Ben Blum   cgroups: add per-...
1427
1428
1429
1430
  	/*
  	 * Change mm, possibly for multiple threads in a threadgroup. This is
  	 * expensive and may sleep.
  	 */
ae1c80238   Li Zefan   cpuset: apply cs-...
1431
  	cpuset_attach_nodemask_to = cs->effective_mems;
bb9d97b6d   Tejun Heo   cgroup: don't use...
1432
  	mm = get_task_mm(leader);
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1433
  	if (mm) {
f780bdb7c   Ben Blum   cgroups: add per-...
1434
  		mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
f047cecf2   Li Zefan   cpuset: fix to mi...
1435
1436
1437
1438
1439
1440
1441
1442
1443
  
  		/*
  		 * old_mems_allowed is the same with mems_allowed here, except
  		 * if this task is being moved automatically due to hotplug.
  		 * In that case @mems_allowed has been updated and is empty,
  		 * so @old_mems_allowed is the right nodesets that we migrate
  		 * mm from.
  		 */
  		if (is_memory_migrate(cs)) {
ae1c80238   Li Zefan   cpuset: apply cs-...
1444
  			cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
f780bdb7c   Ben Blum   cgroups: add per-...
1445
  					  &cpuset_attach_nodemask_to);
f047cecf2   Li Zefan   cpuset: fix to mi...
1446
  		}
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1447
1448
  		mmput(mm);
  	}
452477fa6   Tejun Heo   cpuset: pin down ...
1449

33ad801df   Li Zefan   cpuset: record ol...
1450
  	cs->old_mems_allowed = cpuset_attach_nodemask_to;
02bb58637   Tejun Heo   cpuset: schedule ...
1451

452477fa6   Tejun Heo   cpuset: pin down ...
1452
  	cs->attach_in_progress--;
e44193d39   Li Zefan   cpuset: let hotpl...
1453
1454
  	if (!cs->attach_in_progress)
  		wake_up(&cpuset_attach_wq);
5d21cc2db   Tejun Heo   cpuset: replace c...
1455
1456
  
  	mutex_unlock(&cpuset_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1457
1458
1459
1460
1461
  }
  
  /* The various types of files and directories in a cpuset file system */
  
  typedef enum {
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
1462
  	FILE_MEMORY_MIGRATE,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1463
1464
  	FILE_CPULIST,
  	FILE_MEMLIST,
afd1a8b3e   Li Zefan   cpuset: export ef...
1465
1466
  	FILE_EFFECTIVE_CPULIST,
  	FILE_EFFECTIVE_MEMLIST,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1467
1468
  	FILE_CPU_EXCLUSIVE,
  	FILE_MEM_EXCLUSIVE,
786083667   Paul Menage   Cpuset hardwall f...
1469
  	FILE_MEM_HARDWALL,
029190c51   Paul Jackson   cpuset sched_load...
1470
  	FILE_SCHED_LOAD_BALANCE,
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1471
  	FILE_SCHED_RELAX_DOMAIN_LEVEL,
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1472
1473
  	FILE_MEMORY_PRESSURE_ENABLED,
  	FILE_MEMORY_PRESSURE,
825a46af5   Paul Jackson   [PATCH] cpuset me...
1474
1475
  	FILE_SPREAD_PAGE,
  	FILE_SPREAD_SLAB,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1476
  } cpuset_filetype_t;
182446d08   Tejun Heo   cgroup: pass arou...
1477
1478
  static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
  			    u64 val)
700fe1ab9   Paul Menage   CGroup API files:...
1479
  {
182446d08   Tejun Heo   cgroup: pass arou...
1480
  	struct cpuset *cs = css_cs(css);
700fe1ab9   Paul Menage   CGroup API files:...
1481
  	cpuset_filetype_t type = cft->private;
a903f0865   Li Zefan   cpuset: fix the r...
1482
  	int retval = 0;
700fe1ab9   Paul Menage   CGroup API files:...
1483

5d21cc2db   Tejun Heo   cpuset: replace c...
1484
  	mutex_lock(&cpuset_mutex);
a903f0865   Li Zefan   cpuset: fix the r...
1485
1486
  	if (!is_cpuset_online(cs)) {
  		retval = -ENODEV;
5d21cc2db   Tejun Heo   cpuset: replace c...
1487
  		goto out_unlock;
a903f0865   Li Zefan   cpuset: fix the r...
1488
  	}
700fe1ab9   Paul Menage   CGroup API files:...
1489
1490
  
  	switch (type) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1491
  	case FILE_CPU_EXCLUSIVE:
700fe1ab9   Paul Menage   CGroup API files:...
1492
  		retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1493
1494
  		break;
  	case FILE_MEM_EXCLUSIVE:
700fe1ab9   Paul Menage   CGroup API files:...
1495
  		retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1496
  		break;
786083667   Paul Menage   Cpuset hardwall f...
1497
1498
1499
  	case FILE_MEM_HARDWALL:
  		retval = update_flag(CS_MEM_HARDWALL, cs, val);
  		break;
029190c51   Paul Jackson   cpuset sched_load...
1500
  	case FILE_SCHED_LOAD_BALANCE:
700fe1ab9   Paul Menage   CGroup API files:...
1501
  		retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1502
  		break;
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
1503
  	case FILE_MEMORY_MIGRATE:
700fe1ab9   Paul Menage   CGroup API files:...
1504
  		retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
1505
  		break;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1506
  	case FILE_MEMORY_PRESSURE_ENABLED:
700fe1ab9   Paul Menage   CGroup API files:...
1507
  		cpuset_memory_pressure_enabled = !!val;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1508
1509
1510
1511
  		break;
  	case FILE_MEMORY_PRESSURE:
  		retval = -EACCES;
  		break;
825a46af5   Paul Jackson   [PATCH] cpuset me...
1512
  	case FILE_SPREAD_PAGE:
700fe1ab9   Paul Menage   CGroup API files:...
1513
  		retval = update_flag(CS_SPREAD_PAGE, cs, val);
825a46af5   Paul Jackson   [PATCH] cpuset me...
1514
1515
  		break;
  	case FILE_SPREAD_SLAB:
700fe1ab9   Paul Menage   CGroup API files:...
1516
  		retval = update_flag(CS_SPREAD_SLAB, cs, val);
825a46af5   Paul Jackson   [PATCH] cpuset me...
1517
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1518
1519
  	default:
  		retval = -EINVAL;
700fe1ab9   Paul Menage   CGroup API files:...
1520
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1521
  	}
5d21cc2db   Tejun Heo   cpuset: replace c...
1522
1523
  out_unlock:
  	mutex_unlock(&cpuset_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1524
1525
  	return retval;
  }
182446d08   Tejun Heo   cgroup: pass arou...
1526
1527
  static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
  			    s64 val)
5be7a4792   Paul Menage   Fix cpuset sched_...
1528
  {
182446d08   Tejun Heo   cgroup: pass arou...
1529
  	struct cpuset *cs = css_cs(css);
5be7a4792   Paul Menage   Fix cpuset sched_...
1530
  	cpuset_filetype_t type = cft->private;
5d21cc2db   Tejun Heo   cpuset: replace c...
1531
  	int retval = -ENODEV;
5be7a4792   Paul Menage   Fix cpuset sched_...
1532

5d21cc2db   Tejun Heo   cpuset: replace c...
1533
1534
1535
  	mutex_lock(&cpuset_mutex);
  	if (!is_cpuset_online(cs))
  		goto out_unlock;
e37123953   Paul Menage   cgroup files: rem...
1536

5be7a4792   Paul Menage   Fix cpuset sched_...
1537
1538
1539
1540
1541
1542
1543
1544
  	switch (type) {
  	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
  		retval = update_relax_domain_level(cs, val);
  		break;
  	default:
  		retval = -EINVAL;
  		break;
  	}
5d21cc2db   Tejun Heo   cpuset: replace c...
1545
1546
  out_unlock:
  	mutex_unlock(&cpuset_mutex);
5be7a4792   Paul Menage   Fix cpuset sched_...
1547
1548
  	return retval;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1549
  /*
e37123953   Paul Menage   cgroup files: rem...
1550
1551
   * Common handling for a write to a "cpus" or "mems" file.
   */
451af504d   Tejun Heo   cgroup: replace c...
1552
1553
  static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
  				    char *buf, size_t nbytes, loff_t off)
e37123953   Paul Menage   cgroup files: rem...
1554
  {
451af504d   Tejun Heo   cgroup: replace c...
1555
  	struct cpuset *cs = css_cs(of_css(of));
645fcc9d2   Li Zefan   cpuset: don't all...
1556
  	struct cpuset *trialcs;
5d21cc2db   Tejun Heo   cpuset: replace c...
1557
  	int retval = -ENODEV;
e37123953   Paul Menage   cgroup files: rem...
1558

451af504d   Tejun Heo   cgroup: replace c...
1559
  	buf = strstrip(buf);
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
  	/*
  	 * CPU or memory hotunplug may leave @cs w/o any execution
  	 * resources, in which case the hotplug code asynchronously updates
  	 * configuration and transfers all tasks to the nearest ancestor
  	 * which can execute.
  	 *
  	 * As writes to "cpus" or "mems" may restore @cs's execution
  	 * resources, wait for the previously scheduled operations before
  	 * proceeding, so that we don't end up keep removing tasks added
  	 * after execution capability is restored.
76bb5ab8f   Tejun Heo   cpuset: break ker...
1570
1571
1572
1573
1574
1575
1576
1577
  	 *
  	 * cpuset_hotplug_work calls back into cgroup core via
  	 * cgroup_transfer_tasks() and waiting for it from a cgroupfs
  	 * operation like this one can lead to a deadlock through kernfs
  	 * active_ref protection.  Let's break the protection.  Losing the
  	 * protection is okay as we check whether @cs is online after
  	 * grabbing cpuset_mutex anyway.  This only happens on the legacy
  	 * hierarchies.
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
1578
  	 */
76bb5ab8f   Tejun Heo   cpuset: break ker...
1579
1580
  	css_get(&cs->css);
  	kernfs_break_active_protection(of->kn);
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
1581
  	flush_work(&cpuset_hotplug_work);
5d21cc2db   Tejun Heo   cpuset: replace c...
1582
1583
1584
  	mutex_lock(&cpuset_mutex);
  	if (!is_cpuset_online(cs))
  		goto out_unlock;
e37123953   Paul Menage   cgroup files: rem...
1585

645fcc9d2   Li Zefan   cpuset: don't all...
1586
  	trialcs = alloc_trial_cpuset(cs);
b75f38d65   Li Zefan   cpuset: add a mis...
1587
1588
  	if (!trialcs) {
  		retval = -ENOMEM;
5d21cc2db   Tejun Heo   cpuset: replace c...
1589
  		goto out_unlock;
b75f38d65   Li Zefan   cpuset: add a mis...
1590
  	}
645fcc9d2   Li Zefan   cpuset: don't all...
1591

451af504d   Tejun Heo   cgroup: replace c...
1592
  	switch (of_cft(of)->private) {
e37123953   Paul Menage   cgroup files: rem...
1593
  	case FILE_CPULIST:
645fcc9d2   Li Zefan   cpuset: don't all...
1594
  		retval = update_cpumask(cs, trialcs, buf);
e37123953   Paul Menage   cgroup files: rem...
1595
1596
  		break;
  	case FILE_MEMLIST:
645fcc9d2   Li Zefan   cpuset: don't all...
1597
  		retval = update_nodemask(cs, trialcs, buf);
e37123953   Paul Menage   cgroup files: rem...
1598
1599
1600
1601
1602
  		break;
  	default:
  		retval = -EINVAL;
  		break;
  	}
645fcc9d2   Li Zefan   cpuset: don't all...
1603
1604
  
  	free_trial_cpuset(trialcs);
5d21cc2db   Tejun Heo   cpuset: replace c...
1605
1606
  out_unlock:
  	mutex_unlock(&cpuset_mutex);
76bb5ab8f   Tejun Heo   cpuset: break ker...
1607
1608
  	kernfs_unbreak_active_protection(of->kn);
  	css_put(&cs->css);
451af504d   Tejun Heo   cgroup: replace c...
1609
  	return retval ?: nbytes;
e37123953   Paul Menage   cgroup files: rem...
1610
1611
1612
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1613
1614
1615
1616
1617
1618
   * These ascii lists should be read in a single call, by using a user
   * buffer large enough to hold the entire map.  If read in smaller
   * chunks, there is no guarantee of atomicity.  Since the display format
   * used, list of ranges of sequential numbers, is variable length,
   * and since these maps can change value dynamically, one could read
   * gibberish by doing partial reads while a list was changing.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1619
   */
2da8ca822   Tejun Heo   cgroup: replace c...
1620
  static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1621
  {
2da8ca822   Tejun Heo   cgroup: replace c...
1622
1623
  	struct cpuset *cs = css_cs(seq_css(sf));
  	cpuset_filetype_t type = seq_cft(sf)->private;
51ffe4117   Tejun Heo   cpuset: convert a...
1624
1625
1626
  	ssize_t count;
  	char *buf, *s;
  	int ret = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1627

51ffe4117   Tejun Heo   cpuset: convert a...
1628
1629
  	count = seq_get_buf(sf, &buf);
  	s = buf;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1630

8447a0fee   Vladimir Davydov   cpuset: convert c...
1631
  	spin_lock_irq(&callback_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1632
1633
1634
  
  	switch (type) {
  	case FILE_CPULIST:
51ffe4117   Tejun Heo   cpuset: convert a...
1635
  		s += cpulist_scnprintf(s, count, cs->cpus_allowed);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1636
1637
  		break;
  	case FILE_MEMLIST:
51ffe4117   Tejun Heo   cpuset: convert a...
1638
  		s += nodelist_scnprintf(s, count, cs->mems_allowed);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1639
  		break;
afd1a8b3e   Li Zefan   cpuset: export ef...
1640
1641
1642
1643
1644
1645
  	case FILE_EFFECTIVE_CPULIST:
  		s += cpulist_scnprintf(s, count, cs->effective_cpus);
  		break;
  	case FILE_EFFECTIVE_MEMLIST:
  		s += nodelist_scnprintf(s, count, cs->effective_mems);
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1646
  	default:
51ffe4117   Tejun Heo   cpuset: convert a...
1647
1648
  		ret = -EINVAL;
  		goto out_unlock;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1649
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1650

51ffe4117   Tejun Heo   cpuset: convert a...
1651
1652
1653
1654
1655
1656
1657
1658
  	if (s < buf + count - 1) {
  		*s++ = '
  ';
  		seq_commit(sf, s - buf);
  	} else {
  		seq_commit(sf, -1);
  	}
  out_unlock:
8447a0fee   Vladimir Davydov   cpuset: convert c...
1659
  	spin_unlock_irq(&callback_lock);
51ffe4117   Tejun Heo   cpuset: convert a...
1660
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1661
  }
182446d08   Tejun Heo   cgroup: pass arou...
1662
  static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
700fe1ab9   Paul Menage   CGroup API files:...
1663
  {
182446d08   Tejun Heo   cgroup: pass arou...
1664
  	struct cpuset *cs = css_cs(css);
700fe1ab9   Paul Menage   CGroup API files:...
1665
1666
1667
1668
1669
1670
  	cpuset_filetype_t type = cft->private;
  	switch (type) {
  	case FILE_CPU_EXCLUSIVE:
  		return is_cpu_exclusive(cs);
  	case FILE_MEM_EXCLUSIVE:
  		return is_mem_exclusive(cs);
786083667   Paul Menage   Cpuset hardwall f...
1671
1672
  	case FILE_MEM_HARDWALL:
  		return is_mem_hardwall(cs);
700fe1ab9   Paul Menage   CGroup API files:...
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
  	case FILE_SCHED_LOAD_BALANCE:
  		return is_sched_load_balance(cs);
  	case FILE_MEMORY_MIGRATE:
  		return is_memory_migrate(cs);
  	case FILE_MEMORY_PRESSURE_ENABLED:
  		return cpuset_memory_pressure_enabled;
  	case FILE_MEMORY_PRESSURE:
  		return fmeter_getrate(&cs->fmeter);
  	case FILE_SPREAD_PAGE:
  		return is_spread_page(cs);
  	case FILE_SPREAD_SLAB:
  		return is_spread_slab(cs);
  	default:
  		BUG();
  	}
cf417141c   Max Krasnyansky   sched, cpuset: re...
1688
1689
1690
  
  	/* Unreachable but makes gcc happy */
  	return 0;
700fe1ab9   Paul Menage   CGroup API files:...
1691
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1692

182446d08   Tejun Heo   cgroup: pass arou...
1693
  static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
5be7a4792   Paul Menage   Fix cpuset sched_...
1694
  {
182446d08   Tejun Heo   cgroup: pass arou...
1695
  	struct cpuset *cs = css_cs(css);
5be7a4792   Paul Menage   Fix cpuset sched_...
1696
1697
1698
1699
1700
1701
1702
  	cpuset_filetype_t type = cft->private;
  	switch (type) {
  	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
  		return cs->relax_domain_level;
  	default:
  		BUG();
  	}
cf417141c   Max Krasnyansky   sched, cpuset: re...
1703
1704
1705
  
  	/* Unrechable but makes gcc happy */
  	return 0;
5be7a4792   Paul Menage   Fix cpuset sched_...
1706
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1707
1708
1709
1710
  
  /*
   * for the common functions, 'private' gives the type of file
   */
addf2c739   Paul Menage   Cpuset hardwall f...
1711
1712
1713
  static struct cftype files[] = {
  	{
  		.name = "cpus",
2da8ca822   Tejun Heo   cgroup: replace c...
1714
  		.seq_show = cpuset_common_seq_show,
451af504d   Tejun Heo   cgroup: replace c...
1715
  		.write = cpuset_write_resmask,
e37123953   Paul Menage   cgroup files: rem...
1716
  		.max_write_len = (100U + 6 * NR_CPUS),
addf2c739   Paul Menage   Cpuset hardwall f...
1717
1718
1719
1720
1721
  		.private = FILE_CPULIST,
  	},
  
  	{
  		.name = "mems",
2da8ca822   Tejun Heo   cgroup: replace c...
1722
  		.seq_show = cpuset_common_seq_show,
451af504d   Tejun Heo   cgroup: replace c...
1723
  		.write = cpuset_write_resmask,
e37123953   Paul Menage   cgroup files: rem...
1724
  		.max_write_len = (100U + 6 * MAX_NUMNODES),
addf2c739   Paul Menage   Cpuset hardwall f...
1725
1726
1727
1728
  		.private = FILE_MEMLIST,
  	},
  
  	{
afd1a8b3e   Li Zefan   cpuset: export ef...
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
  		.name = "effective_cpus",
  		.seq_show = cpuset_common_seq_show,
  		.private = FILE_EFFECTIVE_CPULIST,
  	},
  
  	{
  		.name = "effective_mems",
  		.seq_show = cpuset_common_seq_show,
  		.private = FILE_EFFECTIVE_MEMLIST,
  	},
  
  	{
addf2c739   Paul Menage   Cpuset hardwall f...
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
  		.name = "cpu_exclusive",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_CPU_EXCLUSIVE,
  	},
  
  	{
  		.name = "mem_exclusive",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEM_EXCLUSIVE,
  	},
  
  	{
786083667   Paul Menage   Cpuset hardwall f...
1755
1756
1757
1758
1759
1760
1761
  		.name = "mem_hardwall",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEM_HARDWALL,
  	},
  
  	{
addf2c739   Paul Menage   Cpuset hardwall f...
1762
1763
1764
1765
1766
1767
1768
1769
  		.name = "sched_load_balance",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_SCHED_LOAD_BALANCE,
  	},
  
  	{
  		.name = "sched_relax_domain_level",
5be7a4792   Paul Menage   Fix cpuset sched_...
1770
1771
  		.read_s64 = cpuset_read_s64,
  		.write_s64 = cpuset_write_s64,
addf2c739   Paul Menage   Cpuset hardwall f...
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
  		.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
  	},
  
  	{
  		.name = "memory_migrate",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEMORY_MIGRATE,
  	},
  
  	{
  		.name = "memory_pressure",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEMORY_PRESSURE,
099fca322   Li Zefan   cgroups: show cor...
1787
  		.mode = S_IRUGO,
addf2c739   Paul Menage   Cpuset hardwall f...
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
  	},
  
  	{
  		.name = "memory_spread_page",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_SPREAD_PAGE,
  	},
  
  	{
  		.name = "memory_spread_slab",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_SPREAD_SLAB,
  	},
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1803

4baf6e332   Tejun Heo   cgroup: convert a...
1804
1805
1806
1807
1808
1809
1810
  	{
  		.name = "memory_pressure_enabled",
  		.flags = CFTYPE_ONLY_ON_ROOT,
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEMORY_PRESSURE_ENABLED,
  	},
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1811

4baf6e332   Tejun Heo   cgroup: convert a...
1812
1813
  	{ }	/* terminate */
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1814
1815
  
  /*
92fb97487   Tejun Heo   cgroup: rename ->...
1816
   *	cpuset_css_alloc - allocate a cpuset css
c9e5fe66f   Li Zefan   cpuset: rename @c...
1817
   *	cgrp:	control group that the new cpuset will be part of
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1818
   */
eb95419b0   Tejun Heo   cgroup: pass arou...
1819
1820
  static struct cgroup_subsys_state *
  cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1821
  {
c8f699bb5   Tejun Heo   cpuset: introduce...
1822
  	struct cpuset *cs;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1823

eb95419b0   Tejun Heo   cgroup: pass arou...
1824
  	if (!parent_css)
8793d854e   Paul Menage   Task Control Grou...
1825
  		return &top_cpuset.css;
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1826

c8f699bb5   Tejun Heo   cpuset: introduce...
1827
  	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1828
  	if (!cs)
8793d854e   Paul Menage   Task Control Grou...
1829
  		return ERR_PTR(-ENOMEM);
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
1830
1831
1832
1833
  	if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
  		goto free_cs;
  	if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
  		goto free_cpus;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1834

029190c51   Paul Jackson   cpuset sched_load...
1835
  	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
300ed6cbb   Li Zefan   cpuset: convert c...
1836
  	cpumask_clear(cs->cpus_allowed);
f9a86fcbb   Mike Travis   cpuset: modify cp...
1837
  	nodes_clear(cs->mems_allowed);
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
1838
1839
  	cpumask_clear(cs->effective_cpus);
  	nodes_clear(cs->effective_mems);
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1840
  	fmeter_init(&cs->fmeter);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1841
  	cs->relax_domain_level = -1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1842

c8f699bb5   Tejun Heo   cpuset: introduce...
1843
  	return &cs->css;
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
1844
1845
1846
1847
1848
1849
  
  free_cpus:
  	free_cpumask_var(cs->cpus_allowed);
  free_cs:
  	kfree(cs);
  	return ERR_PTR(-ENOMEM);
c8f699bb5   Tejun Heo   cpuset: introduce...
1850
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
1851
  static int cpuset_css_online(struct cgroup_subsys_state *css)
c8f699bb5   Tejun Heo   cpuset: introduce...
1852
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
1853
  	struct cpuset *cs = css_cs(css);
c431069fe   Tejun Heo   cpuset: remove cp...
1854
  	struct cpuset *parent = parent_cs(cs);
ae8086ce1   Tejun Heo   cpuset: introduce...
1855
  	struct cpuset *tmp_cs;
492eb21b9   Tejun Heo   cgroup: make hier...
1856
  	struct cgroup_subsys_state *pos_css;
c8f699bb5   Tejun Heo   cpuset: introduce...
1857
1858
1859
  
  	if (!parent)
  		return 0;
5d21cc2db   Tejun Heo   cpuset: replace c...
1860
  	mutex_lock(&cpuset_mutex);
efeb77b2f   Tejun Heo   cpuset: introduce...
1861
  	set_bit(CS_ONLINE, &cs->flags);
c8f699bb5   Tejun Heo   cpuset: introduce...
1862
1863
1864
1865
  	if (is_spread_page(parent))
  		set_bit(CS_SPREAD_PAGE, &cs->flags);
  	if (is_spread_slab(parent))
  		set_bit(CS_SPREAD_SLAB, &cs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1866

664eeddee   Mel Gorman   mm: page_alloc: u...
1867
  	cpuset_inc();
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1868

8447a0fee   Vladimir Davydov   cpuset: convert c...
1869
  	spin_lock_irq(&callback_lock);
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
1870
1871
1872
1873
  	if (cgroup_on_dfl(cs->css.cgroup)) {
  		cpumask_copy(cs->effective_cpus, parent->effective_cpus);
  		cs->effective_mems = parent->effective_mems;
  	}
8447a0fee   Vladimir Davydov   cpuset: convert c...
1874
  	spin_unlock_irq(&callback_lock);
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
1875

eb95419b0   Tejun Heo   cgroup: pass arou...
1876
  	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
5d21cc2db   Tejun Heo   cpuset: replace c...
1877
  		goto out_unlock;
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
  
  	/*
  	 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
  	 * set.  This flag handling is implemented in cgroup core for
  	 * histrical reasons - the flag may be specified during mount.
  	 *
  	 * Currently, if any sibling cpusets have exclusive cpus or mem, we
  	 * refuse to clone the configuration - thereby refusing the task to
  	 * be entered, and as a result refusing the sys_unshare() or
  	 * clone() which initiated it.  If this becomes a problem for some
  	 * users who wish to allow that scenario, then this could be
  	 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
  	 * (and likewise for mems) to the new cgroup.
  	 */
ae8086ce1   Tejun Heo   cpuset: introduce...
1892
  	rcu_read_lock();
492eb21b9   Tejun Heo   cgroup: make hier...
1893
  	cpuset_for_each_child(tmp_cs, pos_css, parent) {
ae8086ce1   Tejun Heo   cpuset: introduce...
1894
1895
  		if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
  			rcu_read_unlock();
5d21cc2db   Tejun Heo   cpuset: replace c...
1896
  			goto out_unlock;
ae8086ce1   Tejun Heo   cpuset: introduce...
1897
  		}
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1898
  	}
ae8086ce1   Tejun Heo   cpuset: introduce...
1899
  	rcu_read_unlock();
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1900

8447a0fee   Vladimir Davydov   cpuset: convert c...
1901
  	spin_lock_irq(&callback_lock);
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1902
1903
  	cs->mems_allowed = parent->mems_allowed;
  	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
cea74465e   Dan Carpenter   cpuset: lock vs u...
1904
  	spin_unlock_irq(&callback_lock);
5d21cc2db   Tejun Heo   cpuset: replace c...
1905
1906
  out_unlock:
  	mutex_unlock(&cpuset_mutex);
c8f699bb5   Tejun Heo   cpuset: introduce...
1907
1908
  	return 0;
  }
0b9e6965a   Zhao Hongjiang   cpuset: relocate ...
1909
1910
1911
1912
1913
  /*
   * If the cpuset being removed has its flag 'sched_load_balance'
   * enabled, then simulate turning sched_load_balance off, which
   * will call rebuild_sched_domains_locked().
   */
eb95419b0   Tejun Heo   cgroup: pass arou...
1914
  static void cpuset_css_offline(struct cgroup_subsys_state *css)
c8f699bb5   Tejun Heo   cpuset: introduce...
1915
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
1916
  	struct cpuset *cs = css_cs(css);
c8f699bb5   Tejun Heo   cpuset: introduce...
1917

5d21cc2db   Tejun Heo   cpuset: replace c...
1918
  	mutex_lock(&cpuset_mutex);
c8f699bb5   Tejun Heo   cpuset: introduce...
1919
1920
1921
  
  	if (is_sched_load_balance(cs))
  		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
664eeddee   Mel Gorman   mm: page_alloc: u...
1922
  	cpuset_dec();
efeb77b2f   Tejun Heo   cpuset: introduce...
1923
  	clear_bit(CS_ONLINE, &cs->flags);
c8f699bb5   Tejun Heo   cpuset: introduce...
1924

5d21cc2db   Tejun Heo   cpuset: replace c...
1925
  	mutex_unlock(&cpuset_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1926
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
1927
  static void cpuset_css_free(struct cgroup_subsys_state *css)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1928
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
1929
  	struct cpuset *cs = css_cs(css);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1930

e2b9a3d7d   Li Zefan   cpuset: add cs->e...
1931
  	free_cpumask_var(cs->effective_cpus);
300ed6cbb   Li Zefan   cpuset: convert c...
1932
  	free_cpumask_var(cs->cpus_allowed);
8793d854e   Paul Menage   Task Control Grou...
1933
  	kfree(cs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1934
  }
39bd0d15e   Li Zefan   cpuset: initializ...
1935
1936
1937
  static void cpuset_bind(struct cgroup_subsys_state *root_css)
  {
  	mutex_lock(&cpuset_mutex);
8447a0fee   Vladimir Davydov   cpuset: convert c...
1938
  	spin_lock_irq(&callback_lock);
39bd0d15e   Li Zefan   cpuset: initializ...
1939
1940
1941
1942
1943
1944
1945
1946
1947
  
  	if (cgroup_on_dfl(root_css->cgroup)) {
  		cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
  		top_cpuset.mems_allowed = node_possible_map;
  	} else {
  		cpumask_copy(top_cpuset.cpus_allowed,
  			     top_cpuset.effective_cpus);
  		top_cpuset.mems_allowed = top_cpuset.effective_mems;
  	}
8447a0fee   Vladimir Davydov   cpuset: convert c...
1948
  	spin_unlock_irq(&callback_lock);
39bd0d15e   Li Zefan   cpuset: initializ...
1949
1950
  	mutex_unlock(&cpuset_mutex);
  }
073219e99   Tejun Heo   cgroup: clean up ...
1951
  struct cgroup_subsys cpuset_cgrp_subsys = {
39bd0d15e   Li Zefan   cpuset: initializ...
1952
1953
1954
1955
1956
1957
1958
1959
  	.css_alloc	= cpuset_css_alloc,
  	.css_online	= cpuset_css_online,
  	.css_offline	= cpuset_css_offline,
  	.css_free	= cpuset_css_free,
  	.can_attach	= cpuset_can_attach,
  	.cancel_attach	= cpuset_cancel_attach,
  	.attach		= cpuset_attach,
  	.bind		= cpuset_bind,
5577964e6   Tejun Heo   cgroup: rename cg...
1960
  	.legacy_cftypes	= files,
39bd0d15e   Li Zefan   cpuset: initializ...
1961
  	.early_init	= 1,
8793d854e   Paul Menage   Task Control Grou...
1962
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1963
1964
1965
1966
1967
1968
1969
1970
  /**
   * cpuset_init - initialize cpusets at system boot
   *
   * Description: Initialize top_cpuset and the cpuset internal file system,
   **/
  
  int __init cpuset_init(void)
  {
8793d854e   Paul Menage   Task Control Grou...
1971
  	int err = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1972

58568d2a8   Miao Xie   cpuset,mm: update...
1973
1974
  	if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
  		BUG();
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
1975
1976
  	if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
  		BUG();
58568d2a8   Miao Xie   cpuset,mm: update...
1977

300ed6cbb   Li Zefan   cpuset: convert c...
1978
  	cpumask_setall(top_cpuset.cpus_allowed);
f9a86fcbb   Mike Travis   cpuset: modify cp...
1979
  	nodes_setall(top_cpuset.mems_allowed);
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
1980
1981
  	cpumask_setall(top_cpuset.effective_cpus);
  	nodes_setall(top_cpuset.effective_mems);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1982

3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1983
  	fmeter_init(&top_cpuset.fmeter);
029190c51   Paul Jackson   cpuset sched_load...
1984
  	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1985
  	top_cpuset.relax_domain_level = -1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1986

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1987
1988
  	err = register_filesystem(&cpuset_fs_type);
  	if (err < 0)
8793d854e   Paul Menage   Task Control Grou...
1989
  		return err;
2341d1b65   Li Zefan   cpuset: convert c...
1990
1991
  	if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
  		BUG();
8793d854e   Paul Menage   Task Control Grou...
1992
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1993
  }
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1994
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
1995
   * If CPU and/or memory hotplug handlers, below, unplug any CPUs
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1996
1997
   * or memory nodes, we need to walk over the cpuset hierarchy,
   * removing that CPU or node from all cpusets.  If this removes the
956db3ca0   Cliff Wickman   hotplug cpu: move...
1998
1999
   * last CPU or node from a cpuset, then move the tasks in the empty
   * cpuset to its next-highest non-empty parent.
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2000
   */
956db3ca0   Cliff Wickman   hotplug cpu: move...
2001
2002
2003
  static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
  {
  	struct cpuset *parent;
c8d9c90c7   Paul Jackson   hotplug cpu: move...
2004
  	/*
956db3ca0   Cliff Wickman   hotplug cpu: move...
2005
2006
2007
  	 * Find its next-highest non-empty parent, (top cpuset
  	 * has online cpus, so can't be empty).
  	 */
c431069fe   Tejun Heo   cpuset: remove cp...
2008
  	parent = parent_cs(cs);
300ed6cbb   Li Zefan   cpuset: convert c...
2009
  	while (cpumask_empty(parent->cpus_allowed) ||
b45012955   Paul Jackson   hotplug cpu move ...
2010
  			nodes_empty(parent->mems_allowed))
c431069fe   Tejun Heo   cpuset: remove cp...
2011
  		parent = parent_cs(parent);
956db3ca0   Cliff Wickman   hotplug cpu: move...
2012

8cc993452   Tejun Heo   cgroup, cpuset: r...
2013
  	if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
12d3089c1   Fabian Frederick   kernel/cpuset.c: ...
2014
  		pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
e61734c55   Tejun Heo   cgroup: remove cg...
2015
2016
2017
  		pr_cont_cgroup_name(cs->css.cgroup);
  		pr_cont("
  ");
8cc993452   Tejun Heo   cgroup, cpuset: r...
2018
  	}
956db3ca0   Cliff Wickman   hotplug cpu: move...
2019
  }
be4c9dd7a   Li Zefan   cpuset: enable on...
2020
2021
2022
2023
  static void
  hotplug_update_tasks_legacy(struct cpuset *cs,
  			    struct cpumask *new_cpus, nodemask_t *new_mems,
  			    bool cpus_updated, bool mems_updated)
390a36aad   Li Zefan   cpuset: refactor ...
2024
2025
  {
  	bool is_empty;
8447a0fee   Vladimir Davydov   cpuset: convert c...
2026
  	spin_lock_irq(&callback_lock);
be4c9dd7a   Li Zefan   cpuset: enable on...
2027
2028
2029
2030
  	cpumask_copy(cs->cpus_allowed, new_cpus);
  	cpumask_copy(cs->effective_cpus, new_cpus);
  	cs->mems_allowed = *new_mems;
  	cs->effective_mems = *new_mems;
8447a0fee   Vladimir Davydov   cpuset: convert c...
2031
  	spin_unlock_irq(&callback_lock);
390a36aad   Li Zefan   cpuset: refactor ...
2032
2033
2034
2035
2036
  
  	/*
  	 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
  	 * as the tasks will be migratecd to an ancestor.
  	 */
be4c9dd7a   Li Zefan   cpuset: enable on...
2037
  	if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
390a36aad   Li Zefan   cpuset: refactor ...
2038
  		update_tasks_cpumask(cs);
be4c9dd7a   Li Zefan   cpuset: enable on...
2039
  	if (mems_updated && !nodes_empty(cs->mems_allowed))
390a36aad   Li Zefan   cpuset: refactor ...
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
  		update_tasks_nodemask(cs);
  
  	is_empty = cpumask_empty(cs->cpus_allowed) ||
  		   nodes_empty(cs->mems_allowed);
  
  	mutex_unlock(&cpuset_mutex);
  
  	/*
  	 * Move tasks to the nearest ancestor with execution resources,
  	 * This is full cgroup operation which will also call back into
  	 * cpuset. Should be done outside any lock.
  	 */
  	if (is_empty)
  		remove_tasks_in_empty_cpuset(cs);
  
  	mutex_lock(&cpuset_mutex);
  }
be4c9dd7a   Li Zefan   cpuset: enable on...
2057
2058
2059
2060
  static void
  hotplug_update_tasks(struct cpuset *cs,
  		     struct cpumask *new_cpus, nodemask_t *new_mems,
  		     bool cpus_updated, bool mems_updated)
390a36aad   Li Zefan   cpuset: refactor ...
2061
  {
be4c9dd7a   Li Zefan   cpuset: enable on...
2062
2063
2064
2065
  	if (cpumask_empty(new_cpus))
  		cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
  	if (nodes_empty(*new_mems))
  		*new_mems = parent_cs(cs)->effective_mems;
8447a0fee   Vladimir Davydov   cpuset: convert c...
2066
  	spin_lock_irq(&callback_lock);
be4c9dd7a   Li Zefan   cpuset: enable on...
2067
2068
  	cpumask_copy(cs->effective_cpus, new_cpus);
  	cs->effective_mems = *new_mems;
8447a0fee   Vladimir Davydov   cpuset: convert c...
2069
  	spin_unlock_irq(&callback_lock);
390a36aad   Li Zefan   cpuset: refactor ...
2070

be4c9dd7a   Li Zefan   cpuset: enable on...
2071
  	if (cpus_updated)
390a36aad   Li Zefan   cpuset: refactor ...
2072
  		update_tasks_cpumask(cs);
be4c9dd7a   Li Zefan   cpuset: enable on...
2073
  	if (mems_updated)
390a36aad   Li Zefan   cpuset: refactor ...
2074
2075
  		update_tasks_nodemask(cs);
  }
deb7aa308   Tejun Heo   cpuset: reorganiz...
2076
  /**
388afd854   Li Zefan   cpuset: remove as...
2077
   * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
deb7aa308   Tejun Heo   cpuset: reorganiz...
2078
   * @cs: cpuset in interest
956db3ca0   Cliff Wickman   hotplug cpu: move...
2079
   *
deb7aa308   Tejun Heo   cpuset: reorganiz...
2080
2081
2082
   * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
   * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
   * all its tasks are moved to the nearest ancestor with both resources.
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
2083
   */
388afd854   Li Zefan   cpuset: remove as...
2084
  static void cpuset_hotplug_update_tasks(struct cpuset *cs)
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
2085
  {
be4c9dd7a   Li Zefan   cpuset: enable on...
2086
2087
2088
2089
  	static cpumask_t new_cpus;
  	static nodemask_t new_mems;
  	bool cpus_updated;
  	bool mems_updated;
e44193d39   Li Zefan   cpuset: let hotpl...
2090
2091
  retry:
  	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
2092

5d21cc2db   Tejun Heo   cpuset: replace c...
2093
  	mutex_lock(&cpuset_mutex);
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
2094

e44193d39   Li Zefan   cpuset: let hotpl...
2095
2096
2097
2098
2099
2100
2101
2102
  	/*
  	 * We have raced with task attaching. We wait until attaching
  	 * is finished, so we won't attach a task to an empty cpuset.
  	 */
  	if (cs->attach_in_progress) {
  		mutex_unlock(&cpuset_mutex);
  		goto retry;
  	}
be4c9dd7a   Li Zefan   cpuset: enable on...
2103
2104
  	cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
  	nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
2105

be4c9dd7a   Li Zefan   cpuset: enable on...
2106
2107
  	cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
  	mems_updated = !nodes_equal(new_mems, cs->effective_mems);
deb7aa308   Tejun Heo   cpuset: reorganiz...
2108

390a36aad   Li Zefan   cpuset: refactor ...
2109
  	if (cgroup_on_dfl(cs->css.cgroup))
be4c9dd7a   Li Zefan   cpuset: enable on...
2110
2111
  		hotplug_update_tasks(cs, &new_cpus, &new_mems,
  				     cpus_updated, mems_updated);
390a36aad   Li Zefan   cpuset: refactor ...
2112
  	else
be4c9dd7a   Li Zefan   cpuset: enable on...
2113
2114
  		hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
  					    cpus_updated, mems_updated);
8d0339487   Tejun Heo   cpuset: make CPU ...
2115

5d21cc2db   Tejun Heo   cpuset: replace c...
2116
  	mutex_unlock(&cpuset_mutex);
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2117
  }
deb7aa308   Tejun Heo   cpuset: reorganiz...
2118
  /**
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
2119
   * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
956db3ca0   Cliff Wickman   hotplug cpu: move...
2120
   *
deb7aa308   Tejun Heo   cpuset: reorganiz...
2121
2122
2123
2124
2125
   * This function is called after either CPU or memory configuration has
   * changed and updates cpuset accordingly.  The top_cpuset is always
   * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
   * order to make cpusets transparent (of no affect) on systems that are
   * actively using CPU hotplug but making no active use of cpusets.
956db3ca0   Cliff Wickman   hotplug cpu: move...
2126
   *
deb7aa308   Tejun Heo   cpuset: reorganiz...
2127
   * Non-root cpusets are only affected by offlining.  If any CPUs or memory
388afd854   Li Zefan   cpuset: remove as...
2128
2129
   * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
   * all descendants.
956db3ca0   Cliff Wickman   hotplug cpu: move...
2130
   *
deb7aa308   Tejun Heo   cpuset: reorganiz...
2131
2132
   * Note that CPU offlining during suspend is ignored.  We don't modify
   * cpusets across suspend/resume cycles at all.
956db3ca0   Cliff Wickman   hotplug cpu: move...
2133
   */
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
2134
  static void cpuset_hotplug_workfn(struct work_struct *work)
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2135
  {
5c5cc6232   Li Zefan   cpuset: allow to ...
2136
2137
  	static cpumask_t new_cpus;
  	static nodemask_t new_mems;
deb7aa308   Tejun Heo   cpuset: reorganiz...
2138
  	bool cpus_updated, mems_updated;
7e88291be   Li Zefan   cpuset: make cs->...
2139
  	bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2140

5d21cc2db   Tejun Heo   cpuset: replace c...
2141
  	mutex_lock(&cpuset_mutex);
956db3ca0   Cliff Wickman   hotplug cpu: move...
2142

deb7aa308   Tejun Heo   cpuset: reorganiz...
2143
2144
2145
  	/* fetch the available cpus/mems and find out which changed how */
  	cpumask_copy(&new_cpus, cpu_active_mask);
  	new_mems = node_states[N_MEMORY];
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
2146

7e88291be   Li Zefan   cpuset: make cs->...
2147
2148
  	cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
  	mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
2149

deb7aa308   Tejun Heo   cpuset: reorganiz...
2150
2151
  	/* synchronize cpus_allowed to cpu_active_mask */
  	if (cpus_updated) {
8447a0fee   Vladimir Davydov   cpuset: convert c...
2152
  		spin_lock_irq(&callback_lock);
7e88291be   Li Zefan   cpuset: make cs->...
2153
2154
  		if (!on_dfl)
  			cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
1344ab9c2   Li Zefan   cpuset: update cp...
2155
  		cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
8447a0fee   Vladimir Davydov   cpuset: convert c...
2156
  		spin_unlock_irq(&callback_lock);
deb7aa308   Tejun Heo   cpuset: reorganiz...
2157
2158
  		/* we don't mess with cpumasks of tasks in top_cpuset */
  	}
b45012955   Paul Jackson   hotplug cpu move ...
2159

deb7aa308   Tejun Heo   cpuset: reorganiz...
2160
2161
  	/* synchronize mems_allowed to N_MEMORY */
  	if (mems_updated) {
8447a0fee   Vladimir Davydov   cpuset: convert c...
2162
  		spin_lock_irq(&callback_lock);
7e88291be   Li Zefan   cpuset: make cs->...
2163
2164
  		if (!on_dfl)
  			top_cpuset.mems_allowed = new_mems;
1344ab9c2   Li Zefan   cpuset: update cp...
2165
  		top_cpuset.effective_mems = new_mems;
8447a0fee   Vladimir Davydov   cpuset: convert c...
2166
  		spin_unlock_irq(&callback_lock);
d66393e54   Tejun Heo   cpuset: use css_t...
2167
  		update_tasks_nodemask(&top_cpuset);
deb7aa308   Tejun Heo   cpuset: reorganiz...
2168
  	}
b45012955   Paul Jackson   hotplug cpu move ...
2169

388afd854   Li Zefan   cpuset: remove as...
2170
  	mutex_unlock(&cpuset_mutex);
5c5cc6232   Li Zefan   cpuset: allow to ...
2171
2172
  	/* if cpus or mems changed, we need to propagate to descendants */
  	if (cpus_updated || mems_updated) {
deb7aa308   Tejun Heo   cpuset: reorganiz...
2173
  		struct cpuset *cs;
492eb21b9   Tejun Heo   cgroup: make hier...
2174
  		struct cgroup_subsys_state *pos_css;
f9b4fb8da   Miao Xie   cpusets: update t...
2175

fc560a26a   Tejun Heo   cpuset: replace c...
2176
  		rcu_read_lock();
492eb21b9   Tejun Heo   cgroup: make hier...
2177
  		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
ec903c0c8   Tejun Heo   cgroup: rename cs...
2178
  			if (cs == &top_cpuset || !css_tryget_online(&cs->css))
388afd854   Li Zefan   cpuset: remove as...
2179
2180
  				continue;
  			rcu_read_unlock();
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
2181

388afd854   Li Zefan   cpuset: remove as...
2182
  			cpuset_hotplug_update_tasks(cs);
b45012955   Paul Jackson   hotplug cpu move ...
2183

388afd854   Li Zefan   cpuset: remove as...
2184
2185
2186
2187
2188
  			rcu_read_lock();
  			css_put(&cs->css);
  		}
  		rcu_read_unlock();
  	}
8d0339487   Tejun Heo   cpuset: make CPU ...
2189

deb7aa308   Tejun Heo   cpuset: reorganiz...
2190
  	/* rebuild sched domains if cpus_allowed has changed */
e0e80a02e   Li Zhong   cpuset: use rebui...
2191
2192
  	if (cpus_updated)
  		rebuild_sched_domains();
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2193
  }
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
2194
  void cpuset_update_active_cpus(bool cpu_online)
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2195
  {
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
  	/*
  	 * We're inside cpu hotplug critical region which usually nests
  	 * inside cgroup synchronization.  Bounce actual hotplug processing
  	 * to a work item to avoid reverse locking order.
  	 *
  	 * We still need to do partition_sched_domains() synchronously;
  	 * otherwise, the scheduler will get confused and put tasks to the
  	 * dead CPU.  Fall back to the default single domain.
  	 * cpuset_hotplug_workfn() will rebuild it as necessary.
  	 */
  	partition_sched_domains(1, NULL, NULL);
  	schedule_work(&cpuset_hotplug_work);
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2208
  }
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2209

38837fc75   Paul Jackson   [PATCH] cpuset: t...
2210
  /*
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
2211
2212
   * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
   * Call this routine anytime after node_states[N_MEMORY] changes.
a1cd2b13f   Srivatsa S. Bhat   cpusets: Remove/u...
2213
   * See cpuset_update_active_cpus() for CPU hotplug handling.
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2214
   */
f481891fd   Miao Xie   cpuset: update to...
2215
2216
  static int cpuset_track_online_nodes(struct notifier_block *self,
  				unsigned long action, void *arg)
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2217
  {
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
2218
  	schedule_work(&cpuset_hotplug_work);
f481891fd   Miao Xie   cpuset: update to...
2219
  	return NOTIFY_OK;
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2220
  }
d8f10cb3d   Andrew Morton   kernel/cpuset.c: ...
2221
2222
2223
2224
2225
  
  static struct notifier_block cpuset_track_online_nodes_nb = {
  	.notifier_call = cpuset_track_online_nodes,
  	.priority = 10,		/* ??! */
  };
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2226

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2227
2228
2229
2230
  /**
   * cpuset_init_smp - initialize cpus_allowed
   *
   * Description: Finish top cpuset after cpu, node maps are initialized
d8f10cb3d   Andrew Morton   kernel/cpuset.c: ...
2231
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2232
2233
  void __init cpuset_init_smp(void)
  {
6ad4c1888   Peter Zijlstra   sched: Fix balanc...
2234
  	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
2235
  	top_cpuset.mems_allowed = node_states[N_MEMORY];
33ad801df   Li Zefan   cpuset: record ol...
2236
  	top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2237

e2b9a3d7d   Li Zefan   cpuset: add cs->e...
2238
2239
  	cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
  	top_cpuset.effective_mems = node_states[N_MEMORY];
d8f10cb3d   Andrew Morton   kernel/cpuset.c: ...
2240
  	register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2241
2242
2243
  }
  
  /**
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2244
2245
   * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
   * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
6af866af3   Li Zefan   cpuset: remove re...
2246
   * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2247
   *
300ed6cbb   Li Zefan   cpuset: convert c...
2248
   * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2249
   * attached to the specified @tsk.  Guaranteed to return some non-empty
5f054e31c   Rusty Russell   documentation: re...
2250
   * subset of cpu_online_mask, even if this means going outside the
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2251
2252
   * tasks cpuset.
   **/
6af866af3   Li Zefan   cpuset: remove re...
2253
  void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2254
  {
8447a0fee   Vladimir Davydov   cpuset: convert c...
2255
2256
2257
  	unsigned long flags;
  
  	spin_lock_irqsave(&callback_lock, flags);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2258
  	rcu_read_lock();
ae1c80238   Li Zefan   cpuset: apply cs-...
2259
  	guarantee_online_cpus(task_cs(tsk), pmask);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2260
  	rcu_read_unlock();
8447a0fee   Vladimir Davydov   cpuset: convert c...
2261
  	spin_unlock_irqrestore(&callback_lock, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2262
  }
2baab4e90   Peter Zijlstra   sched: Fix select...
2263
  void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
9084bb824   Oleg Nesterov   sched: Make selec...
2264
  {
9084bb824   Oleg Nesterov   sched: Make selec...
2265
  	rcu_read_lock();
ae1c80238   Li Zefan   cpuset: apply cs-...
2266
  	do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
9084bb824   Oleg Nesterov   sched: Make selec...
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
  	rcu_read_unlock();
  
  	/*
  	 * We own tsk->cpus_allowed, nobody can change it under us.
  	 *
  	 * But we used cs && cs->cpus_allowed lockless and thus can
  	 * race with cgroup_attach_task() or update_cpumask() and get
  	 * the wrong tsk->cpus_allowed. However, both cases imply the
  	 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
  	 * which takes task_rq_lock().
  	 *
  	 * If we are called after it dropped the lock we must see all
  	 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
  	 * set any mask even if it is not right from task_cs() pov,
  	 * the pending set_cpus_allowed_ptr() will fix things.
2baab4e90   Peter Zijlstra   sched: Fix select...
2282
2283
2284
  	 *
  	 * select_fallback_rq() will fix things ups and set cpu_possible_mask
  	 * if required.
9084bb824   Oleg Nesterov   sched: Make selec...
2285
  	 */
9084bb824   Oleg Nesterov   sched: Make selec...
2286
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2287
2288
  void cpuset_init_current_mems_allowed(void)
  {
f9a86fcbb   Mike Travis   cpuset: modify cp...
2289
  	nodes_setall(current->mems_allowed);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2290
  }
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2291
  /**
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2292
2293
2294
2295
2296
   * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
   * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
   *
   * Description: Returns the nodemask_t mems_allowed of the cpuset
   * attached to the specified @tsk.  Guaranteed to return some non-empty
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
2297
   * subset of node_states[N_MEMORY], even if this means going outside the
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2298
2299
2300
2301
2302
2303
   * tasks cpuset.
   **/
  
  nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
  {
  	nodemask_t mask;
8447a0fee   Vladimir Davydov   cpuset: convert c...
2304
  	unsigned long flags;
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2305

8447a0fee   Vladimir Davydov   cpuset: convert c...
2306
  	spin_lock_irqsave(&callback_lock, flags);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2307
  	rcu_read_lock();
ae1c80238   Li Zefan   cpuset: apply cs-...
2308
  	guarantee_online_mems(task_cs(tsk), &mask);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2309
  	rcu_read_unlock();
8447a0fee   Vladimir Davydov   cpuset: convert c...
2310
  	spin_unlock_irqrestore(&callback_lock, flags);
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2311
2312
2313
2314
2315
  
  	return mask;
  }
  
  /**
19770b326   Mel Gorman   mm: filter based ...
2316
2317
   * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
   * @nodemask: the nodemask to be checked
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2318
   *
19770b326   Mel Gorman   mm: filter based ...
2319
   * Are any of the nodes in the nodemask allowed in current->mems_allowed?
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2320
   */
19770b326   Mel Gorman   mm: filter based ...
2321
  int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2322
  {
19770b326   Mel Gorman   mm: filter based ...
2323
  	return nodes_intersects(*nodemask, current->mems_allowed);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2324
  }
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2325
  /*
786083667   Paul Menage   Cpuset hardwall f...
2326
2327
   * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
   * mem_hardwall ancestor to the specified cpuset.  Call holding
8447a0fee   Vladimir Davydov   cpuset: convert c...
2328
   * callback_lock.  If no ancestor is mem_exclusive or mem_hardwall
786083667   Paul Menage   Cpuset hardwall f...
2329
   * (an unusual configuration), then returns the root cpuset.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2330
   */
c9710d801   Tejun Heo   cpuset: drop "con...
2331
  static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2332
  {
c431069fe   Tejun Heo   cpuset: remove cp...
2333
2334
  	while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
  		cs = parent_cs(cs);
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2335
2336
  	return cs;
  }
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2337
  /**
344736f29   Vladimir Davydov   cpuset: simplify ...
2338
   * cpuset_node_allowed - Can we allocate on a memory node?
a1bc5a4ee   David Rientjes   cpusets: replace ...
2339
   * @node: is this an allowed node?
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2340
   * @gfp_mask: memory allocation flags
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2341
   *
a1bc5a4ee   David Rientjes   cpusets: replace ...
2342
2343
2344
2345
2346
2347
   * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
   * set, yes, we can always allocate.  If node is in our task's mems_allowed,
   * yes.  If it's not a __GFP_HARDWALL request and this node is in the nearest
   * hardwalled cpuset ancestor to this task's cpuset, yes.  If the task has been
   * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
   * flag, yes.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2348
2349
   * Otherwise, no.
   *
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2350
2351
2352
2353
2354
2355
   * The __GFP_THISNODE placement logic is really handled elsewhere,
   * by forcibly using a zonelist starting at a specified node, and by
   * (in get_page_from_freelist()) refusing to consider the zones for
   * any node on the zonelist except the first.  By the time any such
   * calls get to this routine, we should just shut up and say 'yes'.
   *
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2356
   * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
c596d9f32   David Rientjes   cpusets: allow TI...
2357
2358
   * and do not allow allocations outside the current tasks cpuset
   * unless the task has been OOM killed as is marked TIF_MEMDIE.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2359
   * GFP_KERNEL allocations are not so marked, so can escape to the
786083667   Paul Menage   Cpuset hardwall f...
2360
   * nearest enclosing hardwalled ancestor cpuset.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2361
   *
8447a0fee   Vladimir Davydov   cpuset: convert c...
2362
   * Scanning up parent cpusets requires callback_lock.  The
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2363
2364
2365
2366
   * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
   * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
   * current tasks mems_allowed came up empty on the first pass over
   * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
8447a0fee   Vladimir Davydov   cpuset: convert c...
2367
   * cpuset are short of memory, might require taking the callback_lock.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2368
   *
36be57ffe   Paul Jackson   [PATCH] cpuset: u...
2369
   * The first call here from mm/page_alloc:get_page_from_freelist()
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2370
2371
2372
   * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
   * so no allocation on a node outside the cpuset is allowed (unless
   * in interrupt, of course).
36be57ffe   Paul Jackson   [PATCH] cpuset: u...
2373
2374
2375
2376
2377
2378
   *
   * The second pass through get_page_from_freelist() doesn't even call
   * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
   * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
   * in alloc_flags.  That logic and the checks below have the combined
   * affect that:
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2379
2380
   *	in_interrupt - any node ok (current task context irrelevant)
   *	GFP_ATOMIC   - any node ok
c596d9f32   David Rientjes   cpusets: allow TI...
2381
   *	TIF_MEMDIE   - any node ok
786083667   Paul Menage   Cpuset hardwall f...
2382
   *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2383
   *	GFP_USER     - only nodes in current tasks mems allowed ok.
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2384
   */
344736f29   Vladimir Davydov   cpuset: simplify ...
2385
  int __cpuset_node_allowed(int node, gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2386
  {
c9710d801   Tejun Heo   cpuset: drop "con...
2387
  	struct cpuset *cs;		/* current cpuset ancestors */
29afd49b7   Paul Jackson   [PATCH] cpuset: r...
2388
  	int allowed;			/* is allocation in zone z allowed? */
8447a0fee   Vladimir Davydov   cpuset: convert c...
2389
  	unsigned long flags;
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2390

9b819d204   Christoph Lameter   [PATCH] Add __GFP...
2391
  	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2392
  		return 1;
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2393
2394
  	if (node_isset(node, current->mems_allowed))
  		return 1;
c596d9f32   David Rientjes   cpusets: allow TI...
2395
2396
2397
2398
2399
2400
  	/*
  	 * Allow tasks that have access to memory reserves because they have
  	 * been OOM killed to get memory anywhere.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE)))
  		return 1;
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2401
2402
  	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
  		return 0;
5563e7707   Bob Picco   [PATCH] cpuset: f...
2403
2404
  	if (current->flags & PF_EXITING) /* Let dying task have memory */
  		return 1;
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2405
  	/* Not hardwall and node outside mems_allowed: scan up cpusets */
8447a0fee   Vladimir Davydov   cpuset: convert c...
2406
  	spin_lock_irqsave(&callback_lock, flags);
053199edf   Paul Jackson   [PATCH] cpusets: ...
2407

b8dadcb58   Li Zefan   cpuset: use rcu_r...
2408
  	rcu_read_lock();
786083667   Paul Menage   Cpuset hardwall f...
2409
  	cs = nearest_hardwall_ancestor(task_cs(current));
99afb0fd5   Li Zefan   cpuset: fix a rac...
2410
  	allowed = node_isset(node, cs->mems_allowed);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2411
  	rcu_read_unlock();
053199edf   Paul Jackson   [PATCH] cpusets: ...
2412

8447a0fee   Vladimir Davydov   cpuset: convert c...
2413
  	spin_unlock_irqrestore(&callback_lock, flags);
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2414
  	return allowed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2415
  }
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2416
  /**
6adef3ebe   Jack Steiner   cpusets: new roun...
2417
2418
   * cpuset_mem_spread_node() - On which node to begin search for a file page
   * cpuset_slab_spread_node() - On which node to begin search for a slab page
825a46af5   Paul Jackson   [PATCH] cpuset me...
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
   *
   * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
   * tasks in a cpuset with is_spread_page or is_spread_slab set),
   * and if the memory allocation used cpuset_mem_spread_node()
   * to determine on which node to start looking, as it will for
   * certain page cache or slab cache pages such as used for file
   * system buffers and inode caches, then instead of starting on the
   * local node to look for a free page, rather spread the starting
   * node around the tasks mems_allowed nodes.
   *
   * We don't have to worry about the returned node being offline
   * because "it can't happen", and even if it did, it would be ok.
   *
   * The routines calling guarantee_online_mems() are careful to
   * only set nodes in task->mems_allowed that are online.  So it
   * should not be possible for the following code to return an
   * offline node.  But if it did, that would be ok, as this routine
   * is not returning the node where the allocation must be, only
   * the node where the search should start.  The zonelist passed to
   * __alloc_pages() will include all nodes.  If the slab allocator
   * is passed an offline node, it will fall back to the local node.
   * See kmem_cache_alloc_node().
   */
6adef3ebe   Jack Steiner   cpusets: new roun...
2442
  static int cpuset_spread_node(int *rotor)
825a46af5   Paul Jackson   [PATCH] cpuset me...
2443
2444
  {
  	int node;
6adef3ebe   Jack Steiner   cpusets: new roun...
2445
  	node = next_node(*rotor, current->mems_allowed);
825a46af5   Paul Jackson   [PATCH] cpuset me...
2446
2447
  	if (node == MAX_NUMNODES)
  		node = first_node(current->mems_allowed);
6adef3ebe   Jack Steiner   cpusets: new roun...
2448
  	*rotor = node;
825a46af5   Paul Jackson   [PATCH] cpuset me...
2449
2450
  	return node;
  }
6adef3ebe   Jack Steiner   cpusets: new roun...
2451
2452
2453
  
  int cpuset_mem_spread_node(void)
  {
778d3b0ff   Michal Hocko   cpusets: randomiz...
2454
2455
2456
  	if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
  		current->cpuset_mem_spread_rotor =
  			node_random(&current->mems_allowed);
6adef3ebe   Jack Steiner   cpusets: new roun...
2457
2458
2459
2460
2461
  	return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
  }
  
  int cpuset_slab_spread_node(void)
  {
778d3b0ff   Michal Hocko   cpusets: randomiz...
2462
2463
2464
  	if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
  		current->cpuset_slab_spread_rotor =
  			node_random(&current->mems_allowed);
6adef3ebe   Jack Steiner   cpusets: new roun...
2465
2466
  	return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
  }
825a46af5   Paul Jackson   [PATCH] cpuset me...
2467
2468
2469
  EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
  
  /**
bbe373f2c   David Rientjes   oom: compare cpus...
2470
2471
2472
2473
2474
2475
2476
2477
   * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
   * @tsk1: pointer to task_struct of some task.
   * @tsk2: pointer to task_struct of some other task.
   *
   * Description: Return true if @tsk1's mems_allowed intersects the
   * mems_allowed of @tsk2.  Used by the OOM killer to determine if
   * one of the task's memory usage might impact the memory available
   * to the other.
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2478
   **/
bbe373f2c   David Rientjes   oom: compare cpus...
2479
2480
  int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
  				   const struct task_struct *tsk2)
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2481
  {
bbe373f2c   David Rientjes   oom: compare cpus...
2482
  	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2483
  }
f440d98f8   Li Zefan   cpuset: use cgrou...
2484
  #define CPUSET_NODELIST_LEN	(256)
75aa19941   David Rientjes   oom: print trigge...
2485
2486
  /**
   * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
fc34ac1dc   Fabian Frederick   kernel/cpuset.c: ...
2487
   * @tsk: pointer to task_struct of some task.
75aa19941   David Rientjes   oom: print trigge...
2488
2489
   *
   * Description: Prints @task's name, cpuset name, and cached copy of its
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2490
   * mems_allowed to the kernel log.
75aa19941   David Rientjes   oom: print trigge...
2491
2492
2493
   */
  void cpuset_print_task_mems_allowed(struct task_struct *tsk)
  {
f440d98f8   Li Zefan   cpuset: use cgrou...
2494
2495
2496
  	 /* Statically allocated to prevent using excess stack. */
  	static char cpuset_nodelist[CPUSET_NODELIST_LEN];
  	static DEFINE_SPINLOCK(cpuset_buffer_lock);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2497
  	struct cgroup *cgrp;
75aa19941   David Rientjes   oom: print trigge...
2498

f440d98f8   Li Zefan   cpuset: use cgrou...
2499
  	spin_lock(&cpuset_buffer_lock);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2500
  	rcu_read_lock();
63f43f55c   Li Zefan   cpuset: fix cpuse...
2501

b8dadcb58   Li Zefan   cpuset: use rcu_r...
2502
  	cgrp = task_cs(tsk)->css.cgroup;
75aa19941   David Rientjes   oom: print trigge...
2503
2504
  	nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN,
  			   tsk->mems_allowed);
12d3089c1   Fabian Frederick   kernel/cpuset.c: ...
2505
  	pr_info("%s cpuset=", tsk->comm);
e61734c55   Tejun Heo   cgroup: remove cg...
2506
2507
2508
  	pr_cont_cgroup_name(cgrp);
  	pr_cont(" mems_allowed=%s
  ", cpuset_nodelist);
f440d98f8   Li Zefan   cpuset: use cgrou...
2509

cfb5966be   Li Zefan   cpuset: fix RCU l...
2510
  	rcu_read_unlock();
75aa19941   David Rientjes   oom: print trigge...
2511
2512
  	spin_unlock(&cpuset_buffer_lock);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2513
  /*
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2514
2515
2516
2517
   * Collection of memory_pressure is suppressed unless
   * this flag is enabled by writing "1" to the special
   * cpuset file 'memory_pressure_enabled' in the root cpuset.
   */
c5b2aff89   Paul Jackson   [PATCH] cpuset: m...
2518
  int cpuset_memory_pressure_enabled __read_mostly;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
  
  /**
   * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
   *
   * Keep a running average of the rate of synchronous (direct)
   * page reclaim efforts initiated by tasks in each cpuset.
   *
   * This represents the rate at which some task in the cpuset
   * ran low on memory on all nodes it was allowed to use, and
   * had to enter the kernels page reclaim code in an effort to
   * create more free memory by tossing clean pages or swapping
   * or writing dirty pages.
   *
   * Display to user space in the per-cpuset read-only file
   * "memory_pressure".  Value displayed is an integer
   * representing the recent rate of entry into the synchronous
   * (direct) page reclaim by any task attached to the cpuset.
   **/
  
  void __cpuset_memory_pressure_bump(void)
  {
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2540
  	rcu_read_lock();
8793d854e   Paul Menage   Task Control Grou...
2541
  	fmeter_markevent(&task_cs(current)->fmeter);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2542
  	rcu_read_unlock();
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2543
  }
8793d854e   Paul Menage   Task Control Grou...
2544
  #ifdef CONFIG_PROC_PID_CPUSET
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2545
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2546
2547
2548
   * proc_cpuset_show()
   *  - Print tasks cpuset path into seq_file.
   *  - Used for /proc/<pid>/cpuset.
053199edf   Paul Jackson   [PATCH] cpusets: ...
2549
2550
   *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
   *    doesn't really matter if tsk->cpuset changes after we read it,
5d21cc2db   Tejun Heo   cpuset: replace c...
2551
   *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
2df167a30   Paul Menage   cgroups: update c...
2552
   *    anyway.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2553
   */
52de4779f   Zefan Li   cpuset: simplify ...
2554
2555
  int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
  		     struct pid *pid, struct task_struct *tsk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2556
  {
e61734c55   Tejun Heo   cgroup: remove cg...
2557
  	char *buf, *p;
8793d854e   Paul Menage   Task Control Grou...
2558
  	struct cgroup_subsys_state *css;
99f895518   Eric W. Biederman   [PATCH] proc: don...
2559
  	int retval;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2560

99f895518   Eric W. Biederman   [PATCH] proc: don...
2561
  	retval = -ENOMEM;
e61734c55   Tejun Heo   cgroup: remove cg...
2562
  	buf = kmalloc(PATH_MAX, GFP_KERNEL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2563
  	if (!buf)
99f895518   Eric W. Biederman   [PATCH] proc: don...
2564
  		goto out;
e61734c55   Tejun Heo   cgroup: remove cg...
2565
  	retval = -ENAMETOOLONG;
27e89ae5d   Li Zefan   cpuset: fix RCU l...
2566
  	rcu_read_lock();
073219e99   Tejun Heo   cgroup: clean up ...
2567
  	css = task_css(tsk, cpuset_cgrp_id);
e61734c55   Tejun Heo   cgroup: remove cg...
2568
  	p = cgroup_path(css->cgroup, buf, PATH_MAX);
27e89ae5d   Li Zefan   cpuset: fix RCU l...
2569
  	rcu_read_unlock();
e61734c55   Tejun Heo   cgroup: remove cg...
2570
  	if (!p)
52de4779f   Zefan Li   cpuset: simplify ...
2571
  		goto out_free;
e61734c55   Tejun Heo   cgroup: remove cg...
2572
  	seq_puts(m, p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2573
2574
  	seq_putc(m, '
  ');
e61734c55   Tejun Heo   cgroup: remove cg...
2575
  	retval = 0;
99f895518   Eric W. Biederman   [PATCH] proc: don...
2576
  out_free:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2577
  	kfree(buf);
99f895518   Eric W. Biederman   [PATCH] proc: don...
2578
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2579
2580
  	return retval;
  }
8793d854e   Paul Menage   Task Control Grou...
2581
  #endif /* CONFIG_PROC_PID_CPUSET */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2582

d01d48278   Heiko Carstens   sched: Always sho...
2583
  /* Display task mems_allowed in /proc/<pid>/status file. */
df5f8314c   Eric W. Biederman   proc: seqfile con...
2584
2585
  void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
  {
fc34ac1dc   Fabian Frederick   kernel/cpuset.c: ...
2586
  	seq_puts(m, "Mems_allowed:\t");
30e8e1360   Lai Jiangshan   cpuset: use seq_*...
2587
  	seq_nodemask(m, &task->mems_allowed);
fc34ac1dc   Fabian Frederick   kernel/cpuset.c: ...
2588
2589
2590
  	seq_puts(m, "
  ");
  	seq_puts(m, "Mems_allowed_list:\t");
30e8e1360   Lai Jiangshan   cpuset: use seq_*...
2591
  	seq_nodemask_list(m, &task->mems_allowed);
fc34ac1dc   Fabian Frederick   kernel/cpuset.c: ...
2592
2593
  	seq_puts(m, "
  ");
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2594
  }