Blame view

kernel/cpuset.c 75.2 KB
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1
2
3
4
5
6
  /*
   *  kernel/cpuset.c
   *
   *  Processor and Memory placement constraints for sets of tasks.
   *
   *  Copyright (C) 2003 BULL SA.
029190c51   Paul Jackson   cpuset sched_load...
7
   *  Copyright (C) 2004-2007 Silicon Graphics, Inc.
8793d854e   Paul Menage   Task Control Grou...
8
   *  Copyright (C) 2006 Google, Inc
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
9
10
11
   *
   *  Portions derived from Patrick Mochel's sysfs code.
   *  sysfs is Copyright (c) 2001-3 Patrick Mochel
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
12
   *
825a46af5   Paul Jackson   [PATCH] cpuset me...
13
   *  2003-10-10 Written by Simon Derr.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
14
   *  2003-10-22 Updates by Stephen Hemminger.
825a46af5   Paul Jackson   [PATCH] cpuset me...
15
   *  2004 May-July Rework by Paul Jackson.
8793d854e   Paul Menage   Task Control Grou...
16
   *  2006 Rework by Paul Menage to use generic cgroups
cf417141c   Max Krasnyansky   sched, cpuset: re...
17
18
   *  2008 Rework of the scheduler domains and CPU hotplug handling
   *       by Max Krasnyansky
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
19
20
21
22
23
   *
   *  This file is subject to the terms and conditions of the GNU General Public
   *  License.  See the file COPYING in the main directory of the Linux
   *  distribution for more details.
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
24
25
26
27
28
29
30
31
32
33
34
35
  #include <linux/cpu.h>
  #include <linux/cpumask.h>
  #include <linux/cpuset.h>
  #include <linux/err.h>
  #include <linux/errno.h>
  #include <linux/file.h>
  #include <linux/fs.h>
  #include <linux/init.h>
  #include <linux/interrupt.h>
  #include <linux/kernel.h>
  #include <linux/kmod.h>
  #include <linux/list.h>
68860ec10   Paul Jackson   [PATCH] cpusets: ...
36
  #include <linux/mempolicy.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
37
  #include <linux/mm.h>
f481891fd   Miao Xie   cpuset: update to...
38
  #include <linux/memory.h>
9984de1a5   Paul Gortmaker   kernel: Map most ...
39
  #include <linux/export.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
40
41
42
43
  #include <linux/mount.h>
  #include <linux/namei.h>
  #include <linux/pagemap.h>
  #include <linux/proc_fs.h>
6b9c2603c   Paul Jackson   [PATCH] cpuset: u...
44
  #include <linux/rcupdate.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
45
46
  #include <linux/sched.h>
  #include <linux/seq_file.h>
22fb52dd7   David Quigley   [PATCH] SELinux: ...
47
  #include <linux/security.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
48
  #include <linux/slab.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
49
50
51
52
53
54
55
56
  #include <linux/spinlock.h>
  #include <linux/stat.h>
  #include <linux/string.h>
  #include <linux/time.h>
  #include <linux/backing-dev.h>
  #include <linux/sort.h>
  
  #include <asm/uaccess.h>
60063497a   Arun Sharma   atomic: use <linu...
57
  #include <linux/atomic.h>
3d3f26a7b   Ingo Molnar   [PATCH] kernel/cp...
58
  #include <linux/mutex.h>
956db3ca0   Cliff Wickman   hotplug cpu: move...
59
60
  #include <linux/workqueue.h>
  #include <linux/cgroup.h>
e44193d39   Li Zefan   cpuset: let hotpl...
61
  #include <linux/wait.h>
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
62

664eeddee   Mel Gorman   mm: page_alloc: u...
63
  struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
202f72d5d   Paul Jackson   [PATCH] cpuset: n...
64

3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
65
66
67
68
69
70
71
72
  /* See "Frequency meter" comments, below. */
  
  struct fmeter {
  	int cnt;		/* unprocessed events count */
  	int val;		/* most recent output value */
  	time_t time;		/* clock (secs) when val computed */
  	spinlock_t lock;	/* guards read or write of above */
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
73
  struct cpuset {
8793d854e   Paul Menage   Task Control Grou...
74
  	struct cgroup_subsys_state css;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
75
  	unsigned long flags;		/* "unsigned long" so bitops work */
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
76

7e88291be   Li Zefan   cpuset: make cs->...
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
  	/*
  	 * On default hierarchy:
  	 *
  	 * The user-configured masks can only be changed by writing to
  	 * cpuset.cpus and cpuset.mems, and won't be limited by the
  	 * parent masks.
  	 *
  	 * The effective masks is the real masks that apply to the tasks
  	 * in the cpuset. They may be changed if the configured masks are
  	 * changed or hotplug happens.
  	 *
  	 * effective_mask == configured_mask & parent's effective_mask,
  	 * and if it ends up empty, it will inherit the parent's mask.
  	 *
  	 *
  	 * On legacy hierachy:
  	 *
  	 * The user-configured masks are always the same with effective masks.
  	 */
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
96
97
98
99
100
101
102
  	/* user-configured CPUs and Memory Nodes allow to tasks */
  	cpumask_var_t cpus_allowed;
  	nodemask_t mems_allowed;
  
  	/* effective CPUs and Memory Nodes allow to tasks */
  	cpumask_var_t effective_cpus;
  	nodemask_t effective_mems;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
103

33ad801df   Li Zefan   cpuset: record ol...
104
105
106
107
108
109
110
111
112
113
114
  	/*
  	 * This is old Memory Nodes tasks took on.
  	 *
  	 * - top_cpuset.old_mems_allowed is initialized to mems_allowed.
  	 * - A new cpuset's old_mems_allowed is initialized when some
  	 *   task is moved into it.
  	 * - old_mems_allowed is used in cpuset_migrate_mm() when we change
  	 *   cpuset.mems_allowed and have tasks' nodemask updated, and
  	 *   then old_mems_allowed is updated to mems_allowed.
  	 */
  	nodemask_t old_mems_allowed;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
115
  	struct fmeter fmeter;		/* memory_pressure filter */
029190c51   Paul Jackson   cpuset sched_load...
116

452477fa6   Tejun Heo   cpuset: pin down ...
117
118
119
120
121
  	/*
  	 * Tasks are being attached to this cpuset.  Used to prevent
  	 * zeroing cpus/mems_allowed between ->can_attach() and ->attach().
  	 */
  	int attach_in_progress;
029190c51   Paul Jackson   cpuset sched_load...
122
123
  	/* partition number for rebuild_sched_domains() */
  	int pn;
956db3ca0   Cliff Wickman   hotplug cpu: move...
124

1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
125
126
  	/* for custom sched domain */
  	int relax_domain_level;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
127
  };
a7c6d554a   Tejun Heo   cgroup: add/updat...
128
  static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
8793d854e   Paul Menage   Task Control Grou...
129
  {
a7c6d554a   Tejun Heo   cgroup: add/updat...
130
  	return css ? container_of(css, struct cpuset, css) : NULL;
8793d854e   Paul Menage   Task Control Grou...
131
132
133
134
135
  }
  
  /* Retrieve the cpuset for a task */
  static inline struct cpuset *task_cs(struct task_struct *task)
  {
073219e99   Tejun Heo   cgroup: clean up ...
136
  	return css_cs(task_css(task, cpuset_cgrp_id));
8793d854e   Paul Menage   Task Control Grou...
137
  }
8793d854e   Paul Menage   Task Control Grou...
138

c9710d801   Tejun Heo   cpuset: drop "con...
139
  static inline struct cpuset *parent_cs(struct cpuset *cs)
c431069fe   Tejun Heo   cpuset: remove cp...
140
  {
5c9d535b8   Tejun Heo   cgroup: remove cs...
141
  	return css_cs(cs->css.parent);
c431069fe   Tejun Heo   cpuset: remove cp...
142
  }
b246272ec   David Rientjes   cpusets: stall wh...
143
144
145
146
147
148
149
150
151
152
153
  #ifdef CONFIG_NUMA
  static inline bool task_has_mempolicy(struct task_struct *task)
  {
  	return task->mempolicy;
  }
  #else
  static inline bool task_has_mempolicy(struct task_struct *task)
  {
  	return false;
  }
  #endif
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
154
155
  /* bits in struct cpuset flags field */
  typedef enum {
efeb77b2f   Tejun Heo   cpuset: introduce...
156
  	CS_ONLINE,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
157
158
  	CS_CPU_EXCLUSIVE,
  	CS_MEM_EXCLUSIVE,
786083667   Paul Menage   Cpuset hardwall f...
159
  	CS_MEM_HARDWALL,
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
160
  	CS_MEMORY_MIGRATE,
029190c51   Paul Jackson   cpuset sched_load...
161
  	CS_SCHED_LOAD_BALANCE,
825a46af5   Paul Jackson   [PATCH] cpuset me...
162
163
  	CS_SPREAD_PAGE,
  	CS_SPREAD_SLAB,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
164
165
166
  } cpuset_flagbits_t;
  
  /* convenient tests for these bits */
efeb77b2f   Tejun Heo   cpuset: introduce...
167
168
169
170
  static inline bool is_cpuset_online(const struct cpuset *cs)
  {
  	return test_bit(CS_ONLINE, &cs->flags);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
171
172
  static inline int is_cpu_exclusive(const struct cpuset *cs)
  {
7b5b9ef0e   Paul Jackson   [PATCH] cpuset cl...
173
  	return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
174
175
176
177
  }
  
  static inline int is_mem_exclusive(const struct cpuset *cs)
  {
7b5b9ef0e   Paul Jackson   [PATCH] cpuset cl...
178
  	return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
179
  }
786083667   Paul Menage   Cpuset hardwall f...
180
181
182
183
  static inline int is_mem_hardwall(const struct cpuset *cs)
  {
  	return test_bit(CS_MEM_HARDWALL, &cs->flags);
  }
029190c51   Paul Jackson   cpuset sched_load...
184
185
186
187
  static inline int is_sched_load_balance(const struct cpuset *cs)
  {
  	return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
  }
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
188
189
  static inline int is_memory_migrate(const struct cpuset *cs)
  {
7b5b9ef0e   Paul Jackson   [PATCH] cpuset cl...
190
  	return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
191
  }
825a46af5   Paul Jackson   [PATCH] cpuset me...
192
193
194
195
196
197
198
199
200
  static inline int is_spread_page(const struct cpuset *cs)
  {
  	return test_bit(CS_SPREAD_PAGE, &cs->flags);
  }
  
  static inline int is_spread_slab(const struct cpuset *cs)
  {
  	return test_bit(CS_SPREAD_SLAB, &cs->flags);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
201
  static struct cpuset top_cpuset = {
efeb77b2f   Tejun Heo   cpuset: introduce...
202
203
  	.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
  		  (1 << CS_MEM_EXCLUSIVE)),
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
204
  };
ae8086ce1   Tejun Heo   cpuset: introduce...
205
206
207
  /**
   * cpuset_for_each_child - traverse online children of a cpuset
   * @child_cs: loop cursor pointing to the current child
492eb21b9   Tejun Heo   cgroup: make hier...
208
   * @pos_css: used for iteration
ae8086ce1   Tejun Heo   cpuset: introduce...
209
210
211
212
213
   * @parent_cs: target cpuset to walk children of
   *
   * Walk @child_cs through the online children of @parent_cs.  Must be used
   * with RCU read locked.
   */
492eb21b9   Tejun Heo   cgroup: make hier...
214
215
216
  #define cpuset_for_each_child(child_cs, pos_css, parent_cs)		\
  	css_for_each_child((pos_css), &(parent_cs)->css)		\
  		if (is_cpuset_online(((child_cs) = css_cs((pos_css)))))
ae8086ce1   Tejun Heo   cpuset: introduce...
217

fc560a26a   Tejun Heo   cpuset: replace c...
218
219
220
  /**
   * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
   * @des_cs: loop cursor pointing to the current descendant
492eb21b9   Tejun Heo   cgroup: make hier...
221
   * @pos_css: used for iteration
fc560a26a   Tejun Heo   cpuset: replace c...
222
223
224
   * @root_cs: target cpuset to walk ancestor of
   *
   * Walk @des_cs through the online descendants of @root_cs.  Must be used
492eb21b9   Tejun Heo   cgroup: make hier...
225
   * with RCU read locked.  The caller may modify @pos_css by calling
bd8815a6d   Tejun Heo   cgroup: make css_...
226
227
   * css_rightmost_descendant() to skip subtree.  @root_cs is included in the
   * iteration and the first node to be visited.
fc560a26a   Tejun Heo   cpuset: replace c...
228
   */
492eb21b9   Tejun Heo   cgroup: make hier...
229
230
231
  #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs)	\
  	css_for_each_descendant_pre((pos_css), &(root_cs)->css)		\
  		if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
fc560a26a   Tejun Heo   cpuset: replace c...
232

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
233
  /*
8447a0fee   Vladimir Davydov   cpuset: convert c...
234
235
236
237
   * There are two global locks guarding cpuset structures - cpuset_mutex and
   * callback_lock. We also require taking task_lock() when dereferencing a
   * task's cpuset pointer. See "The task_lock() exception", at the end of this
   * comment.
5d21cc2db   Tejun Heo   cpuset: replace c...
238
   *
8447a0fee   Vladimir Davydov   cpuset: convert c...
239
   * A task must hold both locks to modify cpusets.  If a task holds
5d21cc2db   Tejun Heo   cpuset: replace c...
240
   * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
8447a0fee   Vladimir Davydov   cpuset: convert c...
241
   * is the only task able to also acquire callback_lock and be able to
5d21cc2db   Tejun Heo   cpuset: replace c...
242
243
244
   * modify cpusets.  It can perform various checks on the cpuset structure
   * first, knowing nothing will change.  It can also allocate memory while
   * just holding cpuset_mutex.  While it is performing these checks, various
8447a0fee   Vladimir Davydov   cpuset: convert c...
245
246
   * callback routines can briefly acquire callback_lock to query cpusets.
   * Once it is ready to make the changes, it takes callback_lock, blocking
5d21cc2db   Tejun Heo   cpuset: replace c...
247
   * everyone else.
053199edf   Paul Jackson   [PATCH] cpusets: ...
248
249
   *
   * Calls to the kernel memory allocator can not be made while holding
8447a0fee   Vladimir Davydov   cpuset: convert c...
250
   * callback_lock, as that would risk double tripping on callback_lock
053199edf   Paul Jackson   [PATCH] cpusets: ...
251
252
253
   * from one of the callbacks into the cpuset code from within
   * __alloc_pages().
   *
8447a0fee   Vladimir Davydov   cpuset: convert c...
254
   * If a task is only holding callback_lock, then it has read-only
053199edf   Paul Jackson   [PATCH] cpusets: ...
255
256
   * access to cpusets.
   *
58568d2a8   Miao Xie   cpuset,mm: update...
257
258
259
   * Now, the task_struct fields mems_allowed and mempolicy may be changed
   * by other task, we use alloc_lock in the task_struct fields to protect
   * them.
053199edf   Paul Jackson   [PATCH] cpusets: ...
260
   *
8447a0fee   Vladimir Davydov   cpuset: convert c...
261
   * The cpuset_common_file_read() handlers only hold callback_lock across
053199edf   Paul Jackson   [PATCH] cpusets: ...
262
263
264
   * small pieces of code, such as when reading out possibly multi-word
   * cpumasks and nodemasks.
   *
2df167a30   Paul Menage   cgroups: update c...
265
266
   * Accessing a task's cpuset should be done in accordance with the
   * guidelines for accessing subsystem state in kernel/cgroup.c
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
267
   */
5d21cc2db   Tejun Heo   cpuset: replace c...
268
  static DEFINE_MUTEX(cpuset_mutex);
8447a0fee   Vladimir Davydov   cpuset: convert c...
269
  static DEFINE_SPINLOCK(callback_lock);
4247bdc60   Paul Jackson   [PATCH] cpuset se...
270

cf417141c   Max Krasnyansky   sched, cpuset: re...
271
  /*
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
272
273
274
   * CPU / memory hotplug is handled asynchronously.
   */
  static void cpuset_hotplug_workfn(struct work_struct *work);
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
275
  static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
e44193d39   Li Zefan   cpuset: let hotpl...
276
  static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
277
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
278
   * This is ugly, but preserves the userspace API for existing cpuset
8793d854e   Paul Menage   Task Control Grou...
279
   * users. If someone tries to mount the "cpuset" filesystem, we
cf417141c   Max Krasnyansky   sched, cpuset: re...
280
281
   * silently switch it to mount "cgroup" instead
   */
f7e835710   Al Viro   convert cgroup an...
282
283
  static struct dentry *cpuset_mount(struct file_system_type *fs_type,
  			 int flags, const char *unused_dev_name, void *data)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
284
  {
8793d854e   Paul Menage   Task Control Grou...
285
  	struct file_system_type *cgroup_fs = get_fs_type("cgroup");
f7e835710   Al Viro   convert cgroup an...
286
  	struct dentry *ret = ERR_PTR(-ENODEV);
8793d854e   Paul Menage   Task Control Grou...
287
288
289
290
  	if (cgroup_fs) {
  		char mountopts[] =
  			"cpuset,noprefix,"
  			"release_agent=/sbin/cpuset_release_agent";
f7e835710   Al Viro   convert cgroup an...
291
292
  		ret = cgroup_fs->mount(cgroup_fs, flags,
  					   unused_dev_name, mountopts);
8793d854e   Paul Menage   Task Control Grou...
293
294
295
  		put_filesystem(cgroup_fs);
  	}
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
296
297
298
299
  }
  
  static struct file_system_type cpuset_fs_type = {
  	.name = "cpuset",
f7e835710   Al Viro   convert cgroup an...
300
  	.mount = cpuset_mount,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
301
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
302
  /*
300ed6cbb   Li Zefan   cpuset: convert c...
303
   * Return in pmask the portion of a cpusets's cpus_allowed that
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
304
   * are online.  If none are online, walk up the cpuset hierarchy
40df2deb5   Li Zefan   cpuset: cleanup g...
305
306
   * until we find one that does have some online cpus.  The top
   * cpuset always has some cpus online.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
307
308
   *
   * One way or another, we guarantee to return some non-empty subset
5f054e31c   Rusty Russell   documentation: re...
309
   * of cpu_online_mask.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
310
   *
8447a0fee   Vladimir Davydov   cpuset: convert c...
311
   * Call with callback_lock or cpuset_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
312
   */
c9710d801   Tejun Heo   cpuset: drop "con...
313
  static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
314
  {
ae1c80238   Li Zefan   cpuset: apply cs-...
315
  	while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
c431069fe   Tejun Heo   cpuset: remove cp...
316
  		cs = parent_cs(cs);
ae1c80238   Li Zefan   cpuset: apply cs-...
317
  	cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
318
319
320
321
  }
  
  /*
   * Return in *pmask the portion of a cpusets's mems_allowed that
0e1e7c7a7   Christoph Lameter   Memoryless nodes:...
322
323
   * are online, with memory.  If none are online with memory, walk
   * up the cpuset hierarchy until we find one that does have some
40df2deb5   Li Zefan   cpuset: cleanup g...
324
   * online mems.  The top cpuset always has some mems online.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
325
326
   *
   * One way or another, we guarantee to return some non-empty subset
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
327
   * of node_states[N_MEMORY].
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
328
   *
8447a0fee   Vladimir Davydov   cpuset: convert c...
329
   * Call with callback_lock or cpuset_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
330
   */
c9710d801   Tejun Heo   cpuset: drop "con...
331
  static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
332
  {
ae1c80238   Li Zefan   cpuset: apply cs-...
333
  	while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
c431069fe   Tejun Heo   cpuset: remove cp...
334
  		cs = parent_cs(cs);
ae1c80238   Li Zefan   cpuset: apply cs-...
335
  	nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
336
  }
f3b39d47e   Miao Xie   cpusets: restruct...
337
338
339
  /*
   * update task's spread flag if cpuset's page/slab spread flag is set
   *
8447a0fee   Vladimir Davydov   cpuset: convert c...
340
   * Call with callback_lock or cpuset_mutex held.
f3b39d47e   Miao Xie   cpusets: restruct...
341
342
343
344
345
   */
  static void cpuset_update_task_spread_flag(struct cpuset *cs,
  					struct task_struct *tsk)
  {
  	if (is_spread_page(cs))
2ad654bc5   Zefan Li   cpuset: PF_SPREAD...
346
  		task_set_spread_page(tsk);
f3b39d47e   Miao Xie   cpusets: restruct...
347
  	else
2ad654bc5   Zefan Li   cpuset: PF_SPREAD...
348
  		task_clear_spread_page(tsk);
f3b39d47e   Miao Xie   cpusets: restruct...
349
  	if (is_spread_slab(cs))
2ad654bc5   Zefan Li   cpuset: PF_SPREAD...
350
  		task_set_spread_slab(tsk);
f3b39d47e   Miao Xie   cpusets: restruct...
351
  	else
2ad654bc5   Zefan Li   cpuset: PF_SPREAD...
352
  		task_clear_spread_slab(tsk);
f3b39d47e   Miao Xie   cpusets: restruct...
353
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
354
355
356
357
358
  /*
   * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
   *
   * One cpuset is a subset of another if all its allowed CPUs and
   * Memory Nodes are a subset of the other, and its exclusive flags
5d21cc2db   Tejun Heo   cpuset: replace c...
359
   * are only set if the other's are set.  Call holding cpuset_mutex.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
360
361
362
363
   */
  
  static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
  {
300ed6cbb   Li Zefan   cpuset: convert c...
364
  	return	cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
365
366
367
368
  		nodes_subset(p->mems_allowed, q->mems_allowed) &&
  		is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
  		is_mem_exclusive(p) <= is_mem_exclusive(q);
  }
645fcc9d2   Li Zefan   cpuset: don't all...
369
370
371
372
  /**
   * alloc_trial_cpuset - allocate a trial cpuset
   * @cs: the cpuset that the trial cpuset duplicates
   */
c9710d801   Tejun Heo   cpuset: drop "con...
373
  static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
645fcc9d2   Li Zefan   cpuset: don't all...
374
  {
300ed6cbb   Li Zefan   cpuset: convert c...
375
376
377
378
379
  	struct cpuset *trial;
  
  	trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL);
  	if (!trial)
  		return NULL;
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
380
381
382
383
  	if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
  		goto free_cs;
  	if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
  		goto free_cpus;
300ed6cbb   Li Zefan   cpuset: convert c...
384

e2b9a3d7d   Li Zefan   cpuset: add cs->e...
385
386
  	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
  	cpumask_copy(trial->effective_cpus, cs->effective_cpus);
300ed6cbb   Li Zefan   cpuset: convert c...
387
  	return trial;
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
388
389
390
391
392
393
  
  free_cpus:
  	free_cpumask_var(trial->cpus_allowed);
  free_cs:
  	kfree(trial);
  	return NULL;
645fcc9d2   Li Zefan   cpuset: don't all...
394
395
396
397
398
399
400
401
  }
  
  /**
   * free_trial_cpuset - free the trial cpuset
   * @trial: the trial cpuset to be freed
   */
  static void free_trial_cpuset(struct cpuset *trial)
  {
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
402
  	free_cpumask_var(trial->effective_cpus);
300ed6cbb   Li Zefan   cpuset: convert c...
403
  	free_cpumask_var(trial->cpus_allowed);
645fcc9d2   Li Zefan   cpuset: don't all...
404
405
  	kfree(trial);
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
406
407
408
409
410
411
412
  /*
   * validate_change() - Used to validate that any proposed cpuset change
   *		       follows the structural rules for cpusets.
   *
   * If we replaced the flag and mask values of the current cpuset
   * (cur) with those values in the trial cpuset (trial), would
   * our various subset and exclusive rules still be valid?  Presumes
5d21cc2db   Tejun Heo   cpuset: replace c...
413
   * cpuset_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
414
415
416
417
418
419
420
421
422
423
424
   *
   * 'cur' is the address of an actual, in-use cpuset.  Operations
   * such as list traversal that depend on the actual address of the
   * cpuset in the list must use cur below, not trial.
   *
   * 'trial' is the address of bulk structure copy of cur, with
   * perhaps one or more of the fields cpus_allowed, mems_allowed,
   * or flags changed to new, trial values.
   *
   * Return 0 if valid, -errno if not.
   */
c9710d801   Tejun Heo   cpuset: drop "con...
425
  static int validate_change(struct cpuset *cur, struct cpuset *trial)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
426
  {
492eb21b9   Tejun Heo   cgroup: make hier...
427
  	struct cgroup_subsys_state *css;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
428
  	struct cpuset *c, *par;
ae8086ce1   Tejun Heo   cpuset: introduce...
429
430
431
  	int ret;
  
  	rcu_read_lock();
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
432
433
  
  	/* Each of our child cpusets must be a subset of us */
ae8086ce1   Tejun Heo   cpuset: introduce...
434
  	ret = -EBUSY;
492eb21b9   Tejun Heo   cgroup: make hier...
435
  	cpuset_for_each_child(c, css, cur)
ae8086ce1   Tejun Heo   cpuset: introduce...
436
437
  		if (!is_cpuset_subset(c, trial))
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
438
439
  
  	/* Remaining checks don't apply to root cpuset */
ae8086ce1   Tejun Heo   cpuset: introduce...
440
  	ret = 0;
696040670   Paul Jackson   [PATCH] cpuset: m...
441
  	if (cur == &top_cpuset)
ae8086ce1   Tejun Heo   cpuset: introduce...
442
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
443

c431069fe   Tejun Heo   cpuset: remove cp...
444
  	par = parent_cs(cur);
696040670   Paul Jackson   [PATCH] cpuset: m...
445

7e88291be   Li Zefan   cpuset: make cs->...
446
  	/* On legacy hiearchy, we must be a subset of our parent cpuset. */
ae8086ce1   Tejun Heo   cpuset: introduce...
447
  	ret = -EACCES;
7e88291be   Li Zefan   cpuset: make cs->...
448
  	if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
ae8086ce1   Tejun Heo   cpuset: introduce...
449
  		goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
450

2df167a30   Paul Menage   cgroups: update c...
451
452
453
454
  	/*
  	 * If either I or some sibling (!= me) is exclusive, we can't
  	 * overlap
  	 */
ae8086ce1   Tejun Heo   cpuset: introduce...
455
  	ret = -EINVAL;
492eb21b9   Tejun Heo   cgroup: make hier...
456
  	cpuset_for_each_child(c, css, par) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
457
458
  		if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
  		    c != cur &&
300ed6cbb   Li Zefan   cpuset: convert c...
459
  		    cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
ae8086ce1   Tejun Heo   cpuset: introduce...
460
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
461
462
463
  		if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
  		    c != cur &&
  		    nodes_intersects(trial->mems_allowed, c->mems_allowed))
ae8086ce1   Tejun Heo   cpuset: introduce...
464
  			goto out;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
465
  	}
452477fa6   Tejun Heo   cpuset: pin down ...
466
467
  	/*
  	 * Cpusets with tasks - existing or newly being attached - can't
1c09b195d   Li Zefan   cpuset: fix a reg...
468
  	 * be changed to have empty cpus_allowed or mems_allowed.
452477fa6   Tejun Heo   cpuset: pin down ...
469
  	 */
ae8086ce1   Tejun Heo   cpuset: introduce...
470
  	ret = -ENOSPC;
07bc356ed   Tejun Heo   cgroup: implement...
471
  	if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
1c09b195d   Li Zefan   cpuset: fix a reg...
472
473
474
475
476
477
478
  		if (!cpumask_empty(cur->cpus_allowed) &&
  		    cpumask_empty(trial->cpus_allowed))
  			goto out;
  		if (!nodes_empty(cur->mems_allowed) &&
  		    nodes_empty(trial->mems_allowed))
  			goto out;
  	}
020958b62   Paul Jackson   cpusets: decrusti...
479

f82f80426   Juri Lelli   sched/deadline: E...
480
481
482
483
484
485
486
487
488
  	/*
  	 * We can't shrink if we won't have enough room for SCHED_DEADLINE
  	 * tasks.
  	 */
  	ret = -EBUSY;
  	if (is_cpu_exclusive(cur) &&
  	    !cpuset_cpumask_can_shrink(cur->cpus_allowed,
  				       trial->cpus_allowed))
  		goto out;
ae8086ce1   Tejun Heo   cpuset: introduce...
489
490
491
492
  	ret = 0;
  out:
  	rcu_read_unlock();
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
493
  }
db7f47cf4   Paul Menage   cpusets: allow cp...
494
  #ifdef CONFIG_SMP
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
495
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
496
   * Helper routine for generate_sched_domains().
8b5f1c52d   Li Zefan   cpuset: use effec...
497
   * Do cpusets a, b have overlapping effective cpus_allowed masks?
029190c51   Paul Jackson   cpuset sched_load...
498
   */
029190c51   Paul Jackson   cpuset sched_load...
499
500
  static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
  {
8b5f1c52d   Li Zefan   cpuset: use effec...
501
  	return cpumask_intersects(a->effective_cpus, b->effective_cpus);
029190c51   Paul Jackson   cpuset sched_load...
502
  }
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
503
504
505
  static void
  update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
  {
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
506
507
508
509
  	if (dattr->relax_domain_level < c->relax_domain_level)
  		dattr->relax_domain_level = c->relax_domain_level;
  	return;
  }
fc560a26a   Tejun Heo   cpuset: replace c...
510
511
  static void update_domain_attr_tree(struct sched_domain_attr *dattr,
  				    struct cpuset *root_cs)
f5393693e   Lai Jiangshan   cpuset: speed up ...
512
  {
fc560a26a   Tejun Heo   cpuset: replace c...
513
  	struct cpuset *cp;
492eb21b9   Tejun Heo   cgroup: make hier...
514
  	struct cgroup_subsys_state *pos_css;
f5393693e   Lai Jiangshan   cpuset: speed up ...
515

fc560a26a   Tejun Heo   cpuset: replace c...
516
  	rcu_read_lock();
492eb21b9   Tejun Heo   cgroup: make hier...
517
  	cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
fc560a26a   Tejun Heo   cpuset: replace c...
518
519
  		/* skip the whole subtree if @cp doesn't have any CPU */
  		if (cpumask_empty(cp->cpus_allowed)) {
492eb21b9   Tejun Heo   cgroup: make hier...
520
  			pos_css = css_rightmost_descendant(pos_css);
f5393693e   Lai Jiangshan   cpuset: speed up ...
521
  			continue;
fc560a26a   Tejun Heo   cpuset: replace c...
522
  		}
f5393693e   Lai Jiangshan   cpuset: speed up ...
523
524
525
  
  		if (is_sched_load_balance(cp))
  			update_domain_attr(dattr, cp);
f5393693e   Lai Jiangshan   cpuset: speed up ...
526
  	}
fc560a26a   Tejun Heo   cpuset: replace c...
527
  	rcu_read_unlock();
f5393693e   Lai Jiangshan   cpuset: speed up ...
528
  }
029190c51   Paul Jackson   cpuset sched_load...
529
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
530
531
532
533
534
   * generate_sched_domains()
   *
   * This function builds a partial partition of the systems CPUs
   * A 'partial partition' is a set of non-overlapping subsets whose
   * union is a subset of that set.
0a0fca9d8   Viresh Kumar   sched: Rename sch...
535
   * The output of this function needs to be passed to kernel/sched/core.c
cf417141c   Max Krasnyansky   sched, cpuset: re...
536
537
538
   * partition_sched_domains() routine, which will rebuild the scheduler's
   * load balancing domains (sched domains) as specified by that partial
   * partition.
029190c51   Paul Jackson   cpuset sched_load...
539
   *
45ce80fb6   Li Zefan   cgroups: consolid...
540
   * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt
029190c51   Paul Jackson   cpuset sched_load...
541
542
543
544
545
546
547
   * for a background explanation of this.
   *
   * Does not return errors, on the theory that the callers of this
   * routine would rather not worry about failures to rebuild sched
   * domains when operating in the severe memory shortage situations
   * that could cause allocation failures below.
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
548
   * Must be called with cpuset_mutex held.
029190c51   Paul Jackson   cpuset sched_load...
549
550
   *
   * The three key local variables below are:
aeed68242   Li Zefan   cpuset: clean up ...
551
   *    q  - a linked-list queue of cpuset pointers, used to implement a
029190c51   Paul Jackson   cpuset sched_load...
552
553
554
555
556
557
558
559
560
561
562
563
   *	   top-down scan of all cpusets.  This scan loads a pointer
   *	   to each cpuset marked is_sched_load_balance into the
   *	   array 'csa'.  For our purposes, rebuilding the schedulers
   *	   sched domains, we can ignore !is_sched_load_balance cpusets.
   *  csa  - (for CpuSet Array) Array of pointers to all the cpusets
   *	   that need to be load balanced, for convenient iterative
   *	   access by the subsequent code that finds the best partition,
   *	   i.e the set of domains (subsets) of CPUs such that the
   *	   cpus_allowed of every cpuset marked is_sched_load_balance
   *	   is a subset of one of these domains, while there are as
   *	   many such domains as possible, each as small as possible.
   * doms  - Conversion of 'csa' to an array of cpumasks, for passing to
0a0fca9d8   Viresh Kumar   sched: Rename sch...
564
   *	   the kernel/sched/core.c routine partition_sched_domains() in a
029190c51   Paul Jackson   cpuset sched_load...
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
   *	   convenient format, that can be easily compared to the prior
   *	   value to determine what partition elements (sched domains)
   *	   were changed (added or removed.)
   *
   * Finding the best partition (set of domains):
   *	The triple nested loops below over i, j, k scan over the
   *	load balanced cpusets (using the array of cpuset pointers in
   *	csa[]) looking for pairs of cpusets that have overlapping
   *	cpus_allowed, but which don't have the same 'pn' partition
   *	number and gives them in the same partition number.  It keeps
   *	looping on the 'restart' label until it can no longer find
   *	any such pairs.
   *
   *	The union of the cpus_allowed masks from the set of
   *	all cpusets having the same 'pn' value then form the one
   *	element of the partition (one sched domain) to be passed to
   *	partition_sched_domains().
   */
acc3f5d7c   Rusty Russell   cpumask: Partitio...
583
  static int generate_sched_domains(cpumask_var_t **domains,
cf417141c   Max Krasnyansky   sched, cpuset: re...
584
  			struct sched_domain_attr **attributes)
029190c51   Paul Jackson   cpuset sched_load...
585
  {
029190c51   Paul Jackson   cpuset sched_load...
586
587
588
589
  	struct cpuset *cp;	/* scans q */
  	struct cpuset **csa;	/* array of all cpuset ptrs */
  	int csn;		/* how many cpuset ptrs in csa so far */
  	int i, j, k;		/* indices for partition finding loops */
acc3f5d7c   Rusty Russell   cpumask: Partitio...
590
  	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
47b8ea718   Rik van Riel   cpusets, isolcpus...
591
  	cpumask_var_t non_isolated_cpus;  /* load balanced CPUs */
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
592
  	struct sched_domain_attr *dattr;  /* attributes for custom domains */
1583715dd   Ingo Molnar   sched, cpusets: f...
593
  	int ndoms = 0;		/* number of sched domains in result */
6af866af3   Li Zefan   cpuset: remove re...
594
  	int nslot;		/* next empty doms[] struct cpumask slot */
492eb21b9   Tejun Heo   cgroup: make hier...
595
  	struct cgroup_subsys_state *pos_css;
029190c51   Paul Jackson   cpuset sched_load...
596

029190c51   Paul Jackson   cpuset sched_load...
597
  	doms = NULL;
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
598
  	dattr = NULL;
cf417141c   Max Krasnyansky   sched, cpuset: re...
599
  	csa = NULL;
029190c51   Paul Jackson   cpuset sched_load...
600

47b8ea718   Rik van Riel   cpusets, isolcpus...
601
602
603
  	if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
  		goto done;
  	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
029190c51   Paul Jackson   cpuset sched_load...
604
605
  	/* Special case for the 99% of systems with one, full, sched domain */
  	if (is_sched_load_balance(&top_cpuset)) {
acc3f5d7c   Rusty Russell   cpumask: Partitio...
606
607
  		ndoms = 1;
  		doms = alloc_sched_domains(ndoms);
029190c51   Paul Jackson   cpuset sched_load...
608
  		if (!doms)
cf417141c   Max Krasnyansky   sched, cpuset: re...
609
  			goto done;
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
610
611
612
  		dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
  		if (dattr) {
  			*dattr = SD_ATTR_INIT;
93a655755   Li Zefan   cpuset: fix wrong...
613
  			update_domain_attr_tree(dattr, &top_cpuset);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
614
  		}
47b8ea718   Rik van Riel   cpusets, isolcpus...
615
616
  		cpumask_and(doms[0], top_cpuset.effective_cpus,
  				     non_isolated_cpus);
cf417141c   Max Krasnyansky   sched, cpuset: re...
617

cf417141c   Max Krasnyansky   sched, cpuset: re...
618
  		goto done;
029190c51   Paul Jackson   cpuset sched_load...
619
  	}
664eeddee   Mel Gorman   mm: page_alloc: u...
620
  	csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL);
029190c51   Paul Jackson   cpuset sched_load...
621
622
623
  	if (!csa)
  		goto done;
  	csn = 0;
fc560a26a   Tejun Heo   cpuset: replace c...
624
  	rcu_read_lock();
492eb21b9   Tejun Heo   cgroup: make hier...
625
  	cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
bd8815a6d   Tejun Heo   cgroup: make css_...
626
627
  		if (cp == &top_cpuset)
  			continue;
f5393693e   Lai Jiangshan   cpuset: speed up ...
628
  		/*
fc560a26a   Tejun Heo   cpuset: replace c...
629
630
631
632
633
634
  		 * Continue traversing beyond @cp iff @cp has some CPUs and
  		 * isn't load balancing.  The former is obvious.  The
  		 * latter: All child cpusets contain a subset of the
  		 * parent's cpus, so just skip them, and then we call
  		 * update_domain_attr_tree() to calc relax_domain_level of
  		 * the corresponding sched domain.
f5393693e   Lai Jiangshan   cpuset: speed up ...
635
  		 */
fc560a26a   Tejun Heo   cpuset: replace c...
636
  		if (!cpumask_empty(cp->cpus_allowed) &&
47b8ea718   Rik van Riel   cpusets, isolcpus...
637
638
  		    !(is_sched_load_balance(cp) &&
  		      cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
f5393693e   Lai Jiangshan   cpuset: speed up ...
639
  			continue;
489a5393a   Lai Jiangshan   cpuset: don't pas...
640

fc560a26a   Tejun Heo   cpuset: replace c...
641
642
643
644
  		if (is_sched_load_balance(cp))
  			csa[csn++] = cp;
  
  		/* skip @cp's subtree */
492eb21b9   Tejun Heo   cgroup: make hier...
645
  		pos_css = css_rightmost_descendant(pos_css);
fc560a26a   Tejun Heo   cpuset: replace c...
646
647
  	}
  	rcu_read_unlock();
029190c51   Paul Jackson   cpuset sched_load...
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
  
  	for (i = 0; i < csn; i++)
  		csa[i]->pn = i;
  	ndoms = csn;
  
  restart:
  	/* Find the best partition (set of sched domains) */
  	for (i = 0; i < csn; i++) {
  		struct cpuset *a = csa[i];
  		int apn = a->pn;
  
  		for (j = 0; j < csn; j++) {
  			struct cpuset *b = csa[j];
  			int bpn = b->pn;
  
  			if (apn != bpn && cpusets_overlap(a, b)) {
  				for (k = 0; k < csn; k++) {
  					struct cpuset *c = csa[k];
  
  					if (c->pn == bpn)
  						c->pn = apn;
  				}
  				ndoms--;	/* one less element */
  				goto restart;
  			}
  		}
  	}
cf417141c   Max Krasnyansky   sched, cpuset: re...
675
676
677
678
  	/*
  	 * Now we know how many domains to create.
  	 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
  	 */
acc3f5d7c   Rusty Russell   cpumask: Partitio...
679
  	doms = alloc_sched_domains(ndoms);
700018e0a   Li Zefan   cpuset: fix regre...
680
  	if (!doms)
cf417141c   Max Krasnyansky   sched, cpuset: re...
681
  		goto done;
cf417141c   Max Krasnyansky   sched, cpuset: re...
682
683
684
685
686
  
  	/*
  	 * The rest of the code, including the scheduler, can deal with
  	 * dattr==NULL case. No need to abort if alloc fails.
  	 */
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
687
  	dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
029190c51   Paul Jackson   cpuset sched_load...
688
689
690
  
  	for (nslot = 0, i = 0; i < csn; i++) {
  		struct cpuset *a = csa[i];
6af866af3   Li Zefan   cpuset: remove re...
691
  		struct cpumask *dp;
029190c51   Paul Jackson   cpuset sched_load...
692
  		int apn = a->pn;
cf417141c   Max Krasnyansky   sched, cpuset: re...
693
694
695
696
  		if (apn < 0) {
  			/* Skip completed partitions */
  			continue;
  		}
acc3f5d7c   Rusty Russell   cpumask: Partitio...
697
  		dp = doms[nslot];
cf417141c   Max Krasnyansky   sched, cpuset: re...
698
699
700
701
  
  		if (nslot == ndoms) {
  			static int warnings = 10;
  			if (warnings) {
12d3089c1   Fabian Frederick   kernel/cpuset.c: ...
702
703
704
  				pr_warn("rebuild_sched_domains confused: nslot %d, ndoms %d, csn %d, i %d, apn %d
  ",
  					nslot, ndoms, csn, i, apn);
cf417141c   Max Krasnyansky   sched, cpuset: re...
705
  				warnings--;
029190c51   Paul Jackson   cpuset sched_load...
706
  			}
cf417141c   Max Krasnyansky   sched, cpuset: re...
707
708
  			continue;
  		}
029190c51   Paul Jackson   cpuset sched_load...
709

6af866af3   Li Zefan   cpuset: remove re...
710
  		cpumask_clear(dp);
cf417141c   Max Krasnyansky   sched, cpuset: re...
711
712
713
714
715
716
  		if (dattr)
  			*(dattr + nslot) = SD_ATTR_INIT;
  		for (j = i; j < csn; j++) {
  			struct cpuset *b = csa[j];
  
  			if (apn == b->pn) {
8b5f1c52d   Li Zefan   cpuset: use effec...
717
  				cpumask_or(dp, dp, b->effective_cpus);
47b8ea718   Rik van Riel   cpusets, isolcpus...
718
  				cpumask_and(dp, dp, non_isolated_cpus);
cf417141c   Max Krasnyansky   sched, cpuset: re...
719
720
721
722
723
  				if (dattr)
  					update_domain_attr_tree(dattr + nslot, b);
  
  				/* Done with this partition */
  				b->pn = -1;
029190c51   Paul Jackson   cpuset sched_load...
724
  			}
029190c51   Paul Jackson   cpuset sched_load...
725
  		}
cf417141c   Max Krasnyansky   sched, cpuset: re...
726
  		nslot++;
029190c51   Paul Jackson   cpuset sched_load...
727
728
  	}
  	BUG_ON(nslot != ndoms);
cf417141c   Max Krasnyansky   sched, cpuset: re...
729
  done:
47b8ea718   Rik van Riel   cpusets, isolcpus...
730
  	free_cpumask_var(non_isolated_cpus);
cf417141c   Max Krasnyansky   sched, cpuset: re...
731
  	kfree(csa);
700018e0a   Li Zefan   cpuset: fix regre...
732
733
734
735
736
737
  	/*
  	 * Fallback to the default domain if kmalloc() failed.
  	 * See comments in partition_sched_domains().
  	 */
  	if (doms == NULL)
  		ndoms = 1;
cf417141c   Max Krasnyansky   sched, cpuset: re...
738
739
740
741
742
743
744
745
  	*domains    = doms;
  	*attributes = dattr;
  	return ndoms;
  }
  
  /*
   * Rebuild scheduler domains.
   *
699140ba8   Tejun Heo   cpuset: drop asyn...
746
747
748
749
750
   * If the flag 'sched_load_balance' of any cpuset with non-empty
   * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
   * which has that flag enabled, or if any cpuset with a non-empty
   * 'cpus' is removed, then call this routine to rebuild the
   * scheduler's dynamic sched domains.
cf417141c   Max Krasnyansky   sched, cpuset: re...
751
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
752
   * Call with cpuset_mutex held.  Takes get_online_cpus().
cf417141c   Max Krasnyansky   sched, cpuset: re...
753
   */
699140ba8   Tejun Heo   cpuset: drop asyn...
754
  static void rebuild_sched_domains_locked(void)
cf417141c   Max Krasnyansky   sched, cpuset: re...
755
756
  {
  	struct sched_domain_attr *attr;
acc3f5d7c   Rusty Russell   cpumask: Partitio...
757
  	cpumask_var_t *doms;
cf417141c   Max Krasnyansky   sched, cpuset: re...
758
  	int ndoms;
5d21cc2db   Tejun Heo   cpuset: replace c...
759
  	lockdep_assert_held(&cpuset_mutex);
86ef5c9a8   Gautham R Shenoy   cpu-hotplug: repl...
760
  	get_online_cpus();
cf417141c   Max Krasnyansky   sched, cpuset: re...
761

5b16c2a49   Li Zefan   cpuset: fix cpu h...
762
763
764
765
766
  	/*
  	 * We have raced with CPU hotplug. Don't do anything to avoid
  	 * passing doms with offlined cpu to partition_sched_domains().
  	 * Anyways, hotplug work item will rebuild sched domains.
  	 */
8b5f1c52d   Li Zefan   cpuset: use effec...
767
  	if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
5b16c2a49   Li Zefan   cpuset: fix cpu h...
768
  		goto out;
cf417141c   Max Krasnyansky   sched, cpuset: re...
769
  	/* Generate domain masks and attrs */
cf417141c   Max Krasnyansky   sched, cpuset: re...
770
  	ndoms = generate_sched_domains(&doms, &attr);
cf417141c   Max Krasnyansky   sched, cpuset: re...
771
772
773
  
  	/* Have scheduler rebuild the domains */
  	partition_sched_domains(ndoms, doms, attr);
5b16c2a49   Li Zefan   cpuset: fix cpu h...
774
  out:
86ef5c9a8   Gautham R Shenoy   cpu-hotplug: repl...
775
  	put_online_cpus();
cf417141c   Max Krasnyansky   sched, cpuset: re...
776
  }
db7f47cf4   Paul Menage   cpusets: allow cp...
777
  #else /* !CONFIG_SMP */
699140ba8   Tejun Heo   cpuset: drop asyn...
778
  static void rebuild_sched_domains_locked(void)
db7f47cf4   Paul Menage   cpusets: allow cp...
779
780
  {
  }
db7f47cf4   Paul Menage   cpusets: allow cp...
781
  #endif /* CONFIG_SMP */
029190c51   Paul Jackson   cpuset sched_load...
782

cf417141c   Max Krasnyansky   sched, cpuset: re...
783
784
  void rebuild_sched_domains(void)
  {
5d21cc2db   Tejun Heo   cpuset: replace c...
785
  	mutex_lock(&cpuset_mutex);
699140ba8   Tejun Heo   cpuset: drop asyn...
786
  	rebuild_sched_domains_locked();
5d21cc2db   Tejun Heo   cpuset: replace c...
787
  	mutex_unlock(&cpuset_mutex);
029190c51   Paul Jackson   cpuset sched_load...
788
  }
58f4790b7   Cliff Wickman   cpusets: update_c...
789
  /**
0b2f630a2   Miao Xie   cpusets: restruct...
790
791
   * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
   * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
0b2f630a2   Miao Xie   cpusets: restruct...
792
   *
d66393e54   Tejun Heo   cpuset: use css_t...
793
794
795
   * Iterate through each task of @cs updating its cpus_allowed to the
   * effective cpuset's.  As this function is called with cpuset_mutex held,
   * cpuset membership stays stable.
0b2f630a2   Miao Xie   cpusets: restruct...
796
   */
d66393e54   Tejun Heo   cpuset: use css_t...
797
  static void update_tasks_cpumask(struct cpuset *cs)
0b2f630a2   Miao Xie   cpusets: restruct...
798
  {
d66393e54   Tejun Heo   cpuset: use css_t...
799
800
801
802
803
  	struct css_task_iter it;
  	struct task_struct *task;
  
  	css_task_iter_start(&cs->css, &it);
  	while ((task = css_task_iter_next(&it)))
ae1c80238   Li Zefan   cpuset: apply cs-...
804
  		set_cpus_allowed_ptr(task, cs->effective_cpus);
d66393e54   Tejun Heo   cpuset: use css_t...
805
  	css_task_iter_end(&it);
0b2f630a2   Miao Xie   cpusets: restruct...
806
  }
5c5cc6232   Li Zefan   cpuset: allow to ...
807
  /*
734d45130   Li Zefan   cpuset: update cs...
808
809
810
811
812
813
   * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
   * @cs: the cpuset to consider
   * @new_cpus: temp variable for calculating new effective_cpus
   *
   * When congifured cpumask is changed, the effective cpumasks of this cpuset
   * and all its descendants need to be updated.
5c5cc6232   Li Zefan   cpuset: allow to ...
814
   *
734d45130   Li Zefan   cpuset: update cs...
815
   * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
5c5cc6232   Li Zefan   cpuset: allow to ...
816
817
818
   *
   * Called with cpuset_mutex held
   */
734d45130   Li Zefan   cpuset: update cs...
819
  static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
5c5cc6232   Li Zefan   cpuset: allow to ...
820
821
  {
  	struct cpuset *cp;
492eb21b9   Tejun Heo   cgroup: make hier...
822
  	struct cgroup_subsys_state *pos_css;
8b5f1c52d   Li Zefan   cpuset: use effec...
823
  	bool need_rebuild_sched_domains = false;
5c5cc6232   Li Zefan   cpuset: allow to ...
824
825
  
  	rcu_read_lock();
734d45130   Li Zefan   cpuset: update cs...
826
827
828
829
  	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
  		struct cpuset *parent = parent_cs(cp);
  
  		cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
554b0d1c8   Li Zefan   cpuset: inherit a...
830
831
832
833
  		/*
  		 * If it becomes empty, inherit the effective mask of the
  		 * parent, which is guaranteed to have some CPUs.
  		 */
79063bffc   Zefan Li   cpuset: fix a war...
834
  		if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus))
554b0d1c8   Li Zefan   cpuset: inherit a...
835
  			cpumask_copy(new_cpus, parent->effective_cpus);
734d45130   Li Zefan   cpuset: update cs...
836
837
838
839
  		/* Skip the whole subtree if the cpumask remains the same. */
  		if (cpumask_equal(new_cpus, cp->effective_cpus)) {
  			pos_css = css_rightmost_descendant(pos_css);
  			continue;
5c5cc6232   Li Zefan   cpuset: allow to ...
840
  		}
734d45130   Li Zefan   cpuset: update cs...
841

ec903c0c8   Tejun Heo   cgroup: rename cs...
842
  		if (!css_tryget_online(&cp->css))
5c5cc6232   Li Zefan   cpuset: allow to ...
843
844
  			continue;
  		rcu_read_unlock();
8447a0fee   Vladimir Davydov   cpuset: convert c...
845
  		spin_lock_irq(&callback_lock);
734d45130   Li Zefan   cpuset: update cs...
846
  		cpumask_copy(cp->effective_cpus, new_cpus);
8447a0fee   Vladimir Davydov   cpuset: convert c...
847
  		spin_unlock_irq(&callback_lock);
734d45130   Li Zefan   cpuset: update cs...
848
849
850
  
  		WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
  			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
d66393e54   Tejun Heo   cpuset: use css_t...
851
  		update_tasks_cpumask(cp);
5c5cc6232   Li Zefan   cpuset: allow to ...
852

8b5f1c52d   Li Zefan   cpuset: use effec...
853
854
855
856
857
858
859
  		/*
  		 * If the effective cpumask of any non-empty cpuset is changed,
  		 * we need to rebuild sched domains.
  		 */
  		if (!cpumask_empty(cp->cpus_allowed) &&
  		    is_sched_load_balance(cp))
  			need_rebuild_sched_domains = true;
5c5cc6232   Li Zefan   cpuset: allow to ...
860
861
862
863
  		rcu_read_lock();
  		css_put(&cp->css);
  	}
  	rcu_read_unlock();
8b5f1c52d   Li Zefan   cpuset: use effec...
864
865
866
  
  	if (need_rebuild_sched_domains)
  		rebuild_sched_domains_locked();
5c5cc6232   Li Zefan   cpuset: allow to ...
867
  }
0b2f630a2   Miao Xie   cpusets: restruct...
868
  /**
58f4790b7   Cliff Wickman   cpusets: update_c...
869
870
   * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
   * @cs: the cpuset to consider
fc34ac1dc   Fabian Frederick   kernel/cpuset.c: ...
871
   * @trialcs: trial cpuset
58f4790b7   Cliff Wickman   cpusets: update_c...
872
873
   * @buf: buffer of cpu numbers written to this cpuset
   */
645fcc9d2   Li Zefan   cpuset: don't all...
874
875
  static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  			  const char *buf)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
876
  {
58f4790b7   Cliff Wickman   cpusets: update_c...
877
  	int retval;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
878

5f054e31c   Rusty Russell   documentation: re...
879
  	/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
880
881
  	if (cs == &top_cpuset)
  		return -EACCES;
6f7f02e78   David Rientjes   cpusets: allow em...
882
  	/*
c8d9c90c7   Paul Jackson   hotplug cpu: move...
883
  	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
020958b62   Paul Jackson   cpusets: decrusti...
884
885
886
  	 * Since cpulist_parse() fails on an empty mask, we special case
  	 * that parsing.  The validate_change() call ensures that cpusets
  	 * with tasks have cpus.
6f7f02e78   David Rientjes   cpusets: allow em...
887
  	 */
020958b62   Paul Jackson   cpusets: decrusti...
888
  	if (!*buf) {
300ed6cbb   Li Zefan   cpuset: convert c...
889
  		cpumask_clear(trialcs->cpus_allowed);
6f7f02e78   David Rientjes   cpusets: allow em...
890
  	} else {
300ed6cbb   Li Zefan   cpuset: convert c...
891
  		retval = cpulist_parse(buf, trialcs->cpus_allowed);
6f7f02e78   David Rientjes   cpusets: allow em...
892
893
  		if (retval < 0)
  			return retval;
37340746a   Lai Jiangshan   cpusets: fix bug ...
894

5d8ba82c3   Li Zefan   cpuset: allow wri...
895
896
  		if (!cpumask_subset(trialcs->cpus_allowed,
  				    top_cpuset.cpus_allowed))
37340746a   Lai Jiangshan   cpusets: fix bug ...
897
  			return -EINVAL;
6f7f02e78   David Rientjes   cpusets: allow em...
898
  	}
029190c51   Paul Jackson   cpuset sched_load...
899

8707d8b8c   Paul Menage   Fix cpusets updat...
900
  	/* Nothing to do if the cpus didn't change */
300ed6cbb   Li Zefan   cpuset: convert c...
901
  	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
8707d8b8c   Paul Menage   Fix cpusets updat...
902
  		return 0;
58f4790b7   Cliff Wickman   cpusets: update_c...
903

a73456f37   Li Zefan   cpuset: re-struct...
904
905
906
  	retval = validate_change(cs, trialcs);
  	if (retval < 0)
  		return retval;
8447a0fee   Vladimir Davydov   cpuset: convert c...
907
  	spin_lock_irq(&callback_lock);
300ed6cbb   Li Zefan   cpuset: convert c...
908
  	cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
8447a0fee   Vladimir Davydov   cpuset: convert c...
909
  	spin_unlock_irq(&callback_lock);
029190c51   Paul Jackson   cpuset sched_load...
910

734d45130   Li Zefan   cpuset: update cs...
911
912
  	/* use trialcs->cpus_allowed as a temp variable */
  	update_cpumasks_hier(cs, trialcs->cpus_allowed);
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
913
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
914
  }
053199edf   Paul Jackson   [PATCH] cpusets: ...
915
  /*
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
916
917
918
919
920
921
922
   * cpuset_migrate_mm
   *
   *    Migrate memory region from one set of nodes to another.
   *
   *    Temporarilly set tasks mems_allowed to target nodes of migration,
   *    so that the migration code can allocate pages on these nodes.
   *
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
923
924
925
926
   *    While the mm_struct we are migrating is typically from some
   *    other task, the task_struct mems_allowed that we are hacking
   *    is for our current task, which must allocate new pages for that
   *    migrating memory region.
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
927
928
929
930
931
932
   */
  
  static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
  							const nodemask_t *to)
  {
  	struct task_struct *tsk = current;
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
933
  	tsk->mems_allowed = *to;
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
934
935
  
  	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
472958300   Li Zefan   cpuset: fix a loc...
936
  	rcu_read_lock();
ae1c80238   Li Zefan   cpuset: apply cs-...
937
  	guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
472958300   Li Zefan   cpuset: fix a loc...
938
  	rcu_read_unlock();
e4e364e86   Paul Jackson   [PATCH] cpuset: m...
939
  }
3b6766fe6   Li Zefan   cpuset: rewrite u...
940
  /*
58568d2a8   Miao Xie   cpuset,mm: update...
941
942
943
944
945
946
947
   * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy
   * @tsk: the task to change
   * @newmems: new nodes that the task will be set
   *
   * In order to avoid seeing no nodes if the old and new nodes are disjoint,
   * we structure updates as setting all new allowed nodes, then clearing newly
   * disallowed ones.
58568d2a8   Miao Xie   cpuset,mm: update...
948
949
950
951
   */
  static void cpuset_change_task_nodemask(struct task_struct *tsk,
  					nodemask_t *newmems)
  {
b246272ec   David Rientjes   cpusets: stall wh...
952
  	bool need_loop;
89e8a244b   David Rientjes   cpusets: avoid lo...
953

c0ff7453b   Miao Xie   cpuset,mm: fix no...
954
955
956
957
958
959
960
961
962
963
  	/*
  	 * Allow tasks that have access to memory reserves because they have
  	 * been OOM killed to get memory anywhere.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE)))
  		return;
  	if (current->flags & PF_EXITING) /* Let dying task have memory */
  		return;
  
  	task_lock(tsk);
b246272ec   David Rientjes   cpusets: stall wh...
964
965
  	/*
  	 * Determine if a loop is necessary if another thread is doing
d26914d11   Mel Gorman   mm: optimize put_...
966
  	 * read_mems_allowed_begin().  If at least one node remains unchanged and
b246272ec   David Rientjes   cpusets: stall wh...
967
968
969
970
971
  	 * tsk does not have a mempolicy, then an empty nodemask will not be
  	 * possible when mems_allowed is larger than a word.
  	 */
  	need_loop = task_has_mempolicy(tsk) ||
  			!nodes_intersects(*newmems, tsk->mems_allowed);
c0ff7453b   Miao Xie   cpuset,mm: fix no...
972

0fc0287c9   Peter Zijlstra   cpuset: Fix memor...
973
974
  	if (need_loop) {
  		local_irq_disable();
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
975
  		write_seqcount_begin(&tsk->mems_allowed_seq);
0fc0287c9   Peter Zijlstra   cpuset: Fix memor...
976
  	}
c0ff7453b   Miao Xie   cpuset,mm: fix no...
977

cc9a6c877   Mel Gorman   cpuset: mm: reduc...
978
979
  	nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
  	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
c0ff7453b   Miao Xie   cpuset,mm: fix no...
980
981
  
  	mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
58568d2a8   Miao Xie   cpuset,mm: update...
982
  	tsk->mems_allowed = *newmems;
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
983

0fc0287c9   Peter Zijlstra   cpuset: Fix memor...
984
  	if (need_loop) {
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
985
  		write_seqcount_end(&tsk->mems_allowed_seq);
0fc0287c9   Peter Zijlstra   cpuset: Fix memor...
986
987
  		local_irq_enable();
  	}
cc9a6c877   Mel Gorman   cpuset: mm: reduc...
988

c0ff7453b   Miao Xie   cpuset,mm: fix no...
989
  	task_unlock(tsk);
58568d2a8   Miao Xie   cpuset,mm: update...
990
  }
8793d854e   Paul Menage   Task Control Grou...
991
  static void *cpuset_being_rebound;
0b2f630a2   Miao Xie   cpusets: restruct...
992
993
994
  /**
   * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
   * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
0b2f630a2   Miao Xie   cpusets: restruct...
995
   *
d66393e54   Tejun Heo   cpuset: use css_t...
996
997
998
   * Iterate through each task of @cs updating its mems_allowed to the
   * effective cpuset's.  As this function is called with cpuset_mutex held,
   * cpuset membership stays stable.
0b2f630a2   Miao Xie   cpusets: restruct...
999
   */
d66393e54   Tejun Heo   cpuset: use css_t...
1000
  static void update_tasks_nodemask(struct cpuset *cs)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1001
  {
33ad801df   Li Zefan   cpuset: record ol...
1002
  	static nodemask_t newmems;	/* protected by cpuset_mutex */
d66393e54   Tejun Heo   cpuset: use css_t...
1003
1004
  	struct css_task_iter it;
  	struct task_struct *task;
59dac16fb   Paul Jackson   [PATCH] cpuset: u...
1005

846a16bf0   Lee Schermerhorn   mempolicy: rename...
1006
  	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1007

ae1c80238   Li Zefan   cpuset: apply cs-...
1008
  	guarantee_online_mems(cs, &newmems);
33ad801df   Li Zefan   cpuset: record ol...
1009

4225399a6   Paul Jackson   [PATCH] cpuset: r...
1010
  	/*
3b6766fe6   Li Zefan   cpuset: rewrite u...
1011
1012
1013
1014
  	 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
  	 * take while holding tasklist_lock.  Forks can happen - the
  	 * mpol_dup() cpuset_being_rebound check will catch such forks,
  	 * and rebind their vma mempolicies too.  Because we still hold
5d21cc2db   Tejun Heo   cpuset: replace c...
1015
  	 * the global cpuset_mutex, we know that no other rebind effort
3b6766fe6   Li Zefan   cpuset: rewrite u...
1016
  	 * will be contending for the global variable cpuset_being_rebound.
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1017
  	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
04c19fa6f   Paul Jackson   [PATCH] cpuset: m...
1018
  	 * is idempotent.  Also migrate pages in each mm to new nodes.
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1019
  	 */
d66393e54   Tejun Heo   cpuset: use css_t...
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
  	css_task_iter_start(&cs->css, &it);
  	while ((task = css_task_iter_next(&it))) {
  		struct mm_struct *mm;
  		bool migrate;
  
  		cpuset_change_task_nodemask(task, &newmems);
  
  		mm = get_task_mm(task);
  		if (!mm)
  			continue;
  
  		migrate = is_memory_migrate(cs);
  
  		mpol_rebind_mm(mm, &cs->mems_allowed);
  		if (migrate)
  			cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
  		mmput(mm);
  	}
  	css_task_iter_end(&it);
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1039

33ad801df   Li Zefan   cpuset: record ol...
1040
1041
1042
1043
1044
  	/*
  	 * All the tasks' nodemasks have been updated, update
  	 * cs->old_mems_allowed.
  	 */
  	cs->old_mems_allowed = newmems;
2df167a30   Paul Menage   cgroups: update c...
1045
  	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
8793d854e   Paul Menage   Task Control Grou...
1046
  	cpuset_being_rebound = NULL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1047
  }
0b2f630a2   Miao Xie   cpusets: restruct...
1048
  /*
734d45130   Li Zefan   cpuset: update cs...
1049
1050
1051
   * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
   * @cs: the cpuset to consider
   * @new_mems: a temp variable for calculating new effective_mems
5c5cc6232   Li Zefan   cpuset: allow to ...
1052
   *
734d45130   Li Zefan   cpuset: update cs...
1053
1054
   * When configured nodemask is changed, the effective nodemasks of this cpuset
   * and all its descendants need to be updated.
5c5cc6232   Li Zefan   cpuset: allow to ...
1055
   *
734d45130   Li Zefan   cpuset: update cs...
1056
   * On legacy hiearchy, effective_mems will be the same with mems_allowed.
5c5cc6232   Li Zefan   cpuset: allow to ...
1057
1058
1059
   *
   * Called with cpuset_mutex held
   */
734d45130   Li Zefan   cpuset: update cs...
1060
  static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
5c5cc6232   Li Zefan   cpuset: allow to ...
1061
1062
  {
  	struct cpuset *cp;
492eb21b9   Tejun Heo   cgroup: make hier...
1063
  	struct cgroup_subsys_state *pos_css;
5c5cc6232   Li Zefan   cpuset: allow to ...
1064
1065
  
  	rcu_read_lock();
734d45130   Li Zefan   cpuset: update cs...
1066
1067
1068
1069
  	cpuset_for_each_descendant_pre(cp, pos_css, cs) {
  		struct cpuset *parent = parent_cs(cp);
  
  		nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
554b0d1c8   Li Zefan   cpuset: inherit a...
1070
1071
1072
1073
  		/*
  		 * If it becomes empty, inherit the effective mask of the
  		 * parent, which is guaranteed to have some MEMs.
  		 */
79063bffc   Zefan Li   cpuset: fix a war...
1074
  		if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems))
554b0d1c8   Li Zefan   cpuset: inherit a...
1075
  			*new_mems = parent->effective_mems;
734d45130   Li Zefan   cpuset: update cs...
1076
1077
1078
1079
  		/* Skip the whole subtree if the nodemask remains the same. */
  		if (nodes_equal(*new_mems, cp->effective_mems)) {
  			pos_css = css_rightmost_descendant(pos_css);
  			continue;
5c5cc6232   Li Zefan   cpuset: allow to ...
1080
  		}
734d45130   Li Zefan   cpuset: update cs...
1081

ec903c0c8   Tejun Heo   cgroup: rename cs...
1082
  		if (!css_tryget_online(&cp->css))
5c5cc6232   Li Zefan   cpuset: allow to ...
1083
1084
  			continue;
  		rcu_read_unlock();
8447a0fee   Vladimir Davydov   cpuset: convert c...
1085
  		spin_lock_irq(&callback_lock);
734d45130   Li Zefan   cpuset: update cs...
1086
  		cp->effective_mems = *new_mems;
8447a0fee   Vladimir Davydov   cpuset: convert c...
1087
  		spin_unlock_irq(&callback_lock);
734d45130   Li Zefan   cpuset: update cs...
1088
1089
  
  		WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
a13812683   Li Zefan   cpuset: fix the W...
1090
  			!nodes_equal(cp->mems_allowed, cp->effective_mems));
734d45130   Li Zefan   cpuset: update cs...
1091

d66393e54   Tejun Heo   cpuset: use css_t...
1092
  		update_tasks_nodemask(cp);
5c5cc6232   Li Zefan   cpuset: allow to ...
1093
1094
1095
1096
1097
1098
1099
1100
  
  		rcu_read_lock();
  		css_put(&cp->css);
  	}
  	rcu_read_unlock();
  }
  
  /*
0b2f630a2   Miao Xie   cpusets: restruct...
1101
1102
   * Handle user request to change the 'mems' memory placement
   * of a cpuset.  Needs to validate the request, update the
58568d2a8   Miao Xie   cpuset,mm: update...
1103
1104
1105
1106
   * cpusets mems_allowed, and for each task in the cpuset,
   * update mems_allowed and rebind task's mempolicy and any vma
   * mempolicies and if the cpuset is marked 'memory_migrate',
   * migrate the tasks pages to the new memory.
0b2f630a2   Miao Xie   cpusets: restruct...
1107
   *
8447a0fee   Vladimir Davydov   cpuset: convert c...
1108
   * Call with cpuset_mutex held. May take callback_lock during call.
0b2f630a2   Miao Xie   cpusets: restruct...
1109
1110
1111
1112
   * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
   * lock each such tasks mm->mmap_sem, scan its vma's and rebind
   * their mempolicies to the cpusets new mems_allowed.
   */
645fcc9d2   Li Zefan   cpuset: don't all...
1113
1114
  static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
  			   const char *buf)
0b2f630a2   Miao Xie   cpusets: restruct...
1115
  {
0b2f630a2   Miao Xie   cpusets: restruct...
1116
1117
1118
  	int retval;
  
  	/*
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
1119
  	 * top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
0b2f630a2   Miao Xie   cpusets: restruct...
1120
1121
  	 * it's read-only
  	 */
53feb2976   Miao Xie   cpuset: alloc nod...
1122
1123
1124
1125
  	if (cs == &top_cpuset) {
  		retval = -EACCES;
  		goto done;
  	}
0b2f630a2   Miao Xie   cpusets: restruct...
1126

0b2f630a2   Miao Xie   cpusets: restruct...
1127
1128
1129
1130
1131
1132
1133
  	/*
  	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
  	 * Since nodelist_parse() fails on an empty mask, we special case
  	 * that parsing.  The validate_change() call ensures that cpusets
  	 * with tasks have memory.
  	 */
  	if (!*buf) {
645fcc9d2   Li Zefan   cpuset: don't all...
1134
  		nodes_clear(trialcs->mems_allowed);
0b2f630a2   Miao Xie   cpusets: restruct...
1135
  	} else {
645fcc9d2   Li Zefan   cpuset: don't all...
1136
  		retval = nodelist_parse(buf, trialcs->mems_allowed);
0b2f630a2   Miao Xie   cpusets: restruct...
1137
1138
  		if (retval < 0)
  			goto done;
645fcc9d2   Li Zefan   cpuset: don't all...
1139
  		if (!nodes_subset(trialcs->mems_allowed,
5d8ba82c3   Li Zefan   cpuset: allow wri...
1140
1141
  				  top_cpuset.mems_allowed)) {
  			retval = -EINVAL;
53feb2976   Miao Xie   cpuset: alloc nod...
1142
1143
  			goto done;
  		}
0b2f630a2   Miao Xie   cpusets: restruct...
1144
  	}
33ad801df   Li Zefan   cpuset: record ol...
1145
1146
  
  	if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) {
0b2f630a2   Miao Xie   cpusets: restruct...
1147
1148
1149
  		retval = 0;		/* Too easy - nothing to do */
  		goto done;
  	}
645fcc9d2   Li Zefan   cpuset: don't all...
1150
  	retval = validate_change(cs, trialcs);
0b2f630a2   Miao Xie   cpusets: restruct...
1151
1152
  	if (retval < 0)
  		goto done;
8447a0fee   Vladimir Davydov   cpuset: convert c...
1153
  	spin_lock_irq(&callback_lock);
645fcc9d2   Li Zefan   cpuset: don't all...
1154
  	cs->mems_allowed = trialcs->mems_allowed;
8447a0fee   Vladimir Davydov   cpuset: convert c...
1155
  	spin_unlock_irq(&callback_lock);
0b2f630a2   Miao Xie   cpusets: restruct...
1156

734d45130   Li Zefan   cpuset: update cs...
1157
  	/* use trialcs->mems_allowed as a temp variable */
2f9de0cc2   Alban Crequy   cpuset: use trial...
1158
  	update_nodemasks_hier(cs, &trialcs->mems_allowed);
0b2f630a2   Miao Xie   cpusets: restruct...
1159
1160
1161
  done:
  	return retval;
  }
8793d854e   Paul Menage   Task Control Grou...
1162
1163
  int current_cpuset_is_being_rebound(void)
  {
391acf970   Gu Zheng   cpuset,mempolicy:...
1164
1165
1166
1167
1168
1169
1170
  	int ret;
  
  	rcu_read_lock();
  	ret = task_cs(current) == cpuset_being_rebound;
  	rcu_read_unlock();
  
  	return ret;
8793d854e   Paul Menage   Task Control Grou...
1171
  }
5be7a4792   Paul Menage   Fix cpuset sched_...
1172
  static int update_relax_domain_level(struct cpuset *cs, s64 val)
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1173
  {
db7f47cf4   Paul Menage   cpusets: allow cp...
1174
  #ifdef CONFIG_SMP
60495e776   Peter Zijlstra   sched: Dynamic sc...
1175
  	if (val < -1 || val >= sched_domain_level_max)
30e0e1781   Li Zefan   cpuset: limit the...
1176
  		return -EINVAL;
db7f47cf4   Paul Menage   cpusets: allow cp...
1177
  #endif
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1178
1179
1180
  
  	if (val != cs->relax_domain_level) {
  		cs->relax_domain_level = val;
300ed6cbb   Li Zefan   cpuset: convert c...
1181
1182
  		if (!cpumask_empty(cs->cpus_allowed) &&
  		    is_sched_load_balance(cs))
699140ba8   Tejun Heo   cpuset: drop asyn...
1183
  			rebuild_sched_domains_locked();
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1184
1185
1186
1187
  	}
  
  	return 0;
  }
72ec70299   Tejun Heo   cgroup: make task...
1188
  /**
950592f7b   Miao Xie   cpusets: update t...
1189
1190
   * update_tasks_flags - update the spread flags of tasks in the cpuset.
   * @cs: the cpuset in which each task's spread flags needs to be changed
950592f7b   Miao Xie   cpusets: update t...
1191
   *
d66393e54   Tejun Heo   cpuset: use css_t...
1192
1193
1194
   * Iterate through each task of @cs updating its spread flags.  As this
   * function is called with cpuset_mutex held, cpuset membership stays
   * stable.
950592f7b   Miao Xie   cpusets: update t...
1195
   */
d66393e54   Tejun Heo   cpuset: use css_t...
1196
  static void update_tasks_flags(struct cpuset *cs)
950592f7b   Miao Xie   cpusets: update t...
1197
  {
d66393e54   Tejun Heo   cpuset: use css_t...
1198
1199
1200
1201
1202
1203
1204
  	struct css_task_iter it;
  	struct task_struct *task;
  
  	css_task_iter_start(&cs->css, &it);
  	while ((task = css_task_iter_next(&it)))
  		cpuset_update_task_spread_flag(cs, task);
  	css_task_iter_end(&it);
950592f7b   Miao Xie   cpusets: update t...
1205
1206
1207
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1208
   * update_flag - read a 0 or a 1 in a file and update associated flag
786083667   Paul Menage   Cpuset hardwall f...
1209
1210
1211
   * bit:		the bit to update (see cpuset_flagbits_t)
   * cs:		the cpuset to update
   * turning_on: 	whether the flag is being set or cleared
053199edf   Paul Jackson   [PATCH] cpusets: ...
1212
   *
5d21cc2db   Tejun Heo   cpuset: replace c...
1213
   * Call with cpuset_mutex held.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1214
   */
700fe1ab9   Paul Menage   CGroup API files:...
1215
1216
  static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
  		       int turning_on)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1217
  {
645fcc9d2   Li Zefan   cpuset: don't all...
1218
  	struct cpuset *trialcs;
40b6a7623   Rakib Mullick   cpuset.c: remove ...
1219
  	int balance_flag_changed;
950592f7b   Miao Xie   cpusets: update t...
1220
  	int spread_flag_changed;
950592f7b   Miao Xie   cpusets: update t...
1221
  	int err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1222

645fcc9d2   Li Zefan   cpuset: don't all...
1223
1224
1225
  	trialcs = alloc_trial_cpuset(cs);
  	if (!trialcs)
  		return -ENOMEM;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1226
  	if (turning_on)
645fcc9d2   Li Zefan   cpuset: don't all...
1227
  		set_bit(bit, &trialcs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1228
  	else
645fcc9d2   Li Zefan   cpuset: don't all...
1229
  		clear_bit(bit, &trialcs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1230

645fcc9d2   Li Zefan   cpuset: don't all...
1231
  	err = validate_change(cs, trialcs);
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
1232
  	if (err < 0)
645fcc9d2   Li Zefan   cpuset: don't all...
1233
  		goto out;
029190c51   Paul Jackson   cpuset sched_load...
1234

029190c51   Paul Jackson   cpuset sched_load...
1235
  	balance_flag_changed = (is_sched_load_balance(cs) !=
645fcc9d2   Li Zefan   cpuset: don't all...
1236
  				is_sched_load_balance(trialcs));
029190c51   Paul Jackson   cpuset sched_load...
1237

950592f7b   Miao Xie   cpusets: update t...
1238
1239
  	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
  			|| (is_spread_page(cs) != is_spread_page(trialcs)));
8447a0fee   Vladimir Davydov   cpuset: convert c...
1240
  	spin_lock_irq(&callback_lock);
645fcc9d2   Li Zefan   cpuset: don't all...
1241
  	cs->flags = trialcs->flags;
8447a0fee   Vladimir Davydov   cpuset: convert c...
1242
  	spin_unlock_irq(&callback_lock);
85d7b9498   Dinakar Guniguntala   [PATCH] Dynamic s...
1243

300ed6cbb   Li Zefan   cpuset: convert c...
1244
  	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
699140ba8   Tejun Heo   cpuset: drop asyn...
1245
  		rebuild_sched_domains_locked();
029190c51   Paul Jackson   cpuset sched_load...
1246

950592f7b   Miao Xie   cpusets: update t...
1247
  	if (spread_flag_changed)
d66393e54   Tejun Heo   cpuset: use css_t...
1248
  		update_tasks_flags(cs);
645fcc9d2   Li Zefan   cpuset: don't all...
1249
1250
1251
  out:
  	free_trial_cpuset(trialcs);
  	return err;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1252
  }
053199edf   Paul Jackson   [PATCH] cpusets: ...
1253
  /*
80f7228b5   Adrian Bunk   typo fixes: occur...
1254
   * Frequency meter - How fast is some event occurring?
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
   *
   * These routines manage a digitally filtered, constant time based,
   * event frequency meter.  There are four routines:
   *   fmeter_init() - initialize a frequency meter.
   *   fmeter_markevent() - called each time the event happens.
   *   fmeter_getrate() - returns the recent rate of such events.
   *   fmeter_update() - internal routine used to update fmeter.
   *
   * A common data structure is passed to each of these routines,
   * which is used to keep track of the state required to manage the
   * frequency meter and its digital filter.
   *
   * The filter works on the number of events marked per unit time.
   * The filter is single-pole low-pass recursive (IIR).  The time unit
   * is 1 second.  Arithmetic is done using 32-bit integers scaled to
   * simulate 3 decimal digits of precision (multiplied by 1000).
   *
   * With an FM_COEF of 933, and a time base of 1 second, the filter
   * has a half-life of 10 seconds, meaning that if the events quit
   * happening, then the rate returned from the fmeter_getrate()
   * will be cut in half each 10 seconds, until it converges to zero.
   *
   * It is not worth doing a real infinitely recursive filter.  If more
   * than FM_MAXTICKS ticks have elapsed since the last filter event,
   * just compute FM_MAXTICKS ticks worth, by which point the level
   * will be stable.
   *
   * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
   * arithmetic overflow in the fmeter_update() routine.
   *
   * Given the simple 32 bit integer arithmetic used, this meter works
   * best for reporting rates between one per millisecond (msec) and
   * one per 32 (approx) seconds.  At constant rates faster than one
   * per msec it maxes out at values just under 1,000,000.  At constant
   * rates between one per msec, and one per second it will stabilize
   * to a value N*1000, where N is the rate of events per second.
   * At constant rates between one per second and one per 32 seconds,
   * it will be choppy, moving up on the seconds that have an event,
   * and then decaying until the next event.  At rates slower than
   * about one in 32 seconds, it decays all the way back to zero between
   * each event.
   */
  
  #define FM_COEF 933		/* coefficient for half-life of 10 secs */
  #define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
  #define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */
  #define FM_SCALE 1000		/* faux fixed point scale */
  
  /* Initialize a frequency meter */
  static void fmeter_init(struct fmeter *fmp)
  {
  	fmp->cnt = 0;
  	fmp->val = 0;
  	fmp->time = 0;
  	spin_lock_init(&fmp->lock);
  }
  
  /* Internal meter update - process cnt events and update value */
  static void fmeter_update(struct fmeter *fmp)
  {
  	time_t now = get_seconds();
  	time_t ticks = now - fmp->time;
  
  	if (ticks == 0)
  		return;
  
  	ticks = min(FM_MAXTICKS, ticks);
  	while (ticks-- > 0)
  		fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
  	fmp->time = now;
  
  	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
  	fmp->cnt = 0;
  }
  
  /* Process any previous ticks, then bump cnt by one (times scale). */
  static void fmeter_markevent(struct fmeter *fmp)
  {
  	spin_lock(&fmp->lock);
  	fmeter_update(fmp);
  	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
  	spin_unlock(&fmp->lock);
  }
  
  /* Process any previous ticks, then return current value. */
  static int fmeter_getrate(struct fmeter *fmp)
  {
  	int val;
  
  	spin_lock(&fmp->lock);
  	fmeter_update(fmp);
  	val = fmp->val;
  	spin_unlock(&fmp->lock);
  	return val;
  }
57fce0a68   Tejun Heo   cpuset: don't use...
1350
  static struct cpuset *cpuset_attach_old_cs;
5d21cc2db   Tejun Heo   cpuset: replace c...
1351
  /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
eb95419b0   Tejun Heo   cgroup: pass arou...
1352
1353
  static int cpuset_can_attach(struct cgroup_subsys_state *css,
  			     struct cgroup_taskset *tset)
f780bdb7c   Ben Blum   cgroups: add per-...
1354
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
1355
  	struct cpuset *cs = css_cs(css);
bb9d97b6d   Tejun Heo   cgroup: don't use...
1356
1357
  	struct task_struct *task;
  	int ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1358

57fce0a68   Tejun Heo   cpuset: don't use...
1359
1360
  	/* used later by cpuset_attach() */
  	cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
5d21cc2db   Tejun Heo   cpuset: replace c...
1361
  	mutex_lock(&cpuset_mutex);
aa6ec29be   Tejun Heo   cgroup: remove sa...
1362
  	/* allow moving tasks into an empty cpuset if on default hierarchy */
5d21cc2db   Tejun Heo   cpuset: replace c...
1363
  	ret = -ENOSPC;
aa6ec29be   Tejun Heo   cgroup: remove sa...
1364
  	if (!cgroup_on_dfl(css->cgroup) &&
88fa523bf   Li Zefan   cpuset: allow to ...
1365
  	    (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
5d21cc2db   Tejun Heo   cpuset: replace c...
1366
  		goto out_unlock;
9985b0bab   David Rientjes   sched: prevent bo...
1367

924f0d9a2   Tejun Heo   cgroup: drop @ski...
1368
  	cgroup_taskset_for_each(task, tset) {
7f51412a4   Juri Lelli   sched/deadline: F...
1369
1370
  		ret = task_can_attach(task, cs->cpus_allowed);
  		if (ret)
5d21cc2db   Tejun Heo   cpuset: replace c...
1371
1372
1373
1374
  			goto out_unlock;
  		ret = security_task_setscheduler(task);
  		if (ret)
  			goto out_unlock;
bb9d97b6d   Tejun Heo   cgroup: don't use...
1375
  	}
f780bdb7c   Ben Blum   cgroups: add per-...
1376

452477fa6   Tejun Heo   cpuset: pin down ...
1377
1378
1379
1380
1381
  	/*
  	 * Mark attach is in progress.  This makes validate_change() fail
  	 * changes which zero cpus/mems_allowed.
  	 */
  	cs->attach_in_progress++;
5d21cc2db   Tejun Heo   cpuset: replace c...
1382
1383
1384
1385
  	ret = 0;
  out_unlock:
  	mutex_unlock(&cpuset_mutex);
  	return ret;
8793d854e   Paul Menage   Task Control Grou...
1386
  }
f780bdb7c   Ben Blum   cgroups: add per-...
1387

eb95419b0   Tejun Heo   cgroup: pass arou...
1388
  static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
452477fa6   Tejun Heo   cpuset: pin down ...
1389
1390
  				 struct cgroup_taskset *tset)
  {
5d21cc2db   Tejun Heo   cpuset: replace c...
1391
  	mutex_lock(&cpuset_mutex);
eb95419b0   Tejun Heo   cgroup: pass arou...
1392
  	css_cs(css)->attach_in_progress--;
5d21cc2db   Tejun Heo   cpuset: replace c...
1393
  	mutex_unlock(&cpuset_mutex);
8793d854e   Paul Menage   Task Control Grou...
1394
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1395

4e4c9a140   Tejun Heo   cpuset: cleanup c...
1396
  /*
5d21cc2db   Tejun Heo   cpuset: replace c...
1397
   * Protected by cpuset_mutex.  cpus_attach is used only by cpuset_attach()
4e4c9a140   Tejun Heo   cpuset: cleanup c...
1398
1399
1400
1401
   * but we can't allocate it dynamically there.  Define it global and
   * allocate from cpuset_init().
   */
  static cpumask_var_t cpus_attach;
eb95419b0   Tejun Heo   cgroup: pass arou...
1402
1403
  static void cpuset_attach(struct cgroup_subsys_state *css,
  			  struct cgroup_taskset *tset)
8793d854e   Paul Menage   Task Control Grou...
1404
  {
67bd2c598   Li Zefan   cpuset: remove un...
1405
  	/* static buf protected by cpuset_mutex */
4e4c9a140   Tejun Heo   cpuset: cleanup c...
1406
  	static nodemask_t cpuset_attach_nodemask_to;
8793d854e   Paul Menage   Task Control Grou...
1407
  	struct mm_struct *mm;
bb9d97b6d   Tejun Heo   cgroup: don't use...
1408
1409
  	struct task_struct *task;
  	struct task_struct *leader = cgroup_taskset_first(tset);
eb95419b0   Tejun Heo   cgroup: pass arou...
1410
  	struct cpuset *cs = css_cs(css);
57fce0a68   Tejun Heo   cpuset: don't use...
1411
  	struct cpuset *oldcs = cpuset_attach_old_cs;
22fb52dd7   David Quigley   [PATCH] SELinux: ...
1412

5d21cc2db   Tejun Heo   cpuset: replace c...
1413
  	mutex_lock(&cpuset_mutex);
4e4c9a140   Tejun Heo   cpuset: cleanup c...
1414
1415
1416
1417
  	/* prepare for attach */
  	if (cs == &top_cpuset)
  		cpumask_copy(cpus_attach, cpu_possible_mask);
  	else
ae1c80238   Li Zefan   cpuset: apply cs-...
1418
  		guarantee_online_cpus(cs, cpus_attach);
4e4c9a140   Tejun Heo   cpuset: cleanup c...
1419

ae1c80238   Li Zefan   cpuset: apply cs-...
1420
  	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
4e4c9a140   Tejun Heo   cpuset: cleanup c...
1421

924f0d9a2   Tejun Heo   cgroup: drop @ski...
1422
  	cgroup_taskset_for_each(task, tset) {
bb9d97b6d   Tejun Heo   cgroup: don't use...
1423
1424
1425
1426
1427
1428
1429
1430
1431
  		/*
  		 * can_attach beforehand should guarantee that this doesn't
  		 * fail.  TODO: have a better way to handle failure here
  		 */
  		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
  
  		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
  		cpuset_update_task_spread_flag(cs, task);
  	}
22fb52dd7   David Quigley   [PATCH] SELinux: ...
1432

f780bdb7c   Ben Blum   cgroups: add per-...
1433
1434
1435
1436
  	/*
  	 * Change mm, possibly for multiple threads in a threadgroup. This is
  	 * expensive and may sleep.
  	 */
ae1c80238   Li Zefan   cpuset: apply cs-...
1437
  	cpuset_attach_nodemask_to = cs->effective_mems;
bb9d97b6d   Tejun Heo   cgroup: don't use...
1438
  	mm = get_task_mm(leader);
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1439
  	if (mm) {
f780bdb7c   Ben Blum   cgroups: add per-...
1440
  		mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
f047cecf2   Li Zefan   cpuset: fix to mi...
1441
1442
1443
1444
1445
1446
1447
1448
1449
  
  		/*
  		 * old_mems_allowed is the same with mems_allowed here, except
  		 * if this task is being moved automatically due to hotplug.
  		 * In that case @mems_allowed has been updated and is empty,
  		 * so @old_mems_allowed is the right nodesets that we migrate
  		 * mm from.
  		 */
  		if (is_memory_migrate(cs)) {
ae1c80238   Li Zefan   cpuset: apply cs-...
1450
  			cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
f780bdb7c   Ben Blum   cgroups: add per-...
1451
  					  &cpuset_attach_nodemask_to);
f047cecf2   Li Zefan   cpuset: fix to mi...
1452
  		}
4225399a6   Paul Jackson   [PATCH] cpuset: r...
1453
1454
  		mmput(mm);
  	}
452477fa6   Tejun Heo   cpuset: pin down ...
1455

33ad801df   Li Zefan   cpuset: record ol...
1456
  	cs->old_mems_allowed = cpuset_attach_nodemask_to;
02bb58637   Tejun Heo   cpuset: schedule ...
1457

452477fa6   Tejun Heo   cpuset: pin down ...
1458
  	cs->attach_in_progress--;
e44193d39   Li Zefan   cpuset: let hotpl...
1459
1460
  	if (!cs->attach_in_progress)
  		wake_up(&cpuset_attach_wq);
5d21cc2db   Tejun Heo   cpuset: replace c...
1461
1462
  
  	mutex_unlock(&cpuset_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1463
1464
1465
1466
1467
  }
  
  /* The various types of files and directories in a cpuset file system */
  
  typedef enum {
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
1468
  	FILE_MEMORY_MIGRATE,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1469
1470
  	FILE_CPULIST,
  	FILE_MEMLIST,
afd1a8b3e   Li Zefan   cpuset: export ef...
1471
1472
  	FILE_EFFECTIVE_CPULIST,
  	FILE_EFFECTIVE_MEMLIST,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1473
1474
  	FILE_CPU_EXCLUSIVE,
  	FILE_MEM_EXCLUSIVE,
786083667   Paul Menage   Cpuset hardwall f...
1475
  	FILE_MEM_HARDWALL,
029190c51   Paul Jackson   cpuset sched_load...
1476
  	FILE_SCHED_LOAD_BALANCE,
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1477
  	FILE_SCHED_RELAX_DOMAIN_LEVEL,
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1478
1479
  	FILE_MEMORY_PRESSURE_ENABLED,
  	FILE_MEMORY_PRESSURE,
825a46af5   Paul Jackson   [PATCH] cpuset me...
1480
1481
  	FILE_SPREAD_PAGE,
  	FILE_SPREAD_SLAB,
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1482
  } cpuset_filetype_t;
182446d08   Tejun Heo   cgroup: pass arou...
1483
1484
  static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
  			    u64 val)
700fe1ab9   Paul Menage   CGroup API files:...
1485
  {
182446d08   Tejun Heo   cgroup: pass arou...
1486
  	struct cpuset *cs = css_cs(css);
700fe1ab9   Paul Menage   CGroup API files:...
1487
  	cpuset_filetype_t type = cft->private;
a903f0865   Li Zefan   cpuset: fix the r...
1488
  	int retval = 0;
700fe1ab9   Paul Menage   CGroup API files:...
1489

5d21cc2db   Tejun Heo   cpuset: replace c...
1490
  	mutex_lock(&cpuset_mutex);
a903f0865   Li Zefan   cpuset: fix the r...
1491
1492
  	if (!is_cpuset_online(cs)) {
  		retval = -ENODEV;
5d21cc2db   Tejun Heo   cpuset: replace c...
1493
  		goto out_unlock;
a903f0865   Li Zefan   cpuset: fix the r...
1494
  	}
700fe1ab9   Paul Menage   CGroup API files:...
1495
1496
  
  	switch (type) {
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1497
  	case FILE_CPU_EXCLUSIVE:
700fe1ab9   Paul Menage   CGroup API files:...
1498
  		retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1499
1500
  		break;
  	case FILE_MEM_EXCLUSIVE:
700fe1ab9   Paul Menage   CGroup API files:...
1501
  		retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1502
  		break;
786083667   Paul Menage   Cpuset hardwall f...
1503
1504
1505
  	case FILE_MEM_HARDWALL:
  		retval = update_flag(CS_MEM_HARDWALL, cs, val);
  		break;
029190c51   Paul Jackson   cpuset sched_load...
1506
  	case FILE_SCHED_LOAD_BALANCE:
700fe1ab9   Paul Menage   CGroup API files:...
1507
  		retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1508
  		break;
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
1509
  	case FILE_MEMORY_MIGRATE:
700fe1ab9   Paul Menage   CGroup API files:...
1510
  		retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
45b07ef31   Paul Jackson   [PATCH] cpusets: ...
1511
  		break;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1512
  	case FILE_MEMORY_PRESSURE_ENABLED:
700fe1ab9   Paul Menage   CGroup API files:...
1513
  		cpuset_memory_pressure_enabled = !!val;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1514
1515
1516
1517
  		break;
  	case FILE_MEMORY_PRESSURE:
  		retval = -EACCES;
  		break;
825a46af5   Paul Jackson   [PATCH] cpuset me...
1518
  	case FILE_SPREAD_PAGE:
700fe1ab9   Paul Menage   CGroup API files:...
1519
  		retval = update_flag(CS_SPREAD_PAGE, cs, val);
825a46af5   Paul Jackson   [PATCH] cpuset me...
1520
1521
  		break;
  	case FILE_SPREAD_SLAB:
700fe1ab9   Paul Menage   CGroup API files:...
1522
  		retval = update_flag(CS_SPREAD_SLAB, cs, val);
825a46af5   Paul Jackson   [PATCH] cpuset me...
1523
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1524
1525
  	default:
  		retval = -EINVAL;
700fe1ab9   Paul Menage   CGroup API files:...
1526
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1527
  	}
5d21cc2db   Tejun Heo   cpuset: replace c...
1528
1529
  out_unlock:
  	mutex_unlock(&cpuset_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1530
1531
  	return retval;
  }
182446d08   Tejun Heo   cgroup: pass arou...
1532
1533
  static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
  			    s64 val)
5be7a4792   Paul Menage   Fix cpuset sched_...
1534
  {
182446d08   Tejun Heo   cgroup: pass arou...
1535
  	struct cpuset *cs = css_cs(css);
5be7a4792   Paul Menage   Fix cpuset sched_...
1536
  	cpuset_filetype_t type = cft->private;
5d21cc2db   Tejun Heo   cpuset: replace c...
1537
  	int retval = -ENODEV;
5be7a4792   Paul Menage   Fix cpuset sched_...
1538

5d21cc2db   Tejun Heo   cpuset: replace c...
1539
1540
1541
  	mutex_lock(&cpuset_mutex);
  	if (!is_cpuset_online(cs))
  		goto out_unlock;
e37123953   Paul Menage   cgroup files: rem...
1542

5be7a4792   Paul Menage   Fix cpuset sched_...
1543
1544
1545
1546
1547
1548
1549
1550
  	switch (type) {
  	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
  		retval = update_relax_domain_level(cs, val);
  		break;
  	default:
  		retval = -EINVAL;
  		break;
  	}
5d21cc2db   Tejun Heo   cpuset: replace c...
1551
1552
  out_unlock:
  	mutex_unlock(&cpuset_mutex);
5be7a4792   Paul Menage   Fix cpuset sched_...
1553
1554
  	return retval;
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1555
  /*
e37123953   Paul Menage   cgroup files: rem...
1556
1557
   * Common handling for a write to a "cpus" or "mems" file.
   */
451af504d   Tejun Heo   cgroup: replace c...
1558
1559
  static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
  				    char *buf, size_t nbytes, loff_t off)
e37123953   Paul Menage   cgroup files: rem...
1560
  {
451af504d   Tejun Heo   cgroup: replace c...
1561
  	struct cpuset *cs = css_cs(of_css(of));
645fcc9d2   Li Zefan   cpuset: don't all...
1562
  	struct cpuset *trialcs;
5d21cc2db   Tejun Heo   cpuset: replace c...
1563
  	int retval = -ENODEV;
e37123953   Paul Menage   cgroup files: rem...
1564

451af504d   Tejun Heo   cgroup: replace c...
1565
  	buf = strstrip(buf);
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
  	/*
  	 * CPU or memory hotunplug may leave @cs w/o any execution
  	 * resources, in which case the hotplug code asynchronously updates
  	 * configuration and transfers all tasks to the nearest ancestor
  	 * which can execute.
  	 *
  	 * As writes to "cpus" or "mems" may restore @cs's execution
  	 * resources, wait for the previously scheduled operations before
  	 * proceeding, so that we don't end up keep removing tasks added
  	 * after execution capability is restored.
76bb5ab8f   Tejun Heo   cpuset: break ker...
1576
1577
1578
1579
1580
1581
1582
1583
  	 *
  	 * cpuset_hotplug_work calls back into cgroup core via
  	 * cgroup_transfer_tasks() and waiting for it from a cgroupfs
  	 * operation like this one can lead to a deadlock through kernfs
  	 * active_ref protection.  Let's break the protection.  Losing the
  	 * protection is okay as we check whether @cs is online after
  	 * grabbing cpuset_mutex anyway.  This only happens on the legacy
  	 * hierarchies.
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
1584
  	 */
76bb5ab8f   Tejun Heo   cpuset: break ker...
1585
1586
  	css_get(&cs->css);
  	kernfs_break_active_protection(of->kn);
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
1587
  	flush_work(&cpuset_hotplug_work);
5d21cc2db   Tejun Heo   cpuset: replace c...
1588
1589
1590
  	mutex_lock(&cpuset_mutex);
  	if (!is_cpuset_online(cs))
  		goto out_unlock;
e37123953   Paul Menage   cgroup files: rem...
1591

645fcc9d2   Li Zefan   cpuset: don't all...
1592
  	trialcs = alloc_trial_cpuset(cs);
b75f38d65   Li Zefan   cpuset: add a mis...
1593
1594
  	if (!trialcs) {
  		retval = -ENOMEM;
5d21cc2db   Tejun Heo   cpuset: replace c...
1595
  		goto out_unlock;
b75f38d65   Li Zefan   cpuset: add a mis...
1596
  	}
645fcc9d2   Li Zefan   cpuset: don't all...
1597

451af504d   Tejun Heo   cgroup: replace c...
1598
  	switch (of_cft(of)->private) {
e37123953   Paul Menage   cgroup files: rem...
1599
  	case FILE_CPULIST:
645fcc9d2   Li Zefan   cpuset: don't all...
1600
  		retval = update_cpumask(cs, trialcs, buf);
e37123953   Paul Menage   cgroup files: rem...
1601
1602
  		break;
  	case FILE_MEMLIST:
645fcc9d2   Li Zefan   cpuset: don't all...
1603
  		retval = update_nodemask(cs, trialcs, buf);
e37123953   Paul Menage   cgroup files: rem...
1604
1605
1606
1607
1608
  		break;
  	default:
  		retval = -EINVAL;
  		break;
  	}
645fcc9d2   Li Zefan   cpuset: don't all...
1609
1610
  
  	free_trial_cpuset(trialcs);
5d21cc2db   Tejun Heo   cpuset: replace c...
1611
1612
  out_unlock:
  	mutex_unlock(&cpuset_mutex);
76bb5ab8f   Tejun Heo   cpuset: break ker...
1613
1614
  	kernfs_unbreak_active_protection(of->kn);
  	css_put(&cs->css);
451af504d   Tejun Heo   cgroup: replace c...
1615
  	return retval ?: nbytes;
e37123953   Paul Menage   cgroup files: rem...
1616
1617
1618
  }
  
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1619
1620
1621
1622
1623
1624
   * These ascii lists should be read in a single call, by using a user
   * buffer large enough to hold the entire map.  If read in smaller
   * chunks, there is no guarantee of atomicity.  Since the display format
   * used, list of ranges of sequential numbers, is variable length,
   * and since these maps can change value dynamically, one could read
   * gibberish by doing partial reads while a list was changing.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1625
   */
2da8ca822   Tejun Heo   cgroup: replace c...
1626
  static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1627
  {
2da8ca822   Tejun Heo   cgroup: replace c...
1628
1629
  	struct cpuset *cs = css_cs(seq_css(sf));
  	cpuset_filetype_t type = seq_cft(sf)->private;
51ffe4117   Tejun Heo   cpuset: convert a...
1630
  	int ret = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1631

8447a0fee   Vladimir Davydov   cpuset: convert c...
1632
  	spin_lock_irq(&callback_lock);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1633
1634
1635
  
  	switch (type) {
  	case FILE_CPULIST:
e8e6d97c9   Tejun Heo   cpuset: use %*pb[...
1636
1637
  		seq_printf(sf, "%*pbl
  ", cpumask_pr_args(cs->cpus_allowed));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1638
1639
  		break;
  	case FILE_MEMLIST:
e8e6d97c9   Tejun Heo   cpuset: use %*pb[...
1640
1641
  		seq_printf(sf, "%*pbl
  ", nodemask_pr_args(&cs->mems_allowed));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1642
  		break;
afd1a8b3e   Li Zefan   cpuset: export ef...
1643
  	case FILE_EFFECTIVE_CPULIST:
e8e6d97c9   Tejun Heo   cpuset: use %*pb[...
1644
1645
  		seq_printf(sf, "%*pbl
  ", cpumask_pr_args(cs->effective_cpus));
afd1a8b3e   Li Zefan   cpuset: export ef...
1646
1647
  		break;
  	case FILE_EFFECTIVE_MEMLIST:
e8e6d97c9   Tejun Heo   cpuset: use %*pb[...
1648
1649
  		seq_printf(sf, "%*pbl
  ", nodemask_pr_args(&cs->effective_mems));
afd1a8b3e   Li Zefan   cpuset: export ef...
1650
  		break;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1651
  	default:
51ffe4117   Tejun Heo   cpuset: convert a...
1652
  		ret = -EINVAL;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1653
  	}
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1654

8447a0fee   Vladimir Davydov   cpuset: convert c...
1655
  	spin_unlock_irq(&callback_lock);
51ffe4117   Tejun Heo   cpuset: convert a...
1656
  	return ret;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1657
  }
182446d08   Tejun Heo   cgroup: pass arou...
1658
  static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
700fe1ab9   Paul Menage   CGroup API files:...
1659
  {
182446d08   Tejun Heo   cgroup: pass arou...
1660
  	struct cpuset *cs = css_cs(css);
700fe1ab9   Paul Menage   CGroup API files:...
1661
1662
1663
1664
1665
1666
  	cpuset_filetype_t type = cft->private;
  	switch (type) {
  	case FILE_CPU_EXCLUSIVE:
  		return is_cpu_exclusive(cs);
  	case FILE_MEM_EXCLUSIVE:
  		return is_mem_exclusive(cs);
786083667   Paul Menage   Cpuset hardwall f...
1667
1668
  	case FILE_MEM_HARDWALL:
  		return is_mem_hardwall(cs);
700fe1ab9   Paul Menage   CGroup API files:...
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
  	case FILE_SCHED_LOAD_BALANCE:
  		return is_sched_load_balance(cs);
  	case FILE_MEMORY_MIGRATE:
  		return is_memory_migrate(cs);
  	case FILE_MEMORY_PRESSURE_ENABLED:
  		return cpuset_memory_pressure_enabled;
  	case FILE_MEMORY_PRESSURE:
  		return fmeter_getrate(&cs->fmeter);
  	case FILE_SPREAD_PAGE:
  		return is_spread_page(cs);
  	case FILE_SPREAD_SLAB:
  		return is_spread_slab(cs);
  	default:
  		BUG();
  	}
cf417141c   Max Krasnyansky   sched, cpuset: re...
1684
1685
1686
  
  	/* Unreachable but makes gcc happy */
  	return 0;
700fe1ab9   Paul Menage   CGroup API files:...
1687
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1688

182446d08   Tejun Heo   cgroup: pass arou...
1689
  static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
5be7a4792   Paul Menage   Fix cpuset sched_...
1690
  {
182446d08   Tejun Heo   cgroup: pass arou...
1691
  	struct cpuset *cs = css_cs(css);
5be7a4792   Paul Menage   Fix cpuset sched_...
1692
1693
1694
1695
1696
1697
1698
  	cpuset_filetype_t type = cft->private;
  	switch (type) {
  	case FILE_SCHED_RELAX_DOMAIN_LEVEL:
  		return cs->relax_domain_level;
  	default:
  		BUG();
  	}
cf417141c   Max Krasnyansky   sched, cpuset: re...
1699
1700
1701
  
  	/* Unrechable but makes gcc happy */
  	return 0;
5be7a4792   Paul Menage   Fix cpuset sched_...
1702
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1703
1704
1705
1706
  
  /*
   * for the common functions, 'private' gives the type of file
   */
addf2c739   Paul Menage   Cpuset hardwall f...
1707
1708
1709
  static struct cftype files[] = {
  	{
  		.name = "cpus",
2da8ca822   Tejun Heo   cgroup: replace c...
1710
  		.seq_show = cpuset_common_seq_show,
451af504d   Tejun Heo   cgroup: replace c...
1711
  		.write = cpuset_write_resmask,
e37123953   Paul Menage   cgroup files: rem...
1712
  		.max_write_len = (100U + 6 * NR_CPUS),
addf2c739   Paul Menage   Cpuset hardwall f...
1713
1714
1715
1716
1717
  		.private = FILE_CPULIST,
  	},
  
  	{
  		.name = "mems",
2da8ca822   Tejun Heo   cgroup: replace c...
1718
  		.seq_show = cpuset_common_seq_show,
451af504d   Tejun Heo   cgroup: replace c...
1719
  		.write = cpuset_write_resmask,
e37123953   Paul Menage   cgroup files: rem...
1720
  		.max_write_len = (100U + 6 * MAX_NUMNODES),
addf2c739   Paul Menage   Cpuset hardwall f...
1721
1722
1723
1724
  		.private = FILE_MEMLIST,
  	},
  
  	{
afd1a8b3e   Li Zefan   cpuset: export ef...
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
  		.name = "effective_cpus",
  		.seq_show = cpuset_common_seq_show,
  		.private = FILE_EFFECTIVE_CPULIST,
  	},
  
  	{
  		.name = "effective_mems",
  		.seq_show = cpuset_common_seq_show,
  		.private = FILE_EFFECTIVE_MEMLIST,
  	},
  
  	{
addf2c739   Paul Menage   Cpuset hardwall f...
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
  		.name = "cpu_exclusive",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_CPU_EXCLUSIVE,
  	},
  
  	{
  		.name = "mem_exclusive",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEM_EXCLUSIVE,
  	},
  
  	{
786083667   Paul Menage   Cpuset hardwall f...
1751
1752
1753
1754
1755
1756
1757
  		.name = "mem_hardwall",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEM_HARDWALL,
  	},
  
  	{
addf2c739   Paul Menage   Cpuset hardwall f...
1758
1759
1760
1761
1762
1763
1764
1765
  		.name = "sched_load_balance",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_SCHED_LOAD_BALANCE,
  	},
  
  	{
  		.name = "sched_relax_domain_level",
5be7a4792   Paul Menage   Fix cpuset sched_...
1766
1767
  		.read_s64 = cpuset_read_s64,
  		.write_s64 = cpuset_write_s64,
addf2c739   Paul Menage   Cpuset hardwall f...
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
  		.private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
  	},
  
  	{
  		.name = "memory_migrate",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEMORY_MIGRATE,
  	},
  
  	{
  		.name = "memory_pressure",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEMORY_PRESSURE,
099fca322   Li Zefan   cgroups: show cor...
1783
  		.mode = S_IRUGO,
addf2c739   Paul Menage   Cpuset hardwall f...
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
  	},
  
  	{
  		.name = "memory_spread_page",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_SPREAD_PAGE,
  	},
  
  	{
  		.name = "memory_spread_slab",
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_SPREAD_SLAB,
  	},
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1799

4baf6e332   Tejun Heo   cgroup: convert a...
1800
1801
1802
1803
1804
1805
1806
  	{
  		.name = "memory_pressure_enabled",
  		.flags = CFTYPE_ONLY_ON_ROOT,
  		.read_u64 = cpuset_read_u64,
  		.write_u64 = cpuset_write_u64,
  		.private = FILE_MEMORY_PRESSURE_ENABLED,
  	},
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1807

4baf6e332   Tejun Heo   cgroup: convert a...
1808
1809
  	{ }	/* terminate */
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1810
1811
  
  /*
92fb97487   Tejun Heo   cgroup: rename ->...
1812
   *	cpuset_css_alloc - allocate a cpuset css
c9e5fe66f   Li Zefan   cpuset: rename @c...
1813
   *	cgrp:	control group that the new cpuset will be part of
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1814
   */
eb95419b0   Tejun Heo   cgroup: pass arou...
1815
1816
  static struct cgroup_subsys_state *
  cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1817
  {
c8f699bb5   Tejun Heo   cpuset: introduce...
1818
  	struct cpuset *cs;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1819

eb95419b0   Tejun Heo   cgroup: pass arou...
1820
  	if (!parent_css)
8793d854e   Paul Menage   Task Control Grou...
1821
  		return &top_cpuset.css;
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1822

c8f699bb5   Tejun Heo   cpuset: introduce...
1823
  	cs = kzalloc(sizeof(*cs), GFP_KERNEL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1824
  	if (!cs)
8793d854e   Paul Menage   Task Control Grou...
1825
  		return ERR_PTR(-ENOMEM);
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
1826
1827
1828
1829
  	if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
  		goto free_cs;
  	if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
  		goto free_cpus;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1830

029190c51   Paul Jackson   cpuset sched_load...
1831
  	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
300ed6cbb   Li Zefan   cpuset: convert c...
1832
  	cpumask_clear(cs->cpus_allowed);
f9a86fcbb   Mike Travis   cpuset: modify cp...
1833
  	nodes_clear(cs->mems_allowed);
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
1834
1835
  	cpumask_clear(cs->effective_cpus);
  	nodes_clear(cs->effective_mems);
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1836
  	fmeter_init(&cs->fmeter);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1837
  	cs->relax_domain_level = -1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1838

c8f699bb5   Tejun Heo   cpuset: introduce...
1839
  	return &cs->css;
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
1840
1841
1842
1843
1844
1845
  
  free_cpus:
  	free_cpumask_var(cs->cpus_allowed);
  free_cs:
  	kfree(cs);
  	return ERR_PTR(-ENOMEM);
c8f699bb5   Tejun Heo   cpuset: introduce...
1846
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
1847
  static int cpuset_css_online(struct cgroup_subsys_state *css)
c8f699bb5   Tejun Heo   cpuset: introduce...
1848
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
1849
  	struct cpuset *cs = css_cs(css);
c431069fe   Tejun Heo   cpuset: remove cp...
1850
  	struct cpuset *parent = parent_cs(cs);
ae8086ce1   Tejun Heo   cpuset: introduce...
1851
  	struct cpuset *tmp_cs;
492eb21b9   Tejun Heo   cgroup: make hier...
1852
  	struct cgroup_subsys_state *pos_css;
c8f699bb5   Tejun Heo   cpuset: introduce...
1853
1854
1855
  
  	if (!parent)
  		return 0;
5d21cc2db   Tejun Heo   cpuset: replace c...
1856
  	mutex_lock(&cpuset_mutex);
efeb77b2f   Tejun Heo   cpuset: introduce...
1857
  	set_bit(CS_ONLINE, &cs->flags);
c8f699bb5   Tejun Heo   cpuset: introduce...
1858
1859
1860
1861
  	if (is_spread_page(parent))
  		set_bit(CS_SPREAD_PAGE, &cs->flags);
  	if (is_spread_slab(parent))
  		set_bit(CS_SPREAD_SLAB, &cs->flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1862

664eeddee   Mel Gorman   mm: page_alloc: u...
1863
  	cpuset_inc();
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1864

8447a0fee   Vladimir Davydov   cpuset: convert c...
1865
  	spin_lock_irq(&callback_lock);
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
1866
1867
1868
1869
  	if (cgroup_on_dfl(cs->css.cgroup)) {
  		cpumask_copy(cs->effective_cpus, parent->effective_cpus);
  		cs->effective_mems = parent->effective_mems;
  	}
8447a0fee   Vladimir Davydov   cpuset: convert c...
1870
  	spin_unlock_irq(&callback_lock);
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
1871

eb95419b0   Tejun Heo   cgroup: pass arou...
1872
  	if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
5d21cc2db   Tejun Heo   cpuset: replace c...
1873
  		goto out_unlock;
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
  
  	/*
  	 * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
  	 * set.  This flag handling is implemented in cgroup core for
  	 * histrical reasons - the flag may be specified during mount.
  	 *
  	 * Currently, if any sibling cpusets have exclusive cpus or mem, we
  	 * refuse to clone the configuration - thereby refusing the task to
  	 * be entered, and as a result refusing the sys_unshare() or
  	 * clone() which initiated it.  If this becomes a problem for some
  	 * users who wish to allow that scenario, then this could be
  	 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
  	 * (and likewise for mems) to the new cgroup.
  	 */
ae8086ce1   Tejun Heo   cpuset: introduce...
1888
  	rcu_read_lock();
492eb21b9   Tejun Heo   cgroup: make hier...
1889
  	cpuset_for_each_child(tmp_cs, pos_css, parent) {
ae8086ce1   Tejun Heo   cpuset: introduce...
1890
1891
  		if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
  			rcu_read_unlock();
5d21cc2db   Tejun Heo   cpuset: replace c...
1892
  			goto out_unlock;
ae8086ce1   Tejun Heo   cpuset: introduce...
1893
  		}
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1894
  	}
ae8086ce1   Tejun Heo   cpuset: introduce...
1895
  	rcu_read_unlock();
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1896

8447a0fee   Vladimir Davydov   cpuset: convert c...
1897
  	spin_lock_irq(&callback_lock);
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1898
  	cs->mems_allowed = parent->mems_allowed;
790317e1b   Zefan Li   cpuset: initializ...
1899
  	cs->effective_mems = parent->mems_allowed;
033fa1c5f   Tejun Heo   cgroup, cpuset: r...
1900
  	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
790317e1b   Zefan Li   cpuset: initializ...
1901
  	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
cea74465e   Dan Carpenter   cpuset: lock vs u...
1902
  	spin_unlock_irq(&callback_lock);
5d21cc2db   Tejun Heo   cpuset: replace c...
1903
1904
  out_unlock:
  	mutex_unlock(&cpuset_mutex);
c8f699bb5   Tejun Heo   cpuset: introduce...
1905
1906
  	return 0;
  }
0b9e6965a   Zhao Hongjiang   cpuset: relocate ...
1907
1908
1909
1910
1911
  /*
   * If the cpuset being removed has its flag 'sched_load_balance'
   * enabled, then simulate turning sched_load_balance off, which
   * will call rebuild_sched_domains_locked().
   */
eb95419b0   Tejun Heo   cgroup: pass arou...
1912
  static void cpuset_css_offline(struct cgroup_subsys_state *css)
c8f699bb5   Tejun Heo   cpuset: introduce...
1913
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
1914
  	struct cpuset *cs = css_cs(css);
c8f699bb5   Tejun Heo   cpuset: introduce...
1915

5d21cc2db   Tejun Heo   cpuset: replace c...
1916
  	mutex_lock(&cpuset_mutex);
c8f699bb5   Tejun Heo   cpuset: introduce...
1917
1918
1919
  
  	if (is_sched_load_balance(cs))
  		update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
664eeddee   Mel Gorman   mm: page_alloc: u...
1920
  	cpuset_dec();
efeb77b2f   Tejun Heo   cpuset: introduce...
1921
  	clear_bit(CS_ONLINE, &cs->flags);
c8f699bb5   Tejun Heo   cpuset: introduce...
1922

5d21cc2db   Tejun Heo   cpuset: replace c...
1923
  	mutex_unlock(&cpuset_mutex);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1924
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
1925
  static void cpuset_css_free(struct cgroup_subsys_state *css)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1926
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
1927
  	struct cpuset *cs = css_cs(css);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1928

e2b9a3d7d   Li Zefan   cpuset: add cs->e...
1929
  	free_cpumask_var(cs->effective_cpus);
300ed6cbb   Li Zefan   cpuset: convert c...
1930
  	free_cpumask_var(cs->cpus_allowed);
8793d854e   Paul Menage   Task Control Grou...
1931
  	kfree(cs);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1932
  }
39bd0d15e   Li Zefan   cpuset: initializ...
1933
1934
1935
  static void cpuset_bind(struct cgroup_subsys_state *root_css)
  {
  	mutex_lock(&cpuset_mutex);
8447a0fee   Vladimir Davydov   cpuset: convert c...
1936
  	spin_lock_irq(&callback_lock);
39bd0d15e   Li Zefan   cpuset: initializ...
1937
1938
1939
1940
1941
1942
1943
1944
1945
  
  	if (cgroup_on_dfl(root_css->cgroup)) {
  		cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
  		top_cpuset.mems_allowed = node_possible_map;
  	} else {
  		cpumask_copy(top_cpuset.cpus_allowed,
  			     top_cpuset.effective_cpus);
  		top_cpuset.mems_allowed = top_cpuset.effective_mems;
  	}
8447a0fee   Vladimir Davydov   cpuset: convert c...
1946
  	spin_unlock_irq(&callback_lock);
39bd0d15e   Li Zefan   cpuset: initializ...
1947
1948
  	mutex_unlock(&cpuset_mutex);
  }
073219e99   Tejun Heo   cgroup: clean up ...
1949
  struct cgroup_subsys cpuset_cgrp_subsys = {
39bd0d15e   Li Zefan   cpuset: initializ...
1950
1951
1952
1953
1954
1955
1956
1957
  	.css_alloc	= cpuset_css_alloc,
  	.css_online	= cpuset_css_online,
  	.css_offline	= cpuset_css_offline,
  	.css_free	= cpuset_css_free,
  	.can_attach	= cpuset_can_attach,
  	.cancel_attach	= cpuset_cancel_attach,
  	.attach		= cpuset_attach,
  	.bind		= cpuset_bind,
5577964e6   Tejun Heo   cgroup: rename cg...
1958
  	.legacy_cftypes	= files,
39bd0d15e   Li Zefan   cpuset: initializ...
1959
  	.early_init	= 1,
8793d854e   Paul Menage   Task Control Grou...
1960
  };
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1961
1962
1963
1964
1965
1966
1967
1968
  /**
   * cpuset_init - initialize cpusets at system boot
   *
   * Description: Initialize top_cpuset and the cpuset internal file system,
   **/
  
  int __init cpuset_init(void)
  {
8793d854e   Paul Menage   Task Control Grou...
1969
  	int err = 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1970

58568d2a8   Miao Xie   cpuset,mm: update...
1971
1972
  	if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
  		BUG();
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
1973
1974
  	if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
  		BUG();
58568d2a8   Miao Xie   cpuset,mm: update...
1975

300ed6cbb   Li Zefan   cpuset: convert c...
1976
  	cpumask_setall(top_cpuset.cpus_allowed);
f9a86fcbb   Mike Travis   cpuset: modify cp...
1977
  	nodes_setall(top_cpuset.mems_allowed);
e2b9a3d7d   Li Zefan   cpuset: add cs->e...
1978
1979
  	cpumask_setall(top_cpuset.effective_cpus);
  	nodes_setall(top_cpuset.effective_mems);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1980

3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
1981
  	fmeter_init(&top_cpuset.fmeter);
029190c51   Paul Jackson   cpuset sched_load...
1982
  	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1d3504fcf   Hidetoshi Seto   sched, cpuset: cu...
1983
  	top_cpuset.relax_domain_level = -1;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1984

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1985
1986
  	err = register_filesystem(&cpuset_fs_type);
  	if (err < 0)
8793d854e   Paul Menage   Task Control Grou...
1987
  		return err;
2341d1b65   Li Zefan   cpuset: convert c...
1988
1989
  	if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL))
  		BUG();
8793d854e   Paul Menage   Task Control Grou...
1990
  	return 0;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
1991
  }
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1992
  /*
cf417141c   Max Krasnyansky   sched, cpuset: re...
1993
   * If CPU and/or memory hotplug handlers, below, unplug any CPUs
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1994
1995
   * or memory nodes, we need to walk over the cpuset hierarchy,
   * removing that CPU or node from all cpusets.  If this removes the
956db3ca0   Cliff Wickman   hotplug cpu: move...
1996
1997
   * last CPU or node from a cpuset, then move the tasks in the empty
   * cpuset to its next-highest non-empty parent.
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
1998
   */
956db3ca0   Cliff Wickman   hotplug cpu: move...
1999
2000
2001
  static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
  {
  	struct cpuset *parent;
c8d9c90c7   Paul Jackson   hotplug cpu: move...
2002
  	/*
956db3ca0   Cliff Wickman   hotplug cpu: move...
2003
2004
2005
  	 * Find its next-highest non-empty parent, (top cpuset
  	 * has online cpus, so can't be empty).
  	 */
c431069fe   Tejun Heo   cpuset: remove cp...
2006
  	parent = parent_cs(cs);
300ed6cbb   Li Zefan   cpuset: convert c...
2007
  	while (cpumask_empty(parent->cpus_allowed) ||
b45012955   Paul Jackson   hotplug cpu move ...
2008
  			nodes_empty(parent->mems_allowed))
c431069fe   Tejun Heo   cpuset: remove cp...
2009
  		parent = parent_cs(parent);
956db3ca0   Cliff Wickman   hotplug cpu: move...
2010

8cc993452   Tejun Heo   cgroup, cpuset: r...
2011
  	if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
12d3089c1   Fabian Frederick   kernel/cpuset.c: ...
2012
  		pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
e61734c55   Tejun Heo   cgroup: remove cg...
2013
2014
2015
  		pr_cont_cgroup_name(cs->css.cgroup);
  		pr_cont("
  ");
8cc993452   Tejun Heo   cgroup, cpuset: r...
2016
  	}
956db3ca0   Cliff Wickman   hotplug cpu: move...
2017
  }
be4c9dd7a   Li Zefan   cpuset: enable on...
2018
2019
2020
2021
  static void
  hotplug_update_tasks_legacy(struct cpuset *cs,
  			    struct cpumask *new_cpus, nodemask_t *new_mems,
  			    bool cpus_updated, bool mems_updated)
390a36aad   Li Zefan   cpuset: refactor ...
2022
2023
  {
  	bool is_empty;
8447a0fee   Vladimir Davydov   cpuset: convert c...
2024
  	spin_lock_irq(&callback_lock);
be4c9dd7a   Li Zefan   cpuset: enable on...
2025
2026
2027
2028
  	cpumask_copy(cs->cpus_allowed, new_cpus);
  	cpumask_copy(cs->effective_cpus, new_cpus);
  	cs->mems_allowed = *new_mems;
  	cs->effective_mems = *new_mems;
8447a0fee   Vladimir Davydov   cpuset: convert c...
2029
  	spin_unlock_irq(&callback_lock);
390a36aad   Li Zefan   cpuset: refactor ...
2030
2031
2032
2033
2034
  
  	/*
  	 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
  	 * as the tasks will be migratecd to an ancestor.
  	 */
be4c9dd7a   Li Zefan   cpuset: enable on...
2035
  	if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
390a36aad   Li Zefan   cpuset: refactor ...
2036
  		update_tasks_cpumask(cs);
be4c9dd7a   Li Zefan   cpuset: enable on...
2037
  	if (mems_updated && !nodes_empty(cs->mems_allowed))
390a36aad   Li Zefan   cpuset: refactor ...
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
  		update_tasks_nodemask(cs);
  
  	is_empty = cpumask_empty(cs->cpus_allowed) ||
  		   nodes_empty(cs->mems_allowed);
  
  	mutex_unlock(&cpuset_mutex);
  
  	/*
  	 * Move tasks to the nearest ancestor with execution resources,
  	 * This is full cgroup operation which will also call back into
  	 * cpuset. Should be done outside any lock.
  	 */
  	if (is_empty)
  		remove_tasks_in_empty_cpuset(cs);
  
  	mutex_lock(&cpuset_mutex);
  }
be4c9dd7a   Li Zefan   cpuset: enable on...
2055
2056
2057
2058
  static void
  hotplug_update_tasks(struct cpuset *cs,
  		     struct cpumask *new_cpus, nodemask_t *new_mems,
  		     bool cpus_updated, bool mems_updated)
390a36aad   Li Zefan   cpuset: refactor ...
2059
  {
be4c9dd7a   Li Zefan   cpuset: enable on...
2060
2061
2062
2063
  	if (cpumask_empty(new_cpus))
  		cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
  	if (nodes_empty(*new_mems))
  		*new_mems = parent_cs(cs)->effective_mems;
8447a0fee   Vladimir Davydov   cpuset: convert c...
2064
  	spin_lock_irq(&callback_lock);
be4c9dd7a   Li Zefan   cpuset: enable on...
2065
2066
  	cpumask_copy(cs->effective_cpus, new_cpus);
  	cs->effective_mems = *new_mems;
8447a0fee   Vladimir Davydov   cpuset: convert c...
2067
  	spin_unlock_irq(&callback_lock);
390a36aad   Li Zefan   cpuset: refactor ...
2068

be4c9dd7a   Li Zefan   cpuset: enable on...
2069
  	if (cpus_updated)
390a36aad   Li Zefan   cpuset: refactor ...
2070
  		update_tasks_cpumask(cs);
be4c9dd7a   Li Zefan   cpuset: enable on...
2071
  	if (mems_updated)
390a36aad   Li Zefan   cpuset: refactor ...
2072
2073
  		update_tasks_nodemask(cs);
  }
deb7aa308   Tejun Heo   cpuset: reorganiz...
2074
  /**
388afd854   Li Zefan   cpuset: remove as...
2075
   * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
deb7aa308   Tejun Heo   cpuset: reorganiz...
2076
   * @cs: cpuset in interest
956db3ca0   Cliff Wickman   hotplug cpu: move...
2077
   *
deb7aa308   Tejun Heo   cpuset: reorganiz...
2078
2079
2080
   * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
   * offline, update @cs accordingly.  If @cs ends up with no CPU or memory,
   * all its tasks are moved to the nearest ancestor with both resources.
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
2081
   */
388afd854   Li Zefan   cpuset: remove as...
2082
  static void cpuset_hotplug_update_tasks(struct cpuset *cs)
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
2083
  {
be4c9dd7a   Li Zefan   cpuset: enable on...
2084
2085
2086
2087
  	static cpumask_t new_cpus;
  	static nodemask_t new_mems;
  	bool cpus_updated;
  	bool mems_updated;
e44193d39   Li Zefan   cpuset: let hotpl...
2088
2089
  retry:
  	wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
2090

5d21cc2db   Tejun Heo   cpuset: replace c...
2091
  	mutex_lock(&cpuset_mutex);
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
2092

e44193d39   Li Zefan   cpuset: let hotpl...
2093
2094
2095
2096
2097
2098
2099
2100
  	/*
  	 * We have raced with task attaching. We wait until attaching
  	 * is finished, so we won't attach a task to an empty cpuset.
  	 */
  	if (cs->attach_in_progress) {
  		mutex_unlock(&cpuset_mutex);
  		goto retry;
  	}
be4c9dd7a   Li Zefan   cpuset: enable on...
2101
2102
  	cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
  	nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
80d1fa646   Srivatsa S. Bhat   cpusets, hotplug:...
2103

be4c9dd7a   Li Zefan   cpuset: enable on...
2104
2105
  	cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
  	mems_updated = !nodes_equal(new_mems, cs->effective_mems);
deb7aa308   Tejun Heo   cpuset: reorganiz...
2106

390a36aad   Li Zefan   cpuset: refactor ...
2107
  	if (cgroup_on_dfl(cs->css.cgroup))
be4c9dd7a   Li Zefan   cpuset: enable on...
2108
2109
  		hotplug_update_tasks(cs, &new_cpus, &new_mems,
  				     cpus_updated, mems_updated);
390a36aad   Li Zefan   cpuset: refactor ...
2110
  	else
be4c9dd7a   Li Zefan   cpuset: enable on...
2111
2112
  		hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
  					    cpus_updated, mems_updated);
8d0339487   Tejun Heo   cpuset: make CPU ...
2113

5d21cc2db   Tejun Heo   cpuset: replace c...
2114
  	mutex_unlock(&cpuset_mutex);
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2115
  }
deb7aa308   Tejun Heo   cpuset: reorganiz...
2116
  /**
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
2117
   * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
956db3ca0   Cliff Wickman   hotplug cpu: move...
2118
   *
deb7aa308   Tejun Heo   cpuset: reorganiz...
2119
2120
2121
2122
2123
   * This function is called after either CPU or memory configuration has
   * changed and updates cpuset accordingly.  The top_cpuset is always
   * synchronized to cpu_active_mask and N_MEMORY, which is necessary in
   * order to make cpusets transparent (of no affect) on systems that are
   * actively using CPU hotplug but making no active use of cpusets.
956db3ca0   Cliff Wickman   hotplug cpu: move...
2124
   *
deb7aa308   Tejun Heo   cpuset: reorganiz...
2125
   * Non-root cpusets are only affected by offlining.  If any CPUs or memory
388afd854   Li Zefan   cpuset: remove as...
2126
2127
   * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on
   * all descendants.
956db3ca0   Cliff Wickman   hotplug cpu: move...
2128
   *
deb7aa308   Tejun Heo   cpuset: reorganiz...
2129
2130
   * Note that CPU offlining during suspend is ignored.  We don't modify
   * cpusets across suspend/resume cycles at all.
956db3ca0   Cliff Wickman   hotplug cpu: move...
2131
   */
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
2132
  static void cpuset_hotplug_workfn(struct work_struct *work)
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2133
  {
5c5cc6232   Li Zefan   cpuset: allow to ...
2134
2135
  	static cpumask_t new_cpus;
  	static nodemask_t new_mems;
deb7aa308   Tejun Heo   cpuset: reorganiz...
2136
  	bool cpus_updated, mems_updated;
7e88291be   Li Zefan   cpuset: make cs->...
2137
  	bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2138

5d21cc2db   Tejun Heo   cpuset: replace c...
2139
  	mutex_lock(&cpuset_mutex);
956db3ca0   Cliff Wickman   hotplug cpu: move...
2140

deb7aa308   Tejun Heo   cpuset: reorganiz...
2141
2142
2143
  	/* fetch the available cpus/mems and find out which changed how */
  	cpumask_copy(&new_cpus, cpu_active_mask);
  	new_mems = node_states[N_MEMORY];
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
2144

7e88291be   Li Zefan   cpuset: make cs->...
2145
2146
  	cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
  	mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
2147

deb7aa308   Tejun Heo   cpuset: reorganiz...
2148
2149
  	/* synchronize cpus_allowed to cpu_active_mask */
  	if (cpus_updated) {
8447a0fee   Vladimir Davydov   cpuset: convert c...
2150
  		spin_lock_irq(&callback_lock);
7e88291be   Li Zefan   cpuset: make cs->...
2151
2152
  		if (!on_dfl)
  			cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
1344ab9c2   Li Zefan   cpuset: update cp...
2153
  		cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
8447a0fee   Vladimir Davydov   cpuset: convert c...
2154
  		spin_unlock_irq(&callback_lock);
deb7aa308   Tejun Heo   cpuset: reorganiz...
2155
2156
  		/* we don't mess with cpumasks of tasks in top_cpuset */
  	}
b45012955   Paul Jackson   hotplug cpu move ...
2157

deb7aa308   Tejun Heo   cpuset: reorganiz...
2158
2159
  	/* synchronize mems_allowed to N_MEMORY */
  	if (mems_updated) {
8447a0fee   Vladimir Davydov   cpuset: convert c...
2160
  		spin_lock_irq(&callback_lock);
7e88291be   Li Zefan   cpuset: make cs->...
2161
2162
  		if (!on_dfl)
  			top_cpuset.mems_allowed = new_mems;
1344ab9c2   Li Zefan   cpuset: update cp...
2163
  		top_cpuset.effective_mems = new_mems;
8447a0fee   Vladimir Davydov   cpuset: convert c...
2164
  		spin_unlock_irq(&callback_lock);
d66393e54   Tejun Heo   cpuset: use css_t...
2165
  		update_tasks_nodemask(&top_cpuset);
deb7aa308   Tejun Heo   cpuset: reorganiz...
2166
  	}
b45012955   Paul Jackson   hotplug cpu move ...
2167

388afd854   Li Zefan   cpuset: remove as...
2168
  	mutex_unlock(&cpuset_mutex);
5c5cc6232   Li Zefan   cpuset: allow to ...
2169
2170
  	/* if cpus or mems changed, we need to propagate to descendants */
  	if (cpus_updated || mems_updated) {
deb7aa308   Tejun Heo   cpuset: reorganiz...
2171
  		struct cpuset *cs;
492eb21b9   Tejun Heo   cgroup: make hier...
2172
  		struct cgroup_subsys_state *pos_css;
f9b4fb8da   Miao Xie   cpusets: update t...
2173

fc560a26a   Tejun Heo   cpuset: replace c...
2174
  		rcu_read_lock();
492eb21b9   Tejun Heo   cgroup: make hier...
2175
  		cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
ec903c0c8   Tejun Heo   cgroup: rename cs...
2176
  			if (cs == &top_cpuset || !css_tryget_online(&cs->css))
388afd854   Li Zefan   cpuset: remove as...
2177
2178
  				continue;
  			rcu_read_unlock();
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
2179

388afd854   Li Zefan   cpuset: remove as...
2180
  			cpuset_hotplug_update_tasks(cs);
b45012955   Paul Jackson   hotplug cpu move ...
2181

388afd854   Li Zefan   cpuset: remove as...
2182
2183
2184
2185
2186
  			rcu_read_lock();
  			css_put(&cs->css);
  		}
  		rcu_read_unlock();
  	}
8d0339487   Tejun Heo   cpuset: make CPU ...
2187

deb7aa308   Tejun Heo   cpuset: reorganiz...
2188
  	/* rebuild sched domains if cpus_allowed has changed */
e0e80a02e   Li Zhong   cpuset: use rebui...
2189
2190
  	if (cpus_updated)
  		rebuild_sched_domains();
b1aac8bb8   Paul Jackson   [PATCH] cpuset: h...
2191
  }
7ddf96b02   Srivatsa S. Bhat   cpusets, hotplug:...
2192
  void cpuset_update_active_cpus(bool cpu_online)
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2193
  {
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
  	/*
  	 * We're inside cpu hotplug critical region which usually nests
  	 * inside cgroup synchronization.  Bounce actual hotplug processing
  	 * to a work item to avoid reverse locking order.
  	 *
  	 * We still need to do partition_sched_domains() synchronously;
  	 * otherwise, the scheduler will get confused and put tasks to the
  	 * dead CPU.  Fall back to the default single domain.
  	 * cpuset_hotplug_workfn() will rebuild it as necessary.
  	 */
  	partition_sched_domains(1, NULL, NULL);
  	schedule_work(&cpuset_hotplug_work);
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2206
  }
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2207

38837fc75   Paul Jackson   [PATCH] cpuset: t...
2208
  /*
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
2209
2210
   * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
   * Call this routine anytime after node_states[N_MEMORY] changes.
a1cd2b13f   Srivatsa S. Bhat   cpusets: Remove/u...
2211
   * See cpuset_update_active_cpus() for CPU hotplug handling.
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2212
   */
f481891fd   Miao Xie   cpuset: update to...
2213
2214
  static int cpuset_track_online_nodes(struct notifier_block *self,
  				unsigned long action, void *arg)
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2215
  {
3a5a6d0c2   Tejun Heo   cpuset: don't nes...
2216
  	schedule_work(&cpuset_hotplug_work);
f481891fd   Miao Xie   cpuset: update to...
2217
  	return NOTIFY_OK;
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2218
  }
d8f10cb3d   Andrew Morton   kernel/cpuset.c: ...
2219
2220
2221
2222
2223
  
  static struct notifier_block cpuset_track_online_nodes_nb = {
  	.notifier_call = cpuset_track_online_nodes,
  	.priority = 10,		/* ??! */
  };
38837fc75   Paul Jackson   [PATCH] cpuset: t...
2224

1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2225
2226
2227
2228
  /**
   * cpuset_init_smp - initialize cpus_allowed
   *
   * Description: Finish top cpuset after cpu, node maps are initialized
d8f10cb3d   Andrew Morton   kernel/cpuset.c: ...
2229
   */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2230
2231
  void __init cpuset_init_smp(void)
  {
6ad4c1888   Peter Zijlstra   sched: Fix balanc...
2232
  	cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
2233
  	top_cpuset.mems_allowed = node_states[N_MEMORY];
33ad801df   Li Zefan   cpuset: record ol...
2234
  	top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
4c4d50f7b   Paul Jackson   [PATCH] cpuset: t...
2235

e2b9a3d7d   Li Zefan   cpuset: add cs->e...
2236
2237
  	cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
  	top_cpuset.effective_mems = node_states[N_MEMORY];
d8f10cb3d   Andrew Morton   kernel/cpuset.c: ...
2238
  	register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2239
2240
2241
  }
  
  /**
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2242
2243
   * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
   * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
6af866af3   Li Zefan   cpuset: remove re...
2244
   * @pmask: pointer to struct cpumask variable to receive cpus_allowed set.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2245
   *
300ed6cbb   Li Zefan   cpuset: convert c...
2246
   * Description: Returns the cpumask_var_t cpus_allowed of the cpuset
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2247
   * attached to the specified @tsk.  Guaranteed to return some non-empty
5f054e31c   Rusty Russell   documentation: re...
2248
   * subset of cpu_online_mask, even if this means going outside the
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2249
2250
   * tasks cpuset.
   **/
6af866af3   Li Zefan   cpuset: remove re...
2251
  void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2252
  {
8447a0fee   Vladimir Davydov   cpuset: convert c...
2253
2254
2255
  	unsigned long flags;
  
  	spin_lock_irqsave(&callback_lock, flags);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2256
  	rcu_read_lock();
ae1c80238   Li Zefan   cpuset: apply cs-...
2257
  	guarantee_online_cpus(task_cs(tsk), pmask);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2258
  	rcu_read_unlock();
8447a0fee   Vladimir Davydov   cpuset: convert c...
2259
  	spin_unlock_irqrestore(&callback_lock, flags);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2260
  }
2baab4e90   Peter Zijlstra   sched: Fix select...
2261
  void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
9084bb824   Oleg Nesterov   sched: Make selec...
2262
  {
9084bb824   Oleg Nesterov   sched: Make selec...
2263
  	rcu_read_lock();
ae1c80238   Li Zefan   cpuset: apply cs-...
2264
  	do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
9084bb824   Oleg Nesterov   sched: Make selec...
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
  	rcu_read_unlock();
  
  	/*
  	 * We own tsk->cpus_allowed, nobody can change it under us.
  	 *
  	 * But we used cs && cs->cpus_allowed lockless and thus can
  	 * race with cgroup_attach_task() or update_cpumask() and get
  	 * the wrong tsk->cpus_allowed. However, both cases imply the
  	 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
  	 * which takes task_rq_lock().
  	 *
  	 * If we are called after it dropped the lock we must see all
  	 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
  	 * set any mask even if it is not right from task_cs() pov,
  	 * the pending set_cpus_allowed_ptr() will fix things.
2baab4e90   Peter Zijlstra   sched: Fix select...
2280
2281
2282
  	 *
  	 * select_fallback_rq() will fix things ups and set cpu_possible_mask
  	 * if required.
9084bb824   Oleg Nesterov   sched: Make selec...
2283
  	 */
9084bb824   Oleg Nesterov   sched: Make selec...
2284
  }
8f4ab07f4   Rasmus Villemoes   kernel/cpuset.c: ...
2285
  void __init cpuset_init_current_mems_allowed(void)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2286
  {
f9a86fcbb   Mike Travis   cpuset: modify cp...
2287
  	nodes_setall(current->mems_allowed);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2288
  }
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2289
  /**
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2290
2291
2292
2293
2294
   * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
   * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
   *
   * Description: Returns the nodemask_t mems_allowed of the cpuset
   * attached to the specified @tsk.  Guaranteed to return some non-empty
38d7bee9d   Lai Jiangshan   cpuset: use N_MEM...
2295
   * subset of node_states[N_MEMORY], even if this means going outside the
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2296
2297
2298
2299
2300
2301
   * tasks cpuset.
   **/
  
  nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
  {
  	nodemask_t mask;
8447a0fee   Vladimir Davydov   cpuset: convert c...
2302
  	unsigned long flags;
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2303

8447a0fee   Vladimir Davydov   cpuset: convert c...
2304
  	spin_lock_irqsave(&callback_lock, flags);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2305
  	rcu_read_lock();
ae1c80238   Li Zefan   cpuset: apply cs-...
2306
  	guarantee_online_mems(task_cs(tsk), &mask);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2307
  	rcu_read_unlock();
8447a0fee   Vladimir Davydov   cpuset: convert c...
2308
  	spin_unlock_irqrestore(&callback_lock, flags);
909d75a3b   Paul Jackson   [PATCH] cpuset: i...
2309
2310
2311
2312
2313
  
  	return mask;
  }
  
  /**
19770b326   Mel Gorman   mm: filter based ...
2314
2315
   * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
   * @nodemask: the nodemask to be checked
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2316
   *
19770b326   Mel Gorman   mm: filter based ...
2317
   * Are any of the nodes in the nodemask allowed in current->mems_allowed?
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2318
   */
19770b326   Mel Gorman   mm: filter based ...
2319
  int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2320
  {
19770b326   Mel Gorman   mm: filter based ...
2321
  	return nodes_intersects(*nodemask, current->mems_allowed);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2322
  }
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2323
  /*
786083667   Paul Menage   Cpuset hardwall f...
2324
2325
   * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
   * mem_hardwall ancestor to the specified cpuset.  Call holding
8447a0fee   Vladimir Davydov   cpuset: convert c...
2326
   * callback_lock.  If no ancestor is mem_exclusive or mem_hardwall
786083667   Paul Menage   Cpuset hardwall f...
2327
   * (an unusual configuration), then returns the root cpuset.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2328
   */
c9710d801   Tejun Heo   cpuset: drop "con...
2329
  static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2330
  {
c431069fe   Tejun Heo   cpuset: remove cp...
2331
2332
  	while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
  		cs = parent_cs(cs);
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2333
2334
  	return cs;
  }
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2335
  /**
344736f29   Vladimir Davydov   cpuset: simplify ...
2336
   * cpuset_node_allowed - Can we allocate on a memory node?
a1bc5a4ee   David Rientjes   cpusets: replace ...
2337
   * @node: is this an allowed node?
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2338
   * @gfp_mask: memory allocation flags
d9fd8a6d4   Randy Dunlap   [PATCH] kernel/cp...
2339
   *
6e276d2a5   David Rientjes   kernel, cpuset: r...
2340
2341
2342
2343
   * If we're in interrupt, yes, we can always allocate.  If @node is set in
   * current's mems_allowed, yes.  If it's not a __GFP_HARDWALL request and this
   * node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
   * yes.  If current has access to memory reserves due to TIF_MEMDIE, yes.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2344
2345
2346
   * Otherwise, no.
   *
   * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
c596d9f32   David Rientjes   cpusets: allow TI...
2347
2348
   * and do not allow allocations outside the current tasks cpuset
   * unless the task has been OOM killed as is marked TIF_MEMDIE.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2349
   * GFP_KERNEL allocations are not so marked, so can escape to the
786083667   Paul Menage   Cpuset hardwall f...
2350
   * nearest enclosing hardwalled ancestor cpuset.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2351
   *
8447a0fee   Vladimir Davydov   cpuset: convert c...
2352
   * Scanning up parent cpusets requires callback_lock.  The
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2353
2354
2355
2356
   * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
   * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
   * current tasks mems_allowed came up empty on the first pass over
   * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
8447a0fee   Vladimir Davydov   cpuset: convert c...
2357
   * cpuset are short of memory, might require taking the callback_lock.
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2358
   *
36be57ffe   Paul Jackson   [PATCH] cpuset: u...
2359
   * The first call here from mm/page_alloc:get_page_from_freelist()
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2360
2361
2362
   * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
   * so no allocation on a node outside the cpuset is allowed (unless
   * in interrupt, of course).
36be57ffe   Paul Jackson   [PATCH] cpuset: u...
2363
2364
2365
2366
2367
2368
   *
   * The second pass through get_page_from_freelist() doesn't even call
   * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
   * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
   * in alloc_flags.  That logic and the checks below have the combined
   * affect that:
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2369
2370
   *	in_interrupt - any node ok (current task context irrelevant)
   *	GFP_ATOMIC   - any node ok
c596d9f32   David Rientjes   cpusets: allow TI...
2371
   *	TIF_MEMDIE   - any node ok
786083667   Paul Menage   Cpuset hardwall f...
2372
   *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2373
   *	GFP_USER     - only nodes in current tasks mems allowed ok.
02a0e53d8   Paul Jackson   [PATCH] cpuset: r...
2374
   */
344736f29   Vladimir Davydov   cpuset: simplify ...
2375
  int __cpuset_node_allowed(int node, gfp_t gfp_mask)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2376
  {
c9710d801   Tejun Heo   cpuset: drop "con...
2377
  	struct cpuset *cs;		/* current cpuset ancestors */
29afd49b7   Paul Jackson   [PATCH] cpuset: r...
2378
  	int allowed;			/* is allocation in zone z allowed? */
8447a0fee   Vladimir Davydov   cpuset: convert c...
2379
  	unsigned long flags;
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2380

6e276d2a5   David Rientjes   kernel, cpuset: r...
2381
  	if (in_interrupt())
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2382
  		return 1;
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2383
2384
  	if (node_isset(node, current->mems_allowed))
  		return 1;
c596d9f32   David Rientjes   cpusets: allow TI...
2385
2386
2387
2388
2389
2390
  	/*
  	 * Allow tasks that have access to memory reserves because they have
  	 * been OOM killed to get memory anywhere.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE)))
  		return 1;
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2391
2392
  	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
  		return 0;
5563e7707   Bob Picco   [PATCH] cpuset: f...
2393
2394
  	if (current->flags & PF_EXITING) /* Let dying task have memory */
  		return 1;
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2395
  	/* Not hardwall and node outside mems_allowed: scan up cpusets */
8447a0fee   Vladimir Davydov   cpuset: convert c...
2396
  	spin_lock_irqsave(&callback_lock, flags);
053199edf   Paul Jackson   [PATCH] cpusets: ...
2397

b8dadcb58   Li Zefan   cpuset: use rcu_r...
2398
  	rcu_read_lock();
786083667   Paul Menage   Cpuset hardwall f...
2399
  	cs = nearest_hardwall_ancestor(task_cs(current));
99afb0fd5   Li Zefan   cpuset: fix a rac...
2400
  	allowed = node_isset(node, cs->mems_allowed);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2401
  	rcu_read_unlock();
053199edf   Paul Jackson   [PATCH] cpusets: ...
2402

8447a0fee   Vladimir Davydov   cpuset: convert c...
2403
  	spin_unlock_irqrestore(&callback_lock, flags);
9bf2229f8   Paul Jackson   [PATCH] cpusets: ...
2404
  	return allowed;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2405
  }
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2406
  /**
6adef3ebe   Jack Steiner   cpusets: new roun...
2407
2408
   * cpuset_mem_spread_node() - On which node to begin search for a file page
   * cpuset_slab_spread_node() - On which node to begin search for a slab page
825a46af5   Paul Jackson   [PATCH] cpuset me...
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
   *
   * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
   * tasks in a cpuset with is_spread_page or is_spread_slab set),
   * and if the memory allocation used cpuset_mem_spread_node()
   * to determine on which node to start looking, as it will for
   * certain page cache or slab cache pages such as used for file
   * system buffers and inode caches, then instead of starting on the
   * local node to look for a free page, rather spread the starting
   * node around the tasks mems_allowed nodes.
   *
   * We don't have to worry about the returned node being offline
   * because "it can't happen", and even if it did, it would be ok.
   *
   * The routines calling guarantee_online_mems() are careful to
   * only set nodes in task->mems_allowed that are online.  So it
   * should not be possible for the following code to return an
   * offline node.  But if it did, that would be ok, as this routine
   * is not returning the node where the allocation must be, only
   * the node where the search should start.  The zonelist passed to
   * __alloc_pages() will include all nodes.  If the slab allocator
   * is passed an offline node, it will fall back to the local node.
   * See kmem_cache_alloc_node().
   */
6adef3ebe   Jack Steiner   cpusets: new roun...
2432
  static int cpuset_spread_node(int *rotor)
825a46af5   Paul Jackson   [PATCH] cpuset me...
2433
2434
  {
  	int node;
6adef3ebe   Jack Steiner   cpusets: new roun...
2435
  	node = next_node(*rotor, current->mems_allowed);
825a46af5   Paul Jackson   [PATCH] cpuset me...
2436
2437
  	if (node == MAX_NUMNODES)
  		node = first_node(current->mems_allowed);
6adef3ebe   Jack Steiner   cpusets: new roun...
2438
  	*rotor = node;
825a46af5   Paul Jackson   [PATCH] cpuset me...
2439
2440
  	return node;
  }
6adef3ebe   Jack Steiner   cpusets: new roun...
2441
2442
2443
  
  int cpuset_mem_spread_node(void)
  {
778d3b0ff   Michal Hocko   cpusets: randomiz...
2444
2445
2446
  	if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
  		current->cpuset_mem_spread_rotor =
  			node_random(&current->mems_allowed);
6adef3ebe   Jack Steiner   cpusets: new roun...
2447
2448
2449
2450
2451
  	return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
  }
  
  int cpuset_slab_spread_node(void)
  {
778d3b0ff   Michal Hocko   cpusets: randomiz...
2452
2453
2454
  	if (current->cpuset_slab_spread_rotor == NUMA_NO_NODE)
  		current->cpuset_slab_spread_rotor =
  			node_random(&current->mems_allowed);
6adef3ebe   Jack Steiner   cpusets: new roun...
2455
2456
  	return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
  }
825a46af5   Paul Jackson   [PATCH] cpuset me...
2457
2458
2459
  EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
  
  /**
bbe373f2c   David Rientjes   oom: compare cpus...
2460
2461
2462
2463
2464
2465
2466
2467
   * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's?
   * @tsk1: pointer to task_struct of some task.
   * @tsk2: pointer to task_struct of some other task.
   *
   * Description: Return true if @tsk1's mems_allowed intersects the
   * mems_allowed of @tsk2.  Used by the OOM killer to determine if
   * one of the task's memory usage might impact the memory available
   * to the other.
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2468
   **/
bbe373f2c   David Rientjes   oom: compare cpus...
2469
2470
  int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
  				   const struct task_struct *tsk2)
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2471
  {
bbe373f2c   David Rientjes   oom: compare cpus...
2472
  	return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
ef08e3b49   Paul Jackson   [PATCH] cpusets: ...
2473
  }
75aa19941   David Rientjes   oom: print trigge...
2474
2475
  /**
   * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
fc34ac1dc   Fabian Frederick   kernel/cpuset.c: ...
2476
   * @tsk: pointer to task_struct of some task.
75aa19941   David Rientjes   oom: print trigge...
2477
2478
   *
   * Description: Prints @task's name, cpuset name, and cached copy of its
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2479
   * mems_allowed to the kernel log.
75aa19941   David Rientjes   oom: print trigge...
2480
2481
2482
   */
  void cpuset_print_task_mems_allowed(struct task_struct *tsk)
  {
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2483
  	struct cgroup *cgrp;
75aa19941   David Rientjes   oom: print trigge...
2484

b8dadcb58   Li Zefan   cpuset: use rcu_r...
2485
  	rcu_read_lock();
63f43f55c   Li Zefan   cpuset: fix cpuse...
2486

b8dadcb58   Li Zefan   cpuset: use rcu_r...
2487
  	cgrp = task_cs(tsk)->css.cgroup;
12d3089c1   Fabian Frederick   kernel/cpuset.c: ...
2488
  	pr_info("%s cpuset=", tsk->comm);
e61734c55   Tejun Heo   cgroup: remove cg...
2489
  	pr_cont_cgroup_name(cgrp);
e8e6d97c9   Tejun Heo   cpuset: use %*pb[...
2490
2491
  	pr_cont(" mems_allowed=%*pbl
  ", nodemask_pr_args(&tsk->mems_allowed));
f440d98f8   Li Zefan   cpuset: use cgrou...
2492

cfb5966be   Li Zefan   cpuset: fix RCU l...
2493
  	rcu_read_unlock();
75aa19941   David Rientjes   oom: print trigge...
2494
  }
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2495
  /*
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2496
2497
2498
2499
   * Collection of memory_pressure is suppressed unless
   * this flag is enabled by writing "1" to the special
   * cpuset file 'memory_pressure_enabled' in the root cpuset.
   */
c5b2aff89   Paul Jackson   [PATCH] cpuset: m...
2500
  int cpuset_memory_pressure_enabled __read_mostly;
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
  
  /**
   * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
   *
   * Keep a running average of the rate of synchronous (direct)
   * page reclaim efforts initiated by tasks in each cpuset.
   *
   * This represents the rate at which some task in the cpuset
   * ran low on memory on all nodes it was allowed to use, and
   * had to enter the kernels page reclaim code in an effort to
   * create more free memory by tossing clean pages or swapping
   * or writing dirty pages.
   *
   * Display to user space in the per-cpuset read-only file
   * "memory_pressure".  Value displayed is an integer
   * representing the recent rate of entry into the synchronous
   * (direct) page reclaim by any task attached to the cpuset.
   **/
  
  void __cpuset_memory_pressure_bump(void)
  {
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2522
  	rcu_read_lock();
8793d854e   Paul Menage   Task Control Grou...
2523
  	fmeter_markevent(&task_cs(current)->fmeter);
b8dadcb58   Li Zefan   cpuset: use rcu_r...
2524
  	rcu_read_unlock();
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2525
  }
8793d854e   Paul Menage   Task Control Grou...
2526
  #ifdef CONFIG_PROC_PID_CPUSET
3e0d98b9f   Paul Jackson   [PATCH] cpuset: m...
2527
  /*
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2528
2529
2530
   * proc_cpuset_show()
   *  - Print tasks cpuset path into seq_file.
   *  - Used for /proc/<pid>/cpuset.
053199edf   Paul Jackson   [PATCH] cpusets: ...
2531
2532
   *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
   *    doesn't really matter if tsk->cpuset changes after we read it,
5d21cc2db   Tejun Heo   cpuset: replace c...
2533
   *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
2df167a30   Paul Menage   cgroups: update c...
2534
   *    anyway.
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2535
   */
52de4779f   Zefan Li   cpuset: simplify ...
2536
2537
  int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
  		     struct pid *pid, struct task_struct *tsk)
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2538
  {
e61734c55   Tejun Heo   cgroup: remove cg...
2539
  	char *buf, *p;
8793d854e   Paul Menage   Task Control Grou...
2540
  	struct cgroup_subsys_state *css;
99f895518   Eric W. Biederman   [PATCH] proc: don...
2541
  	int retval;
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2542

99f895518   Eric W. Biederman   [PATCH] proc: don...
2543
  	retval = -ENOMEM;
e61734c55   Tejun Heo   cgroup: remove cg...
2544
  	buf = kmalloc(PATH_MAX, GFP_KERNEL);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2545
  	if (!buf)
99f895518   Eric W. Biederman   [PATCH] proc: don...
2546
  		goto out;
e61734c55   Tejun Heo   cgroup: remove cg...
2547
  	retval = -ENAMETOOLONG;
27e89ae5d   Li Zefan   cpuset: fix RCU l...
2548
  	rcu_read_lock();
073219e99   Tejun Heo   cgroup: clean up ...
2549
  	css = task_css(tsk, cpuset_cgrp_id);
e61734c55   Tejun Heo   cgroup: remove cg...
2550
  	p = cgroup_path(css->cgroup, buf, PATH_MAX);
27e89ae5d   Li Zefan   cpuset: fix RCU l...
2551
  	rcu_read_unlock();
e61734c55   Tejun Heo   cgroup: remove cg...
2552
  	if (!p)
52de4779f   Zefan Li   cpuset: simplify ...
2553
  		goto out_free;
e61734c55   Tejun Heo   cgroup: remove cg...
2554
  	seq_puts(m, p);
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2555
2556
  	seq_putc(m, '
  ');
e61734c55   Tejun Heo   cgroup: remove cg...
2557
  	retval = 0;
99f895518   Eric W. Biederman   [PATCH] proc: don...
2558
  out_free:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2559
  	kfree(buf);
99f895518   Eric W. Biederman   [PATCH] proc: don...
2560
  out:
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2561
2562
  	return retval;
  }
8793d854e   Paul Menage   Task Control Grou...
2563
  #endif /* CONFIG_PROC_PID_CPUSET */
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2564

d01d48278   Heiko Carstens   sched: Always sho...
2565
  /* Display task mems_allowed in /proc/<pid>/status file. */
df5f8314c   Eric W. Biederman   proc: seqfile con...
2566
2567
  void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
  {
e8e6d97c9   Tejun Heo   cpuset: use %*pb[...
2568
2569
2570
2571
2572
2573
  	seq_printf(m, "Mems_allowed:\t%*pb
  ",
  		   nodemask_pr_args(&task->mems_allowed));
  	seq_printf(m, "Mems_allowed_list:\t%*pbl
  ",
  		   nodemask_pr_args(&task->mems_allowed));
1da177e4c   Linus Torvalds   Linux-2.6.12-rc2
2574
  }