Commit 28a42b9ea7e42e1efb02cc2dcacba0b6af234e1b

Authored by Paul Jackson
Committed by Linus Torvalds
1 parent 18a19cb304

[PATCH] cpusets: confine pdflush to its cpuset

This patch keeps pdflush daemons on the same cpuset as their parent, the
kthread daemon.

Some large NUMA configurations put as much as they can of kernel threads
and other classic Unix load in what's called a bootcpuset, keeping the rest
of the system free for dedicated jobs.

This effort is thwarted by pdflush, which dynamically destroys and
recreates pdflush daemons depending on load.

It's easy enough to force the originally created pdflush deamons into the
bootcpuset, at system boottime.  But the pdflush threads created later were
allowed to run freely across the system, due to the necessary line in their
startup kthread():

        set_cpus_allowed(current, CPU_MASK_ALL);

By simply coding pdflush to start its threads with the cpus_allowed
restrictions of its cpuset (inherited from kthread, its parent) we can
ensure that dynamically created pdflush threads are also kept in the
bootcpuset.

On systems w/o cpusets, or w/o a bootcpuset implementation, the following
will have no affect, leaving pdflush to run on any CPU, as before.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 1 changed file with 13 additions and 0 deletions Inline Diff

1 /* 1 /*
2 * mm/pdflush.c - worker threads for writing back filesystem data 2 * mm/pdflush.c - worker threads for writing back filesystem data
3 * 3 *
4 * Copyright (C) 2002, Linus Torvalds. 4 * Copyright (C) 2002, Linus Torvalds.
5 * 5 *
6 * 09Apr2002 akpm@zip.com.au 6 * 09Apr2002 akpm@zip.com.au
7 * Initial version 7 * Initial version
8 * 29Feb2004 kaos@sgi.com 8 * 29Feb2004 kaos@sgi.com
9 * Move worker thread creation to kthread to avoid chewing 9 * Move worker thread creation to kthread to avoid chewing
10 * up stack space with nested calls to kernel_thread. 10 * up stack space with nested calls to kernel_thread.
11 */ 11 */
12 12
13 #include <linux/sched.h> 13 #include <linux/sched.h>
14 #include <linux/list.h> 14 #include <linux/list.h>
15 #include <linux/signal.h> 15 #include <linux/signal.h>
16 #include <linux/spinlock.h> 16 #include <linux/spinlock.h>
17 #include <linux/gfp.h> 17 #include <linux/gfp.h>
18 #include <linux/init.h> 18 #include <linux/init.h>
19 #include <linux/module.h> 19 #include <linux/module.h>
20 #include <linux/fs.h> // Needed by writeback.h 20 #include <linux/fs.h> // Needed by writeback.h
21 #include <linux/writeback.h> // Prototypes pdflush_operation() 21 #include <linux/writeback.h> // Prototypes pdflush_operation()
22 #include <linux/kthread.h> 22 #include <linux/kthread.h>
23 #include <linux/cpuset.h>
23 24
24 25
25 /* 26 /*
26 * Minimum and maximum number of pdflush instances 27 * Minimum and maximum number of pdflush instances
27 */ 28 */
28 #define MIN_PDFLUSH_THREADS 2 29 #define MIN_PDFLUSH_THREADS 2
29 #define MAX_PDFLUSH_THREADS 8 30 #define MAX_PDFLUSH_THREADS 8
30 31
31 static void start_one_pdflush_thread(void); 32 static void start_one_pdflush_thread(void);
32 33
33 34
34 /* 35 /*
35 * The pdflush threads are worker threads for writing back dirty data. 36 * The pdflush threads are worker threads for writing back dirty data.
36 * Ideally, we'd like one thread per active disk spindle. But the disk 37 * Ideally, we'd like one thread per active disk spindle. But the disk
37 * topology is very hard to divine at this level. Instead, we take 38 * topology is very hard to divine at this level. Instead, we take
38 * care in various places to prevent more than one pdflush thread from 39 * care in various places to prevent more than one pdflush thread from
39 * performing writeback against a single filesystem. pdflush threads 40 * performing writeback against a single filesystem. pdflush threads
40 * have the PF_FLUSHER flag set in current->flags to aid in this. 41 * have the PF_FLUSHER flag set in current->flags to aid in this.
41 */ 42 */
42 43
43 /* 44 /*
44 * All the pdflush threads. Protected by pdflush_lock 45 * All the pdflush threads. Protected by pdflush_lock
45 */ 46 */
46 static LIST_HEAD(pdflush_list); 47 static LIST_HEAD(pdflush_list);
47 static DEFINE_SPINLOCK(pdflush_lock); 48 static DEFINE_SPINLOCK(pdflush_lock);
48 49
49 /* 50 /*
50 * The count of currently-running pdflush threads. Protected 51 * The count of currently-running pdflush threads. Protected
51 * by pdflush_lock. 52 * by pdflush_lock.
52 * 53 *
53 * Readable by sysctl, but not writable. Published to userspace at 54 * Readable by sysctl, but not writable. Published to userspace at
54 * /proc/sys/vm/nr_pdflush_threads. 55 * /proc/sys/vm/nr_pdflush_threads.
55 */ 56 */
56 int nr_pdflush_threads = 0; 57 int nr_pdflush_threads = 0;
57 58
58 /* 59 /*
59 * The time at which the pdflush thread pool last went empty 60 * The time at which the pdflush thread pool last went empty
60 */ 61 */
61 static unsigned long last_empty_jifs; 62 static unsigned long last_empty_jifs;
62 63
63 /* 64 /*
64 * The pdflush thread. 65 * The pdflush thread.
65 * 66 *
66 * Thread pool management algorithm: 67 * Thread pool management algorithm:
67 * 68 *
68 * - The minimum and maximum number of pdflush instances are bound 69 * - The minimum and maximum number of pdflush instances are bound
69 * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS. 70 * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
70 * 71 *
71 * - If there have been no idle pdflush instances for 1 second, create 72 * - If there have been no idle pdflush instances for 1 second, create
72 * a new one. 73 * a new one.
73 * 74 *
74 * - If the least-recently-went-to-sleep pdflush thread has been asleep 75 * - If the least-recently-went-to-sleep pdflush thread has been asleep
75 * for more than one second, terminate a thread. 76 * for more than one second, terminate a thread.
76 */ 77 */
77 78
78 /* 79 /*
79 * A structure for passing work to a pdflush thread. Also for passing 80 * A structure for passing work to a pdflush thread. Also for passing
80 * state information between pdflush threads. Protected by pdflush_lock. 81 * state information between pdflush threads. Protected by pdflush_lock.
81 */ 82 */
82 struct pdflush_work { 83 struct pdflush_work {
83 struct task_struct *who; /* The thread */ 84 struct task_struct *who; /* The thread */
84 void (*fn)(unsigned long); /* A callback function */ 85 void (*fn)(unsigned long); /* A callback function */
85 unsigned long arg0; /* An argument to the callback */ 86 unsigned long arg0; /* An argument to the callback */
86 struct list_head list; /* On pdflush_list, when idle */ 87 struct list_head list; /* On pdflush_list, when idle */
87 unsigned long when_i_went_to_sleep; 88 unsigned long when_i_went_to_sleep;
88 }; 89 };
89 90
90 static int __pdflush(struct pdflush_work *my_work) 91 static int __pdflush(struct pdflush_work *my_work)
91 { 92 {
92 current->flags |= PF_FLUSHER; 93 current->flags |= PF_FLUSHER;
93 my_work->fn = NULL; 94 my_work->fn = NULL;
94 my_work->who = current; 95 my_work->who = current;
95 INIT_LIST_HEAD(&my_work->list); 96 INIT_LIST_HEAD(&my_work->list);
96 97
97 spin_lock_irq(&pdflush_lock); 98 spin_lock_irq(&pdflush_lock);
98 nr_pdflush_threads++; 99 nr_pdflush_threads++;
99 for ( ; ; ) { 100 for ( ; ; ) {
100 struct pdflush_work *pdf; 101 struct pdflush_work *pdf;
101 102
102 set_current_state(TASK_INTERRUPTIBLE); 103 set_current_state(TASK_INTERRUPTIBLE);
103 list_move(&my_work->list, &pdflush_list); 104 list_move(&my_work->list, &pdflush_list);
104 my_work->when_i_went_to_sleep = jiffies; 105 my_work->when_i_went_to_sleep = jiffies;
105 spin_unlock_irq(&pdflush_lock); 106 spin_unlock_irq(&pdflush_lock);
106 107
107 schedule(); 108 schedule();
108 if (try_to_freeze()) { 109 if (try_to_freeze()) {
109 spin_lock_irq(&pdflush_lock); 110 spin_lock_irq(&pdflush_lock);
110 continue; 111 continue;
111 } 112 }
112 113
113 spin_lock_irq(&pdflush_lock); 114 spin_lock_irq(&pdflush_lock);
114 if (!list_empty(&my_work->list)) { 115 if (!list_empty(&my_work->list)) {
115 printk("pdflush: bogus wakeup!\n"); 116 printk("pdflush: bogus wakeup!\n");
116 my_work->fn = NULL; 117 my_work->fn = NULL;
117 continue; 118 continue;
118 } 119 }
119 if (my_work->fn == NULL) { 120 if (my_work->fn == NULL) {
120 printk("pdflush: NULL work function\n"); 121 printk("pdflush: NULL work function\n");
121 continue; 122 continue;
122 } 123 }
123 spin_unlock_irq(&pdflush_lock); 124 spin_unlock_irq(&pdflush_lock);
124 125
125 (*my_work->fn)(my_work->arg0); 126 (*my_work->fn)(my_work->arg0);
126 127
127 /* 128 /*
128 * Thread creation: For how long have there been zero 129 * Thread creation: For how long have there been zero
129 * available threads? 130 * available threads?
130 */ 131 */
131 if (jiffies - last_empty_jifs > 1 * HZ) { 132 if (jiffies - last_empty_jifs > 1 * HZ) {
132 /* unlocked list_empty() test is OK here */ 133 /* unlocked list_empty() test is OK here */
133 if (list_empty(&pdflush_list)) { 134 if (list_empty(&pdflush_list)) {
134 /* unlocked test is OK here */ 135 /* unlocked test is OK here */
135 if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) 136 if (nr_pdflush_threads < MAX_PDFLUSH_THREADS)
136 start_one_pdflush_thread(); 137 start_one_pdflush_thread();
137 } 138 }
138 } 139 }
139 140
140 spin_lock_irq(&pdflush_lock); 141 spin_lock_irq(&pdflush_lock);
141 my_work->fn = NULL; 142 my_work->fn = NULL;
142 143
143 /* 144 /*
144 * Thread destruction: For how long has the sleepiest 145 * Thread destruction: For how long has the sleepiest
145 * thread slept? 146 * thread slept?
146 */ 147 */
147 if (list_empty(&pdflush_list)) 148 if (list_empty(&pdflush_list))
148 continue; 149 continue;
149 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) 150 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
150 continue; 151 continue;
151 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); 152 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
152 if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) { 153 if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) {
153 /* Limit exit rate */ 154 /* Limit exit rate */
154 pdf->when_i_went_to_sleep = jiffies; 155 pdf->when_i_went_to_sleep = jiffies;
155 break; /* exeunt */ 156 break; /* exeunt */
156 } 157 }
157 } 158 }
158 nr_pdflush_threads--; 159 nr_pdflush_threads--;
159 spin_unlock_irq(&pdflush_lock); 160 spin_unlock_irq(&pdflush_lock);
160 return 0; 161 return 0;
161 } 162 }
162 163
163 /* 164 /*
164 * Of course, my_work wants to be just a local in __pdflush(). It is 165 * Of course, my_work wants to be just a local in __pdflush(). It is
165 * separated out in this manner to hopefully prevent the compiler from 166 * separated out in this manner to hopefully prevent the compiler from
166 * performing unfortunate optimisations against the auto variables. Because 167 * performing unfortunate optimisations against the auto variables. Because
167 * these are visible to other tasks and CPUs. (No problem has actually 168 * these are visible to other tasks and CPUs. (No problem has actually
168 * been observed. This is just paranoia). 169 * been observed. This is just paranoia).
169 */ 170 */
170 static int pdflush(void *dummy) 171 static int pdflush(void *dummy)
171 { 172 {
172 struct pdflush_work my_work; 173 struct pdflush_work my_work;
174 cpumask_t cpus_allowed;
173 175
174 /* 176 /*
175 * pdflush can spend a lot of time doing encryption via dm-crypt. We 177 * pdflush can spend a lot of time doing encryption via dm-crypt. We
176 * don't want to do that at keventd's priority. 178 * don't want to do that at keventd's priority.
177 */ 179 */
178 set_user_nice(current, 0); 180 set_user_nice(current, 0);
181
182 /*
183 * Some configs put our parent kthread in a limited cpuset,
184 * which kthread() overrides, forcing cpus_allowed == CPU_MASK_ALL.
185 * Our needs are more modest - cut back to our cpusets cpus_allowed.
186 * This is needed as pdflush's are dynamically created and destroyed.
187 * The boottime pdflush's are easily placed w/o these 2 lines.
188 */
189 cpus_allowed = cpuset_cpus_allowed(current);
190 set_cpus_allowed(current, cpus_allowed);
191
179 return __pdflush(&my_work); 192 return __pdflush(&my_work);
180 } 193 }
181 194
182 /* 195 /*
183 * Attempt to wake up a pdflush thread, and get it to do some work for you. 196 * Attempt to wake up a pdflush thread, and get it to do some work for you.
184 * Returns zero if it indeed managed to find a worker thread, and passed your 197 * Returns zero if it indeed managed to find a worker thread, and passed your
185 * payload to it. 198 * payload to it.
186 */ 199 */
187 int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0) 200 int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
188 { 201 {
189 unsigned long flags; 202 unsigned long flags;
190 int ret = 0; 203 int ret = 0;
191 204
192 if (fn == NULL) 205 if (fn == NULL)
193 BUG(); /* Hard to diagnose if it's deferred */ 206 BUG(); /* Hard to diagnose if it's deferred */
194 207
195 spin_lock_irqsave(&pdflush_lock, flags); 208 spin_lock_irqsave(&pdflush_lock, flags);
196 if (list_empty(&pdflush_list)) { 209 if (list_empty(&pdflush_list)) {
197 spin_unlock_irqrestore(&pdflush_lock, flags); 210 spin_unlock_irqrestore(&pdflush_lock, flags);
198 ret = -1; 211 ret = -1;
199 } else { 212 } else {
200 struct pdflush_work *pdf; 213 struct pdflush_work *pdf;
201 214
202 pdf = list_entry(pdflush_list.next, struct pdflush_work, list); 215 pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
203 list_del_init(&pdf->list); 216 list_del_init(&pdf->list);
204 if (list_empty(&pdflush_list)) 217 if (list_empty(&pdflush_list))
205 last_empty_jifs = jiffies; 218 last_empty_jifs = jiffies;
206 pdf->fn = fn; 219 pdf->fn = fn;
207 pdf->arg0 = arg0; 220 pdf->arg0 = arg0;
208 wake_up_process(pdf->who); 221 wake_up_process(pdf->who);
209 spin_unlock_irqrestore(&pdflush_lock, flags); 222 spin_unlock_irqrestore(&pdflush_lock, flags);
210 } 223 }
211 return ret; 224 return ret;
212 } 225 }
213 226
214 static void start_one_pdflush_thread(void) 227 static void start_one_pdflush_thread(void)
215 { 228 {
216 kthread_run(pdflush, NULL, "pdflush"); 229 kthread_run(pdflush, NULL, "pdflush");
217 } 230 }
218 231
219 static int __init pdflush_init(void) 232 static int __init pdflush_init(void)
220 { 233 {
221 int i; 234 int i;
222 235
223 for (i = 0; i < MIN_PDFLUSH_THREADS; i++) 236 for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
224 start_one_pdflush_thread(); 237 start_one_pdflush_thread();
225 return 0; 238 return 0;
226 } 239 }
227 240
228 module_init(pdflush_init); 241 module_init(pdflush_init);
229 242