Commit 66e1707bc34609f626e2e7b4fe7e454c9748bad5
Committed by
Linus Torvalds
1 parent
67e465a77b
Exists in
master
and in
20 other branches
Memory controller: add per cgroup LRU and reclaim
Add the page_cgroup to the per cgroup LRU. The reclaim algorithm has been modified to make the isolate_lru_pages() as a pluggable component. The scan_control data structure now accepts the cgroup on behalf of which reclaims are carried out. try_to_free_pages() has been extended to become cgroup aware. [akpm@linux-foundation.org: fix warning] [Lee.Schermerhorn@hp.com: initialize all scan_control's isolate_pages member] [bunk@kernel.org: make do_try_to_free_pages() static] [hugh@veritas.com: memcgroup: fix try_to_free order] [kamezawa.hiroyu@jp.fujitsu.com: this unlock_page_cgroup() is unnecessary] Signed-off-by: Pavel Emelianov <xemul@openvz.org> Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Paul Menage <menage@google.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Kirill Korotaev <dev@sw.ru> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: David Rientjes <rientjes@google.com> Cc: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 6 changed files with 286 additions and 30 deletions Side-by-side Diff
include/linux/memcontrol.h
... | ... | @@ -32,6 +32,13 @@ |
32 | 32 | extern struct page_cgroup *page_get_page_cgroup(struct page *page); |
33 | 33 | extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm); |
34 | 34 | extern void mem_cgroup_uncharge(struct page_cgroup *pc); |
35 | +extern void mem_cgroup_move_lists(struct page_cgroup *pc, bool active); | |
36 | +extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |
37 | + struct list_head *dst, | |
38 | + unsigned long *scanned, int order, | |
39 | + int mode, struct zone *z, | |
40 | + struct mem_cgroup *mem_cont, | |
41 | + int active); | |
35 | 42 | |
36 | 43 | static inline void mem_cgroup_uncharge_page(struct page *page) |
37 | 44 | { |
... | ... | @@ -68,6 +75,11 @@ |
68 | 75 | } |
69 | 76 | |
70 | 77 | static inline void mem_cgroup_uncharge_page(struct page *page) |
78 | +{ | |
79 | +} | |
80 | + | |
81 | +static inline void mem_cgroup_move_lists(struct page_cgroup *pc, | |
82 | + bool active) | |
71 | 83 | { |
72 | 84 | } |
73 | 85 |
include/linux/res_counter.h
... | ... | @@ -99,5 +99,28 @@ |
99 | 99 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); |
100 | 100 | void res_counter_uncharge(struct res_counter *counter, unsigned long val); |
101 | 101 | |
102 | +static inline bool res_counter_limit_check_locked(struct res_counter *cnt) | |
103 | +{ | |
104 | + if (cnt->usage < cnt->limit) | |
105 | + return true; | |
106 | + | |
107 | + return false; | |
108 | +} | |
109 | + | |
110 | +/* | |
111 | + * Helper function to detect if the cgroup is within it's limit or | |
112 | + * not. It's currently called from cgroup_rss_prepare() | |
113 | + */ | |
114 | +static inline bool res_counter_check_under_limit(struct res_counter *cnt) | |
115 | +{ | |
116 | + bool ret; | |
117 | + unsigned long flags; | |
118 | + | |
119 | + spin_lock_irqsave(&cnt->lock, flags); | |
120 | + ret = res_counter_limit_check_locked(cnt); | |
121 | + spin_unlock_irqrestore(&cnt->lock, flags); | |
122 | + return ret; | |
123 | +} | |
124 | + | |
102 | 125 | #endif |
include/linux/swap.h
... | ... | @@ -5,6 +5,7 @@ |
5 | 5 | #include <linux/linkage.h> |
6 | 6 | #include <linux/mmzone.h> |
7 | 7 | #include <linux/list.h> |
8 | +#include <linux/memcontrol.h> | |
8 | 9 | #include <linux/sched.h> |
9 | 10 | |
10 | 11 | #include <asm/atomic.h> |
... | ... | @@ -182,6 +183,8 @@ |
182 | 183 | /* linux/mm/vmscan.c */ |
183 | 184 | extern unsigned long try_to_free_pages(struct zone **zones, int order, |
184 | 185 | gfp_t gfp_mask); |
186 | +extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem); | |
187 | +extern int __isolate_lru_page(struct page *page, int mode); | |
185 | 188 | extern unsigned long shrink_all_memory(unsigned long nr_pages); |
186 | 189 | extern int vm_swappiness; |
187 | 190 | extern int remove_mapping(struct address_space *mapping, struct page *page); |
mm/memcontrol.c
... | ... | @@ -22,10 +22,15 @@ |
22 | 22 | #include <linux/cgroup.h> |
23 | 23 | #include <linux/mm.h> |
24 | 24 | #include <linux/page-flags.h> |
25 | +#include <linux/backing-dev.h> | |
25 | 26 | #include <linux/bit_spinlock.h> |
26 | 27 | #include <linux/rcupdate.h> |
28 | +#include <linux/swap.h> | |
29 | +#include <linux/spinlock.h> | |
30 | +#include <linux/fs.h> | |
27 | 31 | |
28 | 32 | struct cgroup_subsys mem_cgroup_subsys; |
33 | +static const int MEM_CGROUP_RECLAIM_RETRIES = 5; | |
29 | 34 | |
30 | 35 | /* |
31 | 36 | * The memory controller data structure. The memory controller controls both |
... | ... | @@ -51,6 +56,10 @@ |
51 | 56 | */ |
52 | 57 | struct list_head active_list; |
53 | 58 | struct list_head inactive_list; |
59 | + /* | |
60 | + * spin_lock to protect the per cgroup LRU | |
61 | + */ | |
62 | + spinlock_t lru_lock; | |
54 | 63 | }; |
55 | 64 | |
56 | 65 | /* |
57 | 66 | |
... | ... | @@ -141,7 +150,95 @@ |
141 | 150 | bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); |
142 | 151 | } |
143 | 152 | |
153 | +void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) | |
154 | +{ | |
155 | + if (active) | |
156 | + list_move(&pc->lru, &pc->mem_cgroup->active_list); | |
157 | + else | |
158 | + list_move(&pc->lru, &pc->mem_cgroup->inactive_list); | |
159 | +} | |
160 | + | |
144 | 161 | /* |
162 | + * This routine assumes that the appropriate zone's lru lock is already held | |
163 | + */ | |
164 | +void mem_cgroup_move_lists(struct page_cgroup *pc, bool active) | |
165 | +{ | |
166 | + struct mem_cgroup *mem; | |
167 | + if (!pc) | |
168 | + return; | |
169 | + | |
170 | + mem = pc->mem_cgroup; | |
171 | + | |
172 | + spin_lock(&mem->lru_lock); | |
173 | + __mem_cgroup_move_lists(pc, active); | |
174 | + spin_unlock(&mem->lru_lock); | |
175 | +} | |
176 | + | |
177 | +unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |
178 | + struct list_head *dst, | |
179 | + unsigned long *scanned, int order, | |
180 | + int mode, struct zone *z, | |
181 | + struct mem_cgroup *mem_cont, | |
182 | + int active) | |
183 | +{ | |
184 | + unsigned long nr_taken = 0; | |
185 | + struct page *page; | |
186 | + unsigned long scan; | |
187 | + LIST_HEAD(pc_list); | |
188 | + struct list_head *src; | |
189 | + struct page_cgroup *pc; | |
190 | + | |
191 | + if (active) | |
192 | + src = &mem_cont->active_list; | |
193 | + else | |
194 | + src = &mem_cont->inactive_list; | |
195 | + | |
196 | + spin_lock(&mem_cont->lru_lock); | |
197 | + for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { | |
198 | + pc = list_entry(src->prev, struct page_cgroup, lru); | |
199 | + page = pc->page; | |
200 | + VM_BUG_ON(!pc); | |
201 | + | |
202 | + if (PageActive(page) && !active) { | |
203 | + __mem_cgroup_move_lists(pc, true); | |
204 | + scan--; | |
205 | + continue; | |
206 | + } | |
207 | + if (!PageActive(page) && active) { | |
208 | + __mem_cgroup_move_lists(pc, false); | |
209 | + scan--; | |
210 | + continue; | |
211 | + } | |
212 | + | |
213 | + /* | |
214 | + * Reclaim, per zone | |
215 | + * TODO: make the active/inactive lists per zone | |
216 | + */ | |
217 | + if (page_zone(page) != z) | |
218 | + continue; | |
219 | + | |
220 | + /* | |
221 | + * Check if the meta page went away from under us | |
222 | + */ | |
223 | + if (!list_empty(&pc->lru)) | |
224 | + list_move(&pc->lru, &pc_list); | |
225 | + else | |
226 | + continue; | |
227 | + | |
228 | + if (__isolate_lru_page(page, mode) == 0) { | |
229 | + list_move(&page->lru, dst); | |
230 | + nr_taken++; | |
231 | + } | |
232 | + } | |
233 | + | |
234 | + list_splice(&pc_list, src); | |
235 | + spin_unlock(&mem_cont->lru_lock); | |
236 | + | |
237 | + *scanned = scan; | |
238 | + return nr_taken; | |
239 | +} | |
240 | + | |
241 | +/* | |
145 | 242 | * Charge the memory controller for page usage. |
146 | 243 | * Return |
147 | 244 | * 0 if the charge was successful |
... | ... | @@ -151,6 +248,8 @@ |
151 | 248 | { |
152 | 249 | struct mem_cgroup *mem; |
153 | 250 | struct page_cgroup *pc, *race_pc; |
251 | + unsigned long flags; | |
252 | + unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | |
154 | 253 | |
155 | 254 | /* |
156 | 255 | * Should page_cgroup's go to their own slab? |
157 | 256 | |
... | ... | @@ -159,14 +258,20 @@ |
159 | 258 | * to see if the cgroup page already has a page_cgroup associated |
160 | 259 | * with it |
161 | 260 | */ |
261 | +retry: | |
162 | 262 | lock_page_cgroup(page); |
163 | 263 | pc = page_get_page_cgroup(page); |
164 | 264 | /* |
165 | 265 | * The page_cgroup exists and the page has already been accounted |
166 | 266 | */ |
167 | 267 | if (pc) { |
168 | - atomic_inc(&pc->ref_cnt); | |
169 | - goto done; | |
268 | + if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) { | |
269 | + /* this page is under being uncharged ? */ | |
270 | + unlock_page_cgroup(page); | |
271 | + cpu_relax(); | |
272 | + goto retry; | |
273 | + } else | |
274 | + goto done; | |
170 | 275 | } |
171 | 276 | |
172 | 277 | unlock_page_cgroup(page); |
... | ... | @@ -197,7 +302,32 @@ |
197 | 302 | * If we created the page_cgroup, we should free it on exceeding |
198 | 303 | * the cgroup limit. |
199 | 304 | */ |
200 | - if (res_counter_charge(&mem->res, 1)) { | |
305 | + while (res_counter_charge(&mem->res, 1)) { | |
306 | + if (try_to_free_mem_cgroup_pages(mem)) | |
307 | + continue; | |
308 | + | |
309 | + /* | |
310 | + * try_to_free_mem_cgroup_pages() might not give us a full | |
311 | + * picture of reclaim. Some pages are reclaimed and might be | |
312 | + * moved to swap cache or just unmapped from the cgroup. | |
313 | + * Check the limit again to see if the reclaim reduced the | |
314 | + * current usage of the cgroup before giving up | |
315 | + */ | |
316 | + if (res_counter_check_under_limit(&mem->res)) | |
317 | + continue; | |
318 | + /* | |
319 | + * Since we control both RSS and cache, we end up with a | |
320 | + * very interesting scenario where we end up reclaiming | |
321 | + * memory (essentially RSS), since the memory is pushed | |
322 | + * to swap cache, we eventually end up adding those | |
323 | + * pages back to our list. Hence we give ourselves a | |
324 | + * few chances before we fail | |
325 | + */ | |
326 | + else if (nr_retries--) { | |
327 | + congestion_wait(WRITE, HZ/10); | |
328 | + continue; | |
329 | + } | |
330 | + | |
201 | 331 | css_put(&mem->css); |
202 | 332 | goto free_pc; |
203 | 333 | } |
204 | 334 | |
205 | 335 | |
... | ... | @@ -221,14 +351,16 @@ |
221 | 351 | pc->page = page; |
222 | 352 | page_assign_page_cgroup(page, pc); |
223 | 353 | |
354 | + spin_lock_irqsave(&mem->lru_lock, flags); | |
355 | + list_add(&pc->lru, &mem->active_list); | |
356 | + spin_unlock_irqrestore(&mem->lru_lock, flags); | |
357 | + | |
224 | 358 | done: |
225 | 359 | unlock_page_cgroup(page); |
226 | 360 | return 0; |
227 | 361 | free_pc: |
228 | 362 | kfree(pc); |
229 | - return -ENOMEM; | |
230 | 363 | err: |
231 | - unlock_page_cgroup(page); | |
232 | 364 | return -ENOMEM; |
233 | 365 | } |
234 | 366 | |
... | ... | @@ -240,6 +372,7 @@ |
240 | 372 | { |
241 | 373 | struct mem_cgroup *mem; |
242 | 374 | struct page *page; |
375 | + unsigned long flags; | |
243 | 376 | |
244 | 377 | if (!pc) |
245 | 378 | return; |
... | ... | @@ -252,6 +385,10 @@ |
252 | 385 | page_assign_page_cgroup(page, NULL); |
253 | 386 | unlock_page_cgroup(page); |
254 | 387 | res_counter_uncharge(&mem->res, 1); |
388 | + | |
389 | + spin_lock_irqsave(&mem->lru_lock, flags); | |
390 | + list_del_init(&pc->lru); | |
391 | + spin_unlock_irqrestore(&mem->lru_lock, flags); | |
255 | 392 | kfree(pc); |
256 | 393 | } |
257 | 394 | } |
... | ... | @@ -310,6 +447,7 @@ |
310 | 447 | res_counter_init(&mem->res); |
311 | 448 | INIT_LIST_HEAD(&mem->active_list); |
312 | 449 | INIT_LIST_HEAD(&mem->inactive_list); |
450 | + spin_lock_init(&mem->lru_lock); | |
313 | 451 | return &mem->css; |
314 | 452 | } |
315 | 453 |
mm/swap.c
... | ... | @@ -29,6 +29,7 @@ |
29 | 29 | #include <linux/cpu.h> |
30 | 30 | #include <linux/notifier.h> |
31 | 31 | #include <linux/backing-dev.h> |
32 | +#include <linux/memcontrol.h> | |
32 | 33 | |
33 | 34 | /* How many pages do we try to swap or page in/out together? */ |
34 | 35 | int page_cluster; |
... | ... | @@ -175,6 +176,7 @@ |
175 | 176 | SetPageActive(page); |
176 | 177 | add_page_to_active_list(zone, page); |
177 | 178 | __count_vm_event(PGACTIVATE); |
179 | + mem_cgroup_move_lists(page_get_page_cgroup(page), true); | |
178 | 180 | } |
179 | 181 | spin_unlock_irq(&zone->lru_lock); |
180 | 182 | } |
mm/vmscan.c
... | ... | @@ -37,6 +37,7 @@ |
37 | 37 | #include <linux/delay.h> |
38 | 38 | #include <linux/kthread.h> |
39 | 39 | #include <linux/freezer.h> |
40 | +#include <linux/memcontrol.h> | |
40 | 41 | |
41 | 42 | #include <asm/tlbflush.h> |
42 | 43 | #include <asm/div64.h> |
... | ... | @@ -68,6 +69,15 @@ |
68 | 69 | int all_unreclaimable; |
69 | 70 | |
70 | 71 | int order; |
72 | + | |
73 | + /* Which cgroup do we reclaim from */ | |
74 | + struct mem_cgroup *mem_cgroup; | |
75 | + | |
76 | + /* Pluggable isolate pages callback */ | |
77 | + unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, | |
78 | + unsigned long *scanned, int order, int mode, | |
79 | + struct zone *z, struct mem_cgroup *mem_cont, | |
80 | + int active); | |
71 | 81 | }; |
72 | 82 | |
73 | 83 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) |
... | ... | @@ -626,7 +636,7 @@ |
626 | 636 | * |
627 | 637 | * returns 0 on success, -ve errno on failure. |
628 | 638 | */ |
629 | -static int __isolate_lru_page(struct page *page, int mode) | |
639 | +int __isolate_lru_page(struct page *page, int mode) | |
630 | 640 | { |
631 | 641 | int ret = -EINVAL; |
632 | 642 | |
... | ... | @@ -760,6 +770,21 @@ |
760 | 770 | return nr_taken; |
761 | 771 | } |
762 | 772 | |
773 | +static unsigned long isolate_pages_global(unsigned long nr, | |
774 | + struct list_head *dst, | |
775 | + unsigned long *scanned, int order, | |
776 | + int mode, struct zone *z, | |
777 | + struct mem_cgroup *mem_cont, | |
778 | + int active) | |
779 | +{ | |
780 | + if (active) | |
781 | + return isolate_lru_pages(nr, &z->active_list, dst, | |
782 | + scanned, order, mode); | |
783 | + else | |
784 | + return isolate_lru_pages(nr, &z->inactive_list, dst, | |
785 | + scanned, order, mode); | |
786 | +} | |
787 | + | |
763 | 788 | /* |
764 | 789 | * clear_active_flags() is a helper for shrink_active_list(), clearing |
765 | 790 | * any active bits from the pages in the list. |
766 | 791 | |
... | ... | @@ -801,11 +826,11 @@ |
801 | 826 | unsigned long nr_freed; |
802 | 827 | unsigned long nr_active; |
803 | 828 | |
804 | - nr_taken = isolate_lru_pages(sc->swap_cluster_max, | |
805 | - &zone->inactive_list, | |
829 | + nr_taken = sc->isolate_pages(sc->swap_cluster_max, | |
806 | 830 | &page_list, &nr_scan, sc->order, |
807 | 831 | (sc->order > PAGE_ALLOC_COSTLY_ORDER)? |
808 | - ISOLATE_BOTH : ISOLATE_INACTIVE); | |
832 | + ISOLATE_BOTH : ISOLATE_INACTIVE, | |
833 | + zone, sc->mem_cgroup, 0); | |
809 | 834 | nr_active = clear_active_flags(&page_list); |
810 | 835 | __count_vm_events(PGDEACTIVATE, nr_active); |
811 | 836 | |
... | ... | @@ -1018,8 +1043,9 @@ |
1018 | 1043 | |
1019 | 1044 | lru_add_drain(); |
1020 | 1045 | spin_lock_irq(&zone->lru_lock); |
1021 | - pgmoved = isolate_lru_pages(nr_pages, &zone->active_list, | |
1022 | - &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE); | |
1046 | + pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, | |
1047 | + ISOLATE_ACTIVE, zone, | |
1048 | + sc->mem_cgroup, 1); | |
1023 | 1049 | zone->pages_scanned += pgscanned; |
1024 | 1050 | __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); |
1025 | 1051 | spin_unlock_irq(&zone->lru_lock); |
... | ... | @@ -1051,6 +1077,7 @@ |
1051 | 1077 | ClearPageActive(page); |
1052 | 1078 | |
1053 | 1079 | list_move(&page->lru, &zone->inactive_list); |
1080 | + mem_cgroup_move_lists(page_get_page_cgroup(page), false); | |
1054 | 1081 | pgmoved++; |
1055 | 1082 | if (!pagevec_add(&pvec, page)) { |
1056 | 1083 | __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); |
... | ... | @@ -1079,6 +1106,7 @@ |
1079 | 1106 | SetPageLRU(page); |
1080 | 1107 | VM_BUG_ON(!PageActive(page)); |
1081 | 1108 | list_move(&page->lru, &zone->active_list); |
1109 | + mem_cgroup_move_lists(page_get_page_cgroup(page), true); | |
1082 | 1110 | pgmoved++; |
1083 | 1111 | if (!pagevec_add(&pvec, page)) { |
1084 | 1112 | __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); |
... | ... | @@ -1206,7 +1234,8 @@ |
1206 | 1234 | * holds filesystem locks which prevent writeout this might not work, and the |
1207 | 1235 | * allocation attempt will fail. |
1208 | 1236 | */ |
1209 | -unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) | |
1237 | +static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, | |
1238 | + struct scan_control *sc) | |
1210 | 1239 | { |
1211 | 1240 | int priority; |
1212 | 1241 | int ret = 0; |
... | ... | @@ -1215,14 +1244,6 @@ |
1215 | 1244 | struct reclaim_state *reclaim_state = current->reclaim_state; |
1216 | 1245 | unsigned long lru_pages = 0; |
1217 | 1246 | int i; |
1218 | - struct scan_control sc = { | |
1219 | - .gfp_mask = gfp_mask, | |
1220 | - .may_writepage = !laptop_mode, | |
1221 | - .swap_cluster_max = SWAP_CLUSTER_MAX, | |
1222 | - .may_swap = 1, | |
1223 | - .swappiness = vm_swappiness, | |
1224 | - .order = order, | |
1225 | - }; | |
1226 | 1247 | |
1227 | 1248 | count_vm_event(ALLOCSTALL); |
1228 | 1249 | |
1229 | 1250 | |
1230 | 1251 | |
... | ... | @@ -1237,17 +1258,22 @@ |
1237 | 1258 | } |
1238 | 1259 | |
1239 | 1260 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
1240 | - sc.nr_scanned = 0; | |
1261 | + sc->nr_scanned = 0; | |
1241 | 1262 | if (!priority) |
1242 | 1263 | disable_swap_token(); |
1243 | - nr_reclaimed += shrink_zones(priority, zones, &sc); | |
1244 | - shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); | |
1264 | + nr_reclaimed += shrink_zones(priority, zones, sc); | |
1265 | + /* | |
1266 | + * Don't shrink slabs when reclaiming memory from | |
1267 | + * over limit cgroups | |
1268 | + */ | |
1269 | + if (sc->mem_cgroup == NULL) | |
1270 | + shrink_slab(sc->nr_scanned, gfp_mask, lru_pages); | |
1245 | 1271 | if (reclaim_state) { |
1246 | 1272 | nr_reclaimed += reclaim_state->reclaimed_slab; |
1247 | 1273 | reclaim_state->reclaimed_slab = 0; |
1248 | 1274 | } |
1249 | - total_scanned += sc.nr_scanned; | |
1250 | - if (nr_reclaimed >= sc.swap_cluster_max) { | |
1275 | + total_scanned += sc->nr_scanned; | |
1276 | + if (nr_reclaimed >= sc->swap_cluster_max) { | |
1251 | 1277 | ret = 1; |
1252 | 1278 | goto out; |
1253 | 1279 | } |
1254 | 1280 | |
1255 | 1281 | |
1256 | 1282 | |
... | ... | @@ -1259,18 +1285,18 @@ |
1259 | 1285 | * that's undesirable in laptop mode, where we *want* lumpy |
1260 | 1286 | * writeout. So in laptop mode, write out the whole world. |
1261 | 1287 | */ |
1262 | - if (total_scanned > sc.swap_cluster_max + | |
1263 | - sc.swap_cluster_max / 2) { | |
1288 | + if (total_scanned > sc->swap_cluster_max + | |
1289 | + sc->swap_cluster_max / 2) { | |
1264 | 1290 | wakeup_pdflush(laptop_mode ? 0 : total_scanned); |
1265 | - sc.may_writepage = 1; | |
1291 | + sc->may_writepage = 1; | |
1266 | 1292 | } |
1267 | 1293 | |
1268 | 1294 | /* Take a nap, wait for some writeback to complete */ |
1269 | - if (sc.nr_scanned && priority < DEF_PRIORITY - 2) | |
1295 | + if (sc->nr_scanned && priority < DEF_PRIORITY - 2) | |
1270 | 1296 | congestion_wait(WRITE, HZ/10); |
1271 | 1297 | } |
1272 | 1298 | /* top priority shrink_caches still had more to do? don't OOM, then */ |
1273 | - if (!sc.all_unreclaimable) | |
1299 | + if (!sc->all_unreclaimable && sc->mem_cgroup == NULL) | |
1274 | 1300 | ret = 1; |
1275 | 1301 | out: |
1276 | 1302 | /* |
... | ... | @@ -1293,6 +1319,54 @@ |
1293 | 1319 | return ret; |
1294 | 1320 | } |
1295 | 1321 | |
1322 | +unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) | |
1323 | +{ | |
1324 | + struct scan_control sc = { | |
1325 | + .gfp_mask = gfp_mask, | |
1326 | + .may_writepage = !laptop_mode, | |
1327 | + .swap_cluster_max = SWAP_CLUSTER_MAX, | |
1328 | + .may_swap = 1, | |
1329 | + .swappiness = vm_swappiness, | |
1330 | + .order = order, | |
1331 | + .mem_cgroup = NULL, | |
1332 | + .isolate_pages = isolate_pages_global, | |
1333 | + }; | |
1334 | + | |
1335 | + return do_try_to_free_pages(zones, gfp_mask, &sc); | |
1336 | +} | |
1337 | + | |
1338 | +#ifdef CONFIG_CGROUP_MEM_CONT | |
1339 | + | |
1340 | +#ifdef CONFIG_HIGHMEM | |
1341 | +#define ZONE_USERPAGES ZONE_HIGHMEM | |
1342 | +#else | |
1343 | +#define ZONE_USERPAGES ZONE_NORMAL | |
1344 | +#endif | |
1345 | + | |
1346 | +unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont) | |
1347 | +{ | |
1348 | + struct scan_control sc = { | |
1349 | + .gfp_mask = GFP_KERNEL, | |
1350 | + .may_writepage = !laptop_mode, | |
1351 | + .may_swap = 1, | |
1352 | + .swap_cluster_max = SWAP_CLUSTER_MAX, | |
1353 | + .swappiness = vm_swappiness, | |
1354 | + .order = 0, | |
1355 | + .mem_cgroup = mem_cont, | |
1356 | + .isolate_pages = mem_cgroup_isolate_pages, | |
1357 | + }; | |
1358 | + int node; | |
1359 | + struct zone **zones; | |
1360 | + | |
1361 | + for_each_online_node(node) { | |
1362 | + zones = NODE_DATA(node)->node_zonelists[ZONE_USERPAGES].zones; | |
1363 | + if (do_try_to_free_pages(zones, sc.gfp_mask, &sc)) | |
1364 | + return 1; | |
1365 | + } | |
1366 | + return 0; | |
1367 | +} | |
1368 | +#endif | |
1369 | + | |
1296 | 1370 | /* |
1297 | 1371 | * For kswapd, balance_pgdat() will work across all this node's zones until |
1298 | 1372 | * they are all at pages_high. |
... | ... | @@ -1328,6 +1402,8 @@ |
1328 | 1402 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
1329 | 1403 | .swappiness = vm_swappiness, |
1330 | 1404 | .order = order, |
1405 | + .mem_cgroup = NULL, | |
1406 | + .isolate_pages = isolate_pages_global, | |
1331 | 1407 | }; |
1332 | 1408 | /* |
1333 | 1409 | * temp_priority is used to remember the scanning priority at which |
... | ... | @@ -1649,6 +1725,7 @@ |
1649 | 1725 | .swap_cluster_max = nr_pages, |
1650 | 1726 | .may_writepage = 1, |
1651 | 1727 | .swappiness = vm_swappiness, |
1728 | + .isolate_pages = isolate_pages_global, | |
1652 | 1729 | }; |
1653 | 1730 | |
1654 | 1731 | current->reclaim_state = &reclaim_state; |
... | ... | @@ -1834,6 +1911,7 @@ |
1834 | 1911 | SWAP_CLUSTER_MAX), |
1835 | 1912 | .gfp_mask = gfp_mask, |
1836 | 1913 | .swappiness = vm_swappiness, |
1914 | + .isolate_pages = isolate_pages_global, | |
1837 | 1915 | }; |
1838 | 1916 | unsigned long slab_reclaimable; |
1839 | 1917 |