Commit 7db8889ab05b57200158432755af318fb68854a2
Committed by
Linus Torvalds
1 parent
ab21588487
Exists in
smarc-l5.0.0_1.0.0-ga
and in
5 other branches
mm: have order > 0 compaction start off where it left
Order > 0 compaction stops when enough free pages of the correct page order have been coalesced. When doing subsequent higher order allocations, it is possible for compaction to be invoked many times. However, the compaction code always starts out looking for things to compact at the start of the zone, and for free pages to compact things to at the end of the zone. This can cause quadratic behaviour, with isolate_freepages starting at the end of the zone each time, even though previous invocations of the compaction code already filled up all free memory on that end of the zone. This can cause isolate_freepages to take enormous amounts of CPU with certain workloads on larger memory systems. The obvious solution is to have isolate_freepages remember where it left off last time, and continue at that point the next time it gets invoked for an order > 0 compaction. This could cause compaction to fail if cc->free_pfn and cc->migrate_pfn are close together initially, in that case we restart from the end of the zone and try once more. Forced full (order == -1) compactions are left alone. [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: s/laste/last/, use 80 cols] Signed-off-by: Rik van Riel <riel@redhat.com> Reported-by: Jim Schutt <jaschut@sandia.gov> Tested-by: Jim Schutt <jaschut@sandia.gov> Cc: Minchan Kim <minchan.kim@gmail.com> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 4 changed files with 73 additions and 5 deletions Inline Diff
include/linux/mmzone.h
1 | #ifndef _LINUX_MMZONE_H | 1 | #ifndef _LINUX_MMZONE_H |
2 | #define _LINUX_MMZONE_H | 2 | #define _LINUX_MMZONE_H |
3 | 3 | ||
4 | #ifndef __ASSEMBLY__ | 4 | #ifndef __ASSEMBLY__ |
5 | #ifndef __GENERATING_BOUNDS_H | 5 | #ifndef __GENERATING_BOUNDS_H |
6 | 6 | ||
7 | #include <linux/spinlock.h> | 7 | #include <linux/spinlock.h> |
8 | #include <linux/list.h> | 8 | #include <linux/list.h> |
9 | #include <linux/wait.h> | 9 | #include <linux/wait.h> |
10 | #include <linux/bitops.h> | 10 | #include <linux/bitops.h> |
11 | #include <linux/cache.h> | 11 | #include <linux/cache.h> |
12 | #include <linux/threads.h> | 12 | #include <linux/threads.h> |
13 | #include <linux/numa.h> | 13 | #include <linux/numa.h> |
14 | #include <linux/init.h> | 14 | #include <linux/init.h> |
15 | #include <linux/seqlock.h> | 15 | #include <linux/seqlock.h> |
16 | #include <linux/nodemask.h> | 16 | #include <linux/nodemask.h> |
17 | #include <linux/pageblock-flags.h> | 17 | #include <linux/pageblock-flags.h> |
18 | #include <generated/bounds.h> | 18 | #include <generated/bounds.h> |
19 | #include <linux/atomic.h> | 19 | #include <linux/atomic.h> |
20 | #include <asm/page.h> | 20 | #include <asm/page.h> |
21 | 21 | ||
22 | /* Free memory management - zoned buddy allocator. */ | 22 | /* Free memory management - zoned buddy allocator. */ |
23 | #ifndef CONFIG_FORCE_MAX_ZONEORDER | 23 | #ifndef CONFIG_FORCE_MAX_ZONEORDER |
24 | #define MAX_ORDER 11 | 24 | #define MAX_ORDER 11 |
25 | #else | 25 | #else |
26 | #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER | 26 | #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER |
27 | #endif | 27 | #endif |
28 | #define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1)) | 28 | #define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1)) |
29 | 29 | ||
30 | /* | 30 | /* |
31 | * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed | 31 | * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed |
32 | * costly to service. That is between allocation orders which should | 32 | * costly to service. That is between allocation orders which should |
33 | * coalesce naturally under reasonable reclaim pressure and those which | 33 | * coalesce naturally under reasonable reclaim pressure and those which |
34 | * will not. | 34 | * will not. |
35 | */ | 35 | */ |
36 | #define PAGE_ALLOC_COSTLY_ORDER 3 | 36 | #define PAGE_ALLOC_COSTLY_ORDER 3 |
37 | 37 | ||
38 | enum { | 38 | enum { |
39 | MIGRATE_UNMOVABLE, | 39 | MIGRATE_UNMOVABLE, |
40 | MIGRATE_RECLAIMABLE, | 40 | MIGRATE_RECLAIMABLE, |
41 | MIGRATE_MOVABLE, | 41 | MIGRATE_MOVABLE, |
42 | MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ | 42 | MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ |
43 | MIGRATE_RESERVE = MIGRATE_PCPTYPES, | 43 | MIGRATE_RESERVE = MIGRATE_PCPTYPES, |
44 | #ifdef CONFIG_CMA | 44 | #ifdef CONFIG_CMA |
45 | /* | 45 | /* |
46 | * MIGRATE_CMA migration type is designed to mimic the way | 46 | * MIGRATE_CMA migration type is designed to mimic the way |
47 | * ZONE_MOVABLE works. Only movable pages can be allocated | 47 | * ZONE_MOVABLE works. Only movable pages can be allocated |
48 | * from MIGRATE_CMA pageblocks and page allocator never | 48 | * from MIGRATE_CMA pageblocks and page allocator never |
49 | * implicitly change migration type of MIGRATE_CMA pageblock. | 49 | * implicitly change migration type of MIGRATE_CMA pageblock. |
50 | * | 50 | * |
51 | * The way to use it is to change migratetype of a range of | 51 | * The way to use it is to change migratetype of a range of |
52 | * pageblocks to MIGRATE_CMA which can be done by | 52 | * pageblocks to MIGRATE_CMA which can be done by |
53 | * __free_pageblock_cma() function. What is important though | 53 | * __free_pageblock_cma() function. What is important though |
54 | * is that a range of pageblocks must be aligned to | 54 | * is that a range of pageblocks must be aligned to |
55 | * MAX_ORDER_NR_PAGES should biggest page be bigger then | 55 | * MAX_ORDER_NR_PAGES should biggest page be bigger then |
56 | * a single pageblock. | 56 | * a single pageblock. |
57 | */ | 57 | */ |
58 | MIGRATE_CMA, | 58 | MIGRATE_CMA, |
59 | #endif | 59 | #endif |
60 | MIGRATE_ISOLATE, /* can't allocate from here */ | 60 | MIGRATE_ISOLATE, /* can't allocate from here */ |
61 | MIGRATE_TYPES | 61 | MIGRATE_TYPES |
62 | }; | 62 | }; |
63 | 63 | ||
64 | #ifdef CONFIG_CMA | 64 | #ifdef CONFIG_CMA |
65 | # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) | 65 | # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) |
66 | # define cma_wmark_pages(zone) zone->min_cma_pages | 66 | # define cma_wmark_pages(zone) zone->min_cma_pages |
67 | #else | 67 | #else |
68 | # define is_migrate_cma(migratetype) false | 68 | # define is_migrate_cma(migratetype) false |
69 | # define cma_wmark_pages(zone) 0 | 69 | # define cma_wmark_pages(zone) 0 |
70 | #endif | 70 | #endif |
71 | 71 | ||
72 | #define for_each_migratetype_order(order, type) \ | 72 | #define for_each_migratetype_order(order, type) \ |
73 | for (order = 0; order < MAX_ORDER; order++) \ | 73 | for (order = 0; order < MAX_ORDER; order++) \ |
74 | for (type = 0; type < MIGRATE_TYPES; type++) | 74 | for (type = 0; type < MIGRATE_TYPES; type++) |
75 | 75 | ||
76 | extern int page_group_by_mobility_disabled; | 76 | extern int page_group_by_mobility_disabled; |
77 | 77 | ||
78 | static inline int get_pageblock_migratetype(struct page *page) | 78 | static inline int get_pageblock_migratetype(struct page *page) |
79 | { | 79 | { |
80 | return get_pageblock_flags_group(page, PB_migrate, PB_migrate_end); | 80 | return get_pageblock_flags_group(page, PB_migrate, PB_migrate_end); |
81 | } | 81 | } |
82 | 82 | ||
83 | struct free_area { | 83 | struct free_area { |
84 | struct list_head free_list[MIGRATE_TYPES]; | 84 | struct list_head free_list[MIGRATE_TYPES]; |
85 | unsigned long nr_free; | 85 | unsigned long nr_free; |
86 | }; | 86 | }; |
87 | 87 | ||
88 | struct pglist_data; | 88 | struct pglist_data; |
89 | 89 | ||
90 | /* | 90 | /* |
91 | * zone->lock and zone->lru_lock are two of the hottest locks in the kernel. | 91 | * zone->lock and zone->lru_lock are two of the hottest locks in the kernel. |
92 | * So add a wild amount of padding here to ensure that they fall into separate | 92 | * So add a wild amount of padding here to ensure that they fall into separate |
93 | * cachelines. There are very few zone structures in the machine, so space | 93 | * cachelines. There are very few zone structures in the machine, so space |
94 | * consumption is not a concern here. | 94 | * consumption is not a concern here. |
95 | */ | 95 | */ |
96 | #if defined(CONFIG_SMP) | 96 | #if defined(CONFIG_SMP) |
97 | struct zone_padding { | 97 | struct zone_padding { |
98 | char x[0]; | 98 | char x[0]; |
99 | } ____cacheline_internodealigned_in_smp; | 99 | } ____cacheline_internodealigned_in_smp; |
100 | #define ZONE_PADDING(name) struct zone_padding name; | 100 | #define ZONE_PADDING(name) struct zone_padding name; |
101 | #else | 101 | #else |
102 | #define ZONE_PADDING(name) | 102 | #define ZONE_PADDING(name) |
103 | #endif | 103 | #endif |
104 | 104 | ||
105 | enum zone_stat_item { | 105 | enum zone_stat_item { |
106 | /* First 128 byte cacheline (assuming 64 bit words) */ | 106 | /* First 128 byte cacheline (assuming 64 bit words) */ |
107 | NR_FREE_PAGES, | 107 | NR_FREE_PAGES, |
108 | NR_LRU_BASE, | 108 | NR_LRU_BASE, |
109 | NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ | 109 | NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ |
110 | NR_ACTIVE_ANON, /* " " " " " */ | 110 | NR_ACTIVE_ANON, /* " " " " " */ |
111 | NR_INACTIVE_FILE, /* " " " " " */ | 111 | NR_INACTIVE_FILE, /* " " " " " */ |
112 | NR_ACTIVE_FILE, /* " " " " " */ | 112 | NR_ACTIVE_FILE, /* " " " " " */ |
113 | NR_UNEVICTABLE, /* " " " " " */ | 113 | NR_UNEVICTABLE, /* " " " " " */ |
114 | NR_MLOCK, /* mlock()ed pages found and moved off LRU */ | 114 | NR_MLOCK, /* mlock()ed pages found and moved off LRU */ |
115 | NR_ANON_PAGES, /* Mapped anonymous pages */ | 115 | NR_ANON_PAGES, /* Mapped anonymous pages */ |
116 | NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. | 116 | NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. |
117 | only modified from process context */ | 117 | only modified from process context */ |
118 | NR_FILE_PAGES, | 118 | NR_FILE_PAGES, |
119 | NR_FILE_DIRTY, | 119 | NR_FILE_DIRTY, |
120 | NR_WRITEBACK, | 120 | NR_WRITEBACK, |
121 | NR_SLAB_RECLAIMABLE, | 121 | NR_SLAB_RECLAIMABLE, |
122 | NR_SLAB_UNRECLAIMABLE, | 122 | NR_SLAB_UNRECLAIMABLE, |
123 | NR_PAGETABLE, /* used for pagetables */ | 123 | NR_PAGETABLE, /* used for pagetables */ |
124 | NR_KERNEL_STACK, | 124 | NR_KERNEL_STACK, |
125 | /* Second 128 byte cacheline */ | 125 | /* Second 128 byte cacheline */ |
126 | NR_UNSTABLE_NFS, /* NFS unstable pages */ | 126 | NR_UNSTABLE_NFS, /* NFS unstable pages */ |
127 | NR_BOUNCE, | 127 | NR_BOUNCE, |
128 | NR_VMSCAN_WRITE, | 128 | NR_VMSCAN_WRITE, |
129 | NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ | 129 | NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ |
130 | NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ | 130 | NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ |
131 | NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ | 131 | NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ |
132 | NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ | 132 | NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ |
133 | NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ | 133 | NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ |
134 | NR_DIRTIED, /* page dirtyings since bootup */ | 134 | NR_DIRTIED, /* page dirtyings since bootup */ |
135 | NR_WRITTEN, /* page writings since bootup */ | 135 | NR_WRITTEN, /* page writings since bootup */ |
136 | #ifdef CONFIG_NUMA | 136 | #ifdef CONFIG_NUMA |
137 | NUMA_HIT, /* allocated in intended node */ | 137 | NUMA_HIT, /* allocated in intended node */ |
138 | NUMA_MISS, /* allocated in non intended node */ | 138 | NUMA_MISS, /* allocated in non intended node */ |
139 | NUMA_FOREIGN, /* was intended here, hit elsewhere */ | 139 | NUMA_FOREIGN, /* was intended here, hit elsewhere */ |
140 | NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ | 140 | NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ |
141 | NUMA_LOCAL, /* allocation from local node */ | 141 | NUMA_LOCAL, /* allocation from local node */ |
142 | NUMA_OTHER, /* allocation from other node */ | 142 | NUMA_OTHER, /* allocation from other node */ |
143 | #endif | 143 | #endif |
144 | NR_ANON_TRANSPARENT_HUGEPAGES, | 144 | NR_ANON_TRANSPARENT_HUGEPAGES, |
145 | NR_VM_ZONE_STAT_ITEMS }; | 145 | NR_VM_ZONE_STAT_ITEMS }; |
146 | 146 | ||
147 | /* | 147 | /* |
148 | * We do arithmetic on the LRU lists in various places in the code, | 148 | * We do arithmetic on the LRU lists in various places in the code, |
149 | * so it is important to keep the active lists LRU_ACTIVE higher in | 149 | * so it is important to keep the active lists LRU_ACTIVE higher in |
150 | * the array than the corresponding inactive lists, and to keep | 150 | * the array than the corresponding inactive lists, and to keep |
151 | * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists. | 151 | * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists. |
152 | * | 152 | * |
153 | * This has to be kept in sync with the statistics in zone_stat_item | 153 | * This has to be kept in sync with the statistics in zone_stat_item |
154 | * above and the descriptions in vmstat_text in mm/vmstat.c | 154 | * above and the descriptions in vmstat_text in mm/vmstat.c |
155 | */ | 155 | */ |
156 | #define LRU_BASE 0 | 156 | #define LRU_BASE 0 |
157 | #define LRU_ACTIVE 1 | 157 | #define LRU_ACTIVE 1 |
158 | #define LRU_FILE 2 | 158 | #define LRU_FILE 2 |
159 | 159 | ||
160 | enum lru_list { | 160 | enum lru_list { |
161 | LRU_INACTIVE_ANON = LRU_BASE, | 161 | LRU_INACTIVE_ANON = LRU_BASE, |
162 | LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, | 162 | LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, |
163 | LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, | 163 | LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, |
164 | LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, | 164 | LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, |
165 | LRU_UNEVICTABLE, | 165 | LRU_UNEVICTABLE, |
166 | NR_LRU_LISTS | 166 | NR_LRU_LISTS |
167 | }; | 167 | }; |
168 | 168 | ||
169 | #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) | 169 | #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) |
170 | 170 | ||
171 | #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) | 171 | #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) |
172 | 172 | ||
173 | static inline int is_file_lru(enum lru_list lru) | 173 | static inline int is_file_lru(enum lru_list lru) |
174 | { | 174 | { |
175 | return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); | 175 | return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); |
176 | } | 176 | } |
177 | 177 | ||
178 | static inline int is_active_lru(enum lru_list lru) | 178 | static inline int is_active_lru(enum lru_list lru) |
179 | { | 179 | { |
180 | return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); | 180 | return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); |
181 | } | 181 | } |
182 | 182 | ||
183 | static inline int is_unevictable_lru(enum lru_list lru) | 183 | static inline int is_unevictable_lru(enum lru_list lru) |
184 | { | 184 | { |
185 | return (lru == LRU_UNEVICTABLE); | 185 | return (lru == LRU_UNEVICTABLE); |
186 | } | 186 | } |
187 | 187 | ||
188 | struct zone_reclaim_stat { | 188 | struct zone_reclaim_stat { |
189 | /* | 189 | /* |
190 | * The pageout code in vmscan.c keeps track of how many of the | 190 | * The pageout code in vmscan.c keeps track of how many of the |
191 | * mem/swap backed and file backed pages are referenced. | 191 | * mem/swap backed and file backed pages are referenced. |
192 | * The higher the rotated/scanned ratio, the more valuable | 192 | * The higher the rotated/scanned ratio, the more valuable |
193 | * that cache is. | 193 | * that cache is. |
194 | * | 194 | * |
195 | * The anon LRU stats live in [0], file LRU stats in [1] | 195 | * The anon LRU stats live in [0], file LRU stats in [1] |
196 | */ | 196 | */ |
197 | unsigned long recent_rotated[2]; | 197 | unsigned long recent_rotated[2]; |
198 | unsigned long recent_scanned[2]; | 198 | unsigned long recent_scanned[2]; |
199 | }; | 199 | }; |
200 | 200 | ||
201 | struct lruvec { | 201 | struct lruvec { |
202 | struct list_head lists[NR_LRU_LISTS]; | 202 | struct list_head lists[NR_LRU_LISTS]; |
203 | struct zone_reclaim_stat reclaim_stat; | 203 | struct zone_reclaim_stat reclaim_stat; |
204 | #ifdef CONFIG_MEMCG | 204 | #ifdef CONFIG_MEMCG |
205 | struct zone *zone; | 205 | struct zone *zone; |
206 | #endif | 206 | #endif |
207 | }; | 207 | }; |
208 | 208 | ||
209 | /* Mask used at gathering information at once (see memcontrol.c) */ | 209 | /* Mask used at gathering information at once (see memcontrol.c) */ |
210 | #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) | 210 | #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) |
211 | #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) | 211 | #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) |
212 | #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) | 212 | #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) |
213 | 213 | ||
214 | /* Isolate clean file */ | 214 | /* Isolate clean file */ |
215 | #define ISOLATE_CLEAN ((__force isolate_mode_t)0x1) | 215 | #define ISOLATE_CLEAN ((__force isolate_mode_t)0x1) |
216 | /* Isolate unmapped file */ | 216 | /* Isolate unmapped file */ |
217 | #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) | 217 | #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) |
218 | /* Isolate for asynchronous migration */ | 218 | /* Isolate for asynchronous migration */ |
219 | #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) | 219 | #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) |
220 | 220 | ||
221 | /* LRU Isolation modes. */ | 221 | /* LRU Isolation modes. */ |
222 | typedef unsigned __bitwise__ isolate_mode_t; | 222 | typedef unsigned __bitwise__ isolate_mode_t; |
223 | 223 | ||
224 | enum zone_watermarks { | 224 | enum zone_watermarks { |
225 | WMARK_MIN, | 225 | WMARK_MIN, |
226 | WMARK_LOW, | 226 | WMARK_LOW, |
227 | WMARK_HIGH, | 227 | WMARK_HIGH, |
228 | NR_WMARK | 228 | NR_WMARK |
229 | }; | 229 | }; |
230 | 230 | ||
231 | #define min_wmark_pages(z) (z->watermark[WMARK_MIN]) | 231 | #define min_wmark_pages(z) (z->watermark[WMARK_MIN]) |
232 | #define low_wmark_pages(z) (z->watermark[WMARK_LOW]) | 232 | #define low_wmark_pages(z) (z->watermark[WMARK_LOW]) |
233 | #define high_wmark_pages(z) (z->watermark[WMARK_HIGH]) | 233 | #define high_wmark_pages(z) (z->watermark[WMARK_HIGH]) |
234 | 234 | ||
235 | struct per_cpu_pages { | 235 | struct per_cpu_pages { |
236 | int count; /* number of pages in the list */ | 236 | int count; /* number of pages in the list */ |
237 | int high; /* high watermark, emptying needed */ | 237 | int high; /* high watermark, emptying needed */ |
238 | int batch; /* chunk size for buddy add/remove */ | 238 | int batch; /* chunk size for buddy add/remove */ |
239 | 239 | ||
240 | /* Lists of pages, one per migrate type stored on the pcp-lists */ | 240 | /* Lists of pages, one per migrate type stored on the pcp-lists */ |
241 | struct list_head lists[MIGRATE_PCPTYPES]; | 241 | struct list_head lists[MIGRATE_PCPTYPES]; |
242 | }; | 242 | }; |
243 | 243 | ||
244 | struct per_cpu_pageset { | 244 | struct per_cpu_pageset { |
245 | struct per_cpu_pages pcp; | 245 | struct per_cpu_pages pcp; |
246 | #ifdef CONFIG_NUMA | 246 | #ifdef CONFIG_NUMA |
247 | s8 expire; | 247 | s8 expire; |
248 | #endif | 248 | #endif |
249 | #ifdef CONFIG_SMP | 249 | #ifdef CONFIG_SMP |
250 | s8 stat_threshold; | 250 | s8 stat_threshold; |
251 | s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; | 251 | s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; |
252 | #endif | 252 | #endif |
253 | }; | 253 | }; |
254 | 254 | ||
255 | #endif /* !__GENERATING_BOUNDS.H */ | 255 | #endif /* !__GENERATING_BOUNDS.H */ |
256 | 256 | ||
257 | enum zone_type { | 257 | enum zone_type { |
258 | #ifdef CONFIG_ZONE_DMA | 258 | #ifdef CONFIG_ZONE_DMA |
259 | /* | 259 | /* |
260 | * ZONE_DMA is used when there are devices that are not able | 260 | * ZONE_DMA is used when there are devices that are not able |
261 | * to do DMA to all of addressable memory (ZONE_NORMAL). Then we | 261 | * to do DMA to all of addressable memory (ZONE_NORMAL). Then we |
262 | * carve out the portion of memory that is needed for these devices. | 262 | * carve out the portion of memory that is needed for these devices. |
263 | * The range is arch specific. | 263 | * The range is arch specific. |
264 | * | 264 | * |
265 | * Some examples | 265 | * Some examples |
266 | * | 266 | * |
267 | * Architecture Limit | 267 | * Architecture Limit |
268 | * --------------------------- | 268 | * --------------------------- |
269 | * parisc, ia64, sparc <4G | 269 | * parisc, ia64, sparc <4G |
270 | * s390 <2G | 270 | * s390 <2G |
271 | * arm Various | 271 | * arm Various |
272 | * alpha Unlimited or 0-16MB. | 272 | * alpha Unlimited or 0-16MB. |
273 | * | 273 | * |
274 | * i386, x86_64 and multiple other arches | 274 | * i386, x86_64 and multiple other arches |
275 | * <16M. | 275 | * <16M. |
276 | */ | 276 | */ |
277 | ZONE_DMA, | 277 | ZONE_DMA, |
278 | #endif | 278 | #endif |
279 | #ifdef CONFIG_ZONE_DMA32 | 279 | #ifdef CONFIG_ZONE_DMA32 |
280 | /* | 280 | /* |
281 | * x86_64 needs two ZONE_DMAs because it supports devices that are | 281 | * x86_64 needs two ZONE_DMAs because it supports devices that are |
282 | * only able to do DMA to the lower 16M but also 32 bit devices that | 282 | * only able to do DMA to the lower 16M but also 32 bit devices that |
283 | * can only do DMA areas below 4G. | 283 | * can only do DMA areas below 4G. |
284 | */ | 284 | */ |
285 | ZONE_DMA32, | 285 | ZONE_DMA32, |
286 | #endif | 286 | #endif |
287 | /* | 287 | /* |
288 | * Normal addressable memory is in ZONE_NORMAL. DMA operations can be | 288 | * Normal addressable memory is in ZONE_NORMAL. DMA operations can be |
289 | * performed on pages in ZONE_NORMAL if the DMA devices support | 289 | * performed on pages in ZONE_NORMAL if the DMA devices support |
290 | * transfers to all addressable memory. | 290 | * transfers to all addressable memory. |
291 | */ | 291 | */ |
292 | ZONE_NORMAL, | 292 | ZONE_NORMAL, |
293 | #ifdef CONFIG_HIGHMEM | 293 | #ifdef CONFIG_HIGHMEM |
294 | /* | 294 | /* |
295 | * A memory area that is only addressable by the kernel through | 295 | * A memory area that is only addressable by the kernel through |
296 | * mapping portions into its own address space. This is for example | 296 | * mapping portions into its own address space. This is for example |
297 | * used by i386 to allow the kernel to address the memory beyond | 297 | * used by i386 to allow the kernel to address the memory beyond |
298 | * 900MB. The kernel will set up special mappings (page | 298 | * 900MB. The kernel will set up special mappings (page |
299 | * table entries on i386) for each page that the kernel needs to | 299 | * table entries on i386) for each page that the kernel needs to |
300 | * access. | 300 | * access. |
301 | */ | 301 | */ |
302 | ZONE_HIGHMEM, | 302 | ZONE_HIGHMEM, |
303 | #endif | 303 | #endif |
304 | ZONE_MOVABLE, | 304 | ZONE_MOVABLE, |
305 | __MAX_NR_ZONES | 305 | __MAX_NR_ZONES |
306 | }; | 306 | }; |
307 | 307 | ||
308 | #ifndef __GENERATING_BOUNDS_H | 308 | #ifndef __GENERATING_BOUNDS_H |
309 | 309 | ||
310 | /* | 310 | /* |
311 | * When a memory allocation must conform to specific limitations (such | 311 | * When a memory allocation must conform to specific limitations (such |
312 | * as being suitable for DMA) the caller will pass in hints to the | 312 | * as being suitable for DMA) the caller will pass in hints to the |
313 | * allocator in the gfp_mask, in the zone modifier bits. These bits | 313 | * allocator in the gfp_mask, in the zone modifier bits. These bits |
314 | * are used to select a priority ordered list of memory zones which | 314 | * are used to select a priority ordered list of memory zones which |
315 | * match the requested limits. See gfp_zone() in include/linux/gfp.h | 315 | * match the requested limits. See gfp_zone() in include/linux/gfp.h |
316 | */ | 316 | */ |
317 | 317 | ||
318 | #if MAX_NR_ZONES < 2 | 318 | #if MAX_NR_ZONES < 2 |
319 | #define ZONES_SHIFT 0 | 319 | #define ZONES_SHIFT 0 |
320 | #elif MAX_NR_ZONES <= 2 | 320 | #elif MAX_NR_ZONES <= 2 |
321 | #define ZONES_SHIFT 1 | 321 | #define ZONES_SHIFT 1 |
322 | #elif MAX_NR_ZONES <= 4 | 322 | #elif MAX_NR_ZONES <= 4 |
323 | #define ZONES_SHIFT 2 | 323 | #define ZONES_SHIFT 2 |
324 | #else | 324 | #else |
325 | #error ZONES_SHIFT -- too many zones configured adjust calculation | 325 | #error ZONES_SHIFT -- too many zones configured adjust calculation |
326 | #endif | 326 | #endif |
327 | 327 | ||
328 | struct zone { | 328 | struct zone { |
329 | /* Fields commonly accessed by the page allocator */ | 329 | /* Fields commonly accessed by the page allocator */ |
330 | 330 | ||
331 | /* zone watermarks, access with *_wmark_pages(zone) macros */ | 331 | /* zone watermarks, access with *_wmark_pages(zone) macros */ |
332 | unsigned long watermark[NR_WMARK]; | 332 | unsigned long watermark[NR_WMARK]; |
333 | 333 | ||
334 | /* | 334 | /* |
335 | * When free pages are below this point, additional steps are taken | 335 | * When free pages are below this point, additional steps are taken |
336 | * when reading the number of free pages to avoid per-cpu counter | 336 | * when reading the number of free pages to avoid per-cpu counter |
337 | * drift allowing watermarks to be breached | 337 | * drift allowing watermarks to be breached |
338 | */ | 338 | */ |
339 | unsigned long percpu_drift_mark; | 339 | unsigned long percpu_drift_mark; |
340 | 340 | ||
341 | /* | 341 | /* |
342 | * We don't know if the memory that we're going to allocate will be freeable | 342 | * We don't know if the memory that we're going to allocate will be freeable |
343 | * or/and it will be released eventually, so to avoid totally wasting several | 343 | * or/and it will be released eventually, so to avoid totally wasting several |
344 | * GB of ram we must reserve some of the lower zone memory (otherwise we risk | 344 | * GB of ram we must reserve some of the lower zone memory (otherwise we risk |
345 | * to run OOM on the lower zones despite there's tons of freeable ram | 345 | * to run OOM on the lower zones despite there's tons of freeable ram |
346 | * on the higher zones). This array is recalculated at runtime if the | 346 | * on the higher zones). This array is recalculated at runtime if the |
347 | * sysctl_lowmem_reserve_ratio sysctl changes. | 347 | * sysctl_lowmem_reserve_ratio sysctl changes. |
348 | */ | 348 | */ |
349 | unsigned long lowmem_reserve[MAX_NR_ZONES]; | 349 | unsigned long lowmem_reserve[MAX_NR_ZONES]; |
350 | 350 | ||
351 | /* | 351 | /* |
352 | * This is a per-zone reserve of pages that should not be | 352 | * This is a per-zone reserve of pages that should not be |
353 | * considered dirtyable memory. | 353 | * considered dirtyable memory. |
354 | */ | 354 | */ |
355 | unsigned long dirty_balance_reserve; | 355 | unsigned long dirty_balance_reserve; |
356 | 356 | ||
357 | #ifdef CONFIG_NUMA | 357 | #ifdef CONFIG_NUMA |
358 | int node; | 358 | int node; |
359 | /* | 359 | /* |
360 | * zone reclaim becomes active if more unmapped pages exist. | 360 | * zone reclaim becomes active if more unmapped pages exist. |
361 | */ | 361 | */ |
362 | unsigned long min_unmapped_pages; | 362 | unsigned long min_unmapped_pages; |
363 | unsigned long min_slab_pages; | 363 | unsigned long min_slab_pages; |
364 | #endif | 364 | #endif |
365 | struct per_cpu_pageset __percpu *pageset; | 365 | struct per_cpu_pageset __percpu *pageset; |
366 | /* | 366 | /* |
367 | * free areas of different sizes | 367 | * free areas of different sizes |
368 | */ | 368 | */ |
369 | spinlock_t lock; | 369 | spinlock_t lock; |
370 | int all_unreclaimable; /* All pages pinned */ | 370 | int all_unreclaimable; /* All pages pinned */ |
371 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | ||
372 | /* pfn where the last incremental compaction isolated free pages */ | ||
373 | unsigned long compact_cached_free_pfn; | ||
374 | #endif | ||
371 | #ifdef CONFIG_MEMORY_HOTPLUG | 375 | #ifdef CONFIG_MEMORY_HOTPLUG |
372 | /* see spanned/present_pages for more description */ | 376 | /* see spanned/present_pages for more description */ |
373 | seqlock_t span_seqlock; | 377 | seqlock_t span_seqlock; |
374 | #endif | 378 | #endif |
375 | #ifdef CONFIG_CMA | 379 | #ifdef CONFIG_CMA |
376 | /* | 380 | /* |
377 | * CMA needs to increase watermark levels during the allocation | 381 | * CMA needs to increase watermark levels during the allocation |
378 | * process to make sure that the system is not starved. | 382 | * process to make sure that the system is not starved. |
379 | */ | 383 | */ |
380 | unsigned long min_cma_pages; | 384 | unsigned long min_cma_pages; |
381 | #endif | 385 | #endif |
382 | struct free_area free_area[MAX_ORDER]; | 386 | struct free_area free_area[MAX_ORDER]; |
383 | 387 | ||
384 | #ifndef CONFIG_SPARSEMEM | 388 | #ifndef CONFIG_SPARSEMEM |
385 | /* | 389 | /* |
386 | * Flags for a pageblock_nr_pages block. See pageblock-flags.h. | 390 | * Flags for a pageblock_nr_pages block. See pageblock-flags.h. |
387 | * In SPARSEMEM, this map is stored in struct mem_section | 391 | * In SPARSEMEM, this map is stored in struct mem_section |
388 | */ | 392 | */ |
389 | unsigned long *pageblock_flags; | 393 | unsigned long *pageblock_flags; |
390 | #endif /* CONFIG_SPARSEMEM */ | 394 | #endif /* CONFIG_SPARSEMEM */ |
391 | 395 | ||
392 | #ifdef CONFIG_COMPACTION | 396 | #ifdef CONFIG_COMPACTION |
393 | /* | 397 | /* |
394 | * On compaction failure, 1<<compact_defer_shift compactions | 398 | * On compaction failure, 1<<compact_defer_shift compactions |
395 | * are skipped before trying again. The number attempted since | 399 | * are skipped before trying again. The number attempted since |
396 | * last failure is tracked with compact_considered. | 400 | * last failure is tracked with compact_considered. |
397 | */ | 401 | */ |
398 | unsigned int compact_considered; | 402 | unsigned int compact_considered; |
399 | unsigned int compact_defer_shift; | 403 | unsigned int compact_defer_shift; |
400 | int compact_order_failed; | 404 | int compact_order_failed; |
401 | #endif | 405 | #endif |
402 | 406 | ||
403 | ZONE_PADDING(_pad1_) | 407 | ZONE_PADDING(_pad1_) |
404 | 408 | ||
405 | /* Fields commonly accessed by the page reclaim scanner */ | 409 | /* Fields commonly accessed by the page reclaim scanner */ |
406 | spinlock_t lru_lock; | 410 | spinlock_t lru_lock; |
407 | struct lruvec lruvec; | 411 | struct lruvec lruvec; |
408 | 412 | ||
409 | unsigned long pages_scanned; /* since last reclaim */ | 413 | unsigned long pages_scanned; /* since last reclaim */ |
410 | unsigned long flags; /* zone flags, see below */ | 414 | unsigned long flags; /* zone flags, see below */ |
411 | 415 | ||
412 | /* Zone statistics */ | 416 | /* Zone statistics */ |
413 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; | 417 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; |
414 | 418 | ||
415 | /* | 419 | /* |
416 | * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on | 420 | * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on |
417 | * this zone's LRU. Maintained by the pageout code. | 421 | * this zone's LRU. Maintained by the pageout code. |
418 | */ | 422 | */ |
419 | unsigned int inactive_ratio; | 423 | unsigned int inactive_ratio; |
420 | 424 | ||
421 | 425 | ||
422 | ZONE_PADDING(_pad2_) | 426 | ZONE_PADDING(_pad2_) |
423 | /* Rarely used or read-mostly fields */ | 427 | /* Rarely used or read-mostly fields */ |
424 | 428 | ||
425 | /* | 429 | /* |
426 | * wait_table -- the array holding the hash table | 430 | * wait_table -- the array holding the hash table |
427 | * wait_table_hash_nr_entries -- the size of the hash table array | 431 | * wait_table_hash_nr_entries -- the size of the hash table array |
428 | * wait_table_bits -- wait_table_size == (1 << wait_table_bits) | 432 | * wait_table_bits -- wait_table_size == (1 << wait_table_bits) |
429 | * | 433 | * |
430 | * The purpose of all these is to keep track of the people | 434 | * The purpose of all these is to keep track of the people |
431 | * waiting for a page to become available and make them | 435 | * waiting for a page to become available and make them |
432 | * runnable again when possible. The trouble is that this | 436 | * runnable again when possible. The trouble is that this |
433 | * consumes a lot of space, especially when so few things | 437 | * consumes a lot of space, especially when so few things |
434 | * wait on pages at a given time. So instead of using | 438 | * wait on pages at a given time. So instead of using |
435 | * per-page waitqueues, we use a waitqueue hash table. | 439 | * per-page waitqueues, we use a waitqueue hash table. |
436 | * | 440 | * |
437 | * The bucket discipline is to sleep on the same queue when | 441 | * The bucket discipline is to sleep on the same queue when |
438 | * colliding and wake all in that wait queue when removing. | 442 | * colliding and wake all in that wait queue when removing. |
439 | * When something wakes, it must check to be sure its page is | 443 | * When something wakes, it must check to be sure its page is |
440 | * truly available, a la thundering herd. The cost of a | 444 | * truly available, a la thundering herd. The cost of a |
441 | * collision is great, but given the expected load of the | 445 | * collision is great, but given the expected load of the |
442 | * table, they should be so rare as to be outweighed by the | 446 | * table, they should be so rare as to be outweighed by the |
443 | * benefits from the saved space. | 447 | * benefits from the saved space. |
444 | * | 448 | * |
445 | * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the | 449 | * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the |
446 | * primary users of these fields, and in mm/page_alloc.c | 450 | * primary users of these fields, and in mm/page_alloc.c |
447 | * free_area_init_core() performs the initialization of them. | 451 | * free_area_init_core() performs the initialization of them. |
448 | */ | 452 | */ |
449 | wait_queue_head_t * wait_table; | 453 | wait_queue_head_t * wait_table; |
450 | unsigned long wait_table_hash_nr_entries; | 454 | unsigned long wait_table_hash_nr_entries; |
451 | unsigned long wait_table_bits; | 455 | unsigned long wait_table_bits; |
452 | 456 | ||
453 | /* | 457 | /* |
454 | * Discontig memory support fields. | 458 | * Discontig memory support fields. |
455 | */ | 459 | */ |
456 | struct pglist_data *zone_pgdat; | 460 | struct pglist_data *zone_pgdat; |
457 | /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ | 461 | /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ |
458 | unsigned long zone_start_pfn; | 462 | unsigned long zone_start_pfn; |
459 | 463 | ||
460 | /* | 464 | /* |
461 | * zone_start_pfn, spanned_pages and present_pages are all | 465 | * zone_start_pfn, spanned_pages and present_pages are all |
462 | * protected by span_seqlock. It is a seqlock because it has | 466 | * protected by span_seqlock. It is a seqlock because it has |
463 | * to be read outside of zone->lock, and it is done in the main | 467 | * to be read outside of zone->lock, and it is done in the main |
464 | * allocator path. But, it is written quite infrequently. | 468 | * allocator path. But, it is written quite infrequently. |
465 | * | 469 | * |
466 | * The lock is declared along with zone->lock because it is | 470 | * The lock is declared along with zone->lock because it is |
467 | * frequently read in proximity to zone->lock. It's good to | 471 | * frequently read in proximity to zone->lock. It's good to |
468 | * give them a chance of being in the same cacheline. | 472 | * give them a chance of being in the same cacheline. |
469 | */ | 473 | */ |
470 | unsigned long spanned_pages; /* total size, including holes */ | 474 | unsigned long spanned_pages; /* total size, including holes */ |
471 | unsigned long present_pages; /* amount of memory (excluding holes) */ | 475 | unsigned long present_pages; /* amount of memory (excluding holes) */ |
472 | 476 | ||
473 | /* | 477 | /* |
474 | * rarely used fields: | 478 | * rarely used fields: |
475 | */ | 479 | */ |
476 | const char *name; | 480 | const char *name; |
477 | } ____cacheline_internodealigned_in_smp; | 481 | } ____cacheline_internodealigned_in_smp; |
478 | 482 | ||
479 | typedef enum { | 483 | typedef enum { |
480 | ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ | 484 | ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ |
481 | ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */ | 485 | ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */ |
482 | ZONE_CONGESTED, /* zone has many dirty pages backed by | 486 | ZONE_CONGESTED, /* zone has many dirty pages backed by |
483 | * a congested BDI | 487 | * a congested BDI |
484 | */ | 488 | */ |
485 | } zone_flags_t; | 489 | } zone_flags_t; |
486 | 490 | ||
487 | static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) | 491 | static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) |
488 | { | 492 | { |
489 | set_bit(flag, &zone->flags); | 493 | set_bit(flag, &zone->flags); |
490 | } | 494 | } |
491 | 495 | ||
492 | static inline int zone_test_and_set_flag(struct zone *zone, zone_flags_t flag) | 496 | static inline int zone_test_and_set_flag(struct zone *zone, zone_flags_t flag) |
493 | { | 497 | { |
494 | return test_and_set_bit(flag, &zone->flags); | 498 | return test_and_set_bit(flag, &zone->flags); |
495 | } | 499 | } |
496 | 500 | ||
497 | static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag) | 501 | static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag) |
498 | { | 502 | { |
499 | clear_bit(flag, &zone->flags); | 503 | clear_bit(flag, &zone->flags); |
500 | } | 504 | } |
501 | 505 | ||
502 | static inline int zone_is_reclaim_congested(const struct zone *zone) | 506 | static inline int zone_is_reclaim_congested(const struct zone *zone) |
503 | { | 507 | { |
504 | return test_bit(ZONE_CONGESTED, &zone->flags); | 508 | return test_bit(ZONE_CONGESTED, &zone->flags); |
505 | } | 509 | } |
506 | 510 | ||
507 | static inline int zone_is_reclaim_locked(const struct zone *zone) | 511 | static inline int zone_is_reclaim_locked(const struct zone *zone) |
508 | { | 512 | { |
509 | return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); | 513 | return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); |
510 | } | 514 | } |
511 | 515 | ||
512 | static inline int zone_is_oom_locked(const struct zone *zone) | 516 | static inline int zone_is_oom_locked(const struct zone *zone) |
513 | { | 517 | { |
514 | return test_bit(ZONE_OOM_LOCKED, &zone->flags); | 518 | return test_bit(ZONE_OOM_LOCKED, &zone->flags); |
515 | } | 519 | } |
516 | 520 | ||
517 | /* | 521 | /* |
518 | * The "priority" of VM scanning is how much of the queues we will scan in one | 522 | * The "priority" of VM scanning is how much of the queues we will scan in one |
519 | * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the | 523 | * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the |
520 | * queues ("queue_length >> 12") during an aging round. | 524 | * queues ("queue_length >> 12") during an aging round. |
521 | */ | 525 | */ |
522 | #define DEF_PRIORITY 12 | 526 | #define DEF_PRIORITY 12 |
523 | 527 | ||
524 | /* Maximum number of zones on a zonelist */ | 528 | /* Maximum number of zones on a zonelist */ |
525 | #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES) | 529 | #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES) |
526 | 530 | ||
527 | #ifdef CONFIG_NUMA | 531 | #ifdef CONFIG_NUMA |
528 | 532 | ||
529 | /* | 533 | /* |
530 | * The NUMA zonelists are doubled because we need zonelists that restrict the | 534 | * The NUMA zonelists are doubled because we need zonelists that restrict the |
531 | * allocations to a single node for GFP_THISNODE. | 535 | * allocations to a single node for GFP_THISNODE. |
532 | * | 536 | * |
533 | * [0] : Zonelist with fallback | 537 | * [0] : Zonelist with fallback |
534 | * [1] : No fallback (GFP_THISNODE) | 538 | * [1] : No fallback (GFP_THISNODE) |
535 | */ | 539 | */ |
536 | #define MAX_ZONELISTS 2 | 540 | #define MAX_ZONELISTS 2 |
537 | 541 | ||
538 | 542 | ||
539 | /* | 543 | /* |
540 | * We cache key information from each zonelist for smaller cache | 544 | * We cache key information from each zonelist for smaller cache |
541 | * footprint when scanning for free pages in get_page_from_freelist(). | 545 | * footprint when scanning for free pages in get_page_from_freelist(). |
542 | * | 546 | * |
543 | * 1) The BITMAP fullzones tracks which zones in a zonelist have come | 547 | * 1) The BITMAP fullzones tracks which zones in a zonelist have come |
544 | * up short of free memory since the last time (last_fullzone_zap) | 548 | * up short of free memory since the last time (last_fullzone_zap) |
545 | * we zero'd fullzones. | 549 | * we zero'd fullzones. |
546 | * 2) The array z_to_n[] maps each zone in the zonelist to its node | 550 | * 2) The array z_to_n[] maps each zone in the zonelist to its node |
547 | * id, so that we can efficiently evaluate whether that node is | 551 | * id, so that we can efficiently evaluate whether that node is |
548 | * set in the current tasks mems_allowed. | 552 | * set in the current tasks mems_allowed. |
549 | * | 553 | * |
550 | * Both fullzones and z_to_n[] are one-to-one with the zonelist, | 554 | * Both fullzones and z_to_n[] are one-to-one with the zonelist, |
551 | * indexed by a zones offset in the zonelist zones[] array. | 555 | * indexed by a zones offset in the zonelist zones[] array. |
552 | * | 556 | * |
553 | * The get_page_from_freelist() routine does two scans. During the | 557 | * The get_page_from_freelist() routine does two scans. During the |
554 | * first scan, we skip zones whose corresponding bit in 'fullzones' | 558 | * first scan, we skip zones whose corresponding bit in 'fullzones' |
555 | * is set or whose corresponding node in current->mems_allowed (which | 559 | * is set or whose corresponding node in current->mems_allowed (which |
556 | * comes from cpusets) is not set. During the second scan, we bypass | 560 | * comes from cpusets) is not set. During the second scan, we bypass |
557 | * this zonelist_cache, to ensure we look methodically at each zone. | 561 | * this zonelist_cache, to ensure we look methodically at each zone. |
558 | * | 562 | * |
559 | * Once per second, we zero out (zap) fullzones, forcing us to | 563 | * Once per second, we zero out (zap) fullzones, forcing us to |
560 | * reconsider nodes that might have regained more free memory. | 564 | * reconsider nodes that might have regained more free memory. |
561 | * The field last_full_zap is the time we last zapped fullzones. | 565 | * The field last_full_zap is the time we last zapped fullzones. |
562 | * | 566 | * |
563 | * This mechanism reduces the amount of time we waste repeatedly | 567 | * This mechanism reduces the amount of time we waste repeatedly |
564 | * reexaming zones for free memory when they just came up low on | 568 | * reexaming zones for free memory when they just came up low on |
565 | * memory momentarilly ago. | 569 | * memory momentarilly ago. |
566 | * | 570 | * |
567 | * The zonelist_cache struct members logically belong in struct | 571 | * The zonelist_cache struct members logically belong in struct |
568 | * zonelist. However, the mempolicy zonelists constructed for | 572 | * zonelist. However, the mempolicy zonelists constructed for |
569 | * MPOL_BIND are intentionally variable length (and usually much | 573 | * MPOL_BIND are intentionally variable length (and usually much |
570 | * shorter). A general purpose mechanism for handling structs with | 574 | * shorter). A general purpose mechanism for handling structs with |
571 | * multiple variable length members is more mechanism than we want | 575 | * multiple variable length members is more mechanism than we want |
572 | * here. We resort to some special case hackery instead. | 576 | * here. We resort to some special case hackery instead. |
573 | * | 577 | * |
574 | * The MPOL_BIND zonelists don't need this zonelist_cache (in good | 578 | * The MPOL_BIND zonelists don't need this zonelist_cache (in good |
575 | * part because they are shorter), so we put the fixed length stuff | 579 | * part because they are shorter), so we put the fixed length stuff |
576 | * at the front of the zonelist struct, ending in a variable length | 580 | * at the front of the zonelist struct, ending in a variable length |
577 | * zones[], as is needed by MPOL_BIND. | 581 | * zones[], as is needed by MPOL_BIND. |
578 | * | 582 | * |
579 | * Then we put the optional zonelist cache on the end of the zonelist | 583 | * Then we put the optional zonelist cache on the end of the zonelist |
580 | * struct. This optional stuff is found by a 'zlcache_ptr' pointer in | 584 | * struct. This optional stuff is found by a 'zlcache_ptr' pointer in |
581 | * the fixed length portion at the front of the struct. This pointer | 585 | * the fixed length portion at the front of the struct. This pointer |
582 | * both enables us to find the zonelist cache, and in the case of | 586 | * both enables us to find the zonelist cache, and in the case of |
583 | * MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL) | 587 | * MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL) |
584 | * to know that the zonelist cache is not there. | 588 | * to know that the zonelist cache is not there. |
585 | * | 589 | * |
586 | * The end result is that struct zonelists come in two flavors: | 590 | * The end result is that struct zonelists come in two flavors: |
587 | * 1) The full, fixed length version, shown below, and | 591 | * 1) The full, fixed length version, shown below, and |
588 | * 2) The custom zonelists for MPOL_BIND. | 592 | * 2) The custom zonelists for MPOL_BIND. |
589 | * The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache. | 593 | * The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache. |
590 | * | 594 | * |
591 | * Even though there may be multiple CPU cores on a node modifying | 595 | * Even though there may be multiple CPU cores on a node modifying |
592 | * fullzones or last_full_zap in the same zonelist_cache at the same | 596 | * fullzones or last_full_zap in the same zonelist_cache at the same |
593 | * time, we don't lock it. This is just hint data - if it is wrong now | 597 | * time, we don't lock it. This is just hint data - if it is wrong now |
594 | * and then, the allocator will still function, perhaps a bit slower. | 598 | * and then, the allocator will still function, perhaps a bit slower. |
595 | */ | 599 | */ |
596 | 600 | ||
597 | 601 | ||
598 | struct zonelist_cache { | 602 | struct zonelist_cache { |
599 | unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */ | 603 | unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */ |
600 | DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */ | 604 | DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */ |
601 | unsigned long last_full_zap; /* when last zap'd (jiffies) */ | 605 | unsigned long last_full_zap; /* when last zap'd (jiffies) */ |
602 | }; | 606 | }; |
603 | #else | 607 | #else |
604 | #define MAX_ZONELISTS 1 | 608 | #define MAX_ZONELISTS 1 |
605 | struct zonelist_cache; | 609 | struct zonelist_cache; |
606 | #endif | 610 | #endif |
607 | 611 | ||
608 | /* | 612 | /* |
609 | * This struct contains information about a zone in a zonelist. It is stored | 613 | * This struct contains information about a zone in a zonelist. It is stored |
610 | * here to avoid dereferences into large structures and lookups of tables | 614 | * here to avoid dereferences into large structures and lookups of tables |
611 | */ | 615 | */ |
612 | struct zoneref { | 616 | struct zoneref { |
613 | struct zone *zone; /* Pointer to actual zone */ | 617 | struct zone *zone; /* Pointer to actual zone */ |
614 | int zone_idx; /* zone_idx(zoneref->zone) */ | 618 | int zone_idx; /* zone_idx(zoneref->zone) */ |
615 | }; | 619 | }; |
616 | 620 | ||
617 | /* | 621 | /* |
618 | * One allocation request operates on a zonelist. A zonelist | 622 | * One allocation request operates on a zonelist. A zonelist |
619 | * is a list of zones, the first one is the 'goal' of the | 623 | * is a list of zones, the first one is the 'goal' of the |
620 | * allocation, the other zones are fallback zones, in decreasing | 624 | * allocation, the other zones are fallback zones, in decreasing |
621 | * priority. | 625 | * priority. |
622 | * | 626 | * |
623 | * If zlcache_ptr is not NULL, then it is just the address of zlcache, | 627 | * If zlcache_ptr is not NULL, then it is just the address of zlcache, |
624 | * as explained above. If zlcache_ptr is NULL, there is no zlcache. | 628 | * as explained above. If zlcache_ptr is NULL, there is no zlcache. |
625 | * * | 629 | * * |
626 | * To speed the reading of the zonelist, the zonerefs contain the zone index | 630 | * To speed the reading of the zonelist, the zonerefs contain the zone index |
627 | * of the entry being read. Helper functions to access information given | 631 | * of the entry being read. Helper functions to access information given |
628 | * a struct zoneref are | 632 | * a struct zoneref are |
629 | * | 633 | * |
630 | * zonelist_zone() - Return the struct zone * for an entry in _zonerefs | 634 | * zonelist_zone() - Return the struct zone * for an entry in _zonerefs |
631 | * zonelist_zone_idx() - Return the index of the zone for an entry | 635 | * zonelist_zone_idx() - Return the index of the zone for an entry |
632 | * zonelist_node_idx() - Return the index of the node for an entry | 636 | * zonelist_node_idx() - Return the index of the node for an entry |
633 | */ | 637 | */ |
634 | struct zonelist { | 638 | struct zonelist { |
635 | struct zonelist_cache *zlcache_ptr; // NULL or &zlcache | 639 | struct zonelist_cache *zlcache_ptr; // NULL or &zlcache |
636 | struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; | 640 | struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; |
637 | #ifdef CONFIG_NUMA | 641 | #ifdef CONFIG_NUMA |
638 | struct zonelist_cache zlcache; // optional ... | 642 | struct zonelist_cache zlcache; // optional ... |
639 | #endif | 643 | #endif |
640 | }; | 644 | }; |
641 | 645 | ||
642 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 646 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
643 | struct node_active_region { | 647 | struct node_active_region { |
644 | unsigned long start_pfn; | 648 | unsigned long start_pfn; |
645 | unsigned long end_pfn; | 649 | unsigned long end_pfn; |
646 | int nid; | 650 | int nid; |
647 | }; | 651 | }; |
648 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 652 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
649 | 653 | ||
650 | #ifndef CONFIG_DISCONTIGMEM | 654 | #ifndef CONFIG_DISCONTIGMEM |
651 | /* The array of struct pages - for discontigmem use pgdat->lmem_map */ | 655 | /* The array of struct pages - for discontigmem use pgdat->lmem_map */ |
652 | extern struct page *mem_map; | 656 | extern struct page *mem_map; |
653 | #endif | 657 | #endif |
654 | 658 | ||
655 | /* | 659 | /* |
656 | * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM | 660 | * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM |
657 | * (mostly NUMA machines?) to denote a higher-level memory zone than the | 661 | * (mostly NUMA machines?) to denote a higher-level memory zone than the |
658 | * zone denotes. | 662 | * zone denotes. |
659 | * | 663 | * |
660 | * On NUMA machines, each NUMA node would have a pg_data_t to describe | 664 | * On NUMA machines, each NUMA node would have a pg_data_t to describe |
661 | * it's memory layout. | 665 | * it's memory layout. |
662 | * | 666 | * |
663 | * Memory statistics and page replacement data structures are maintained on a | 667 | * Memory statistics and page replacement data structures are maintained on a |
664 | * per-zone basis. | 668 | * per-zone basis. |
665 | */ | 669 | */ |
666 | struct bootmem_data; | 670 | struct bootmem_data; |
667 | typedef struct pglist_data { | 671 | typedef struct pglist_data { |
668 | struct zone node_zones[MAX_NR_ZONES]; | 672 | struct zone node_zones[MAX_NR_ZONES]; |
669 | struct zonelist node_zonelists[MAX_ZONELISTS]; | 673 | struct zonelist node_zonelists[MAX_ZONELISTS]; |
670 | int nr_zones; | 674 | int nr_zones; |
671 | #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ | 675 | #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ |
672 | struct page *node_mem_map; | 676 | struct page *node_mem_map; |
673 | #ifdef CONFIG_MEMCG | 677 | #ifdef CONFIG_MEMCG |
674 | struct page_cgroup *node_page_cgroup; | 678 | struct page_cgroup *node_page_cgroup; |
675 | #endif | 679 | #endif |
676 | #endif | 680 | #endif |
677 | #ifndef CONFIG_NO_BOOTMEM | 681 | #ifndef CONFIG_NO_BOOTMEM |
678 | struct bootmem_data *bdata; | 682 | struct bootmem_data *bdata; |
679 | #endif | 683 | #endif |
680 | #ifdef CONFIG_MEMORY_HOTPLUG | 684 | #ifdef CONFIG_MEMORY_HOTPLUG |
681 | /* | 685 | /* |
682 | * Must be held any time you expect node_start_pfn, node_present_pages | 686 | * Must be held any time you expect node_start_pfn, node_present_pages |
683 | * or node_spanned_pages stay constant. Holding this will also | 687 | * or node_spanned_pages stay constant. Holding this will also |
684 | * guarantee that any pfn_valid() stays that way. | 688 | * guarantee that any pfn_valid() stays that way. |
685 | * | 689 | * |
686 | * Nests above zone->lock and zone->size_seqlock. | 690 | * Nests above zone->lock and zone->size_seqlock. |
687 | */ | 691 | */ |
688 | spinlock_t node_size_lock; | 692 | spinlock_t node_size_lock; |
689 | #endif | 693 | #endif |
690 | unsigned long node_start_pfn; | 694 | unsigned long node_start_pfn; |
691 | unsigned long node_present_pages; /* total number of physical pages */ | 695 | unsigned long node_present_pages; /* total number of physical pages */ |
692 | unsigned long node_spanned_pages; /* total size of physical page | 696 | unsigned long node_spanned_pages; /* total size of physical page |
693 | range, including holes */ | 697 | range, including holes */ |
694 | int node_id; | 698 | int node_id; |
695 | wait_queue_head_t kswapd_wait; | 699 | wait_queue_head_t kswapd_wait; |
696 | struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ | 700 | struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ |
697 | int kswapd_max_order; | 701 | int kswapd_max_order; |
698 | enum zone_type classzone_idx; | 702 | enum zone_type classzone_idx; |
699 | } pg_data_t; | 703 | } pg_data_t; |
700 | 704 | ||
701 | #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) | 705 | #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) |
702 | #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) | 706 | #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) |
703 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | 707 | #ifdef CONFIG_FLAT_NODE_MEM_MAP |
704 | #define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) | 708 | #define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) |
705 | #else | 709 | #else |
706 | #define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) | 710 | #define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) |
707 | #endif | 711 | #endif |
708 | #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) | 712 | #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) |
709 | 713 | ||
710 | #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) | 714 | #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) |
711 | 715 | ||
712 | #define node_end_pfn(nid) ({\ | 716 | #define node_end_pfn(nid) ({\ |
713 | pg_data_t *__pgdat = NODE_DATA(nid);\ | 717 | pg_data_t *__pgdat = NODE_DATA(nid);\ |
714 | __pgdat->node_start_pfn + __pgdat->node_spanned_pages;\ | 718 | __pgdat->node_start_pfn + __pgdat->node_spanned_pages;\ |
715 | }) | 719 | }) |
716 | 720 | ||
717 | #include <linux/memory_hotplug.h> | 721 | #include <linux/memory_hotplug.h> |
718 | 722 | ||
719 | extern struct mutex zonelists_mutex; | 723 | extern struct mutex zonelists_mutex; |
720 | void build_all_zonelists(void *data); | 724 | void build_all_zonelists(void *data); |
721 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); | 725 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); |
722 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 726 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
723 | int classzone_idx, int alloc_flags); | 727 | int classzone_idx, int alloc_flags); |
724 | bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | 728 | bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, |
725 | int classzone_idx, int alloc_flags); | 729 | int classzone_idx, int alloc_flags); |
726 | enum memmap_context { | 730 | enum memmap_context { |
727 | MEMMAP_EARLY, | 731 | MEMMAP_EARLY, |
728 | MEMMAP_HOTPLUG, | 732 | MEMMAP_HOTPLUG, |
729 | }; | 733 | }; |
730 | extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, | 734 | extern int init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, |
731 | unsigned long size, | 735 | unsigned long size, |
732 | enum memmap_context context); | 736 | enum memmap_context context); |
733 | 737 | ||
734 | extern void lruvec_init(struct lruvec *lruvec, struct zone *zone); | 738 | extern void lruvec_init(struct lruvec *lruvec, struct zone *zone); |
735 | 739 | ||
736 | static inline struct zone *lruvec_zone(struct lruvec *lruvec) | 740 | static inline struct zone *lruvec_zone(struct lruvec *lruvec) |
737 | { | 741 | { |
738 | #ifdef CONFIG_MEMCG | 742 | #ifdef CONFIG_MEMCG |
739 | return lruvec->zone; | 743 | return lruvec->zone; |
740 | #else | 744 | #else |
741 | return container_of(lruvec, struct zone, lruvec); | 745 | return container_of(lruvec, struct zone, lruvec); |
742 | #endif | 746 | #endif |
743 | } | 747 | } |
744 | 748 | ||
745 | #ifdef CONFIG_HAVE_MEMORY_PRESENT | 749 | #ifdef CONFIG_HAVE_MEMORY_PRESENT |
746 | void memory_present(int nid, unsigned long start, unsigned long end); | 750 | void memory_present(int nid, unsigned long start, unsigned long end); |
747 | #else | 751 | #else |
748 | static inline void memory_present(int nid, unsigned long start, unsigned long end) {} | 752 | static inline void memory_present(int nid, unsigned long start, unsigned long end) {} |
749 | #endif | 753 | #endif |
750 | 754 | ||
751 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES | 755 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES |
752 | int local_memory_node(int node_id); | 756 | int local_memory_node(int node_id); |
753 | #else | 757 | #else |
754 | static inline int local_memory_node(int node_id) { return node_id; }; | 758 | static inline int local_memory_node(int node_id) { return node_id; }; |
755 | #endif | 759 | #endif |
756 | 760 | ||
757 | #ifdef CONFIG_NEED_NODE_MEMMAP_SIZE | 761 | #ifdef CONFIG_NEED_NODE_MEMMAP_SIZE |
758 | unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); | 762 | unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); |
759 | #endif | 763 | #endif |
760 | 764 | ||
761 | /* | 765 | /* |
762 | * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. | 766 | * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. |
763 | */ | 767 | */ |
764 | #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) | 768 | #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) |
765 | 769 | ||
766 | static inline int populated_zone(struct zone *zone) | 770 | static inline int populated_zone(struct zone *zone) |
767 | { | 771 | { |
768 | return (!!zone->present_pages); | 772 | return (!!zone->present_pages); |
769 | } | 773 | } |
770 | 774 | ||
771 | extern int movable_zone; | 775 | extern int movable_zone; |
772 | 776 | ||
773 | static inline int zone_movable_is_highmem(void) | 777 | static inline int zone_movable_is_highmem(void) |
774 | { | 778 | { |
775 | #if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE) | 779 | #if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE) |
776 | return movable_zone == ZONE_HIGHMEM; | 780 | return movable_zone == ZONE_HIGHMEM; |
777 | #else | 781 | #else |
778 | return 0; | 782 | return 0; |
779 | #endif | 783 | #endif |
780 | } | 784 | } |
781 | 785 | ||
782 | static inline int is_highmem_idx(enum zone_type idx) | 786 | static inline int is_highmem_idx(enum zone_type idx) |
783 | { | 787 | { |
784 | #ifdef CONFIG_HIGHMEM | 788 | #ifdef CONFIG_HIGHMEM |
785 | return (idx == ZONE_HIGHMEM || | 789 | return (idx == ZONE_HIGHMEM || |
786 | (idx == ZONE_MOVABLE && zone_movable_is_highmem())); | 790 | (idx == ZONE_MOVABLE && zone_movable_is_highmem())); |
787 | #else | 791 | #else |
788 | return 0; | 792 | return 0; |
789 | #endif | 793 | #endif |
790 | } | 794 | } |
791 | 795 | ||
792 | static inline int is_normal_idx(enum zone_type idx) | 796 | static inline int is_normal_idx(enum zone_type idx) |
793 | { | 797 | { |
794 | return (idx == ZONE_NORMAL); | 798 | return (idx == ZONE_NORMAL); |
795 | } | 799 | } |
796 | 800 | ||
797 | /** | 801 | /** |
798 | * is_highmem - helper function to quickly check if a struct zone is a | 802 | * is_highmem - helper function to quickly check if a struct zone is a |
799 | * highmem zone or not. This is an attempt to keep references | 803 | * highmem zone or not. This is an attempt to keep references |
800 | * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. | 804 | * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. |
801 | * @zone - pointer to struct zone variable | 805 | * @zone - pointer to struct zone variable |
802 | */ | 806 | */ |
803 | static inline int is_highmem(struct zone *zone) | 807 | static inline int is_highmem(struct zone *zone) |
804 | { | 808 | { |
805 | #ifdef CONFIG_HIGHMEM | 809 | #ifdef CONFIG_HIGHMEM |
806 | int zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones; | 810 | int zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones; |
807 | return zone_off == ZONE_HIGHMEM * sizeof(*zone) || | 811 | return zone_off == ZONE_HIGHMEM * sizeof(*zone) || |
808 | (zone_off == ZONE_MOVABLE * sizeof(*zone) && | 812 | (zone_off == ZONE_MOVABLE * sizeof(*zone) && |
809 | zone_movable_is_highmem()); | 813 | zone_movable_is_highmem()); |
810 | #else | 814 | #else |
811 | return 0; | 815 | return 0; |
812 | #endif | 816 | #endif |
813 | } | 817 | } |
814 | 818 | ||
815 | static inline int is_normal(struct zone *zone) | 819 | static inline int is_normal(struct zone *zone) |
816 | { | 820 | { |
817 | return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL; | 821 | return zone == zone->zone_pgdat->node_zones + ZONE_NORMAL; |
818 | } | 822 | } |
819 | 823 | ||
820 | static inline int is_dma32(struct zone *zone) | 824 | static inline int is_dma32(struct zone *zone) |
821 | { | 825 | { |
822 | #ifdef CONFIG_ZONE_DMA32 | 826 | #ifdef CONFIG_ZONE_DMA32 |
823 | return zone == zone->zone_pgdat->node_zones + ZONE_DMA32; | 827 | return zone == zone->zone_pgdat->node_zones + ZONE_DMA32; |
824 | #else | 828 | #else |
825 | return 0; | 829 | return 0; |
826 | #endif | 830 | #endif |
827 | } | 831 | } |
828 | 832 | ||
829 | static inline int is_dma(struct zone *zone) | 833 | static inline int is_dma(struct zone *zone) |
830 | { | 834 | { |
831 | #ifdef CONFIG_ZONE_DMA | 835 | #ifdef CONFIG_ZONE_DMA |
832 | return zone == zone->zone_pgdat->node_zones + ZONE_DMA; | 836 | return zone == zone->zone_pgdat->node_zones + ZONE_DMA; |
833 | #else | 837 | #else |
834 | return 0; | 838 | return 0; |
835 | #endif | 839 | #endif |
836 | } | 840 | } |
837 | 841 | ||
838 | /* These two functions are used to setup the per zone pages min values */ | 842 | /* These two functions are used to setup the per zone pages min values */ |
839 | struct ctl_table; | 843 | struct ctl_table; |
840 | int min_free_kbytes_sysctl_handler(struct ctl_table *, int, | 844 | int min_free_kbytes_sysctl_handler(struct ctl_table *, int, |
841 | void __user *, size_t *, loff_t *); | 845 | void __user *, size_t *, loff_t *); |
842 | extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; | 846 | extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; |
843 | int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, | 847 | int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, |
844 | void __user *, size_t *, loff_t *); | 848 | void __user *, size_t *, loff_t *); |
845 | int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, | 849 | int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, |
846 | void __user *, size_t *, loff_t *); | 850 | void __user *, size_t *, loff_t *); |
847 | int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, | 851 | int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, |
848 | void __user *, size_t *, loff_t *); | 852 | void __user *, size_t *, loff_t *); |
849 | int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, | 853 | int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, |
850 | void __user *, size_t *, loff_t *); | 854 | void __user *, size_t *, loff_t *); |
851 | 855 | ||
852 | extern int numa_zonelist_order_handler(struct ctl_table *, int, | 856 | extern int numa_zonelist_order_handler(struct ctl_table *, int, |
853 | void __user *, size_t *, loff_t *); | 857 | void __user *, size_t *, loff_t *); |
854 | extern char numa_zonelist_order[]; | 858 | extern char numa_zonelist_order[]; |
855 | #define NUMA_ZONELIST_ORDER_LEN 16 /* string buffer size */ | 859 | #define NUMA_ZONELIST_ORDER_LEN 16 /* string buffer size */ |
856 | 860 | ||
857 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 861 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
858 | 862 | ||
859 | extern struct pglist_data contig_page_data; | 863 | extern struct pglist_data contig_page_data; |
860 | #define NODE_DATA(nid) (&contig_page_data) | 864 | #define NODE_DATA(nid) (&contig_page_data) |
861 | #define NODE_MEM_MAP(nid) mem_map | 865 | #define NODE_MEM_MAP(nid) mem_map |
862 | 866 | ||
863 | #else /* CONFIG_NEED_MULTIPLE_NODES */ | 867 | #else /* CONFIG_NEED_MULTIPLE_NODES */ |
864 | 868 | ||
865 | #include <asm/mmzone.h> | 869 | #include <asm/mmzone.h> |
866 | 870 | ||
867 | #endif /* !CONFIG_NEED_MULTIPLE_NODES */ | 871 | #endif /* !CONFIG_NEED_MULTIPLE_NODES */ |
868 | 872 | ||
869 | extern struct pglist_data *first_online_pgdat(void); | 873 | extern struct pglist_data *first_online_pgdat(void); |
870 | extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); | 874 | extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); |
871 | extern struct zone *next_zone(struct zone *zone); | 875 | extern struct zone *next_zone(struct zone *zone); |
872 | 876 | ||
873 | /** | 877 | /** |
874 | * for_each_online_pgdat - helper macro to iterate over all online nodes | 878 | * for_each_online_pgdat - helper macro to iterate over all online nodes |
875 | * @pgdat - pointer to a pg_data_t variable | 879 | * @pgdat - pointer to a pg_data_t variable |
876 | */ | 880 | */ |
877 | #define for_each_online_pgdat(pgdat) \ | 881 | #define for_each_online_pgdat(pgdat) \ |
878 | for (pgdat = first_online_pgdat(); \ | 882 | for (pgdat = first_online_pgdat(); \ |
879 | pgdat; \ | 883 | pgdat; \ |
880 | pgdat = next_online_pgdat(pgdat)) | 884 | pgdat = next_online_pgdat(pgdat)) |
881 | /** | 885 | /** |
882 | * for_each_zone - helper macro to iterate over all memory zones | 886 | * for_each_zone - helper macro to iterate over all memory zones |
883 | * @zone - pointer to struct zone variable | 887 | * @zone - pointer to struct zone variable |
884 | * | 888 | * |
885 | * The user only needs to declare the zone variable, for_each_zone | 889 | * The user only needs to declare the zone variable, for_each_zone |
886 | * fills it in. | 890 | * fills it in. |
887 | */ | 891 | */ |
888 | #define for_each_zone(zone) \ | 892 | #define for_each_zone(zone) \ |
889 | for (zone = (first_online_pgdat())->node_zones; \ | 893 | for (zone = (first_online_pgdat())->node_zones; \ |
890 | zone; \ | 894 | zone; \ |
891 | zone = next_zone(zone)) | 895 | zone = next_zone(zone)) |
892 | 896 | ||
893 | #define for_each_populated_zone(zone) \ | 897 | #define for_each_populated_zone(zone) \ |
894 | for (zone = (first_online_pgdat())->node_zones; \ | 898 | for (zone = (first_online_pgdat())->node_zones; \ |
895 | zone; \ | 899 | zone; \ |
896 | zone = next_zone(zone)) \ | 900 | zone = next_zone(zone)) \ |
897 | if (!populated_zone(zone)) \ | 901 | if (!populated_zone(zone)) \ |
898 | ; /* do nothing */ \ | 902 | ; /* do nothing */ \ |
899 | else | 903 | else |
900 | 904 | ||
901 | static inline struct zone *zonelist_zone(struct zoneref *zoneref) | 905 | static inline struct zone *zonelist_zone(struct zoneref *zoneref) |
902 | { | 906 | { |
903 | return zoneref->zone; | 907 | return zoneref->zone; |
904 | } | 908 | } |
905 | 909 | ||
906 | static inline int zonelist_zone_idx(struct zoneref *zoneref) | 910 | static inline int zonelist_zone_idx(struct zoneref *zoneref) |
907 | { | 911 | { |
908 | return zoneref->zone_idx; | 912 | return zoneref->zone_idx; |
909 | } | 913 | } |
910 | 914 | ||
911 | static inline int zonelist_node_idx(struct zoneref *zoneref) | 915 | static inline int zonelist_node_idx(struct zoneref *zoneref) |
912 | { | 916 | { |
913 | #ifdef CONFIG_NUMA | 917 | #ifdef CONFIG_NUMA |
914 | /* zone_to_nid not available in this context */ | 918 | /* zone_to_nid not available in this context */ |
915 | return zoneref->zone->node; | 919 | return zoneref->zone->node; |
916 | #else | 920 | #else |
917 | return 0; | 921 | return 0; |
918 | #endif /* CONFIG_NUMA */ | 922 | #endif /* CONFIG_NUMA */ |
919 | } | 923 | } |
920 | 924 | ||
921 | /** | 925 | /** |
922 | * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point | 926 | * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point |
923 | * @z - The cursor used as a starting point for the search | 927 | * @z - The cursor used as a starting point for the search |
924 | * @highest_zoneidx - The zone index of the highest zone to return | 928 | * @highest_zoneidx - The zone index of the highest zone to return |
925 | * @nodes - An optional nodemask to filter the zonelist with | 929 | * @nodes - An optional nodemask to filter the zonelist with |
926 | * @zone - The first suitable zone found is returned via this parameter | 930 | * @zone - The first suitable zone found is returned via this parameter |
927 | * | 931 | * |
928 | * This function returns the next zone at or below a given zone index that is | 932 | * This function returns the next zone at or below a given zone index that is |
929 | * within the allowed nodemask using a cursor as the starting point for the | 933 | * within the allowed nodemask using a cursor as the starting point for the |
930 | * search. The zoneref returned is a cursor that represents the current zone | 934 | * search. The zoneref returned is a cursor that represents the current zone |
931 | * being examined. It should be advanced by one before calling | 935 | * being examined. It should be advanced by one before calling |
932 | * next_zones_zonelist again. | 936 | * next_zones_zonelist again. |
933 | */ | 937 | */ |
934 | struct zoneref *next_zones_zonelist(struct zoneref *z, | 938 | struct zoneref *next_zones_zonelist(struct zoneref *z, |
935 | enum zone_type highest_zoneidx, | 939 | enum zone_type highest_zoneidx, |
936 | nodemask_t *nodes, | 940 | nodemask_t *nodes, |
937 | struct zone **zone); | 941 | struct zone **zone); |
938 | 942 | ||
939 | /** | 943 | /** |
940 | * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist | 944 | * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist |
941 | * @zonelist - The zonelist to search for a suitable zone | 945 | * @zonelist - The zonelist to search for a suitable zone |
942 | * @highest_zoneidx - The zone index of the highest zone to return | 946 | * @highest_zoneidx - The zone index of the highest zone to return |
943 | * @nodes - An optional nodemask to filter the zonelist with | 947 | * @nodes - An optional nodemask to filter the zonelist with |
944 | * @zone - The first suitable zone found is returned via this parameter | 948 | * @zone - The first suitable zone found is returned via this parameter |
945 | * | 949 | * |
946 | * This function returns the first zone at or below a given zone index that is | 950 | * This function returns the first zone at or below a given zone index that is |
947 | * within the allowed nodemask. The zoneref returned is a cursor that can be | 951 | * within the allowed nodemask. The zoneref returned is a cursor that can be |
948 | * used to iterate the zonelist with next_zones_zonelist by advancing it by | 952 | * used to iterate the zonelist with next_zones_zonelist by advancing it by |
949 | * one before calling. | 953 | * one before calling. |
950 | */ | 954 | */ |
951 | static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, | 955 | static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, |
952 | enum zone_type highest_zoneidx, | 956 | enum zone_type highest_zoneidx, |
953 | nodemask_t *nodes, | 957 | nodemask_t *nodes, |
954 | struct zone **zone) | 958 | struct zone **zone) |
955 | { | 959 | { |
956 | return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes, | 960 | return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes, |
957 | zone); | 961 | zone); |
958 | } | 962 | } |
959 | 963 | ||
960 | /** | 964 | /** |
961 | * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask | 965 | * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask |
962 | * @zone - The current zone in the iterator | 966 | * @zone - The current zone in the iterator |
963 | * @z - The current pointer within zonelist->zones being iterated | 967 | * @z - The current pointer within zonelist->zones being iterated |
964 | * @zlist - The zonelist being iterated | 968 | * @zlist - The zonelist being iterated |
965 | * @highidx - The zone index of the highest zone to return | 969 | * @highidx - The zone index of the highest zone to return |
966 | * @nodemask - Nodemask allowed by the allocator | 970 | * @nodemask - Nodemask allowed by the allocator |
967 | * | 971 | * |
968 | * This iterator iterates though all zones at or below a given zone index and | 972 | * This iterator iterates though all zones at or below a given zone index and |
969 | * within a given nodemask | 973 | * within a given nodemask |
970 | */ | 974 | */ |
971 | #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ | 975 | #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ |
972 | for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \ | 976 | for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \ |
973 | zone; \ | 977 | zone; \ |
974 | z = next_zones_zonelist(++z, highidx, nodemask, &zone)) \ | 978 | z = next_zones_zonelist(++z, highidx, nodemask, &zone)) \ |
975 | 979 | ||
976 | /** | 980 | /** |
977 | * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index | 981 | * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index |
978 | * @zone - The current zone in the iterator | 982 | * @zone - The current zone in the iterator |
979 | * @z - The current pointer within zonelist->zones being iterated | 983 | * @z - The current pointer within zonelist->zones being iterated |
980 | * @zlist - The zonelist being iterated | 984 | * @zlist - The zonelist being iterated |
981 | * @highidx - The zone index of the highest zone to return | 985 | * @highidx - The zone index of the highest zone to return |
982 | * | 986 | * |
983 | * This iterator iterates though all zones at or below a given zone index. | 987 | * This iterator iterates though all zones at or below a given zone index. |
984 | */ | 988 | */ |
985 | #define for_each_zone_zonelist(zone, z, zlist, highidx) \ | 989 | #define for_each_zone_zonelist(zone, z, zlist, highidx) \ |
986 | for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL) | 990 | for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL) |
987 | 991 | ||
988 | #ifdef CONFIG_SPARSEMEM | 992 | #ifdef CONFIG_SPARSEMEM |
989 | #include <asm/sparsemem.h> | 993 | #include <asm/sparsemem.h> |
990 | #endif | 994 | #endif |
991 | 995 | ||
992 | #if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \ | 996 | #if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \ |
993 | !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) | 997 | !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) |
994 | static inline unsigned long early_pfn_to_nid(unsigned long pfn) | 998 | static inline unsigned long early_pfn_to_nid(unsigned long pfn) |
995 | { | 999 | { |
996 | return 0; | 1000 | return 0; |
997 | } | 1001 | } |
998 | #endif | 1002 | #endif |
999 | 1003 | ||
1000 | #ifdef CONFIG_FLATMEM | 1004 | #ifdef CONFIG_FLATMEM |
1001 | #define pfn_to_nid(pfn) (0) | 1005 | #define pfn_to_nid(pfn) (0) |
1002 | #endif | 1006 | #endif |
1003 | 1007 | ||
1004 | #ifdef CONFIG_SPARSEMEM | 1008 | #ifdef CONFIG_SPARSEMEM |
1005 | 1009 | ||
1006 | /* | 1010 | /* |
1007 | * SECTION_SHIFT #bits space required to store a section # | 1011 | * SECTION_SHIFT #bits space required to store a section # |
1008 | * | 1012 | * |
1009 | * PA_SECTION_SHIFT physical address to/from section number | 1013 | * PA_SECTION_SHIFT physical address to/from section number |
1010 | * PFN_SECTION_SHIFT pfn to/from section number | 1014 | * PFN_SECTION_SHIFT pfn to/from section number |
1011 | */ | 1015 | */ |
1012 | #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) | 1016 | #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) |
1013 | 1017 | ||
1014 | #define PA_SECTION_SHIFT (SECTION_SIZE_BITS) | 1018 | #define PA_SECTION_SHIFT (SECTION_SIZE_BITS) |
1015 | #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) | 1019 | #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) |
1016 | 1020 | ||
1017 | #define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) | 1021 | #define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) |
1018 | 1022 | ||
1019 | #define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) | 1023 | #define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) |
1020 | #define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) | 1024 | #define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) |
1021 | 1025 | ||
1022 | #define SECTION_BLOCKFLAGS_BITS \ | 1026 | #define SECTION_BLOCKFLAGS_BITS \ |
1023 | ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) | 1027 | ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) |
1024 | 1028 | ||
1025 | #if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS | 1029 | #if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS |
1026 | #error Allocator MAX_ORDER exceeds SECTION_SIZE | 1030 | #error Allocator MAX_ORDER exceeds SECTION_SIZE |
1027 | #endif | 1031 | #endif |
1028 | 1032 | ||
1029 | #define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT) | 1033 | #define pfn_to_section_nr(pfn) ((pfn) >> PFN_SECTION_SHIFT) |
1030 | #define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT) | 1034 | #define section_nr_to_pfn(sec) ((sec) << PFN_SECTION_SHIFT) |
1031 | 1035 | ||
1032 | #define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK) | 1036 | #define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK) |
1033 | #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) | 1037 | #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) |
1034 | 1038 | ||
1035 | struct page; | 1039 | struct page; |
1036 | struct page_cgroup; | 1040 | struct page_cgroup; |
1037 | struct mem_section { | 1041 | struct mem_section { |
1038 | /* | 1042 | /* |
1039 | * This is, logically, a pointer to an array of struct | 1043 | * This is, logically, a pointer to an array of struct |
1040 | * pages. However, it is stored with some other magic. | 1044 | * pages. However, it is stored with some other magic. |
1041 | * (see sparse.c::sparse_init_one_section()) | 1045 | * (see sparse.c::sparse_init_one_section()) |
1042 | * | 1046 | * |
1043 | * Additionally during early boot we encode node id of | 1047 | * Additionally during early boot we encode node id of |
1044 | * the location of the section here to guide allocation. | 1048 | * the location of the section here to guide allocation. |
1045 | * (see sparse.c::memory_present()) | 1049 | * (see sparse.c::memory_present()) |
1046 | * | 1050 | * |
1047 | * Making it a UL at least makes someone do a cast | 1051 | * Making it a UL at least makes someone do a cast |
1048 | * before using it wrong. | 1052 | * before using it wrong. |
1049 | */ | 1053 | */ |
1050 | unsigned long section_mem_map; | 1054 | unsigned long section_mem_map; |
1051 | 1055 | ||
1052 | /* See declaration of similar field in struct zone */ | 1056 | /* See declaration of similar field in struct zone */ |
1053 | unsigned long *pageblock_flags; | 1057 | unsigned long *pageblock_flags; |
1054 | #ifdef CONFIG_MEMCG | 1058 | #ifdef CONFIG_MEMCG |
1055 | /* | 1059 | /* |
1056 | * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use | 1060 | * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use |
1057 | * section. (see memcontrol.h/page_cgroup.h about this.) | 1061 | * section. (see memcontrol.h/page_cgroup.h about this.) |
1058 | */ | 1062 | */ |
1059 | struct page_cgroup *page_cgroup; | 1063 | struct page_cgroup *page_cgroup; |
1060 | unsigned long pad; | 1064 | unsigned long pad; |
1061 | #endif | 1065 | #endif |
1062 | }; | 1066 | }; |
1063 | 1067 | ||
1064 | #ifdef CONFIG_SPARSEMEM_EXTREME | 1068 | #ifdef CONFIG_SPARSEMEM_EXTREME |
1065 | #define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) | 1069 | #define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) |
1066 | #else | 1070 | #else |
1067 | #define SECTIONS_PER_ROOT 1 | 1071 | #define SECTIONS_PER_ROOT 1 |
1068 | #endif | 1072 | #endif |
1069 | 1073 | ||
1070 | #define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) | 1074 | #define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) |
1071 | #define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT) | 1075 | #define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT) |
1072 | #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) | 1076 | #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) |
1073 | 1077 | ||
1074 | #ifdef CONFIG_SPARSEMEM_EXTREME | 1078 | #ifdef CONFIG_SPARSEMEM_EXTREME |
1075 | extern struct mem_section *mem_section[NR_SECTION_ROOTS]; | 1079 | extern struct mem_section *mem_section[NR_SECTION_ROOTS]; |
1076 | #else | 1080 | #else |
1077 | extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; | 1081 | extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; |
1078 | #endif | 1082 | #endif |
1079 | 1083 | ||
1080 | static inline struct mem_section *__nr_to_section(unsigned long nr) | 1084 | static inline struct mem_section *__nr_to_section(unsigned long nr) |
1081 | { | 1085 | { |
1082 | if (!mem_section[SECTION_NR_TO_ROOT(nr)]) | 1086 | if (!mem_section[SECTION_NR_TO_ROOT(nr)]) |
1083 | return NULL; | 1087 | return NULL; |
1084 | return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; | 1088 | return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; |
1085 | } | 1089 | } |
1086 | extern int __section_nr(struct mem_section* ms); | 1090 | extern int __section_nr(struct mem_section* ms); |
1087 | extern unsigned long usemap_size(void); | 1091 | extern unsigned long usemap_size(void); |
1088 | 1092 | ||
1089 | /* | 1093 | /* |
1090 | * We use the lower bits of the mem_map pointer to store | 1094 | * We use the lower bits of the mem_map pointer to store |
1091 | * a little bit of information. There should be at least | 1095 | * a little bit of information. There should be at least |
1092 | * 3 bits here due to 32-bit alignment. | 1096 | * 3 bits here due to 32-bit alignment. |
1093 | */ | 1097 | */ |
1094 | #define SECTION_MARKED_PRESENT (1UL<<0) | 1098 | #define SECTION_MARKED_PRESENT (1UL<<0) |
1095 | #define SECTION_HAS_MEM_MAP (1UL<<1) | 1099 | #define SECTION_HAS_MEM_MAP (1UL<<1) |
1096 | #define SECTION_MAP_LAST_BIT (1UL<<2) | 1100 | #define SECTION_MAP_LAST_BIT (1UL<<2) |
1097 | #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) | 1101 | #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) |
1098 | #define SECTION_NID_SHIFT 2 | 1102 | #define SECTION_NID_SHIFT 2 |
1099 | 1103 | ||
1100 | static inline struct page *__section_mem_map_addr(struct mem_section *section) | 1104 | static inline struct page *__section_mem_map_addr(struct mem_section *section) |
1101 | { | 1105 | { |
1102 | unsigned long map = section->section_mem_map; | 1106 | unsigned long map = section->section_mem_map; |
1103 | map &= SECTION_MAP_MASK; | 1107 | map &= SECTION_MAP_MASK; |
1104 | return (struct page *)map; | 1108 | return (struct page *)map; |
1105 | } | 1109 | } |
1106 | 1110 | ||
1107 | static inline int present_section(struct mem_section *section) | 1111 | static inline int present_section(struct mem_section *section) |
1108 | { | 1112 | { |
1109 | return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); | 1113 | return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); |
1110 | } | 1114 | } |
1111 | 1115 | ||
1112 | static inline int present_section_nr(unsigned long nr) | 1116 | static inline int present_section_nr(unsigned long nr) |
1113 | { | 1117 | { |
1114 | return present_section(__nr_to_section(nr)); | 1118 | return present_section(__nr_to_section(nr)); |
1115 | } | 1119 | } |
1116 | 1120 | ||
1117 | static inline int valid_section(struct mem_section *section) | 1121 | static inline int valid_section(struct mem_section *section) |
1118 | { | 1122 | { |
1119 | return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); | 1123 | return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); |
1120 | } | 1124 | } |
1121 | 1125 | ||
1122 | static inline int valid_section_nr(unsigned long nr) | 1126 | static inline int valid_section_nr(unsigned long nr) |
1123 | { | 1127 | { |
1124 | return valid_section(__nr_to_section(nr)); | 1128 | return valid_section(__nr_to_section(nr)); |
1125 | } | 1129 | } |
1126 | 1130 | ||
1127 | static inline struct mem_section *__pfn_to_section(unsigned long pfn) | 1131 | static inline struct mem_section *__pfn_to_section(unsigned long pfn) |
1128 | { | 1132 | { |
1129 | return __nr_to_section(pfn_to_section_nr(pfn)); | 1133 | return __nr_to_section(pfn_to_section_nr(pfn)); |
1130 | } | 1134 | } |
1131 | 1135 | ||
1132 | #ifndef CONFIG_HAVE_ARCH_PFN_VALID | 1136 | #ifndef CONFIG_HAVE_ARCH_PFN_VALID |
1133 | static inline int pfn_valid(unsigned long pfn) | 1137 | static inline int pfn_valid(unsigned long pfn) |
1134 | { | 1138 | { |
1135 | if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) | 1139 | if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) |
1136 | return 0; | 1140 | return 0; |
1137 | return valid_section(__nr_to_section(pfn_to_section_nr(pfn))); | 1141 | return valid_section(__nr_to_section(pfn_to_section_nr(pfn))); |
1138 | } | 1142 | } |
1139 | #endif | 1143 | #endif |
1140 | 1144 | ||
1141 | static inline int pfn_present(unsigned long pfn) | 1145 | static inline int pfn_present(unsigned long pfn) |
1142 | { | 1146 | { |
1143 | if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) | 1147 | if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) |
1144 | return 0; | 1148 | return 0; |
1145 | return present_section(__nr_to_section(pfn_to_section_nr(pfn))); | 1149 | return present_section(__nr_to_section(pfn_to_section_nr(pfn))); |
1146 | } | 1150 | } |
1147 | 1151 | ||
1148 | /* | 1152 | /* |
1149 | * These are _only_ used during initialisation, therefore they | 1153 | * These are _only_ used during initialisation, therefore they |
1150 | * can use __initdata ... They could have names to indicate | 1154 | * can use __initdata ... They could have names to indicate |
1151 | * this restriction. | 1155 | * this restriction. |
1152 | */ | 1156 | */ |
1153 | #ifdef CONFIG_NUMA | 1157 | #ifdef CONFIG_NUMA |
1154 | #define pfn_to_nid(pfn) \ | 1158 | #define pfn_to_nid(pfn) \ |
1155 | ({ \ | 1159 | ({ \ |
1156 | unsigned long __pfn_to_nid_pfn = (pfn); \ | 1160 | unsigned long __pfn_to_nid_pfn = (pfn); \ |
1157 | page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \ | 1161 | page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \ |
1158 | }) | 1162 | }) |
1159 | #else | 1163 | #else |
1160 | #define pfn_to_nid(pfn) (0) | 1164 | #define pfn_to_nid(pfn) (0) |
1161 | #endif | 1165 | #endif |
1162 | 1166 | ||
1163 | #define early_pfn_valid(pfn) pfn_valid(pfn) | 1167 | #define early_pfn_valid(pfn) pfn_valid(pfn) |
1164 | void sparse_init(void); | 1168 | void sparse_init(void); |
1165 | #else | 1169 | #else |
1166 | #define sparse_init() do {} while (0) | 1170 | #define sparse_init() do {} while (0) |
1167 | #define sparse_index_init(_sec, _nid) do {} while (0) | 1171 | #define sparse_index_init(_sec, _nid) do {} while (0) |
1168 | #endif /* CONFIG_SPARSEMEM */ | 1172 | #endif /* CONFIG_SPARSEMEM */ |
1169 | 1173 | ||
1170 | #ifdef CONFIG_NODES_SPAN_OTHER_NODES | 1174 | #ifdef CONFIG_NODES_SPAN_OTHER_NODES |
1171 | bool early_pfn_in_nid(unsigned long pfn, int nid); | 1175 | bool early_pfn_in_nid(unsigned long pfn, int nid); |
1172 | #else | 1176 | #else |
1173 | #define early_pfn_in_nid(pfn, nid) (1) | 1177 | #define early_pfn_in_nid(pfn, nid) (1) |
1174 | #endif | 1178 | #endif |
1175 | 1179 | ||
1176 | #ifndef early_pfn_valid | 1180 | #ifndef early_pfn_valid |
1177 | #define early_pfn_valid(pfn) (1) | 1181 | #define early_pfn_valid(pfn) (1) |
1178 | #endif | 1182 | #endif |
1179 | 1183 | ||
1180 | void memory_present(int nid, unsigned long start, unsigned long end); | 1184 | void memory_present(int nid, unsigned long start, unsigned long end); |
1181 | unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); | 1185 | unsigned long __init node_memmap_size_bytes(int, unsigned long, unsigned long); |
1182 | 1186 | ||
1183 | /* | 1187 | /* |
1184 | * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we | 1188 | * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we |
1185 | * need to check pfn validility within that MAX_ORDER_NR_PAGES block. | 1189 | * need to check pfn validility within that MAX_ORDER_NR_PAGES block. |
1186 | * pfn_valid_within() should be used in this case; we optimise this away | 1190 | * pfn_valid_within() should be used in this case; we optimise this away |
1187 | * when we have no holes within a MAX_ORDER_NR_PAGES block. | 1191 | * when we have no holes within a MAX_ORDER_NR_PAGES block. |
1188 | */ | 1192 | */ |
1189 | #ifdef CONFIG_HOLES_IN_ZONE | 1193 | #ifdef CONFIG_HOLES_IN_ZONE |
1190 | #define pfn_valid_within(pfn) pfn_valid(pfn) | 1194 | #define pfn_valid_within(pfn) pfn_valid(pfn) |
1191 | #else | 1195 | #else |
1192 | #define pfn_valid_within(pfn) (1) | 1196 | #define pfn_valid_within(pfn) (1) |
1193 | #endif | 1197 | #endif |
1194 | 1198 | ||
1195 | #ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL | 1199 | #ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL |
1196 | /* | 1200 | /* |
1197 | * pfn_valid() is meant to be able to tell if a given PFN has valid memmap | 1201 | * pfn_valid() is meant to be able to tell if a given PFN has valid memmap |
1198 | * associated with it or not. In FLATMEM, it is expected that holes always | 1202 | * associated with it or not. In FLATMEM, it is expected that holes always |
1199 | * have valid memmap as long as there is valid PFNs either side of the hole. | 1203 | * have valid memmap as long as there is valid PFNs either side of the hole. |
1200 | * In SPARSEMEM, it is assumed that a valid section has a memmap for the | 1204 | * In SPARSEMEM, it is assumed that a valid section has a memmap for the |
1201 | * entire section. | 1205 | * entire section. |
1202 | * | 1206 | * |
1203 | * However, an ARM, and maybe other embedded architectures in the future | 1207 | * However, an ARM, and maybe other embedded architectures in the future |
1204 | * free memmap backing holes to save memory on the assumption the memmap is | 1208 | * free memmap backing holes to save memory on the assumption the memmap is |
1205 | * never used. The page_zone linkages are then broken even though pfn_valid() | 1209 | * never used. The page_zone linkages are then broken even though pfn_valid() |
1206 | * returns true. A walker of the full memmap must then do this additional | 1210 | * returns true. A walker of the full memmap must then do this additional |
1207 | * check to ensure the memmap they are looking at is sane by making sure | 1211 | * check to ensure the memmap they are looking at is sane by making sure |
1208 | * the zone and PFN linkages are still valid. This is expensive, but walkers | 1212 | * the zone and PFN linkages are still valid. This is expensive, but walkers |
1209 | * of the full memmap are extremely rare. | 1213 | * of the full memmap are extremely rare. |
1210 | */ | 1214 | */ |
1211 | int memmap_valid_within(unsigned long pfn, | 1215 | int memmap_valid_within(unsigned long pfn, |
1212 | struct page *page, struct zone *zone); | 1216 | struct page *page, struct zone *zone); |
1213 | #else | 1217 | #else |
1214 | static inline int memmap_valid_within(unsigned long pfn, | 1218 | static inline int memmap_valid_within(unsigned long pfn, |
1215 | struct page *page, struct zone *zone) | 1219 | struct page *page, struct zone *zone) |
1216 | { | 1220 | { |
1217 | return 1; | 1221 | return 1; |
1218 | } | 1222 | } |
1219 | #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ | 1223 | #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ |
1220 | 1224 | ||
1221 | #endif /* !__GENERATING_BOUNDS.H */ | 1225 | #endif /* !__GENERATING_BOUNDS.H */ |
1222 | #endif /* !__ASSEMBLY__ */ | 1226 | #endif /* !__ASSEMBLY__ */ |
1223 | #endif /* _LINUX_MMZONE_H */ | 1227 | #endif /* _LINUX_MMZONE_H */ |
1224 | 1228 |
mm/compaction.c
1 | /* | 1 | /* |
2 | * linux/mm/compaction.c | 2 | * linux/mm/compaction.c |
3 | * | 3 | * |
4 | * Memory compaction for the reduction of external fragmentation. Note that | 4 | * Memory compaction for the reduction of external fragmentation. Note that |
5 | * this heavily depends upon page migration to do all the real heavy | 5 | * this heavily depends upon page migration to do all the real heavy |
6 | * lifting | 6 | * lifting |
7 | * | 7 | * |
8 | * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie> | 8 | * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie> |
9 | */ | 9 | */ |
10 | #include <linux/swap.h> | 10 | #include <linux/swap.h> |
11 | #include <linux/migrate.h> | 11 | #include <linux/migrate.h> |
12 | #include <linux/compaction.h> | 12 | #include <linux/compaction.h> |
13 | #include <linux/mm_inline.h> | 13 | #include <linux/mm_inline.h> |
14 | #include <linux/backing-dev.h> | 14 | #include <linux/backing-dev.h> |
15 | #include <linux/sysctl.h> | 15 | #include <linux/sysctl.h> |
16 | #include <linux/sysfs.h> | 16 | #include <linux/sysfs.h> |
17 | #include "internal.h" | 17 | #include "internal.h" |
18 | 18 | ||
19 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | 19 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA |
20 | 20 | ||
21 | #define CREATE_TRACE_POINTS | 21 | #define CREATE_TRACE_POINTS |
22 | #include <trace/events/compaction.h> | 22 | #include <trace/events/compaction.h> |
23 | 23 | ||
24 | static unsigned long release_freepages(struct list_head *freelist) | 24 | static unsigned long release_freepages(struct list_head *freelist) |
25 | { | 25 | { |
26 | struct page *page, *next; | 26 | struct page *page, *next; |
27 | unsigned long count = 0; | 27 | unsigned long count = 0; |
28 | 28 | ||
29 | list_for_each_entry_safe(page, next, freelist, lru) { | 29 | list_for_each_entry_safe(page, next, freelist, lru) { |
30 | list_del(&page->lru); | 30 | list_del(&page->lru); |
31 | __free_page(page); | 31 | __free_page(page); |
32 | count++; | 32 | count++; |
33 | } | 33 | } |
34 | 34 | ||
35 | return count; | 35 | return count; |
36 | } | 36 | } |
37 | 37 | ||
38 | static void map_pages(struct list_head *list) | 38 | static void map_pages(struct list_head *list) |
39 | { | 39 | { |
40 | struct page *page; | 40 | struct page *page; |
41 | 41 | ||
42 | list_for_each_entry(page, list, lru) { | 42 | list_for_each_entry(page, list, lru) { |
43 | arch_alloc_page(page, 0); | 43 | arch_alloc_page(page, 0); |
44 | kernel_map_pages(page, 1, 1); | 44 | kernel_map_pages(page, 1, 1); |
45 | } | 45 | } |
46 | } | 46 | } |
47 | 47 | ||
48 | static inline bool migrate_async_suitable(int migratetype) | 48 | static inline bool migrate_async_suitable(int migratetype) |
49 | { | 49 | { |
50 | return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; | 50 | return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; |
51 | } | 51 | } |
52 | 52 | ||
53 | /* | 53 | /* |
54 | * Isolate free pages onto a private freelist. Caller must hold zone->lock. | 54 | * Isolate free pages onto a private freelist. Caller must hold zone->lock. |
55 | * If @strict is true, will abort returning 0 on any invalid PFNs or non-free | 55 | * If @strict is true, will abort returning 0 on any invalid PFNs or non-free |
56 | * pages inside of the pageblock (even though it may still end up isolating | 56 | * pages inside of the pageblock (even though it may still end up isolating |
57 | * some pages). | 57 | * some pages). |
58 | */ | 58 | */ |
59 | static unsigned long isolate_freepages_block(unsigned long blockpfn, | 59 | static unsigned long isolate_freepages_block(unsigned long blockpfn, |
60 | unsigned long end_pfn, | 60 | unsigned long end_pfn, |
61 | struct list_head *freelist, | 61 | struct list_head *freelist, |
62 | bool strict) | 62 | bool strict) |
63 | { | 63 | { |
64 | int nr_scanned = 0, total_isolated = 0; | 64 | int nr_scanned = 0, total_isolated = 0; |
65 | struct page *cursor; | 65 | struct page *cursor; |
66 | 66 | ||
67 | cursor = pfn_to_page(blockpfn); | 67 | cursor = pfn_to_page(blockpfn); |
68 | 68 | ||
69 | /* Isolate free pages. This assumes the block is valid */ | 69 | /* Isolate free pages. This assumes the block is valid */ |
70 | for (; blockpfn < end_pfn; blockpfn++, cursor++) { | 70 | for (; blockpfn < end_pfn; blockpfn++, cursor++) { |
71 | int isolated, i; | 71 | int isolated, i; |
72 | struct page *page = cursor; | 72 | struct page *page = cursor; |
73 | 73 | ||
74 | if (!pfn_valid_within(blockpfn)) { | 74 | if (!pfn_valid_within(blockpfn)) { |
75 | if (strict) | 75 | if (strict) |
76 | return 0; | 76 | return 0; |
77 | continue; | 77 | continue; |
78 | } | 78 | } |
79 | nr_scanned++; | 79 | nr_scanned++; |
80 | 80 | ||
81 | if (!PageBuddy(page)) { | 81 | if (!PageBuddy(page)) { |
82 | if (strict) | 82 | if (strict) |
83 | return 0; | 83 | return 0; |
84 | continue; | 84 | continue; |
85 | } | 85 | } |
86 | 86 | ||
87 | /* Found a free page, break it into order-0 pages */ | 87 | /* Found a free page, break it into order-0 pages */ |
88 | isolated = split_free_page(page); | 88 | isolated = split_free_page(page); |
89 | if (!isolated && strict) | 89 | if (!isolated && strict) |
90 | return 0; | 90 | return 0; |
91 | total_isolated += isolated; | 91 | total_isolated += isolated; |
92 | for (i = 0; i < isolated; i++) { | 92 | for (i = 0; i < isolated; i++) { |
93 | list_add(&page->lru, freelist); | 93 | list_add(&page->lru, freelist); |
94 | page++; | 94 | page++; |
95 | } | 95 | } |
96 | 96 | ||
97 | /* If a page was split, advance to the end of it */ | 97 | /* If a page was split, advance to the end of it */ |
98 | if (isolated) { | 98 | if (isolated) { |
99 | blockpfn += isolated - 1; | 99 | blockpfn += isolated - 1; |
100 | cursor += isolated - 1; | 100 | cursor += isolated - 1; |
101 | } | 101 | } |
102 | } | 102 | } |
103 | 103 | ||
104 | trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); | 104 | trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); |
105 | return total_isolated; | 105 | return total_isolated; |
106 | } | 106 | } |
107 | 107 | ||
108 | /** | 108 | /** |
109 | * isolate_freepages_range() - isolate free pages. | 109 | * isolate_freepages_range() - isolate free pages. |
110 | * @start_pfn: The first PFN to start isolating. | 110 | * @start_pfn: The first PFN to start isolating. |
111 | * @end_pfn: The one-past-last PFN. | 111 | * @end_pfn: The one-past-last PFN. |
112 | * | 112 | * |
113 | * Non-free pages, invalid PFNs, or zone boundaries within the | 113 | * Non-free pages, invalid PFNs, or zone boundaries within the |
114 | * [start_pfn, end_pfn) range are considered errors, cause function to | 114 | * [start_pfn, end_pfn) range are considered errors, cause function to |
115 | * undo its actions and return zero. | 115 | * undo its actions and return zero. |
116 | * | 116 | * |
117 | * Otherwise, function returns one-past-the-last PFN of isolated page | 117 | * Otherwise, function returns one-past-the-last PFN of isolated page |
118 | * (which may be greater then end_pfn if end fell in a middle of | 118 | * (which may be greater then end_pfn if end fell in a middle of |
119 | * a free page). | 119 | * a free page). |
120 | */ | 120 | */ |
121 | unsigned long | 121 | unsigned long |
122 | isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) | 122 | isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn) |
123 | { | 123 | { |
124 | unsigned long isolated, pfn, block_end_pfn, flags; | 124 | unsigned long isolated, pfn, block_end_pfn, flags; |
125 | struct zone *zone = NULL; | 125 | struct zone *zone = NULL; |
126 | LIST_HEAD(freelist); | 126 | LIST_HEAD(freelist); |
127 | 127 | ||
128 | if (pfn_valid(start_pfn)) | 128 | if (pfn_valid(start_pfn)) |
129 | zone = page_zone(pfn_to_page(start_pfn)); | 129 | zone = page_zone(pfn_to_page(start_pfn)); |
130 | 130 | ||
131 | for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { | 131 | for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { |
132 | if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn))) | 132 | if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn))) |
133 | break; | 133 | break; |
134 | 134 | ||
135 | /* | 135 | /* |
136 | * On subsequent iterations ALIGN() is actually not needed, | 136 | * On subsequent iterations ALIGN() is actually not needed, |
137 | * but we keep it that we not to complicate the code. | 137 | * but we keep it that we not to complicate the code. |
138 | */ | 138 | */ |
139 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | 139 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); |
140 | block_end_pfn = min(block_end_pfn, end_pfn); | 140 | block_end_pfn = min(block_end_pfn, end_pfn); |
141 | 141 | ||
142 | spin_lock_irqsave(&zone->lock, flags); | 142 | spin_lock_irqsave(&zone->lock, flags); |
143 | isolated = isolate_freepages_block(pfn, block_end_pfn, | 143 | isolated = isolate_freepages_block(pfn, block_end_pfn, |
144 | &freelist, true); | 144 | &freelist, true); |
145 | spin_unlock_irqrestore(&zone->lock, flags); | 145 | spin_unlock_irqrestore(&zone->lock, flags); |
146 | 146 | ||
147 | /* | 147 | /* |
148 | * In strict mode, isolate_freepages_block() returns 0 if | 148 | * In strict mode, isolate_freepages_block() returns 0 if |
149 | * there are any holes in the block (ie. invalid PFNs or | 149 | * there are any holes in the block (ie. invalid PFNs or |
150 | * non-free pages). | 150 | * non-free pages). |
151 | */ | 151 | */ |
152 | if (!isolated) | 152 | if (!isolated) |
153 | break; | 153 | break; |
154 | 154 | ||
155 | /* | 155 | /* |
156 | * If we managed to isolate pages, it is always (1 << n) * | 156 | * If we managed to isolate pages, it is always (1 << n) * |
157 | * pageblock_nr_pages for some non-negative n. (Max order | 157 | * pageblock_nr_pages for some non-negative n. (Max order |
158 | * page may span two pageblocks). | 158 | * page may span two pageblocks). |
159 | */ | 159 | */ |
160 | } | 160 | } |
161 | 161 | ||
162 | /* split_free_page does not map the pages */ | 162 | /* split_free_page does not map the pages */ |
163 | map_pages(&freelist); | 163 | map_pages(&freelist); |
164 | 164 | ||
165 | if (pfn < end_pfn) { | 165 | if (pfn < end_pfn) { |
166 | /* Loop terminated early, cleanup. */ | 166 | /* Loop terminated early, cleanup. */ |
167 | release_freepages(&freelist); | 167 | release_freepages(&freelist); |
168 | return 0; | 168 | return 0; |
169 | } | 169 | } |
170 | 170 | ||
171 | /* We don't use freelists for anything. */ | 171 | /* We don't use freelists for anything. */ |
172 | return pfn; | 172 | return pfn; |
173 | } | 173 | } |
174 | 174 | ||
175 | /* Update the number of anon and file isolated pages in the zone */ | 175 | /* Update the number of anon and file isolated pages in the zone */ |
176 | static void acct_isolated(struct zone *zone, struct compact_control *cc) | 176 | static void acct_isolated(struct zone *zone, struct compact_control *cc) |
177 | { | 177 | { |
178 | struct page *page; | 178 | struct page *page; |
179 | unsigned int count[2] = { 0, }; | 179 | unsigned int count[2] = { 0, }; |
180 | 180 | ||
181 | list_for_each_entry(page, &cc->migratepages, lru) | 181 | list_for_each_entry(page, &cc->migratepages, lru) |
182 | count[!!page_is_file_cache(page)]++; | 182 | count[!!page_is_file_cache(page)]++; |
183 | 183 | ||
184 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); | 184 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); |
185 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); | 185 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); |
186 | } | 186 | } |
187 | 187 | ||
188 | /* Similar to reclaim, but different enough that they don't share logic */ | 188 | /* Similar to reclaim, but different enough that they don't share logic */ |
189 | static bool too_many_isolated(struct zone *zone) | 189 | static bool too_many_isolated(struct zone *zone) |
190 | { | 190 | { |
191 | unsigned long active, inactive, isolated; | 191 | unsigned long active, inactive, isolated; |
192 | 192 | ||
193 | inactive = zone_page_state(zone, NR_INACTIVE_FILE) + | 193 | inactive = zone_page_state(zone, NR_INACTIVE_FILE) + |
194 | zone_page_state(zone, NR_INACTIVE_ANON); | 194 | zone_page_state(zone, NR_INACTIVE_ANON); |
195 | active = zone_page_state(zone, NR_ACTIVE_FILE) + | 195 | active = zone_page_state(zone, NR_ACTIVE_FILE) + |
196 | zone_page_state(zone, NR_ACTIVE_ANON); | 196 | zone_page_state(zone, NR_ACTIVE_ANON); |
197 | isolated = zone_page_state(zone, NR_ISOLATED_FILE) + | 197 | isolated = zone_page_state(zone, NR_ISOLATED_FILE) + |
198 | zone_page_state(zone, NR_ISOLATED_ANON); | 198 | zone_page_state(zone, NR_ISOLATED_ANON); |
199 | 199 | ||
200 | return isolated > (inactive + active) / 2; | 200 | return isolated > (inactive + active) / 2; |
201 | } | 201 | } |
202 | 202 | ||
203 | /** | 203 | /** |
204 | * isolate_migratepages_range() - isolate all migrate-able pages in range. | 204 | * isolate_migratepages_range() - isolate all migrate-able pages in range. |
205 | * @zone: Zone pages are in. | 205 | * @zone: Zone pages are in. |
206 | * @cc: Compaction control structure. | 206 | * @cc: Compaction control structure. |
207 | * @low_pfn: The first PFN of the range. | 207 | * @low_pfn: The first PFN of the range. |
208 | * @end_pfn: The one-past-the-last PFN of the range. | 208 | * @end_pfn: The one-past-the-last PFN of the range. |
209 | * | 209 | * |
210 | * Isolate all pages that can be migrated from the range specified by | 210 | * Isolate all pages that can be migrated from the range specified by |
211 | * [low_pfn, end_pfn). Returns zero if there is a fatal signal | 211 | * [low_pfn, end_pfn). Returns zero if there is a fatal signal |
212 | * pending), otherwise PFN of the first page that was not scanned | 212 | * pending), otherwise PFN of the first page that was not scanned |
213 | * (which may be both less, equal to or more then end_pfn). | 213 | * (which may be both less, equal to or more then end_pfn). |
214 | * | 214 | * |
215 | * Assumes that cc->migratepages is empty and cc->nr_migratepages is | 215 | * Assumes that cc->migratepages is empty and cc->nr_migratepages is |
216 | * zero. | 216 | * zero. |
217 | * | 217 | * |
218 | * Apart from cc->migratepages and cc->nr_migratetypes this function | 218 | * Apart from cc->migratepages and cc->nr_migratetypes this function |
219 | * does not modify any cc's fields, in particular it does not modify | 219 | * does not modify any cc's fields, in particular it does not modify |
220 | * (or read for that matter) cc->migrate_pfn. | 220 | * (or read for that matter) cc->migrate_pfn. |
221 | */ | 221 | */ |
222 | unsigned long | 222 | unsigned long |
223 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | 223 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, |
224 | unsigned long low_pfn, unsigned long end_pfn) | 224 | unsigned long low_pfn, unsigned long end_pfn) |
225 | { | 225 | { |
226 | unsigned long last_pageblock_nr = 0, pageblock_nr; | 226 | unsigned long last_pageblock_nr = 0, pageblock_nr; |
227 | unsigned long nr_scanned = 0, nr_isolated = 0; | 227 | unsigned long nr_scanned = 0, nr_isolated = 0; |
228 | struct list_head *migratelist = &cc->migratepages; | 228 | struct list_head *migratelist = &cc->migratepages; |
229 | isolate_mode_t mode = 0; | 229 | isolate_mode_t mode = 0; |
230 | struct lruvec *lruvec; | 230 | struct lruvec *lruvec; |
231 | 231 | ||
232 | /* | 232 | /* |
233 | * Ensure that there are not too many pages isolated from the LRU | 233 | * Ensure that there are not too many pages isolated from the LRU |
234 | * list by either parallel reclaimers or compaction. If there are, | 234 | * list by either parallel reclaimers or compaction. If there are, |
235 | * delay for some time until fewer pages are isolated | 235 | * delay for some time until fewer pages are isolated |
236 | */ | 236 | */ |
237 | while (unlikely(too_many_isolated(zone))) { | 237 | while (unlikely(too_many_isolated(zone))) { |
238 | /* async migration should just abort */ | 238 | /* async migration should just abort */ |
239 | if (!cc->sync) | 239 | if (!cc->sync) |
240 | return 0; | 240 | return 0; |
241 | 241 | ||
242 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 242 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
243 | 243 | ||
244 | if (fatal_signal_pending(current)) | 244 | if (fatal_signal_pending(current)) |
245 | return 0; | 245 | return 0; |
246 | } | 246 | } |
247 | 247 | ||
248 | /* Time to isolate some pages for migration */ | 248 | /* Time to isolate some pages for migration */ |
249 | cond_resched(); | 249 | cond_resched(); |
250 | spin_lock_irq(&zone->lru_lock); | 250 | spin_lock_irq(&zone->lru_lock); |
251 | for (; low_pfn < end_pfn; low_pfn++) { | 251 | for (; low_pfn < end_pfn; low_pfn++) { |
252 | struct page *page; | 252 | struct page *page; |
253 | bool locked = true; | 253 | bool locked = true; |
254 | 254 | ||
255 | /* give a chance to irqs before checking need_resched() */ | 255 | /* give a chance to irqs before checking need_resched() */ |
256 | if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { | 256 | if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) { |
257 | spin_unlock_irq(&zone->lru_lock); | 257 | spin_unlock_irq(&zone->lru_lock); |
258 | locked = false; | 258 | locked = false; |
259 | } | 259 | } |
260 | if (need_resched() || spin_is_contended(&zone->lru_lock)) { | 260 | if (need_resched() || spin_is_contended(&zone->lru_lock)) { |
261 | if (locked) | 261 | if (locked) |
262 | spin_unlock_irq(&zone->lru_lock); | 262 | spin_unlock_irq(&zone->lru_lock); |
263 | cond_resched(); | 263 | cond_resched(); |
264 | spin_lock_irq(&zone->lru_lock); | 264 | spin_lock_irq(&zone->lru_lock); |
265 | if (fatal_signal_pending(current)) | 265 | if (fatal_signal_pending(current)) |
266 | break; | 266 | break; |
267 | } else if (!locked) | 267 | } else if (!locked) |
268 | spin_lock_irq(&zone->lru_lock); | 268 | spin_lock_irq(&zone->lru_lock); |
269 | 269 | ||
270 | /* | 270 | /* |
271 | * migrate_pfn does not necessarily start aligned to a | 271 | * migrate_pfn does not necessarily start aligned to a |
272 | * pageblock. Ensure that pfn_valid is called when moving | 272 | * pageblock. Ensure that pfn_valid is called when moving |
273 | * into a new MAX_ORDER_NR_PAGES range in case of large | 273 | * into a new MAX_ORDER_NR_PAGES range in case of large |
274 | * memory holes within the zone | 274 | * memory holes within the zone |
275 | */ | 275 | */ |
276 | if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { | 276 | if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { |
277 | if (!pfn_valid(low_pfn)) { | 277 | if (!pfn_valid(low_pfn)) { |
278 | low_pfn += MAX_ORDER_NR_PAGES - 1; | 278 | low_pfn += MAX_ORDER_NR_PAGES - 1; |
279 | continue; | 279 | continue; |
280 | } | 280 | } |
281 | } | 281 | } |
282 | 282 | ||
283 | if (!pfn_valid_within(low_pfn)) | 283 | if (!pfn_valid_within(low_pfn)) |
284 | continue; | 284 | continue; |
285 | nr_scanned++; | 285 | nr_scanned++; |
286 | 286 | ||
287 | /* | 287 | /* |
288 | * Get the page and ensure the page is within the same zone. | 288 | * Get the page and ensure the page is within the same zone. |
289 | * See the comment in isolate_freepages about overlapping | 289 | * See the comment in isolate_freepages about overlapping |
290 | * nodes. It is deliberate that the new zone lock is not taken | 290 | * nodes. It is deliberate that the new zone lock is not taken |
291 | * as memory compaction should not move pages between nodes. | 291 | * as memory compaction should not move pages between nodes. |
292 | */ | 292 | */ |
293 | page = pfn_to_page(low_pfn); | 293 | page = pfn_to_page(low_pfn); |
294 | if (page_zone(page) != zone) | 294 | if (page_zone(page) != zone) |
295 | continue; | 295 | continue; |
296 | 296 | ||
297 | /* Skip if free */ | 297 | /* Skip if free */ |
298 | if (PageBuddy(page)) | 298 | if (PageBuddy(page)) |
299 | continue; | 299 | continue; |
300 | 300 | ||
301 | /* | 301 | /* |
302 | * For async migration, also only scan in MOVABLE blocks. Async | 302 | * For async migration, also only scan in MOVABLE blocks. Async |
303 | * migration is optimistic to see if the minimum amount of work | 303 | * migration is optimistic to see if the minimum amount of work |
304 | * satisfies the allocation | 304 | * satisfies the allocation |
305 | */ | 305 | */ |
306 | pageblock_nr = low_pfn >> pageblock_order; | 306 | pageblock_nr = low_pfn >> pageblock_order; |
307 | if (!cc->sync && last_pageblock_nr != pageblock_nr && | 307 | if (!cc->sync && last_pageblock_nr != pageblock_nr && |
308 | !migrate_async_suitable(get_pageblock_migratetype(page))) { | 308 | !migrate_async_suitable(get_pageblock_migratetype(page))) { |
309 | low_pfn += pageblock_nr_pages; | 309 | low_pfn += pageblock_nr_pages; |
310 | low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; | 310 | low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; |
311 | last_pageblock_nr = pageblock_nr; | 311 | last_pageblock_nr = pageblock_nr; |
312 | continue; | 312 | continue; |
313 | } | 313 | } |
314 | 314 | ||
315 | if (!PageLRU(page)) | 315 | if (!PageLRU(page)) |
316 | continue; | 316 | continue; |
317 | 317 | ||
318 | /* | 318 | /* |
319 | * PageLRU is set, and lru_lock excludes isolation, | 319 | * PageLRU is set, and lru_lock excludes isolation, |
320 | * splitting and collapsing (collapsing has already | 320 | * splitting and collapsing (collapsing has already |
321 | * happened if PageLRU is set). | 321 | * happened if PageLRU is set). |
322 | */ | 322 | */ |
323 | if (PageTransHuge(page)) { | 323 | if (PageTransHuge(page)) { |
324 | low_pfn += (1 << compound_order(page)) - 1; | 324 | low_pfn += (1 << compound_order(page)) - 1; |
325 | continue; | 325 | continue; |
326 | } | 326 | } |
327 | 327 | ||
328 | if (!cc->sync) | 328 | if (!cc->sync) |
329 | mode |= ISOLATE_ASYNC_MIGRATE; | 329 | mode |= ISOLATE_ASYNC_MIGRATE; |
330 | 330 | ||
331 | lruvec = mem_cgroup_page_lruvec(page, zone); | 331 | lruvec = mem_cgroup_page_lruvec(page, zone); |
332 | 332 | ||
333 | /* Try isolate the page */ | 333 | /* Try isolate the page */ |
334 | if (__isolate_lru_page(page, mode) != 0) | 334 | if (__isolate_lru_page(page, mode) != 0) |
335 | continue; | 335 | continue; |
336 | 336 | ||
337 | VM_BUG_ON(PageTransCompound(page)); | 337 | VM_BUG_ON(PageTransCompound(page)); |
338 | 338 | ||
339 | /* Successfully isolated */ | 339 | /* Successfully isolated */ |
340 | del_page_from_lru_list(page, lruvec, page_lru(page)); | 340 | del_page_from_lru_list(page, lruvec, page_lru(page)); |
341 | list_add(&page->lru, migratelist); | 341 | list_add(&page->lru, migratelist); |
342 | cc->nr_migratepages++; | 342 | cc->nr_migratepages++; |
343 | nr_isolated++; | 343 | nr_isolated++; |
344 | 344 | ||
345 | /* Avoid isolating too much */ | 345 | /* Avoid isolating too much */ |
346 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { | 346 | if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { |
347 | ++low_pfn; | 347 | ++low_pfn; |
348 | break; | 348 | break; |
349 | } | 349 | } |
350 | } | 350 | } |
351 | 351 | ||
352 | acct_isolated(zone, cc); | 352 | acct_isolated(zone, cc); |
353 | 353 | ||
354 | spin_unlock_irq(&zone->lru_lock); | 354 | spin_unlock_irq(&zone->lru_lock); |
355 | 355 | ||
356 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 356 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
357 | 357 | ||
358 | return low_pfn; | 358 | return low_pfn; |
359 | } | 359 | } |
360 | 360 | ||
361 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ | 361 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ |
362 | #ifdef CONFIG_COMPACTION | 362 | #ifdef CONFIG_COMPACTION |
363 | 363 | ||
364 | /* Returns true if the page is within a block suitable for migration to */ | 364 | /* Returns true if the page is within a block suitable for migration to */ |
365 | static bool suitable_migration_target(struct page *page) | 365 | static bool suitable_migration_target(struct page *page) |
366 | { | 366 | { |
367 | 367 | ||
368 | int migratetype = get_pageblock_migratetype(page); | 368 | int migratetype = get_pageblock_migratetype(page); |
369 | 369 | ||
370 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ | 370 | /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ |
371 | if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) | 371 | if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) |
372 | return false; | 372 | return false; |
373 | 373 | ||
374 | /* If the page is a large free page, then allow migration */ | 374 | /* If the page is a large free page, then allow migration */ |
375 | if (PageBuddy(page) && page_order(page) >= pageblock_order) | 375 | if (PageBuddy(page) && page_order(page) >= pageblock_order) |
376 | return true; | 376 | return true; |
377 | 377 | ||
378 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | 378 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ |
379 | if (migrate_async_suitable(migratetype)) | 379 | if (migrate_async_suitable(migratetype)) |
380 | return true; | 380 | return true; |
381 | 381 | ||
382 | /* Otherwise skip the block */ | 382 | /* Otherwise skip the block */ |
383 | return false; | 383 | return false; |
384 | } | 384 | } |
385 | 385 | ||
386 | /* | 386 | /* |
387 | * Based on information in the current compact_control, find blocks | 387 | * Based on information in the current compact_control, find blocks |
388 | * suitable for isolating free pages from and then isolate them. | 388 | * suitable for isolating free pages from and then isolate them. |
389 | */ | 389 | */ |
390 | static void isolate_freepages(struct zone *zone, | 390 | static void isolate_freepages(struct zone *zone, |
391 | struct compact_control *cc) | 391 | struct compact_control *cc) |
392 | { | 392 | { |
393 | struct page *page; | 393 | struct page *page; |
394 | unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; | 394 | unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn; |
395 | unsigned long flags; | 395 | unsigned long flags; |
396 | int nr_freepages = cc->nr_freepages; | 396 | int nr_freepages = cc->nr_freepages; |
397 | struct list_head *freelist = &cc->freepages; | 397 | struct list_head *freelist = &cc->freepages; |
398 | 398 | ||
399 | /* | 399 | /* |
400 | * Initialise the free scanner. The starting point is where we last | 400 | * Initialise the free scanner. The starting point is where we last |
401 | * scanned from (or the end of the zone if starting). The low point | 401 | * scanned from (or the end of the zone if starting). The low point |
402 | * is the end of the pageblock the migration scanner is using. | 402 | * is the end of the pageblock the migration scanner is using. |
403 | */ | 403 | */ |
404 | pfn = cc->free_pfn; | 404 | pfn = cc->free_pfn; |
405 | low_pfn = cc->migrate_pfn + pageblock_nr_pages; | 405 | low_pfn = cc->migrate_pfn + pageblock_nr_pages; |
406 | 406 | ||
407 | /* | 407 | /* |
408 | * Take care that if the migration scanner is at the end of the zone | 408 | * Take care that if the migration scanner is at the end of the zone |
409 | * that the free scanner does not accidentally move to the next zone | 409 | * that the free scanner does not accidentally move to the next zone |
410 | * in the next isolation cycle. | 410 | * in the next isolation cycle. |
411 | */ | 411 | */ |
412 | high_pfn = min(low_pfn, pfn); | 412 | high_pfn = min(low_pfn, pfn); |
413 | 413 | ||
414 | zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; | 414 | zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; |
415 | 415 | ||
416 | /* | 416 | /* |
417 | * Isolate free pages until enough are available to migrate the | 417 | * Isolate free pages until enough are available to migrate the |
418 | * pages on cc->migratepages. We stop searching if the migrate | 418 | * pages on cc->migratepages. We stop searching if the migrate |
419 | * and free page scanners meet or enough free pages are isolated. | 419 | * and free page scanners meet or enough free pages are isolated. |
420 | */ | 420 | */ |
421 | for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; | 421 | for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; |
422 | pfn -= pageblock_nr_pages) { | 422 | pfn -= pageblock_nr_pages) { |
423 | unsigned long isolated; | 423 | unsigned long isolated; |
424 | 424 | ||
425 | /* | ||
426 | * Skip ahead if another thread is compacting in the area | ||
427 | * simultaneously. If we wrapped around, we can only skip | ||
428 | * ahead if zone->compact_cached_free_pfn also wrapped to | ||
429 | * above our starting point. | ||
430 | */ | ||
431 | if (cc->order > 0 && (!cc->wrapped || | ||
432 | zone->compact_cached_free_pfn > | ||
433 | cc->start_free_pfn)) | ||
434 | pfn = min(pfn, zone->compact_cached_free_pfn); | ||
435 | |||
425 | if (!pfn_valid(pfn)) | 436 | if (!pfn_valid(pfn)) |
426 | continue; | 437 | continue; |
427 | 438 | ||
428 | /* | 439 | /* |
429 | * Check for overlapping nodes/zones. It's possible on some | 440 | * Check for overlapping nodes/zones. It's possible on some |
430 | * configurations to have a setup like | 441 | * configurations to have a setup like |
431 | * node0 node1 node0 | 442 | * node0 node1 node0 |
432 | * i.e. it's possible that all pages within a zones range of | 443 | * i.e. it's possible that all pages within a zones range of |
433 | * pages do not belong to a single zone. | 444 | * pages do not belong to a single zone. |
434 | */ | 445 | */ |
435 | page = pfn_to_page(pfn); | 446 | page = pfn_to_page(pfn); |
436 | if (page_zone(page) != zone) | 447 | if (page_zone(page) != zone) |
437 | continue; | 448 | continue; |
438 | 449 | ||
439 | /* Check the block is suitable for migration */ | 450 | /* Check the block is suitable for migration */ |
440 | if (!suitable_migration_target(page)) | 451 | if (!suitable_migration_target(page)) |
441 | continue; | 452 | continue; |
442 | 453 | ||
443 | /* | 454 | /* |
444 | * Found a block suitable for isolating free pages from. Now | 455 | * Found a block suitable for isolating free pages from. Now |
445 | * we disabled interrupts, double check things are ok and | 456 | * we disabled interrupts, double check things are ok and |
446 | * isolate the pages. This is to minimise the time IRQs | 457 | * isolate the pages. This is to minimise the time IRQs |
447 | * are disabled | 458 | * are disabled |
448 | */ | 459 | */ |
449 | isolated = 0; | 460 | isolated = 0; |
450 | spin_lock_irqsave(&zone->lock, flags); | 461 | spin_lock_irqsave(&zone->lock, flags); |
451 | if (suitable_migration_target(page)) { | 462 | if (suitable_migration_target(page)) { |
452 | end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); | 463 | end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); |
453 | isolated = isolate_freepages_block(pfn, end_pfn, | 464 | isolated = isolate_freepages_block(pfn, end_pfn, |
454 | freelist, false); | 465 | freelist, false); |
455 | nr_freepages += isolated; | 466 | nr_freepages += isolated; |
456 | } | 467 | } |
457 | spin_unlock_irqrestore(&zone->lock, flags); | 468 | spin_unlock_irqrestore(&zone->lock, flags); |
458 | 469 | ||
459 | /* | 470 | /* |
460 | * Record the highest PFN we isolated pages from. When next | 471 | * Record the highest PFN we isolated pages from. When next |
461 | * looking for free pages, the search will restart here as | 472 | * looking for free pages, the search will restart here as |
462 | * page migration may have returned some pages to the allocator | 473 | * page migration may have returned some pages to the allocator |
463 | */ | 474 | */ |
464 | if (isolated) | 475 | if (isolated) { |
465 | high_pfn = max(high_pfn, pfn); | 476 | high_pfn = max(high_pfn, pfn); |
477 | if (cc->order > 0) | ||
478 | zone->compact_cached_free_pfn = high_pfn; | ||
479 | } | ||
466 | } | 480 | } |
467 | 481 | ||
468 | /* split_free_page does not map the pages */ | 482 | /* split_free_page does not map the pages */ |
469 | map_pages(freelist); | 483 | map_pages(freelist); |
470 | 484 | ||
471 | cc->free_pfn = high_pfn; | 485 | cc->free_pfn = high_pfn; |
472 | cc->nr_freepages = nr_freepages; | 486 | cc->nr_freepages = nr_freepages; |
473 | } | 487 | } |
474 | 488 | ||
475 | /* | 489 | /* |
476 | * This is a migrate-callback that "allocates" freepages by taking pages | 490 | * This is a migrate-callback that "allocates" freepages by taking pages |
477 | * from the isolated freelists in the block we are migrating to. | 491 | * from the isolated freelists in the block we are migrating to. |
478 | */ | 492 | */ |
479 | static struct page *compaction_alloc(struct page *migratepage, | 493 | static struct page *compaction_alloc(struct page *migratepage, |
480 | unsigned long data, | 494 | unsigned long data, |
481 | int **result) | 495 | int **result) |
482 | { | 496 | { |
483 | struct compact_control *cc = (struct compact_control *)data; | 497 | struct compact_control *cc = (struct compact_control *)data; |
484 | struct page *freepage; | 498 | struct page *freepage; |
485 | 499 | ||
486 | /* Isolate free pages if necessary */ | 500 | /* Isolate free pages if necessary */ |
487 | if (list_empty(&cc->freepages)) { | 501 | if (list_empty(&cc->freepages)) { |
488 | isolate_freepages(cc->zone, cc); | 502 | isolate_freepages(cc->zone, cc); |
489 | 503 | ||
490 | if (list_empty(&cc->freepages)) | 504 | if (list_empty(&cc->freepages)) |
491 | return NULL; | 505 | return NULL; |
492 | } | 506 | } |
493 | 507 | ||
494 | freepage = list_entry(cc->freepages.next, struct page, lru); | 508 | freepage = list_entry(cc->freepages.next, struct page, lru); |
495 | list_del(&freepage->lru); | 509 | list_del(&freepage->lru); |
496 | cc->nr_freepages--; | 510 | cc->nr_freepages--; |
497 | 511 | ||
498 | return freepage; | 512 | return freepage; |
499 | } | 513 | } |
500 | 514 | ||
501 | /* | 515 | /* |
502 | * We cannot control nr_migratepages and nr_freepages fully when migration is | 516 | * We cannot control nr_migratepages and nr_freepages fully when migration is |
503 | * running as migrate_pages() has no knowledge of compact_control. When | 517 | * running as migrate_pages() has no knowledge of compact_control. When |
504 | * migration is complete, we count the number of pages on the lists by hand. | 518 | * migration is complete, we count the number of pages on the lists by hand. |
505 | */ | 519 | */ |
506 | static void update_nr_listpages(struct compact_control *cc) | 520 | static void update_nr_listpages(struct compact_control *cc) |
507 | { | 521 | { |
508 | int nr_migratepages = 0; | 522 | int nr_migratepages = 0; |
509 | int nr_freepages = 0; | 523 | int nr_freepages = 0; |
510 | struct page *page; | 524 | struct page *page; |
511 | 525 | ||
512 | list_for_each_entry(page, &cc->migratepages, lru) | 526 | list_for_each_entry(page, &cc->migratepages, lru) |
513 | nr_migratepages++; | 527 | nr_migratepages++; |
514 | list_for_each_entry(page, &cc->freepages, lru) | 528 | list_for_each_entry(page, &cc->freepages, lru) |
515 | nr_freepages++; | 529 | nr_freepages++; |
516 | 530 | ||
517 | cc->nr_migratepages = nr_migratepages; | 531 | cc->nr_migratepages = nr_migratepages; |
518 | cc->nr_freepages = nr_freepages; | 532 | cc->nr_freepages = nr_freepages; |
519 | } | 533 | } |
520 | 534 | ||
521 | /* possible outcome of isolate_migratepages */ | 535 | /* possible outcome of isolate_migratepages */ |
522 | typedef enum { | 536 | typedef enum { |
523 | ISOLATE_ABORT, /* Abort compaction now */ | 537 | ISOLATE_ABORT, /* Abort compaction now */ |
524 | ISOLATE_NONE, /* No pages isolated, continue scanning */ | 538 | ISOLATE_NONE, /* No pages isolated, continue scanning */ |
525 | ISOLATE_SUCCESS, /* Pages isolated, migrate */ | 539 | ISOLATE_SUCCESS, /* Pages isolated, migrate */ |
526 | } isolate_migrate_t; | 540 | } isolate_migrate_t; |
527 | 541 | ||
528 | /* | 542 | /* |
529 | * Isolate all pages that can be migrated from the block pointed to by | 543 | * Isolate all pages that can be migrated from the block pointed to by |
530 | * the migrate scanner within compact_control. | 544 | * the migrate scanner within compact_control. |
531 | */ | 545 | */ |
532 | static isolate_migrate_t isolate_migratepages(struct zone *zone, | 546 | static isolate_migrate_t isolate_migratepages(struct zone *zone, |
533 | struct compact_control *cc) | 547 | struct compact_control *cc) |
534 | { | 548 | { |
535 | unsigned long low_pfn, end_pfn; | 549 | unsigned long low_pfn, end_pfn; |
536 | 550 | ||
537 | /* Do not scan outside zone boundaries */ | 551 | /* Do not scan outside zone boundaries */ |
538 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); | 552 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); |
539 | 553 | ||
540 | /* Only scan within a pageblock boundary */ | 554 | /* Only scan within a pageblock boundary */ |
541 | end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); | 555 | end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); |
542 | 556 | ||
543 | /* Do not cross the free scanner or scan within a memory hole */ | 557 | /* Do not cross the free scanner or scan within a memory hole */ |
544 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { | 558 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { |
545 | cc->migrate_pfn = end_pfn; | 559 | cc->migrate_pfn = end_pfn; |
546 | return ISOLATE_NONE; | 560 | return ISOLATE_NONE; |
547 | } | 561 | } |
548 | 562 | ||
549 | /* Perform the isolation */ | 563 | /* Perform the isolation */ |
550 | low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn); | 564 | low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn); |
551 | if (!low_pfn) | 565 | if (!low_pfn) |
552 | return ISOLATE_ABORT; | 566 | return ISOLATE_ABORT; |
553 | 567 | ||
554 | cc->migrate_pfn = low_pfn; | 568 | cc->migrate_pfn = low_pfn; |
555 | 569 | ||
556 | return ISOLATE_SUCCESS; | 570 | return ISOLATE_SUCCESS; |
557 | } | 571 | } |
558 | 572 | ||
573 | /* | ||
574 | * Returns the start pfn of the last page block in a zone. This is the starting | ||
575 | * point for full compaction of a zone. Compaction searches for free pages from | ||
576 | * the end of each zone, while isolate_freepages_block scans forward inside each | ||
577 | * page block. | ||
578 | */ | ||
579 | static unsigned long start_free_pfn(struct zone *zone) | ||
580 | { | ||
581 | unsigned long free_pfn; | ||
582 | free_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
583 | free_pfn &= ~(pageblock_nr_pages-1); | ||
584 | return free_pfn; | ||
585 | } | ||
586 | |||
559 | static int compact_finished(struct zone *zone, | 587 | static int compact_finished(struct zone *zone, |
560 | struct compact_control *cc) | 588 | struct compact_control *cc) |
561 | { | 589 | { |
562 | unsigned int order; | 590 | unsigned int order; |
563 | unsigned long watermark; | 591 | unsigned long watermark; |
564 | 592 | ||
565 | if (fatal_signal_pending(current)) | 593 | if (fatal_signal_pending(current)) |
566 | return COMPACT_PARTIAL; | 594 | return COMPACT_PARTIAL; |
567 | 595 | ||
568 | /* Compaction run completes if the migrate and free scanner meet */ | 596 | /* |
569 | if (cc->free_pfn <= cc->migrate_pfn) | 597 | * A full (order == -1) compaction run starts at the beginning and |
598 | * end of a zone; it completes when the migrate and free scanner meet. | ||
599 | * A partial (order > 0) compaction can start with the free scanner | ||
600 | * at a random point in the zone, and may have to restart. | ||
601 | */ | ||
602 | if (cc->free_pfn <= cc->migrate_pfn) { | ||
603 | if (cc->order > 0 && !cc->wrapped) { | ||
604 | /* We started partway through; restart at the end. */ | ||
605 | unsigned long free_pfn = start_free_pfn(zone); | ||
606 | zone->compact_cached_free_pfn = free_pfn; | ||
607 | cc->free_pfn = free_pfn; | ||
608 | cc->wrapped = 1; | ||
609 | return COMPACT_CONTINUE; | ||
610 | } | ||
570 | return COMPACT_COMPLETE; | 611 | return COMPACT_COMPLETE; |
612 | } | ||
571 | 613 | ||
614 | /* We wrapped around and ended up where we started. */ | ||
615 | if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn) | ||
616 | return COMPACT_COMPLETE; | ||
617 | |||
572 | /* | 618 | /* |
573 | * order == -1 is expected when compacting via | 619 | * order == -1 is expected when compacting via |
574 | * /proc/sys/vm/compact_memory | 620 | * /proc/sys/vm/compact_memory |
575 | */ | 621 | */ |
576 | if (cc->order == -1) | 622 | if (cc->order == -1) |
577 | return COMPACT_CONTINUE; | 623 | return COMPACT_CONTINUE; |
578 | 624 | ||
579 | /* Compaction run is not finished if the watermark is not met */ | 625 | /* Compaction run is not finished if the watermark is not met */ |
580 | watermark = low_wmark_pages(zone); | 626 | watermark = low_wmark_pages(zone); |
581 | watermark += (1 << cc->order); | 627 | watermark += (1 << cc->order); |
582 | 628 | ||
583 | if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) | 629 | if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) |
584 | return COMPACT_CONTINUE; | 630 | return COMPACT_CONTINUE; |
585 | 631 | ||
586 | /* Direct compactor: Is a suitable page free? */ | 632 | /* Direct compactor: Is a suitable page free? */ |
587 | for (order = cc->order; order < MAX_ORDER; order++) { | 633 | for (order = cc->order; order < MAX_ORDER; order++) { |
588 | /* Job done if page is free of the right migratetype */ | 634 | /* Job done if page is free of the right migratetype */ |
589 | if (!list_empty(&zone->free_area[order].free_list[cc->migratetype])) | 635 | if (!list_empty(&zone->free_area[order].free_list[cc->migratetype])) |
590 | return COMPACT_PARTIAL; | 636 | return COMPACT_PARTIAL; |
591 | 637 | ||
592 | /* Job done if allocation would set block type */ | 638 | /* Job done if allocation would set block type */ |
593 | if (order >= pageblock_order && zone->free_area[order].nr_free) | 639 | if (order >= pageblock_order && zone->free_area[order].nr_free) |
594 | return COMPACT_PARTIAL; | 640 | return COMPACT_PARTIAL; |
595 | } | 641 | } |
596 | 642 | ||
597 | return COMPACT_CONTINUE; | 643 | return COMPACT_CONTINUE; |
598 | } | 644 | } |
599 | 645 | ||
600 | /* | 646 | /* |
601 | * compaction_suitable: Is this suitable to run compaction on this zone now? | 647 | * compaction_suitable: Is this suitable to run compaction on this zone now? |
602 | * Returns | 648 | * Returns |
603 | * COMPACT_SKIPPED - If there are too few free pages for compaction | 649 | * COMPACT_SKIPPED - If there are too few free pages for compaction |
604 | * COMPACT_PARTIAL - If the allocation would succeed without compaction | 650 | * COMPACT_PARTIAL - If the allocation would succeed without compaction |
605 | * COMPACT_CONTINUE - If compaction should run now | 651 | * COMPACT_CONTINUE - If compaction should run now |
606 | */ | 652 | */ |
607 | unsigned long compaction_suitable(struct zone *zone, int order) | 653 | unsigned long compaction_suitable(struct zone *zone, int order) |
608 | { | 654 | { |
609 | int fragindex; | 655 | int fragindex; |
610 | unsigned long watermark; | 656 | unsigned long watermark; |
611 | 657 | ||
612 | /* | 658 | /* |
613 | * order == -1 is expected when compacting via | 659 | * order == -1 is expected when compacting via |
614 | * /proc/sys/vm/compact_memory | 660 | * /proc/sys/vm/compact_memory |
615 | */ | 661 | */ |
616 | if (order == -1) | 662 | if (order == -1) |
617 | return COMPACT_CONTINUE; | 663 | return COMPACT_CONTINUE; |
618 | 664 | ||
619 | /* | 665 | /* |
620 | * Watermarks for order-0 must be met for compaction. Note the 2UL. | 666 | * Watermarks for order-0 must be met for compaction. Note the 2UL. |
621 | * This is because during migration, copies of pages need to be | 667 | * This is because during migration, copies of pages need to be |
622 | * allocated and for a short time, the footprint is higher | 668 | * allocated and for a short time, the footprint is higher |
623 | */ | 669 | */ |
624 | watermark = low_wmark_pages(zone) + (2UL << order); | 670 | watermark = low_wmark_pages(zone) + (2UL << order); |
625 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | 671 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) |
626 | return COMPACT_SKIPPED; | 672 | return COMPACT_SKIPPED; |
627 | 673 | ||
628 | /* | 674 | /* |
629 | * fragmentation index determines if allocation failures are due to | 675 | * fragmentation index determines if allocation failures are due to |
630 | * low memory or external fragmentation | 676 | * low memory or external fragmentation |
631 | * | 677 | * |
632 | * index of -1000 implies allocations might succeed depending on | 678 | * index of -1000 implies allocations might succeed depending on |
633 | * watermarks | 679 | * watermarks |
634 | * index towards 0 implies failure is due to lack of memory | 680 | * index towards 0 implies failure is due to lack of memory |
635 | * index towards 1000 implies failure is due to fragmentation | 681 | * index towards 1000 implies failure is due to fragmentation |
636 | * | 682 | * |
637 | * Only compact if a failure would be due to fragmentation. | 683 | * Only compact if a failure would be due to fragmentation. |
638 | */ | 684 | */ |
639 | fragindex = fragmentation_index(zone, order); | 685 | fragindex = fragmentation_index(zone, order); |
640 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) | 686 | if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) |
641 | return COMPACT_SKIPPED; | 687 | return COMPACT_SKIPPED; |
642 | 688 | ||
643 | if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark, | 689 | if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark, |
644 | 0, 0)) | 690 | 0, 0)) |
645 | return COMPACT_PARTIAL; | 691 | return COMPACT_PARTIAL; |
646 | 692 | ||
647 | return COMPACT_CONTINUE; | 693 | return COMPACT_CONTINUE; |
648 | } | 694 | } |
649 | 695 | ||
650 | static int compact_zone(struct zone *zone, struct compact_control *cc) | 696 | static int compact_zone(struct zone *zone, struct compact_control *cc) |
651 | { | 697 | { |
652 | int ret; | 698 | int ret; |
653 | 699 | ||
654 | ret = compaction_suitable(zone, cc->order); | 700 | ret = compaction_suitable(zone, cc->order); |
655 | switch (ret) { | 701 | switch (ret) { |
656 | case COMPACT_PARTIAL: | 702 | case COMPACT_PARTIAL: |
657 | case COMPACT_SKIPPED: | 703 | case COMPACT_SKIPPED: |
658 | /* Compaction is likely to fail */ | 704 | /* Compaction is likely to fail */ |
659 | return ret; | 705 | return ret; |
660 | case COMPACT_CONTINUE: | 706 | case COMPACT_CONTINUE: |
661 | /* Fall through to compaction */ | 707 | /* Fall through to compaction */ |
662 | ; | 708 | ; |
663 | } | 709 | } |
664 | 710 | ||
665 | /* Setup to move all movable pages to the end of the zone */ | 711 | /* Setup to move all movable pages to the end of the zone */ |
666 | cc->migrate_pfn = zone->zone_start_pfn; | 712 | cc->migrate_pfn = zone->zone_start_pfn; |
667 | cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; | 713 | |
668 | cc->free_pfn &= ~(pageblock_nr_pages-1); | 714 | if (cc->order > 0) { |
715 | /* Incremental compaction. Start where the last one stopped. */ | ||
716 | cc->free_pfn = zone->compact_cached_free_pfn; | ||
717 | cc->start_free_pfn = cc->free_pfn; | ||
718 | } else { | ||
719 | /* Order == -1 starts at the end of the zone. */ | ||
720 | cc->free_pfn = start_free_pfn(zone); | ||
721 | } | ||
669 | 722 | ||
670 | migrate_prep_local(); | 723 | migrate_prep_local(); |
671 | 724 | ||
672 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { | 725 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { |
673 | unsigned long nr_migrate, nr_remaining; | 726 | unsigned long nr_migrate, nr_remaining; |
674 | int err; | 727 | int err; |
675 | 728 | ||
676 | switch (isolate_migratepages(zone, cc)) { | 729 | switch (isolate_migratepages(zone, cc)) { |
677 | case ISOLATE_ABORT: | 730 | case ISOLATE_ABORT: |
678 | ret = COMPACT_PARTIAL; | 731 | ret = COMPACT_PARTIAL; |
679 | goto out; | 732 | goto out; |
680 | case ISOLATE_NONE: | 733 | case ISOLATE_NONE: |
681 | continue; | 734 | continue; |
682 | case ISOLATE_SUCCESS: | 735 | case ISOLATE_SUCCESS: |
683 | ; | 736 | ; |
684 | } | 737 | } |
685 | 738 | ||
686 | nr_migrate = cc->nr_migratepages; | 739 | nr_migrate = cc->nr_migratepages; |
687 | err = migrate_pages(&cc->migratepages, compaction_alloc, | 740 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
688 | (unsigned long)cc, false, | 741 | (unsigned long)cc, false, |
689 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); | 742 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); |
690 | update_nr_listpages(cc); | 743 | update_nr_listpages(cc); |
691 | nr_remaining = cc->nr_migratepages; | 744 | nr_remaining = cc->nr_migratepages; |
692 | 745 | ||
693 | count_vm_event(COMPACTBLOCKS); | 746 | count_vm_event(COMPACTBLOCKS); |
694 | count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); | 747 | count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); |
695 | if (nr_remaining) | 748 | if (nr_remaining) |
696 | count_vm_events(COMPACTPAGEFAILED, nr_remaining); | 749 | count_vm_events(COMPACTPAGEFAILED, nr_remaining); |
697 | trace_mm_compaction_migratepages(nr_migrate - nr_remaining, | 750 | trace_mm_compaction_migratepages(nr_migrate - nr_remaining, |
698 | nr_remaining); | 751 | nr_remaining); |
699 | 752 | ||
700 | /* Release LRU pages not migrated */ | 753 | /* Release LRU pages not migrated */ |
701 | if (err) { | 754 | if (err) { |
702 | putback_lru_pages(&cc->migratepages); | 755 | putback_lru_pages(&cc->migratepages); |
703 | cc->nr_migratepages = 0; | 756 | cc->nr_migratepages = 0; |
704 | if (err == -ENOMEM) { | 757 | if (err == -ENOMEM) { |
705 | ret = COMPACT_PARTIAL; | 758 | ret = COMPACT_PARTIAL; |
706 | goto out; | 759 | goto out; |
707 | } | 760 | } |
708 | } | 761 | } |
709 | } | 762 | } |
710 | 763 | ||
711 | out: | 764 | out: |
712 | /* Release free pages and check accounting */ | 765 | /* Release free pages and check accounting */ |
713 | cc->nr_freepages -= release_freepages(&cc->freepages); | 766 | cc->nr_freepages -= release_freepages(&cc->freepages); |
714 | VM_BUG_ON(cc->nr_freepages != 0); | 767 | VM_BUG_ON(cc->nr_freepages != 0); |
715 | 768 | ||
716 | return ret; | 769 | return ret; |
717 | } | 770 | } |
718 | 771 | ||
719 | static unsigned long compact_zone_order(struct zone *zone, | 772 | static unsigned long compact_zone_order(struct zone *zone, |
720 | int order, gfp_t gfp_mask, | 773 | int order, gfp_t gfp_mask, |
721 | bool sync) | 774 | bool sync) |
722 | { | 775 | { |
723 | struct compact_control cc = { | 776 | struct compact_control cc = { |
724 | .nr_freepages = 0, | 777 | .nr_freepages = 0, |
725 | .nr_migratepages = 0, | 778 | .nr_migratepages = 0, |
726 | .order = order, | 779 | .order = order, |
727 | .migratetype = allocflags_to_migratetype(gfp_mask), | 780 | .migratetype = allocflags_to_migratetype(gfp_mask), |
728 | .zone = zone, | 781 | .zone = zone, |
729 | .sync = sync, | 782 | .sync = sync, |
730 | }; | 783 | }; |
731 | INIT_LIST_HEAD(&cc.freepages); | 784 | INIT_LIST_HEAD(&cc.freepages); |
732 | INIT_LIST_HEAD(&cc.migratepages); | 785 | INIT_LIST_HEAD(&cc.migratepages); |
733 | 786 | ||
734 | return compact_zone(zone, &cc); | 787 | return compact_zone(zone, &cc); |
735 | } | 788 | } |
736 | 789 | ||
737 | int sysctl_extfrag_threshold = 500; | 790 | int sysctl_extfrag_threshold = 500; |
738 | 791 | ||
739 | /** | 792 | /** |
740 | * try_to_compact_pages - Direct compact to satisfy a high-order allocation | 793 | * try_to_compact_pages - Direct compact to satisfy a high-order allocation |
741 | * @zonelist: The zonelist used for the current allocation | 794 | * @zonelist: The zonelist used for the current allocation |
742 | * @order: The order of the current allocation | 795 | * @order: The order of the current allocation |
743 | * @gfp_mask: The GFP mask of the current allocation | 796 | * @gfp_mask: The GFP mask of the current allocation |
744 | * @nodemask: The allowed nodes to allocate from | 797 | * @nodemask: The allowed nodes to allocate from |
745 | * @sync: Whether migration is synchronous or not | 798 | * @sync: Whether migration is synchronous or not |
746 | * | 799 | * |
747 | * This is the main entry point for direct page compaction. | 800 | * This is the main entry point for direct page compaction. |
748 | */ | 801 | */ |
749 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | 802 | unsigned long try_to_compact_pages(struct zonelist *zonelist, |
750 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 803 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
751 | bool sync) | 804 | bool sync) |
752 | { | 805 | { |
753 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 806 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
754 | int may_enter_fs = gfp_mask & __GFP_FS; | 807 | int may_enter_fs = gfp_mask & __GFP_FS; |
755 | int may_perform_io = gfp_mask & __GFP_IO; | 808 | int may_perform_io = gfp_mask & __GFP_IO; |
756 | struct zoneref *z; | 809 | struct zoneref *z; |
757 | struct zone *zone; | 810 | struct zone *zone; |
758 | int rc = COMPACT_SKIPPED; | 811 | int rc = COMPACT_SKIPPED; |
759 | 812 | ||
760 | /* | 813 | /* |
761 | * Check whether it is worth even starting compaction. The order check is | 814 | * Check whether it is worth even starting compaction. The order check is |
762 | * made because an assumption is made that the page allocator can satisfy | 815 | * made because an assumption is made that the page allocator can satisfy |
763 | * the "cheaper" orders without taking special steps | 816 | * the "cheaper" orders without taking special steps |
764 | */ | 817 | */ |
765 | if (!order || !may_enter_fs || !may_perform_io) | 818 | if (!order || !may_enter_fs || !may_perform_io) |
766 | return rc; | 819 | return rc; |
767 | 820 | ||
768 | count_vm_event(COMPACTSTALL); | 821 | count_vm_event(COMPACTSTALL); |
769 | 822 | ||
770 | /* Compact each zone in the list */ | 823 | /* Compact each zone in the list */ |
771 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, | 824 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, |
772 | nodemask) { | 825 | nodemask) { |
773 | int status; | 826 | int status; |
774 | 827 | ||
775 | status = compact_zone_order(zone, order, gfp_mask, sync); | 828 | status = compact_zone_order(zone, order, gfp_mask, sync); |
776 | rc = max(status, rc); | 829 | rc = max(status, rc); |
777 | 830 | ||
778 | /* If a normal allocation would succeed, stop compacting */ | 831 | /* If a normal allocation would succeed, stop compacting */ |
779 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) | 832 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) |
780 | break; | 833 | break; |
781 | } | 834 | } |
782 | 835 | ||
783 | return rc; | 836 | return rc; |
784 | } | 837 | } |
785 | 838 | ||
786 | 839 | ||
787 | /* Compact all zones within a node */ | 840 | /* Compact all zones within a node */ |
788 | static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | 841 | static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) |
789 | { | 842 | { |
790 | int zoneid; | 843 | int zoneid; |
791 | struct zone *zone; | 844 | struct zone *zone; |
792 | 845 | ||
793 | for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { | 846 | for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { |
794 | 847 | ||
795 | zone = &pgdat->node_zones[zoneid]; | 848 | zone = &pgdat->node_zones[zoneid]; |
796 | if (!populated_zone(zone)) | 849 | if (!populated_zone(zone)) |
797 | continue; | 850 | continue; |
798 | 851 | ||
799 | cc->nr_freepages = 0; | 852 | cc->nr_freepages = 0; |
800 | cc->nr_migratepages = 0; | 853 | cc->nr_migratepages = 0; |
801 | cc->zone = zone; | 854 | cc->zone = zone; |
802 | INIT_LIST_HEAD(&cc->freepages); | 855 | INIT_LIST_HEAD(&cc->freepages); |
803 | INIT_LIST_HEAD(&cc->migratepages); | 856 | INIT_LIST_HEAD(&cc->migratepages); |
804 | 857 | ||
805 | if (cc->order == -1 || !compaction_deferred(zone, cc->order)) | 858 | if (cc->order == -1 || !compaction_deferred(zone, cc->order)) |
806 | compact_zone(zone, cc); | 859 | compact_zone(zone, cc); |
807 | 860 | ||
808 | if (cc->order > 0) { | 861 | if (cc->order > 0) { |
809 | int ok = zone_watermark_ok(zone, cc->order, | 862 | int ok = zone_watermark_ok(zone, cc->order, |
810 | low_wmark_pages(zone), 0, 0); | 863 | low_wmark_pages(zone), 0, 0); |
811 | if (ok && cc->order > zone->compact_order_failed) | 864 | if (ok && cc->order > zone->compact_order_failed) |
812 | zone->compact_order_failed = cc->order + 1; | 865 | zone->compact_order_failed = cc->order + 1; |
813 | /* Currently async compaction is never deferred. */ | 866 | /* Currently async compaction is never deferred. */ |
814 | else if (!ok && cc->sync) | 867 | else if (!ok && cc->sync) |
815 | defer_compaction(zone, cc->order); | 868 | defer_compaction(zone, cc->order); |
816 | } | 869 | } |
817 | 870 | ||
818 | VM_BUG_ON(!list_empty(&cc->freepages)); | 871 | VM_BUG_ON(!list_empty(&cc->freepages)); |
819 | VM_BUG_ON(!list_empty(&cc->migratepages)); | 872 | VM_BUG_ON(!list_empty(&cc->migratepages)); |
820 | } | 873 | } |
821 | 874 | ||
822 | return 0; | 875 | return 0; |
823 | } | 876 | } |
824 | 877 | ||
825 | int compact_pgdat(pg_data_t *pgdat, int order) | 878 | int compact_pgdat(pg_data_t *pgdat, int order) |
826 | { | 879 | { |
827 | struct compact_control cc = { | 880 | struct compact_control cc = { |
828 | .order = order, | 881 | .order = order, |
829 | .sync = false, | 882 | .sync = false, |
830 | }; | 883 | }; |
831 | 884 | ||
832 | return __compact_pgdat(pgdat, &cc); | 885 | return __compact_pgdat(pgdat, &cc); |
833 | } | 886 | } |
834 | 887 | ||
835 | static int compact_node(int nid) | 888 | static int compact_node(int nid) |
836 | { | 889 | { |
837 | struct compact_control cc = { | 890 | struct compact_control cc = { |
838 | .order = -1, | 891 | .order = -1, |
839 | .sync = true, | 892 | .sync = true, |
840 | }; | 893 | }; |
841 | 894 | ||
842 | return __compact_pgdat(NODE_DATA(nid), &cc); | 895 | return __compact_pgdat(NODE_DATA(nid), &cc); |
843 | } | 896 | } |
844 | 897 | ||
845 | /* Compact all nodes in the system */ | 898 | /* Compact all nodes in the system */ |
846 | static int compact_nodes(void) | 899 | static int compact_nodes(void) |
847 | { | 900 | { |
848 | int nid; | 901 | int nid; |
849 | 902 | ||
850 | /* Flush pending updates to the LRU lists */ | 903 | /* Flush pending updates to the LRU lists */ |
851 | lru_add_drain_all(); | 904 | lru_add_drain_all(); |
852 | 905 | ||
853 | for_each_online_node(nid) | 906 | for_each_online_node(nid) |
854 | compact_node(nid); | 907 | compact_node(nid); |
855 | 908 | ||
856 | return COMPACT_COMPLETE; | 909 | return COMPACT_COMPLETE; |
857 | } | 910 | } |
858 | 911 | ||
859 | /* The written value is actually unused, all memory is compacted */ | 912 | /* The written value is actually unused, all memory is compacted */ |
860 | int sysctl_compact_memory; | 913 | int sysctl_compact_memory; |
861 | 914 | ||
862 | /* This is the entry point for compacting all nodes via /proc/sys/vm */ | 915 | /* This is the entry point for compacting all nodes via /proc/sys/vm */ |
863 | int sysctl_compaction_handler(struct ctl_table *table, int write, | 916 | int sysctl_compaction_handler(struct ctl_table *table, int write, |
864 | void __user *buffer, size_t *length, loff_t *ppos) | 917 | void __user *buffer, size_t *length, loff_t *ppos) |
865 | { | 918 | { |
866 | if (write) | 919 | if (write) |
867 | return compact_nodes(); | 920 | return compact_nodes(); |
868 | 921 | ||
869 | return 0; | 922 | return 0; |
870 | } | 923 | } |
871 | 924 | ||
872 | int sysctl_extfrag_handler(struct ctl_table *table, int write, | 925 | int sysctl_extfrag_handler(struct ctl_table *table, int write, |
873 | void __user *buffer, size_t *length, loff_t *ppos) | 926 | void __user *buffer, size_t *length, loff_t *ppos) |
874 | { | 927 | { |
875 | proc_dointvec_minmax(table, write, buffer, length, ppos); | 928 | proc_dointvec_minmax(table, write, buffer, length, ppos); |
876 | 929 | ||
877 | return 0; | 930 | return 0; |
878 | } | 931 | } |
879 | 932 | ||
880 | #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) | 933 | #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) |
881 | ssize_t sysfs_compact_node(struct device *dev, | 934 | ssize_t sysfs_compact_node(struct device *dev, |
882 | struct device_attribute *attr, | 935 | struct device_attribute *attr, |
883 | const char *buf, size_t count) | 936 | const char *buf, size_t count) |
884 | { | 937 | { |
885 | int nid = dev->id; | 938 | int nid = dev->id; |
886 | 939 | ||
887 | if (nid >= 0 && nid < nr_node_ids && node_online(nid)) { | 940 | if (nid >= 0 && nid < nr_node_ids && node_online(nid)) { |
888 | /* Flush pending updates to the LRU lists */ | 941 | /* Flush pending updates to the LRU lists */ |
889 | lru_add_drain_all(); | 942 | lru_add_drain_all(); |
890 | 943 | ||
891 | compact_node(nid); | 944 | compact_node(nid); |
892 | } | 945 | } |
893 | 946 | ||
894 | return count; | 947 | return count; |
895 | } | 948 | } |
896 | static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node); | 949 | static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node); |
897 | 950 | ||
898 | int compaction_register_node(struct node *node) | 951 | int compaction_register_node(struct node *node) |
899 | { | 952 | { |
900 | return device_create_file(&node->dev, &dev_attr_compact); | 953 | return device_create_file(&node->dev, &dev_attr_compact); |
901 | } | 954 | } |
902 | 955 | ||
903 | void compaction_unregister_node(struct node *node) | 956 | void compaction_unregister_node(struct node *node) |
904 | { | 957 | { |
905 | return device_remove_file(&node->dev, &dev_attr_compact); | 958 | return device_remove_file(&node->dev, &dev_attr_compact); |
906 | } | 959 | } |
907 | #endif /* CONFIG_SYSFS && CONFIG_NUMA */ | 960 | #endif /* CONFIG_SYSFS && CONFIG_NUMA */ |
908 | 961 | ||
909 | #endif /* CONFIG_COMPACTION */ | 962 | #endif /* CONFIG_COMPACTION */ |
910 | 963 |
mm/internal.h
1 | /* internal.h: mm/ internal definitions | 1 | /* internal.h: mm/ internal definitions |
2 | * | 2 | * |
3 | * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. | 3 | * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. |
4 | * Written by David Howells (dhowells@redhat.com) | 4 | * Written by David Howells (dhowells@redhat.com) |
5 | * | 5 | * |
6 | * This program is free software; you can redistribute it and/or | 6 | * This program is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU General Public License | 7 | * modify it under the terms of the GNU General Public License |
8 | * as published by the Free Software Foundation; either version | 8 | * as published by the Free Software Foundation; either version |
9 | * 2 of the License, or (at your option) any later version. | 9 | * 2 of the License, or (at your option) any later version. |
10 | */ | 10 | */ |
11 | #ifndef __MM_INTERNAL_H | 11 | #ifndef __MM_INTERNAL_H |
12 | #define __MM_INTERNAL_H | 12 | #define __MM_INTERNAL_H |
13 | 13 | ||
14 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
15 | 15 | ||
16 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | 16 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, |
17 | unsigned long floor, unsigned long ceiling); | 17 | unsigned long floor, unsigned long ceiling); |
18 | 18 | ||
19 | static inline void set_page_count(struct page *page, int v) | 19 | static inline void set_page_count(struct page *page, int v) |
20 | { | 20 | { |
21 | atomic_set(&page->_count, v); | 21 | atomic_set(&page->_count, v); |
22 | } | 22 | } |
23 | 23 | ||
24 | /* | 24 | /* |
25 | * Turn a non-refcounted page (->_count == 0) into refcounted with | 25 | * Turn a non-refcounted page (->_count == 0) into refcounted with |
26 | * a count of one. | 26 | * a count of one. |
27 | */ | 27 | */ |
28 | static inline void set_page_refcounted(struct page *page) | 28 | static inline void set_page_refcounted(struct page *page) |
29 | { | 29 | { |
30 | VM_BUG_ON(PageTail(page)); | 30 | VM_BUG_ON(PageTail(page)); |
31 | VM_BUG_ON(atomic_read(&page->_count)); | 31 | VM_BUG_ON(atomic_read(&page->_count)); |
32 | set_page_count(page, 1); | 32 | set_page_count(page, 1); |
33 | } | 33 | } |
34 | 34 | ||
35 | static inline void __put_page(struct page *page) | 35 | static inline void __put_page(struct page *page) |
36 | { | 36 | { |
37 | atomic_dec(&page->_count); | 37 | atomic_dec(&page->_count); |
38 | } | 38 | } |
39 | 39 | ||
40 | static inline void __get_page_tail_foll(struct page *page, | 40 | static inline void __get_page_tail_foll(struct page *page, |
41 | bool get_page_head) | 41 | bool get_page_head) |
42 | { | 42 | { |
43 | /* | 43 | /* |
44 | * If we're getting a tail page, the elevated page->_count is | 44 | * If we're getting a tail page, the elevated page->_count is |
45 | * required only in the head page and we will elevate the head | 45 | * required only in the head page and we will elevate the head |
46 | * page->_count and tail page->_mapcount. | 46 | * page->_count and tail page->_mapcount. |
47 | * | 47 | * |
48 | * We elevate page_tail->_mapcount for tail pages to force | 48 | * We elevate page_tail->_mapcount for tail pages to force |
49 | * page_tail->_count to be zero at all times to avoid getting | 49 | * page_tail->_count to be zero at all times to avoid getting |
50 | * false positives from get_page_unless_zero() with | 50 | * false positives from get_page_unless_zero() with |
51 | * speculative page access (like in | 51 | * speculative page access (like in |
52 | * page_cache_get_speculative()) on tail pages. | 52 | * page_cache_get_speculative()) on tail pages. |
53 | */ | 53 | */ |
54 | VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); | 54 | VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); |
55 | VM_BUG_ON(atomic_read(&page->_count) != 0); | 55 | VM_BUG_ON(atomic_read(&page->_count) != 0); |
56 | VM_BUG_ON(page_mapcount(page) < 0); | 56 | VM_BUG_ON(page_mapcount(page) < 0); |
57 | if (get_page_head) | 57 | if (get_page_head) |
58 | atomic_inc(&page->first_page->_count); | 58 | atomic_inc(&page->first_page->_count); |
59 | atomic_inc(&page->_mapcount); | 59 | atomic_inc(&page->_mapcount); |
60 | } | 60 | } |
61 | 61 | ||
62 | /* | 62 | /* |
63 | * This is meant to be called as the FOLL_GET operation of | 63 | * This is meant to be called as the FOLL_GET operation of |
64 | * follow_page() and it must be called while holding the proper PT | 64 | * follow_page() and it must be called while holding the proper PT |
65 | * lock while the pte (or pmd_trans_huge) is still mapping the page. | 65 | * lock while the pte (or pmd_trans_huge) is still mapping the page. |
66 | */ | 66 | */ |
67 | static inline void get_page_foll(struct page *page) | 67 | static inline void get_page_foll(struct page *page) |
68 | { | 68 | { |
69 | if (unlikely(PageTail(page))) | 69 | if (unlikely(PageTail(page))) |
70 | /* | 70 | /* |
71 | * This is safe only because | 71 | * This is safe only because |
72 | * __split_huge_page_refcount() can't run under | 72 | * __split_huge_page_refcount() can't run under |
73 | * get_page_foll() because we hold the proper PT lock. | 73 | * get_page_foll() because we hold the proper PT lock. |
74 | */ | 74 | */ |
75 | __get_page_tail_foll(page, true); | 75 | __get_page_tail_foll(page, true); |
76 | else { | 76 | else { |
77 | /* | 77 | /* |
78 | * Getting a normal page or the head of a compound page | 78 | * Getting a normal page or the head of a compound page |
79 | * requires to already have an elevated page->_count. | 79 | * requires to already have an elevated page->_count. |
80 | */ | 80 | */ |
81 | VM_BUG_ON(atomic_read(&page->_count) <= 0); | 81 | VM_BUG_ON(atomic_read(&page->_count) <= 0); |
82 | atomic_inc(&page->_count); | 82 | atomic_inc(&page->_count); |
83 | } | 83 | } |
84 | } | 84 | } |
85 | 85 | ||
86 | extern unsigned long highest_memmap_pfn; | 86 | extern unsigned long highest_memmap_pfn; |
87 | 87 | ||
88 | /* | 88 | /* |
89 | * in mm/vmscan.c: | 89 | * in mm/vmscan.c: |
90 | */ | 90 | */ |
91 | extern int isolate_lru_page(struct page *page); | 91 | extern int isolate_lru_page(struct page *page); |
92 | extern void putback_lru_page(struct page *page); | 92 | extern void putback_lru_page(struct page *page); |
93 | 93 | ||
94 | /* | 94 | /* |
95 | * in mm/page_alloc.c | 95 | * in mm/page_alloc.c |
96 | */ | 96 | */ |
97 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 97 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
98 | extern void prep_compound_page(struct page *page, unsigned long order); | 98 | extern void prep_compound_page(struct page *page, unsigned long order); |
99 | #ifdef CONFIG_MEMORY_FAILURE | 99 | #ifdef CONFIG_MEMORY_FAILURE |
100 | extern bool is_free_buddy_page(struct page *page); | 100 | extern bool is_free_buddy_page(struct page *page); |
101 | #endif | 101 | #endif |
102 | 102 | ||
103 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | 103 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA |
104 | 104 | ||
105 | /* | 105 | /* |
106 | * in mm/compaction.c | 106 | * in mm/compaction.c |
107 | */ | 107 | */ |
108 | /* | 108 | /* |
109 | * compact_control is used to track pages being migrated and the free pages | 109 | * compact_control is used to track pages being migrated and the free pages |
110 | * they are being migrated to during memory compaction. The free_pfn starts | 110 | * they are being migrated to during memory compaction. The free_pfn starts |
111 | * at the end of a zone and migrate_pfn begins at the start. Movable pages | 111 | * at the end of a zone and migrate_pfn begins at the start. Movable pages |
112 | * are moved to the end of a zone during a compaction run and the run | 112 | * are moved to the end of a zone during a compaction run and the run |
113 | * completes when free_pfn <= migrate_pfn | 113 | * completes when free_pfn <= migrate_pfn |
114 | */ | 114 | */ |
115 | struct compact_control { | 115 | struct compact_control { |
116 | struct list_head freepages; /* List of free pages to migrate to */ | 116 | struct list_head freepages; /* List of free pages to migrate to */ |
117 | struct list_head migratepages; /* List of pages being migrated */ | 117 | struct list_head migratepages; /* List of pages being migrated */ |
118 | unsigned long nr_freepages; /* Number of isolated free pages */ | 118 | unsigned long nr_freepages; /* Number of isolated free pages */ |
119 | unsigned long nr_migratepages; /* Number of pages to migrate */ | 119 | unsigned long nr_migratepages; /* Number of pages to migrate */ |
120 | unsigned long free_pfn; /* isolate_freepages search base */ | 120 | unsigned long free_pfn; /* isolate_freepages search base */ |
121 | unsigned long start_free_pfn; /* where we started the search */ | ||
121 | unsigned long migrate_pfn; /* isolate_migratepages search base */ | 122 | unsigned long migrate_pfn; /* isolate_migratepages search base */ |
122 | bool sync; /* Synchronous migration */ | 123 | bool sync; /* Synchronous migration */ |
124 | bool wrapped; /* Order > 0 compactions are | ||
125 | incremental, once free_pfn | ||
126 | and migrate_pfn meet, we restart | ||
127 | from the top of the zone; | ||
128 | remember we wrapped around. */ | ||
123 | 129 | ||
124 | int order; /* order a direct compactor needs */ | 130 | int order; /* order a direct compactor needs */ |
125 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 131 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ |
126 | struct zone *zone; | 132 | struct zone *zone; |
127 | }; | 133 | }; |
128 | 134 | ||
129 | unsigned long | 135 | unsigned long |
130 | isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn); | 136 | isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn); |
131 | unsigned long | 137 | unsigned long |
132 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | 138 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, |
133 | unsigned long low_pfn, unsigned long end_pfn); | 139 | unsigned long low_pfn, unsigned long end_pfn); |
134 | 140 | ||
135 | #endif | 141 | #endif |
136 | 142 | ||
137 | /* | 143 | /* |
138 | * function for dealing with page's order in buddy system. | 144 | * function for dealing with page's order in buddy system. |
139 | * zone->lock is already acquired when we use these. | 145 | * zone->lock is already acquired when we use these. |
140 | * So, we don't need atomic page->flags operations here. | 146 | * So, we don't need atomic page->flags operations here. |
141 | */ | 147 | */ |
142 | static inline unsigned long page_order(struct page *page) | 148 | static inline unsigned long page_order(struct page *page) |
143 | { | 149 | { |
144 | /* PageBuddy() must be checked by the caller */ | 150 | /* PageBuddy() must be checked by the caller */ |
145 | return page_private(page); | 151 | return page_private(page); |
146 | } | 152 | } |
147 | 153 | ||
148 | /* mm/util.c */ | 154 | /* mm/util.c */ |
149 | void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, | 155 | void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, |
150 | struct vm_area_struct *prev, struct rb_node *rb_parent); | 156 | struct vm_area_struct *prev, struct rb_node *rb_parent); |
151 | 157 | ||
152 | #ifdef CONFIG_MMU | 158 | #ifdef CONFIG_MMU |
153 | extern long mlock_vma_pages_range(struct vm_area_struct *vma, | 159 | extern long mlock_vma_pages_range(struct vm_area_struct *vma, |
154 | unsigned long start, unsigned long end); | 160 | unsigned long start, unsigned long end); |
155 | extern void munlock_vma_pages_range(struct vm_area_struct *vma, | 161 | extern void munlock_vma_pages_range(struct vm_area_struct *vma, |
156 | unsigned long start, unsigned long end); | 162 | unsigned long start, unsigned long end); |
157 | static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | 163 | static inline void munlock_vma_pages_all(struct vm_area_struct *vma) |
158 | { | 164 | { |
159 | munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); | 165 | munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); |
160 | } | 166 | } |
161 | 167 | ||
162 | /* | 168 | /* |
163 | * Called only in fault path via page_evictable() for a new page | 169 | * Called only in fault path via page_evictable() for a new page |
164 | * to determine if it's being mapped into a LOCKED vma. | 170 | * to determine if it's being mapped into a LOCKED vma. |
165 | * If so, mark page as mlocked. | 171 | * If so, mark page as mlocked. |
166 | */ | 172 | */ |
167 | static inline int mlocked_vma_newpage(struct vm_area_struct *vma, | 173 | static inline int mlocked_vma_newpage(struct vm_area_struct *vma, |
168 | struct page *page) | 174 | struct page *page) |
169 | { | 175 | { |
170 | VM_BUG_ON(PageLRU(page)); | 176 | VM_BUG_ON(PageLRU(page)); |
171 | 177 | ||
172 | if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) | 178 | if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) |
173 | return 0; | 179 | return 0; |
174 | 180 | ||
175 | if (!TestSetPageMlocked(page)) { | 181 | if (!TestSetPageMlocked(page)) { |
176 | inc_zone_page_state(page, NR_MLOCK); | 182 | inc_zone_page_state(page, NR_MLOCK); |
177 | count_vm_event(UNEVICTABLE_PGMLOCKED); | 183 | count_vm_event(UNEVICTABLE_PGMLOCKED); |
178 | } | 184 | } |
179 | return 1; | 185 | return 1; |
180 | } | 186 | } |
181 | 187 | ||
182 | /* | 188 | /* |
183 | * must be called with vma's mmap_sem held for read or write, and page locked. | 189 | * must be called with vma's mmap_sem held for read or write, and page locked. |
184 | */ | 190 | */ |
185 | extern void mlock_vma_page(struct page *page); | 191 | extern void mlock_vma_page(struct page *page); |
186 | extern void munlock_vma_page(struct page *page); | 192 | extern void munlock_vma_page(struct page *page); |
187 | 193 | ||
188 | /* | 194 | /* |
189 | * Clear the page's PageMlocked(). This can be useful in a situation where | 195 | * Clear the page's PageMlocked(). This can be useful in a situation where |
190 | * we want to unconditionally remove a page from the pagecache -- e.g., | 196 | * we want to unconditionally remove a page from the pagecache -- e.g., |
191 | * on truncation or freeing. | 197 | * on truncation or freeing. |
192 | * | 198 | * |
193 | * It is legal to call this function for any page, mlocked or not. | 199 | * It is legal to call this function for any page, mlocked or not. |
194 | * If called for a page that is still mapped by mlocked vmas, all we do | 200 | * If called for a page that is still mapped by mlocked vmas, all we do |
195 | * is revert to lazy LRU behaviour -- semantics are not broken. | 201 | * is revert to lazy LRU behaviour -- semantics are not broken. |
196 | */ | 202 | */ |
197 | extern void __clear_page_mlock(struct page *page); | 203 | extern void __clear_page_mlock(struct page *page); |
198 | static inline void clear_page_mlock(struct page *page) | 204 | static inline void clear_page_mlock(struct page *page) |
199 | { | 205 | { |
200 | if (unlikely(TestClearPageMlocked(page))) | 206 | if (unlikely(TestClearPageMlocked(page))) |
201 | __clear_page_mlock(page); | 207 | __clear_page_mlock(page); |
202 | } | 208 | } |
203 | 209 | ||
204 | /* | 210 | /* |
205 | * mlock_migrate_page - called only from migrate_page_copy() to | 211 | * mlock_migrate_page - called only from migrate_page_copy() to |
206 | * migrate the Mlocked page flag; update statistics. | 212 | * migrate the Mlocked page flag; update statistics. |
207 | */ | 213 | */ |
208 | static inline void mlock_migrate_page(struct page *newpage, struct page *page) | 214 | static inline void mlock_migrate_page(struct page *newpage, struct page *page) |
209 | { | 215 | { |
210 | if (TestClearPageMlocked(page)) { | 216 | if (TestClearPageMlocked(page)) { |
211 | unsigned long flags; | 217 | unsigned long flags; |
212 | 218 | ||
213 | local_irq_save(flags); | 219 | local_irq_save(flags); |
214 | __dec_zone_page_state(page, NR_MLOCK); | 220 | __dec_zone_page_state(page, NR_MLOCK); |
215 | SetPageMlocked(newpage); | 221 | SetPageMlocked(newpage); |
216 | __inc_zone_page_state(newpage, NR_MLOCK); | 222 | __inc_zone_page_state(newpage, NR_MLOCK); |
217 | local_irq_restore(flags); | 223 | local_irq_restore(flags); |
218 | } | 224 | } |
219 | } | 225 | } |
220 | 226 | ||
221 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 227 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
222 | extern unsigned long vma_address(struct page *page, | 228 | extern unsigned long vma_address(struct page *page, |
223 | struct vm_area_struct *vma); | 229 | struct vm_area_struct *vma); |
224 | #endif | 230 | #endif |
225 | #else /* !CONFIG_MMU */ | 231 | #else /* !CONFIG_MMU */ |
226 | static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p) | 232 | static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p) |
227 | { | 233 | { |
228 | return 0; | 234 | return 0; |
229 | } | 235 | } |
230 | static inline void clear_page_mlock(struct page *page) { } | 236 | static inline void clear_page_mlock(struct page *page) { } |
231 | static inline void mlock_vma_page(struct page *page) { } | 237 | static inline void mlock_vma_page(struct page *page) { } |
232 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } | 238 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } |
233 | 239 | ||
234 | #endif /* !CONFIG_MMU */ | 240 | #endif /* !CONFIG_MMU */ |
235 | 241 | ||
236 | /* | 242 | /* |
237 | * Return the mem_map entry representing the 'offset' subpage within | 243 | * Return the mem_map entry representing the 'offset' subpage within |
238 | * the maximally aligned gigantic page 'base'. Handle any discontiguity | 244 | * the maximally aligned gigantic page 'base'. Handle any discontiguity |
239 | * in the mem_map at MAX_ORDER_NR_PAGES boundaries. | 245 | * in the mem_map at MAX_ORDER_NR_PAGES boundaries. |
240 | */ | 246 | */ |
241 | static inline struct page *mem_map_offset(struct page *base, int offset) | 247 | static inline struct page *mem_map_offset(struct page *base, int offset) |
242 | { | 248 | { |
243 | if (unlikely(offset >= MAX_ORDER_NR_PAGES)) | 249 | if (unlikely(offset >= MAX_ORDER_NR_PAGES)) |
244 | return pfn_to_page(page_to_pfn(base) + offset); | 250 | return pfn_to_page(page_to_pfn(base) + offset); |
245 | return base + offset; | 251 | return base + offset; |
246 | } | 252 | } |
247 | 253 | ||
248 | /* | 254 | /* |
249 | * Iterator over all subpages within the maximally aligned gigantic | 255 | * Iterator over all subpages within the maximally aligned gigantic |
250 | * page 'base'. Handle any discontiguity in the mem_map. | 256 | * page 'base'. Handle any discontiguity in the mem_map. |
251 | */ | 257 | */ |
252 | static inline struct page *mem_map_next(struct page *iter, | 258 | static inline struct page *mem_map_next(struct page *iter, |
253 | struct page *base, int offset) | 259 | struct page *base, int offset) |
254 | { | 260 | { |
255 | if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) { | 261 | if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) { |
256 | unsigned long pfn = page_to_pfn(base) + offset; | 262 | unsigned long pfn = page_to_pfn(base) + offset; |
257 | if (!pfn_valid(pfn)) | 263 | if (!pfn_valid(pfn)) |
258 | return NULL; | 264 | return NULL; |
259 | return pfn_to_page(pfn); | 265 | return pfn_to_page(pfn); |
260 | } | 266 | } |
261 | return iter + 1; | 267 | return iter + 1; |
262 | } | 268 | } |
263 | 269 | ||
264 | /* | 270 | /* |
265 | * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, | 271 | * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, |
266 | * so all functions starting at paging_init should be marked __init | 272 | * so all functions starting at paging_init should be marked __init |
267 | * in those cases. SPARSEMEM, however, allows for memory hotplug, | 273 | * in those cases. SPARSEMEM, however, allows for memory hotplug, |
268 | * and alloc_bootmem_node is not used. | 274 | * and alloc_bootmem_node is not used. |
269 | */ | 275 | */ |
270 | #ifdef CONFIG_SPARSEMEM | 276 | #ifdef CONFIG_SPARSEMEM |
271 | #define __paginginit __meminit | 277 | #define __paginginit __meminit |
272 | #else | 278 | #else |
273 | #define __paginginit __init | 279 | #define __paginginit __init |
274 | #endif | 280 | #endif |
275 | 281 | ||
276 | /* Memory initialisation debug and verification */ | 282 | /* Memory initialisation debug and verification */ |
277 | enum mminit_level { | 283 | enum mminit_level { |
278 | MMINIT_WARNING, | 284 | MMINIT_WARNING, |
279 | MMINIT_VERIFY, | 285 | MMINIT_VERIFY, |
280 | MMINIT_TRACE | 286 | MMINIT_TRACE |
281 | }; | 287 | }; |
282 | 288 | ||
283 | #ifdef CONFIG_DEBUG_MEMORY_INIT | 289 | #ifdef CONFIG_DEBUG_MEMORY_INIT |
284 | 290 | ||
285 | extern int mminit_loglevel; | 291 | extern int mminit_loglevel; |
286 | 292 | ||
287 | #define mminit_dprintk(level, prefix, fmt, arg...) \ | 293 | #define mminit_dprintk(level, prefix, fmt, arg...) \ |
288 | do { \ | 294 | do { \ |
289 | if (level < mminit_loglevel) { \ | 295 | if (level < mminit_loglevel) { \ |
290 | printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \ | 296 | printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \ |
291 | printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \ | 297 | printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \ |
292 | } \ | 298 | } \ |
293 | } while (0) | 299 | } while (0) |
294 | 300 | ||
295 | extern void mminit_verify_pageflags_layout(void); | 301 | extern void mminit_verify_pageflags_layout(void); |
296 | extern void mminit_verify_page_links(struct page *page, | 302 | extern void mminit_verify_page_links(struct page *page, |
297 | enum zone_type zone, unsigned long nid, unsigned long pfn); | 303 | enum zone_type zone, unsigned long nid, unsigned long pfn); |
298 | extern void mminit_verify_zonelist(void); | 304 | extern void mminit_verify_zonelist(void); |
299 | 305 | ||
300 | #else | 306 | #else |
301 | 307 | ||
302 | static inline void mminit_dprintk(enum mminit_level level, | 308 | static inline void mminit_dprintk(enum mminit_level level, |
303 | const char *prefix, const char *fmt, ...) | 309 | const char *prefix, const char *fmt, ...) |
304 | { | 310 | { |
305 | } | 311 | } |
306 | 312 | ||
307 | static inline void mminit_verify_pageflags_layout(void) | 313 | static inline void mminit_verify_pageflags_layout(void) |
308 | { | 314 | { |
309 | } | 315 | } |
310 | 316 | ||
311 | static inline void mminit_verify_page_links(struct page *page, | 317 | static inline void mminit_verify_page_links(struct page *page, |
312 | enum zone_type zone, unsigned long nid, unsigned long pfn) | 318 | enum zone_type zone, unsigned long nid, unsigned long pfn) |
313 | { | 319 | { |
314 | } | 320 | } |
315 | 321 | ||
316 | static inline void mminit_verify_zonelist(void) | 322 | static inline void mminit_verify_zonelist(void) |
317 | { | 323 | { |
318 | } | 324 | } |
319 | #endif /* CONFIG_DEBUG_MEMORY_INIT */ | 325 | #endif /* CONFIG_DEBUG_MEMORY_INIT */ |
320 | 326 | ||
321 | /* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ | 327 | /* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ |
322 | #if defined(CONFIG_SPARSEMEM) | 328 | #if defined(CONFIG_SPARSEMEM) |
323 | extern void mminit_validate_memmodel_limits(unsigned long *start_pfn, | 329 | extern void mminit_validate_memmodel_limits(unsigned long *start_pfn, |
324 | unsigned long *end_pfn); | 330 | unsigned long *end_pfn); |
325 | #else | 331 | #else |
326 | static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, | 332 | static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, |
327 | unsigned long *end_pfn) | 333 | unsigned long *end_pfn) |
328 | { | 334 | { |
329 | } | 335 | } |
330 | #endif /* CONFIG_SPARSEMEM */ | 336 | #endif /* CONFIG_SPARSEMEM */ |
331 | 337 | ||
332 | #define ZONE_RECLAIM_NOSCAN -2 | 338 | #define ZONE_RECLAIM_NOSCAN -2 |
333 | #define ZONE_RECLAIM_FULL -1 | 339 | #define ZONE_RECLAIM_FULL -1 |
334 | #define ZONE_RECLAIM_SOME 0 | 340 | #define ZONE_RECLAIM_SOME 0 |
335 | #define ZONE_RECLAIM_SUCCESS 1 | 341 | #define ZONE_RECLAIM_SUCCESS 1 |
336 | #endif | 342 | #endif |
337 | 343 | ||
338 | extern int hwpoison_filter(struct page *p); | 344 | extern int hwpoison_filter(struct page *p); |
339 | 345 | ||
340 | extern u32 hwpoison_filter_dev_major; | 346 | extern u32 hwpoison_filter_dev_major; |
341 | extern u32 hwpoison_filter_dev_minor; | 347 | extern u32 hwpoison_filter_dev_minor; |
342 | extern u64 hwpoison_filter_flags_mask; | 348 | extern u64 hwpoison_filter_flags_mask; |
343 | extern u64 hwpoison_filter_flags_value; | 349 | extern u64 hwpoison_filter_flags_value; |
344 | extern u64 hwpoison_filter_memcg; | 350 | extern u64 hwpoison_filter_memcg; |
345 | extern u32 hwpoison_filter_enable; | 351 | extern u32 hwpoison_filter_enable; |
346 | 352 | ||
347 | extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, | 353 | extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, |
348 | unsigned long, unsigned long, | 354 | unsigned long, unsigned long, |
349 | unsigned long, unsigned long); | 355 | unsigned long, unsigned long); |
350 | 356 |
mm/page_alloc.c
1 | /* | 1 | /* |
2 | * linux/mm/page_alloc.c | 2 | * linux/mm/page_alloc.c |
3 | * | 3 | * |
4 | * Manages the free list, the system allocates free pages here. | 4 | * Manages the free list, the system allocates free pages here. |
5 | * Note that kmalloc() lives in slab.c | 5 | * Note that kmalloc() lives in slab.c |
6 | * | 6 | * |
7 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | 7 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds |
8 | * Swap reorganised 29.12.95, Stephen Tweedie | 8 | * Swap reorganised 29.12.95, Stephen Tweedie |
9 | * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 | 9 | * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 |
10 | * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 | 10 | * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 |
11 | * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 | 11 | * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 |
12 | * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 | 12 | * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 |
13 | * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 | 13 | * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 |
14 | * (lots of bits borrowed from Ingo Molnar & Andrew Morton) | 14 | * (lots of bits borrowed from Ingo Molnar & Andrew Morton) |
15 | */ | 15 | */ |
16 | 16 | ||
17 | #include <linux/stddef.h> | 17 | #include <linux/stddef.h> |
18 | #include <linux/mm.h> | 18 | #include <linux/mm.h> |
19 | #include <linux/swap.h> | 19 | #include <linux/swap.h> |
20 | #include <linux/interrupt.h> | 20 | #include <linux/interrupt.h> |
21 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
22 | #include <linux/jiffies.h> | 22 | #include <linux/jiffies.h> |
23 | #include <linux/bootmem.h> | 23 | #include <linux/bootmem.h> |
24 | #include <linux/memblock.h> | 24 | #include <linux/memblock.h> |
25 | #include <linux/compiler.h> | 25 | #include <linux/compiler.h> |
26 | #include <linux/kernel.h> | 26 | #include <linux/kernel.h> |
27 | #include <linux/kmemcheck.h> | 27 | #include <linux/kmemcheck.h> |
28 | #include <linux/module.h> | 28 | #include <linux/module.h> |
29 | #include <linux/suspend.h> | 29 | #include <linux/suspend.h> |
30 | #include <linux/pagevec.h> | 30 | #include <linux/pagevec.h> |
31 | #include <linux/blkdev.h> | 31 | #include <linux/blkdev.h> |
32 | #include <linux/slab.h> | 32 | #include <linux/slab.h> |
33 | #include <linux/ratelimit.h> | 33 | #include <linux/ratelimit.h> |
34 | #include <linux/oom.h> | 34 | #include <linux/oom.h> |
35 | #include <linux/notifier.h> | 35 | #include <linux/notifier.h> |
36 | #include <linux/topology.h> | 36 | #include <linux/topology.h> |
37 | #include <linux/sysctl.h> | 37 | #include <linux/sysctl.h> |
38 | #include <linux/cpu.h> | 38 | #include <linux/cpu.h> |
39 | #include <linux/cpuset.h> | 39 | #include <linux/cpuset.h> |
40 | #include <linux/memory_hotplug.h> | 40 | #include <linux/memory_hotplug.h> |
41 | #include <linux/nodemask.h> | 41 | #include <linux/nodemask.h> |
42 | #include <linux/vmalloc.h> | 42 | #include <linux/vmalloc.h> |
43 | #include <linux/vmstat.h> | 43 | #include <linux/vmstat.h> |
44 | #include <linux/mempolicy.h> | 44 | #include <linux/mempolicy.h> |
45 | #include <linux/stop_machine.h> | 45 | #include <linux/stop_machine.h> |
46 | #include <linux/sort.h> | 46 | #include <linux/sort.h> |
47 | #include <linux/pfn.h> | 47 | #include <linux/pfn.h> |
48 | #include <linux/backing-dev.h> | 48 | #include <linux/backing-dev.h> |
49 | #include <linux/fault-inject.h> | 49 | #include <linux/fault-inject.h> |
50 | #include <linux/page-isolation.h> | 50 | #include <linux/page-isolation.h> |
51 | #include <linux/page_cgroup.h> | 51 | #include <linux/page_cgroup.h> |
52 | #include <linux/debugobjects.h> | 52 | #include <linux/debugobjects.h> |
53 | #include <linux/kmemleak.h> | 53 | #include <linux/kmemleak.h> |
54 | #include <linux/memory.h> | 54 | #include <linux/memory.h> |
55 | #include <linux/compaction.h> | 55 | #include <linux/compaction.h> |
56 | #include <trace/events/kmem.h> | 56 | #include <trace/events/kmem.h> |
57 | #include <linux/ftrace_event.h> | 57 | #include <linux/ftrace_event.h> |
58 | #include <linux/memcontrol.h> | 58 | #include <linux/memcontrol.h> |
59 | #include <linux/prefetch.h> | 59 | #include <linux/prefetch.h> |
60 | #include <linux/migrate.h> | 60 | #include <linux/migrate.h> |
61 | #include <linux/page-debug-flags.h> | 61 | #include <linux/page-debug-flags.h> |
62 | 62 | ||
63 | #include <asm/tlbflush.h> | 63 | #include <asm/tlbflush.h> |
64 | #include <asm/div64.h> | 64 | #include <asm/div64.h> |
65 | #include "internal.h" | 65 | #include "internal.h" |
66 | 66 | ||
67 | #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID | 67 | #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID |
68 | DEFINE_PER_CPU(int, numa_node); | 68 | DEFINE_PER_CPU(int, numa_node); |
69 | EXPORT_PER_CPU_SYMBOL(numa_node); | 69 | EXPORT_PER_CPU_SYMBOL(numa_node); |
70 | #endif | 70 | #endif |
71 | 71 | ||
72 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES | 72 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES |
73 | /* | 73 | /* |
74 | * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. | 74 | * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. |
75 | * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. | 75 | * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. |
76 | * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() | 76 | * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() |
77 | * defined in <linux/topology.h>. | 77 | * defined in <linux/topology.h>. |
78 | */ | 78 | */ |
79 | DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ | 79 | DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ |
80 | EXPORT_PER_CPU_SYMBOL(_numa_mem_); | 80 | EXPORT_PER_CPU_SYMBOL(_numa_mem_); |
81 | #endif | 81 | #endif |
82 | 82 | ||
83 | /* | 83 | /* |
84 | * Array of node states. | 84 | * Array of node states. |
85 | */ | 85 | */ |
86 | nodemask_t node_states[NR_NODE_STATES] __read_mostly = { | 86 | nodemask_t node_states[NR_NODE_STATES] __read_mostly = { |
87 | [N_POSSIBLE] = NODE_MASK_ALL, | 87 | [N_POSSIBLE] = NODE_MASK_ALL, |
88 | [N_ONLINE] = { { [0] = 1UL } }, | 88 | [N_ONLINE] = { { [0] = 1UL } }, |
89 | #ifndef CONFIG_NUMA | 89 | #ifndef CONFIG_NUMA |
90 | [N_NORMAL_MEMORY] = { { [0] = 1UL } }, | 90 | [N_NORMAL_MEMORY] = { { [0] = 1UL } }, |
91 | #ifdef CONFIG_HIGHMEM | 91 | #ifdef CONFIG_HIGHMEM |
92 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, | 92 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, |
93 | #endif | 93 | #endif |
94 | [N_CPU] = { { [0] = 1UL } }, | 94 | [N_CPU] = { { [0] = 1UL } }, |
95 | #endif /* NUMA */ | 95 | #endif /* NUMA */ |
96 | }; | 96 | }; |
97 | EXPORT_SYMBOL(node_states); | 97 | EXPORT_SYMBOL(node_states); |
98 | 98 | ||
99 | unsigned long totalram_pages __read_mostly; | 99 | unsigned long totalram_pages __read_mostly; |
100 | unsigned long totalreserve_pages __read_mostly; | 100 | unsigned long totalreserve_pages __read_mostly; |
101 | /* | 101 | /* |
102 | * When calculating the number of globally allowed dirty pages, there | 102 | * When calculating the number of globally allowed dirty pages, there |
103 | * is a certain number of per-zone reserves that should not be | 103 | * is a certain number of per-zone reserves that should not be |
104 | * considered dirtyable memory. This is the sum of those reserves | 104 | * considered dirtyable memory. This is the sum of those reserves |
105 | * over all existing zones that contribute dirtyable memory. | 105 | * over all existing zones that contribute dirtyable memory. |
106 | */ | 106 | */ |
107 | unsigned long dirty_balance_reserve __read_mostly; | 107 | unsigned long dirty_balance_reserve __read_mostly; |
108 | 108 | ||
109 | int percpu_pagelist_fraction; | 109 | int percpu_pagelist_fraction; |
110 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; | 110 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; |
111 | 111 | ||
112 | #ifdef CONFIG_PM_SLEEP | 112 | #ifdef CONFIG_PM_SLEEP |
113 | /* | 113 | /* |
114 | * The following functions are used by the suspend/hibernate code to temporarily | 114 | * The following functions are used by the suspend/hibernate code to temporarily |
115 | * change gfp_allowed_mask in order to avoid using I/O during memory allocations | 115 | * change gfp_allowed_mask in order to avoid using I/O during memory allocations |
116 | * while devices are suspended. To avoid races with the suspend/hibernate code, | 116 | * while devices are suspended. To avoid races with the suspend/hibernate code, |
117 | * they should always be called with pm_mutex held (gfp_allowed_mask also should | 117 | * they should always be called with pm_mutex held (gfp_allowed_mask also should |
118 | * only be modified with pm_mutex held, unless the suspend/hibernate code is | 118 | * only be modified with pm_mutex held, unless the suspend/hibernate code is |
119 | * guaranteed not to run in parallel with that modification). | 119 | * guaranteed not to run in parallel with that modification). |
120 | */ | 120 | */ |
121 | 121 | ||
122 | static gfp_t saved_gfp_mask; | 122 | static gfp_t saved_gfp_mask; |
123 | 123 | ||
124 | void pm_restore_gfp_mask(void) | 124 | void pm_restore_gfp_mask(void) |
125 | { | 125 | { |
126 | WARN_ON(!mutex_is_locked(&pm_mutex)); | 126 | WARN_ON(!mutex_is_locked(&pm_mutex)); |
127 | if (saved_gfp_mask) { | 127 | if (saved_gfp_mask) { |
128 | gfp_allowed_mask = saved_gfp_mask; | 128 | gfp_allowed_mask = saved_gfp_mask; |
129 | saved_gfp_mask = 0; | 129 | saved_gfp_mask = 0; |
130 | } | 130 | } |
131 | } | 131 | } |
132 | 132 | ||
133 | void pm_restrict_gfp_mask(void) | 133 | void pm_restrict_gfp_mask(void) |
134 | { | 134 | { |
135 | WARN_ON(!mutex_is_locked(&pm_mutex)); | 135 | WARN_ON(!mutex_is_locked(&pm_mutex)); |
136 | WARN_ON(saved_gfp_mask); | 136 | WARN_ON(saved_gfp_mask); |
137 | saved_gfp_mask = gfp_allowed_mask; | 137 | saved_gfp_mask = gfp_allowed_mask; |
138 | gfp_allowed_mask &= ~GFP_IOFS; | 138 | gfp_allowed_mask &= ~GFP_IOFS; |
139 | } | 139 | } |
140 | 140 | ||
141 | bool pm_suspended_storage(void) | 141 | bool pm_suspended_storage(void) |
142 | { | 142 | { |
143 | if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) | 143 | if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) |
144 | return false; | 144 | return false; |
145 | return true; | 145 | return true; |
146 | } | 146 | } |
147 | #endif /* CONFIG_PM_SLEEP */ | 147 | #endif /* CONFIG_PM_SLEEP */ |
148 | 148 | ||
149 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 149 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
150 | int pageblock_order __read_mostly; | 150 | int pageblock_order __read_mostly; |
151 | #endif | 151 | #endif |
152 | 152 | ||
153 | static void __free_pages_ok(struct page *page, unsigned int order); | 153 | static void __free_pages_ok(struct page *page, unsigned int order); |
154 | 154 | ||
155 | /* | 155 | /* |
156 | * results with 256, 32 in the lowmem_reserve sysctl: | 156 | * results with 256, 32 in the lowmem_reserve sysctl: |
157 | * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) | 157 | * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) |
158 | * 1G machine -> (16M dma, 784M normal, 224M high) | 158 | * 1G machine -> (16M dma, 784M normal, 224M high) |
159 | * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA | 159 | * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA |
160 | * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL | 160 | * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL |
161 | * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA | 161 | * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA |
162 | * | 162 | * |
163 | * TBD: should special case ZONE_DMA32 machines here - in those we normally | 163 | * TBD: should special case ZONE_DMA32 machines here - in those we normally |
164 | * don't need any ZONE_NORMAL reservation | 164 | * don't need any ZONE_NORMAL reservation |
165 | */ | 165 | */ |
166 | int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { | 166 | int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { |
167 | #ifdef CONFIG_ZONE_DMA | 167 | #ifdef CONFIG_ZONE_DMA |
168 | 256, | 168 | 256, |
169 | #endif | 169 | #endif |
170 | #ifdef CONFIG_ZONE_DMA32 | 170 | #ifdef CONFIG_ZONE_DMA32 |
171 | 256, | 171 | 256, |
172 | #endif | 172 | #endif |
173 | #ifdef CONFIG_HIGHMEM | 173 | #ifdef CONFIG_HIGHMEM |
174 | 32, | 174 | 32, |
175 | #endif | 175 | #endif |
176 | 32, | 176 | 32, |
177 | }; | 177 | }; |
178 | 178 | ||
179 | EXPORT_SYMBOL(totalram_pages); | 179 | EXPORT_SYMBOL(totalram_pages); |
180 | 180 | ||
181 | static char * const zone_names[MAX_NR_ZONES] = { | 181 | static char * const zone_names[MAX_NR_ZONES] = { |
182 | #ifdef CONFIG_ZONE_DMA | 182 | #ifdef CONFIG_ZONE_DMA |
183 | "DMA", | 183 | "DMA", |
184 | #endif | 184 | #endif |
185 | #ifdef CONFIG_ZONE_DMA32 | 185 | #ifdef CONFIG_ZONE_DMA32 |
186 | "DMA32", | 186 | "DMA32", |
187 | #endif | 187 | #endif |
188 | "Normal", | 188 | "Normal", |
189 | #ifdef CONFIG_HIGHMEM | 189 | #ifdef CONFIG_HIGHMEM |
190 | "HighMem", | 190 | "HighMem", |
191 | #endif | 191 | #endif |
192 | "Movable", | 192 | "Movable", |
193 | }; | 193 | }; |
194 | 194 | ||
195 | int min_free_kbytes = 1024; | 195 | int min_free_kbytes = 1024; |
196 | 196 | ||
197 | static unsigned long __meminitdata nr_kernel_pages; | 197 | static unsigned long __meminitdata nr_kernel_pages; |
198 | static unsigned long __meminitdata nr_all_pages; | 198 | static unsigned long __meminitdata nr_all_pages; |
199 | static unsigned long __meminitdata dma_reserve; | 199 | static unsigned long __meminitdata dma_reserve; |
200 | 200 | ||
201 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 201 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
202 | static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; | 202 | static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; |
203 | static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; | 203 | static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; |
204 | static unsigned long __initdata required_kernelcore; | 204 | static unsigned long __initdata required_kernelcore; |
205 | static unsigned long __initdata required_movablecore; | 205 | static unsigned long __initdata required_movablecore; |
206 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; | 206 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; |
207 | 207 | ||
208 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ | 208 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ |
209 | int movable_zone; | 209 | int movable_zone; |
210 | EXPORT_SYMBOL(movable_zone); | 210 | EXPORT_SYMBOL(movable_zone); |
211 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 211 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
212 | 212 | ||
213 | #if MAX_NUMNODES > 1 | 213 | #if MAX_NUMNODES > 1 |
214 | int nr_node_ids __read_mostly = MAX_NUMNODES; | 214 | int nr_node_ids __read_mostly = MAX_NUMNODES; |
215 | int nr_online_nodes __read_mostly = 1; | 215 | int nr_online_nodes __read_mostly = 1; |
216 | EXPORT_SYMBOL(nr_node_ids); | 216 | EXPORT_SYMBOL(nr_node_ids); |
217 | EXPORT_SYMBOL(nr_online_nodes); | 217 | EXPORT_SYMBOL(nr_online_nodes); |
218 | #endif | 218 | #endif |
219 | 219 | ||
220 | int page_group_by_mobility_disabled __read_mostly; | 220 | int page_group_by_mobility_disabled __read_mostly; |
221 | 221 | ||
222 | static void set_pageblock_migratetype(struct page *page, int migratetype) | 222 | static void set_pageblock_migratetype(struct page *page, int migratetype) |
223 | { | 223 | { |
224 | 224 | ||
225 | if (unlikely(page_group_by_mobility_disabled)) | 225 | if (unlikely(page_group_by_mobility_disabled)) |
226 | migratetype = MIGRATE_UNMOVABLE; | 226 | migratetype = MIGRATE_UNMOVABLE; |
227 | 227 | ||
228 | set_pageblock_flags_group(page, (unsigned long)migratetype, | 228 | set_pageblock_flags_group(page, (unsigned long)migratetype, |
229 | PB_migrate, PB_migrate_end); | 229 | PB_migrate, PB_migrate_end); |
230 | } | 230 | } |
231 | 231 | ||
232 | bool oom_killer_disabled __read_mostly; | 232 | bool oom_killer_disabled __read_mostly; |
233 | 233 | ||
234 | #ifdef CONFIG_DEBUG_VM | 234 | #ifdef CONFIG_DEBUG_VM |
235 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 235 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
236 | { | 236 | { |
237 | int ret = 0; | 237 | int ret = 0; |
238 | unsigned seq; | 238 | unsigned seq; |
239 | unsigned long pfn = page_to_pfn(page); | 239 | unsigned long pfn = page_to_pfn(page); |
240 | 240 | ||
241 | do { | 241 | do { |
242 | seq = zone_span_seqbegin(zone); | 242 | seq = zone_span_seqbegin(zone); |
243 | if (pfn >= zone->zone_start_pfn + zone->spanned_pages) | 243 | if (pfn >= zone->zone_start_pfn + zone->spanned_pages) |
244 | ret = 1; | 244 | ret = 1; |
245 | else if (pfn < zone->zone_start_pfn) | 245 | else if (pfn < zone->zone_start_pfn) |
246 | ret = 1; | 246 | ret = 1; |
247 | } while (zone_span_seqretry(zone, seq)); | 247 | } while (zone_span_seqretry(zone, seq)); |
248 | 248 | ||
249 | return ret; | 249 | return ret; |
250 | } | 250 | } |
251 | 251 | ||
252 | static int page_is_consistent(struct zone *zone, struct page *page) | 252 | static int page_is_consistent(struct zone *zone, struct page *page) |
253 | { | 253 | { |
254 | if (!pfn_valid_within(page_to_pfn(page))) | 254 | if (!pfn_valid_within(page_to_pfn(page))) |
255 | return 0; | 255 | return 0; |
256 | if (zone != page_zone(page)) | 256 | if (zone != page_zone(page)) |
257 | return 0; | 257 | return 0; |
258 | 258 | ||
259 | return 1; | 259 | return 1; |
260 | } | 260 | } |
261 | /* | 261 | /* |
262 | * Temporary debugging check for pages not lying within a given zone. | 262 | * Temporary debugging check for pages not lying within a given zone. |
263 | */ | 263 | */ |
264 | static int bad_range(struct zone *zone, struct page *page) | 264 | static int bad_range(struct zone *zone, struct page *page) |
265 | { | 265 | { |
266 | if (page_outside_zone_boundaries(zone, page)) | 266 | if (page_outside_zone_boundaries(zone, page)) |
267 | return 1; | 267 | return 1; |
268 | if (!page_is_consistent(zone, page)) | 268 | if (!page_is_consistent(zone, page)) |
269 | return 1; | 269 | return 1; |
270 | 270 | ||
271 | return 0; | 271 | return 0; |
272 | } | 272 | } |
273 | #else | 273 | #else |
274 | static inline int bad_range(struct zone *zone, struct page *page) | 274 | static inline int bad_range(struct zone *zone, struct page *page) |
275 | { | 275 | { |
276 | return 0; | 276 | return 0; |
277 | } | 277 | } |
278 | #endif | 278 | #endif |
279 | 279 | ||
280 | static void bad_page(struct page *page) | 280 | static void bad_page(struct page *page) |
281 | { | 281 | { |
282 | static unsigned long resume; | 282 | static unsigned long resume; |
283 | static unsigned long nr_shown; | 283 | static unsigned long nr_shown; |
284 | static unsigned long nr_unshown; | 284 | static unsigned long nr_unshown; |
285 | 285 | ||
286 | /* Don't complain about poisoned pages */ | 286 | /* Don't complain about poisoned pages */ |
287 | if (PageHWPoison(page)) { | 287 | if (PageHWPoison(page)) { |
288 | reset_page_mapcount(page); /* remove PageBuddy */ | 288 | reset_page_mapcount(page); /* remove PageBuddy */ |
289 | return; | 289 | return; |
290 | } | 290 | } |
291 | 291 | ||
292 | /* | 292 | /* |
293 | * Allow a burst of 60 reports, then keep quiet for that minute; | 293 | * Allow a burst of 60 reports, then keep quiet for that minute; |
294 | * or allow a steady drip of one report per second. | 294 | * or allow a steady drip of one report per second. |
295 | */ | 295 | */ |
296 | if (nr_shown == 60) { | 296 | if (nr_shown == 60) { |
297 | if (time_before(jiffies, resume)) { | 297 | if (time_before(jiffies, resume)) { |
298 | nr_unshown++; | 298 | nr_unshown++; |
299 | goto out; | 299 | goto out; |
300 | } | 300 | } |
301 | if (nr_unshown) { | 301 | if (nr_unshown) { |
302 | printk(KERN_ALERT | 302 | printk(KERN_ALERT |
303 | "BUG: Bad page state: %lu messages suppressed\n", | 303 | "BUG: Bad page state: %lu messages suppressed\n", |
304 | nr_unshown); | 304 | nr_unshown); |
305 | nr_unshown = 0; | 305 | nr_unshown = 0; |
306 | } | 306 | } |
307 | nr_shown = 0; | 307 | nr_shown = 0; |
308 | } | 308 | } |
309 | if (nr_shown++ == 0) | 309 | if (nr_shown++ == 0) |
310 | resume = jiffies + 60 * HZ; | 310 | resume = jiffies + 60 * HZ; |
311 | 311 | ||
312 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", | 312 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", |
313 | current->comm, page_to_pfn(page)); | 313 | current->comm, page_to_pfn(page)); |
314 | dump_page(page); | 314 | dump_page(page); |
315 | 315 | ||
316 | print_modules(); | 316 | print_modules(); |
317 | dump_stack(); | 317 | dump_stack(); |
318 | out: | 318 | out: |
319 | /* Leave bad fields for debug, except PageBuddy could make trouble */ | 319 | /* Leave bad fields for debug, except PageBuddy could make trouble */ |
320 | reset_page_mapcount(page); /* remove PageBuddy */ | 320 | reset_page_mapcount(page); /* remove PageBuddy */ |
321 | add_taint(TAINT_BAD_PAGE); | 321 | add_taint(TAINT_BAD_PAGE); |
322 | } | 322 | } |
323 | 323 | ||
324 | /* | 324 | /* |
325 | * Higher-order pages are called "compound pages". They are structured thusly: | 325 | * Higher-order pages are called "compound pages". They are structured thusly: |
326 | * | 326 | * |
327 | * The first PAGE_SIZE page is called the "head page". | 327 | * The first PAGE_SIZE page is called the "head page". |
328 | * | 328 | * |
329 | * The remaining PAGE_SIZE pages are called "tail pages". | 329 | * The remaining PAGE_SIZE pages are called "tail pages". |
330 | * | 330 | * |
331 | * All pages have PG_compound set. All tail pages have their ->first_page | 331 | * All pages have PG_compound set. All tail pages have their ->first_page |
332 | * pointing at the head page. | 332 | * pointing at the head page. |
333 | * | 333 | * |
334 | * The first tail page's ->lru.next holds the address of the compound page's | 334 | * The first tail page's ->lru.next holds the address of the compound page's |
335 | * put_page() function. Its ->lru.prev holds the order of allocation. | 335 | * put_page() function. Its ->lru.prev holds the order of allocation. |
336 | * This usage means that zero-order pages may not be compound. | 336 | * This usage means that zero-order pages may not be compound. |
337 | */ | 337 | */ |
338 | 338 | ||
339 | static void free_compound_page(struct page *page) | 339 | static void free_compound_page(struct page *page) |
340 | { | 340 | { |
341 | __free_pages_ok(page, compound_order(page)); | 341 | __free_pages_ok(page, compound_order(page)); |
342 | } | 342 | } |
343 | 343 | ||
344 | void prep_compound_page(struct page *page, unsigned long order) | 344 | void prep_compound_page(struct page *page, unsigned long order) |
345 | { | 345 | { |
346 | int i; | 346 | int i; |
347 | int nr_pages = 1 << order; | 347 | int nr_pages = 1 << order; |
348 | 348 | ||
349 | set_compound_page_dtor(page, free_compound_page); | 349 | set_compound_page_dtor(page, free_compound_page); |
350 | set_compound_order(page, order); | 350 | set_compound_order(page, order); |
351 | __SetPageHead(page); | 351 | __SetPageHead(page); |
352 | for (i = 1; i < nr_pages; i++) { | 352 | for (i = 1; i < nr_pages; i++) { |
353 | struct page *p = page + i; | 353 | struct page *p = page + i; |
354 | __SetPageTail(p); | 354 | __SetPageTail(p); |
355 | set_page_count(p, 0); | 355 | set_page_count(p, 0); |
356 | p->first_page = page; | 356 | p->first_page = page; |
357 | } | 357 | } |
358 | } | 358 | } |
359 | 359 | ||
360 | /* update __split_huge_page_refcount if you change this function */ | 360 | /* update __split_huge_page_refcount if you change this function */ |
361 | static int destroy_compound_page(struct page *page, unsigned long order) | 361 | static int destroy_compound_page(struct page *page, unsigned long order) |
362 | { | 362 | { |
363 | int i; | 363 | int i; |
364 | int nr_pages = 1 << order; | 364 | int nr_pages = 1 << order; |
365 | int bad = 0; | 365 | int bad = 0; |
366 | 366 | ||
367 | if (unlikely(compound_order(page) != order) || | 367 | if (unlikely(compound_order(page) != order) || |
368 | unlikely(!PageHead(page))) { | 368 | unlikely(!PageHead(page))) { |
369 | bad_page(page); | 369 | bad_page(page); |
370 | bad++; | 370 | bad++; |
371 | } | 371 | } |
372 | 372 | ||
373 | __ClearPageHead(page); | 373 | __ClearPageHead(page); |
374 | 374 | ||
375 | for (i = 1; i < nr_pages; i++) { | 375 | for (i = 1; i < nr_pages; i++) { |
376 | struct page *p = page + i; | 376 | struct page *p = page + i; |
377 | 377 | ||
378 | if (unlikely(!PageTail(p) || (p->first_page != page))) { | 378 | if (unlikely(!PageTail(p) || (p->first_page != page))) { |
379 | bad_page(page); | 379 | bad_page(page); |
380 | bad++; | 380 | bad++; |
381 | } | 381 | } |
382 | __ClearPageTail(p); | 382 | __ClearPageTail(p); |
383 | } | 383 | } |
384 | 384 | ||
385 | return bad; | 385 | return bad; |
386 | } | 386 | } |
387 | 387 | ||
388 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | 388 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) |
389 | { | 389 | { |
390 | int i; | 390 | int i; |
391 | 391 | ||
392 | /* | 392 | /* |
393 | * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO | 393 | * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO |
394 | * and __GFP_HIGHMEM from hard or soft interrupt context. | 394 | * and __GFP_HIGHMEM from hard or soft interrupt context. |
395 | */ | 395 | */ |
396 | VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); | 396 | VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); |
397 | for (i = 0; i < (1 << order); i++) | 397 | for (i = 0; i < (1 << order); i++) |
398 | clear_highpage(page + i); | 398 | clear_highpage(page + i); |
399 | } | 399 | } |
400 | 400 | ||
401 | #ifdef CONFIG_DEBUG_PAGEALLOC | 401 | #ifdef CONFIG_DEBUG_PAGEALLOC |
402 | unsigned int _debug_guardpage_minorder; | 402 | unsigned int _debug_guardpage_minorder; |
403 | 403 | ||
404 | static int __init debug_guardpage_minorder_setup(char *buf) | 404 | static int __init debug_guardpage_minorder_setup(char *buf) |
405 | { | 405 | { |
406 | unsigned long res; | 406 | unsigned long res; |
407 | 407 | ||
408 | if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { | 408 | if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { |
409 | printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); | 409 | printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); |
410 | return 0; | 410 | return 0; |
411 | } | 411 | } |
412 | _debug_guardpage_minorder = res; | 412 | _debug_guardpage_minorder = res; |
413 | printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); | 413 | printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); |
414 | return 0; | 414 | return 0; |
415 | } | 415 | } |
416 | __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); | 416 | __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); |
417 | 417 | ||
418 | static inline void set_page_guard_flag(struct page *page) | 418 | static inline void set_page_guard_flag(struct page *page) |
419 | { | 419 | { |
420 | __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); | 420 | __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); |
421 | } | 421 | } |
422 | 422 | ||
423 | static inline void clear_page_guard_flag(struct page *page) | 423 | static inline void clear_page_guard_flag(struct page *page) |
424 | { | 424 | { |
425 | __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); | 425 | __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); |
426 | } | 426 | } |
427 | #else | 427 | #else |
428 | static inline void set_page_guard_flag(struct page *page) { } | 428 | static inline void set_page_guard_flag(struct page *page) { } |
429 | static inline void clear_page_guard_flag(struct page *page) { } | 429 | static inline void clear_page_guard_flag(struct page *page) { } |
430 | #endif | 430 | #endif |
431 | 431 | ||
432 | static inline void set_page_order(struct page *page, int order) | 432 | static inline void set_page_order(struct page *page, int order) |
433 | { | 433 | { |
434 | set_page_private(page, order); | 434 | set_page_private(page, order); |
435 | __SetPageBuddy(page); | 435 | __SetPageBuddy(page); |
436 | } | 436 | } |
437 | 437 | ||
438 | static inline void rmv_page_order(struct page *page) | 438 | static inline void rmv_page_order(struct page *page) |
439 | { | 439 | { |
440 | __ClearPageBuddy(page); | 440 | __ClearPageBuddy(page); |
441 | set_page_private(page, 0); | 441 | set_page_private(page, 0); |
442 | } | 442 | } |
443 | 443 | ||
444 | /* | 444 | /* |
445 | * Locate the struct page for both the matching buddy in our | 445 | * Locate the struct page for both the matching buddy in our |
446 | * pair (buddy1) and the combined O(n+1) page they form (page). | 446 | * pair (buddy1) and the combined O(n+1) page they form (page). |
447 | * | 447 | * |
448 | * 1) Any buddy B1 will have an order O twin B2 which satisfies | 448 | * 1) Any buddy B1 will have an order O twin B2 which satisfies |
449 | * the following equation: | 449 | * the following equation: |
450 | * B2 = B1 ^ (1 << O) | 450 | * B2 = B1 ^ (1 << O) |
451 | * For example, if the starting buddy (buddy2) is #8 its order | 451 | * For example, if the starting buddy (buddy2) is #8 its order |
452 | * 1 buddy is #10: | 452 | * 1 buddy is #10: |
453 | * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 | 453 | * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 |
454 | * | 454 | * |
455 | * 2) Any buddy B will have an order O+1 parent P which | 455 | * 2) Any buddy B will have an order O+1 parent P which |
456 | * satisfies the following equation: | 456 | * satisfies the following equation: |
457 | * P = B & ~(1 << O) | 457 | * P = B & ~(1 << O) |
458 | * | 458 | * |
459 | * Assumption: *_mem_map is contiguous at least up to MAX_ORDER | 459 | * Assumption: *_mem_map is contiguous at least up to MAX_ORDER |
460 | */ | 460 | */ |
461 | static inline unsigned long | 461 | static inline unsigned long |
462 | __find_buddy_index(unsigned long page_idx, unsigned int order) | 462 | __find_buddy_index(unsigned long page_idx, unsigned int order) |
463 | { | 463 | { |
464 | return page_idx ^ (1 << order); | 464 | return page_idx ^ (1 << order); |
465 | } | 465 | } |
466 | 466 | ||
467 | /* | 467 | /* |
468 | * This function checks whether a page is free && is the buddy | 468 | * This function checks whether a page is free && is the buddy |
469 | * we can do coalesce a page and its buddy if | 469 | * we can do coalesce a page and its buddy if |
470 | * (a) the buddy is not in a hole && | 470 | * (a) the buddy is not in a hole && |
471 | * (b) the buddy is in the buddy system && | 471 | * (b) the buddy is in the buddy system && |
472 | * (c) a page and its buddy have the same order && | 472 | * (c) a page and its buddy have the same order && |
473 | * (d) a page and its buddy are in the same zone. | 473 | * (d) a page and its buddy are in the same zone. |
474 | * | 474 | * |
475 | * For recording whether a page is in the buddy system, we set ->_mapcount -2. | 475 | * For recording whether a page is in the buddy system, we set ->_mapcount -2. |
476 | * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. | 476 | * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. |
477 | * | 477 | * |
478 | * For recording page's order, we use page_private(page). | 478 | * For recording page's order, we use page_private(page). |
479 | */ | 479 | */ |
480 | static inline int page_is_buddy(struct page *page, struct page *buddy, | 480 | static inline int page_is_buddy(struct page *page, struct page *buddy, |
481 | int order) | 481 | int order) |
482 | { | 482 | { |
483 | if (!pfn_valid_within(page_to_pfn(buddy))) | 483 | if (!pfn_valid_within(page_to_pfn(buddy))) |
484 | return 0; | 484 | return 0; |
485 | 485 | ||
486 | if (page_zone_id(page) != page_zone_id(buddy)) | 486 | if (page_zone_id(page) != page_zone_id(buddy)) |
487 | return 0; | 487 | return 0; |
488 | 488 | ||
489 | if (page_is_guard(buddy) && page_order(buddy) == order) { | 489 | if (page_is_guard(buddy) && page_order(buddy) == order) { |
490 | VM_BUG_ON(page_count(buddy) != 0); | 490 | VM_BUG_ON(page_count(buddy) != 0); |
491 | return 1; | 491 | return 1; |
492 | } | 492 | } |
493 | 493 | ||
494 | if (PageBuddy(buddy) && page_order(buddy) == order) { | 494 | if (PageBuddy(buddy) && page_order(buddy) == order) { |
495 | VM_BUG_ON(page_count(buddy) != 0); | 495 | VM_BUG_ON(page_count(buddy) != 0); |
496 | return 1; | 496 | return 1; |
497 | } | 497 | } |
498 | return 0; | 498 | return 0; |
499 | } | 499 | } |
500 | 500 | ||
501 | /* | 501 | /* |
502 | * Freeing function for a buddy system allocator. | 502 | * Freeing function for a buddy system allocator. |
503 | * | 503 | * |
504 | * The concept of a buddy system is to maintain direct-mapped table | 504 | * The concept of a buddy system is to maintain direct-mapped table |
505 | * (containing bit values) for memory blocks of various "orders". | 505 | * (containing bit values) for memory blocks of various "orders". |
506 | * The bottom level table contains the map for the smallest allocatable | 506 | * The bottom level table contains the map for the smallest allocatable |
507 | * units of memory (here, pages), and each level above it describes | 507 | * units of memory (here, pages), and each level above it describes |
508 | * pairs of units from the levels below, hence, "buddies". | 508 | * pairs of units from the levels below, hence, "buddies". |
509 | * At a high level, all that happens here is marking the table entry | 509 | * At a high level, all that happens here is marking the table entry |
510 | * at the bottom level available, and propagating the changes upward | 510 | * at the bottom level available, and propagating the changes upward |
511 | * as necessary, plus some accounting needed to play nicely with other | 511 | * as necessary, plus some accounting needed to play nicely with other |
512 | * parts of the VM system. | 512 | * parts of the VM system. |
513 | * At each level, we keep a list of pages, which are heads of continuous | 513 | * At each level, we keep a list of pages, which are heads of continuous |
514 | * free pages of length of (1 << order) and marked with _mapcount -2. Page's | 514 | * free pages of length of (1 << order) and marked with _mapcount -2. Page's |
515 | * order is recorded in page_private(page) field. | 515 | * order is recorded in page_private(page) field. |
516 | * So when we are allocating or freeing one, we can derive the state of the | 516 | * So when we are allocating or freeing one, we can derive the state of the |
517 | * other. That is, if we allocate a small block, and both were | 517 | * other. That is, if we allocate a small block, and both were |
518 | * free, the remainder of the region must be split into blocks. | 518 | * free, the remainder of the region must be split into blocks. |
519 | * If a block is freed, and its buddy is also free, then this | 519 | * If a block is freed, and its buddy is also free, then this |
520 | * triggers coalescing into a block of larger size. | 520 | * triggers coalescing into a block of larger size. |
521 | * | 521 | * |
522 | * -- wli | 522 | * -- wli |
523 | */ | 523 | */ |
524 | 524 | ||
525 | static inline void __free_one_page(struct page *page, | 525 | static inline void __free_one_page(struct page *page, |
526 | struct zone *zone, unsigned int order, | 526 | struct zone *zone, unsigned int order, |
527 | int migratetype) | 527 | int migratetype) |
528 | { | 528 | { |
529 | unsigned long page_idx; | 529 | unsigned long page_idx; |
530 | unsigned long combined_idx; | 530 | unsigned long combined_idx; |
531 | unsigned long uninitialized_var(buddy_idx); | 531 | unsigned long uninitialized_var(buddy_idx); |
532 | struct page *buddy; | 532 | struct page *buddy; |
533 | 533 | ||
534 | if (unlikely(PageCompound(page))) | 534 | if (unlikely(PageCompound(page))) |
535 | if (unlikely(destroy_compound_page(page, order))) | 535 | if (unlikely(destroy_compound_page(page, order))) |
536 | return; | 536 | return; |
537 | 537 | ||
538 | VM_BUG_ON(migratetype == -1); | 538 | VM_BUG_ON(migratetype == -1); |
539 | 539 | ||
540 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 540 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); |
541 | 541 | ||
542 | VM_BUG_ON(page_idx & ((1 << order) - 1)); | 542 | VM_BUG_ON(page_idx & ((1 << order) - 1)); |
543 | VM_BUG_ON(bad_range(zone, page)); | 543 | VM_BUG_ON(bad_range(zone, page)); |
544 | 544 | ||
545 | while (order < MAX_ORDER-1) { | 545 | while (order < MAX_ORDER-1) { |
546 | buddy_idx = __find_buddy_index(page_idx, order); | 546 | buddy_idx = __find_buddy_index(page_idx, order); |
547 | buddy = page + (buddy_idx - page_idx); | 547 | buddy = page + (buddy_idx - page_idx); |
548 | if (!page_is_buddy(page, buddy, order)) | 548 | if (!page_is_buddy(page, buddy, order)) |
549 | break; | 549 | break; |
550 | /* | 550 | /* |
551 | * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, | 551 | * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, |
552 | * merge with it and move up one order. | 552 | * merge with it and move up one order. |
553 | */ | 553 | */ |
554 | if (page_is_guard(buddy)) { | 554 | if (page_is_guard(buddy)) { |
555 | clear_page_guard_flag(buddy); | 555 | clear_page_guard_flag(buddy); |
556 | set_page_private(page, 0); | 556 | set_page_private(page, 0); |
557 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | 557 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); |
558 | } else { | 558 | } else { |
559 | list_del(&buddy->lru); | 559 | list_del(&buddy->lru); |
560 | zone->free_area[order].nr_free--; | 560 | zone->free_area[order].nr_free--; |
561 | rmv_page_order(buddy); | 561 | rmv_page_order(buddy); |
562 | } | 562 | } |
563 | combined_idx = buddy_idx & page_idx; | 563 | combined_idx = buddy_idx & page_idx; |
564 | page = page + (combined_idx - page_idx); | 564 | page = page + (combined_idx - page_idx); |
565 | page_idx = combined_idx; | 565 | page_idx = combined_idx; |
566 | order++; | 566 | order++; |
567 | } | 567 | } |
568 | set_page_order(page, order); | 568 | set_page_order(page, order); |
569 | 569 | ||
570 | /* | 570 | /* |
571 | * If this is not the largest possible page, check if the buddy | 571 | * If this is not the largest possible page, check if the buddy |
572 | * of the next-highest order is free. If it is, it's possible | 572 | * of the next-highest order is free. If it is, it's possible |
573 | * that pages are being freed that will coalesce soon. In case, | 573 | * that pages are being freed that will coalesce soon. In case, |
574 | * that is happening, add the free page to the tail of the list | 574 | * that is happening, add the free page to the tail of the list |
575 | * so it's less likely to be used soon and more likely to be merged | 575 | * so it's less likely to be used soon and more likely to be merged |
576 | * as a higher order page | 576 | * as a higher order page |
577 | */ | 577 | */ |
578 | if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { | 578 | if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { |
579 | struct page *higher_page, *higher_buddy; | 579 | struct page *higher_page, *higher_buddy; |
580 | combined_idx = buddy_idx & page_idx; | 580 | combined_idx = buddy_idx & page_idx; |
581 | higher_page = page + (combined_idx - page_idx); | 581 | higher_page = page + (combined_idx - page_idx); |
582 | buddy_idx = __find_buddy_index(combined_idx, order + 1); | 582 | buddy_idx = __find_buddy_index(combined_idx, order + 1); |
583 | higher_buddy = page + (buddy_idx - combined_idx); | 583 | higher_buddy = page + (buddy_idx - combined_idx); |
584 | if (page_is_buddy(higher_page, higher_buddy, order + 1)) { | 584 | if (page_is_buddy(higher_page, higher_buddy, order + 1)) { |
585 | list_add_tail(&page->lru, | 585 | list_add_tail(&page->lru, |
586 | &zone->free_area[order].free_list[migratetype]); | 586 | &zone->free_area[order].free_list[migratetype]); |
587 | goto out; | 587 | goto out; |
588 | } | 588 | } |
589 | } | 589 | } |
590 | 590 | ||
591 | list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); | 591 | list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); |
592 | out: | 592 | out: |
593 | zone->free_area[order].nr_free++; | 593 | zone->free_area[order].nr_free++; |
594 | } | 594 | } |
595 | 595 | ||
596 | /* | 596 | /* |
597 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | 597 | * free_page_mlock() -- clean up attempts to free and mlocked() page. |
598 | * Page should not be on lru, so no need to fix that up. | 598 | * Page should not be on lru, so no need to fix that up. |
599 | * free_pages_check() will verify... | 599 | * free_pages_check() will verify... |
600 | */ | 600 | */ |
601 | static inline void free_page_mlock(struct page *page) | 601 | static inline void free_page_mlock(struct page *page) |
602 | { | 602 | { |
603 | __dec_zone_page_state(page, NR_MLOCK); | 603 | __dec_zone_page_state(page, NR_MLOCK); |
604 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | 604 | __count_vm_event(UNEVICTABLE_MLOCKFREED); |
605 | } | 605 | } |
606 | 606 | ||
607 | static inline int free_pages_check(struct page *page) | 607 | static inline int free_pages_check(struct page *page) |
608 | { | 608 | { |
609 | if (unlikely(page_mapcount(page) | | 609 | if (unlikely(page_mapcount(page) | |
610 | (page->mapping != NULL) | | 610 | (page->mapping != NULL) | |
611 | (atomic_read(&page->_count) != 0) | | 611 | (atomic_read(&page->_count) != 0) | |
612 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | | 612 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE) | |
613 | (mem_cgroup_bad_page_check(page)))) { | 613 | (mem_cgroup_bad_page_check(page)))) { |
614 | bad_page(page); | 614 | bad_page(page); |
615 | return 1; | 615 | return 1; |
616 | } | 616 | } |
617 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 617 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
618 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | 618 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
619 | return 0; | 619 | return 0; |
620 | } | 620 | } |
621 | 621 | ||
622 | /* | 622 | /* |
623 | * Frees a number of pages from the PCP lists | 623 | * Frees a number of pages from the PCP lists |
624 | * Assumes all pages on list are in same zone, and of same order. | 624 | * Assumes all pages on list are in same zone, and of same order. |
625 | * count is the number of pages to free. | 625 | * count is the number of pages to free. |
626 | * | 626 | * |
627 | * If the zone was previously in an "all pages pinned" state then look to | 627 | * If the zone was previously in an "all pages pinned" state then look to |
628 | * see if this freeing clears that state. | 628 | * see if this freeing clears that state. |
629 | * | 629 | * |
630 | * And clear the zone's pages_scanned counter, to hold off the "all pages are | 630 | * And clear the zone's pages_scanned counter, to hold off the "all pages are |
631 | * pinned" detection logic. | 631 | * pinned" detection logic. |
632 | */ | 632 | */ |
633 | static void free_pcppages_bulk(struct zone *zone, int count, | 633 | static void free_pcppages_bulk(struct zone *zone, int count, |
634 | struct per_cpu_pages *pcp) | 634 | struct per_cpu_pages *pcp) |
635 | { | 635 | { |
636 | int migratetype = 0; | 636 | int migratetype = 0; |
637 | int batch_free = 0; | 637 | int batch_free = 0; |
638 | int to_free = count; | 638 | int to_free = count; |
639 | 639 | ||
640 | spin_lock(&zone->lock); | 640 | spin_lock(&zone->lock); |
641 | zone->all_unreclaimable = 0; | 641 | zone->all_unreclaimable = 0; |
642 | zone->pages_scanned = 0; | 642 | zone->pages_scanned = 0; |
643 | 643 | ||
644 | while (to_free) { | 644 | while (to_free) { |
645 | struct page *page; | 645 | struct page *page; |
646 | struct list_head *list; | 646 | struct list_head *list; |
647 | 647 | ||
648 | /* | 648 | /* |
649 | * Remove pages from lists in a round-robin fashion. A | 649 | * Remove pages from lists in a round-robin fashion. A |
650 | * batch_free count is maintained that is incremented when an | 650 | * batch_free count is maintained that is incremented when an |
651 | * empty list is encountered. This is so more pages are freed | 651 | * empty list is encountered. This is so more pages are freed |
652 | * off fuller lists instead of spinning excessively around empty | 652 | * off fuller lists instead of spinning excessively around empty |
653 | * lists | 653 | * lists |
654 | */ | 654 | */ |
655 | do { | 655 | do { |
656 | batch_free++; | 656 | batch_free++; |
657 | if (++migratetype == MIGRATE_PCPTYPES) | 657 | if (++migratetype == MIGRATE_PCPTYPES) |
658 | migratetype = 0; | 658 | migratetype = 0; |
659 | list = &pcp->lists[migratetype]; | 659 | list = &pcp->lists[migratetype]; |
660 | } while (list_empty(list)); | 660 | } while (list_empty(list)); |
661 | 661 | ||
662 | /* This is the only non-empty list. Free them all. */ | 662 | /* This is the only non-empty list. Free them all. */ |
663 | if (batch_free == MIGRATE_PCPTYPES) | 663 | if (batch_free == MIGRATE_PCPTYPES) |
664 | batch_free = to_free; | 664 | batch_free = to_free; |
665 | 665 | ||
666 | do { | 666 | do { |
667 | page = list_entry(list->prev, struct page, lru); | 667 | page = list_entry(list->prev, struct page, lru); |
668 | /* must delete as __free_one_page list manipulates */ | 668 | /* must delete as __free_one_page list manipulates */ |
669 | list_del(&page->lru); | 669 | list_del(&page->lru); |
670 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ | 670 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ |
671 | __free_one_page(page, zone, 0, page_private(page)); | 671 | __free_one_page(page, zone, 0, page_private(page)); |
672 | trace_mm_page_pcpu_drain(page, 0, page_private(page)); | 672 | trace_mm_page_pcpu_drain(page, 0, page_private(page)); |
673 | } while (--to_free && --batch_free && !list_empty(list)); | 673 | } while (--to_free && --batch_free && !list_empty(list)); |
674 | } | 674 | } |
675 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); | 675 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); |
676 | spin_unlock(&zone->lock); | 676 | spin_unlock(&zone->lock); |
677 | } | 677 | } |
678 | 678 | ||
679 | static void free_one_page(struct zone *zone, struct page *page, int order, | 679 | static void free_one_page(struct zone *zone, struct page *page, int order, |
680 | int migratetype) | 680 | int migratetype) |
681 | { | 681 | { |
682 | spin_lock(&zone->lock); | 682 | spin_lock(&zone->lock); |
683 | zone->all_unreclaimable = 0; | 683 | zone->all_unreclaimable = 0; |
684 | zone->pages_scanned = 0; | 684 | zone->pages_scanned = 0; |
685 | 685 | ||
686 | __free_one_page(page, zone, order, migratetype); | 686 | __free_one_page(page, zone, order, migratetype); |
687 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | 687 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); |
688 | spin_unlock(&zone->lock); | 688 | spin_unlock(&zone->lock); |
689 | } | 689 | } |
690 | 690 | ||
691 | static bool free_pages_prepare(struct page *page, unsigned int order) | 691 | static bool free_pages_prepare(struct page *page, unsigned int order) |
692 | { | 692 | { |
693 | int i; | 693 | int i; |
694 | int bad = 0; | 694 | int bad = 0; |
695 | 695 | ||
696 | trace_mm_page_free(page, order); | 696 | trace_mm_page_free(page, order); |
697 | kmemcheck_free_shadow(page, order); | 697 | kmemcheck_free_shadow(page, order); |
698 | 698 | ||
699 | if (PageAnon(page)) | 699 | if (PageAnon(page)) |
700 | page->mapping = NULL; | 700 | page->mapping = NULL; |
701 | for (i = 0; i < (1 << order); i++) | 701 | for (i = 0; i < (1 << order); i++) |
702 | bad += free_pages_check(page + i); | 702 | bad += free_pages_check(page + i); |
703 | if (bad) | 703 | if (bad) |
704 | return false; | 704 | return false; |
705 | 705 | ||
706 | if (!PageHighMem(page)) { | 706 | if (!PageHighMem(page)) { |
707 | debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); | 707 | debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); |
708 | debug_check_no_obj_freed(page_address(page), | 708 | debug_check_no_obj_freed(page_address(page), |
709 | PAGE_SIZE << order); | 709 | PAGE_SIZE << order); |
710 | } | 710 | } |
711 | arch_free_page(page, order); | 711 | arch_free_page(page, order); |
712 | kernel_map_pages(page, 1 << order, 0); | 712 | kernel_map_pages(page, 1 << order, 0); |
713 | 713 | ||
714 | return true; | 714 | return true; |
715 | } | 715 | } |
716 | 716 | ||
717 | static void __free_pages_ok(struct page *page, unsigned int order) | 717 | static void __free_pages_ok(struct page *page, unsigned int order) |
718 | { | 718 | { |
719 | unsigned long flags; | 719 | unsigned long flags; |
720 | int wasMlocked = __TestClearPageMlocked(page); | 720 | int wasMlocked = __TestClearPageMlocked(page); |
721 | 721 | ||
722 | if (!free_pages_prepare(page, order)) | 722 | if (!free_pages_prepare(page, order)) |
723 | return; | 723 | return; |
724 | 724 | ||
725 | local_irq_save(flags); | 725 | local_irq_save(flags); |
726 | if (unlikely(wasMlocked)) | 726 | if (unlikely(wasMlocked)) |
727 | free_page_mlock(page); | 727 | free_page_mlock(page); |
728 | __count_vm_events(PGFREE, 1 << order); | 728 | __count_vm_events(PGFREE, 1 << order); |
729 | free_one_page(page_zone(page), page, order, | 729 | free_one_page(page_zone(page), page, order, |
730 | get_pageblock_migratetype(page)); | 730 | get_pageblock_migratetype(page)); |
731 | local_irq_restore(flags); | 731 | local_irq_restore(flags); |
732 | } | 732 | } |
733 | 733 | ||
734 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | 734 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) |
735 | { | 735 | { |
736 | unsigned int nr_pages = 1 << order; | 736 | unsigned int nr_pages = 1 << order; |
737 | unsigned int loop; | 737 | unsigned int loop; |
738 | 738 | ||
739 | prefetchw(page); | 739 | prefetchw(page); |
740 | for (loop = 0; loop < nr_pages; loop++) { | 740 | for (loop = 0; loop < nr_pages; loop++) { |
741 | struct page *p = &page[loop]; | 741 | struct page *p = &page[loop]; |
742 | 742 | ||
743 | if (loop + 1 < nr_pages) | 743 | if (loop + 1 < nr_pages) |
744 | prefetchw(p + 1); | 744 | prefetchw(p + 1); |
745 | __ClearPageReserved(p); | 745 | __ClearPageReserved(p); |
746 | set_page_count(p, 0); | 746 | set_page_count(p, 0); |
747 | } | 747 | } |
748 | 748 | ||
749 | set_page_refcounted(page); | 749 | set_page_refcounted(page); |
750 | __free_pages(page, order); | 750 | __free_pages(page, order); |
751 | } | 751 | } |
752 | 752 | ||
753 | #ifdef CONFIG_CMA | 753 | #ifdef CONFIG_CMA |
754 | /* Free whole pageblock and set it's migration type to MIGRATE_CMA. */ | 754 | /* Free whole pageblock and set it's migration type to MIGRATE_CMA. */ |
755 | void __init init_cma_reserved_pageblock(struct page *page) | 755 | void __init init_cma_reserved_pageblock(struct page *page) |
756 | { | 756 | { |
757 | unsigned i = pageblock_nr_pages; | 757 | unsigned i = pageblock_nr_pages; |
758 | struct page *p = page; | 758 | struct page *p = page; |
759 | 759 | ||
760 | do { | 760 | do { |
761 | __ClearPageReserved(p); | 761 | __ClearPageReserved(p); |
762 | set_page_count(p, 0); | 762 | set_page_count(p, 0); |
763 | } while (++p, --i); | 763 | } while (++p, --i); |
764 | 764 | ||
765 | set_page_refcounted(page); | 765 | set_page_refcounted(page); |
766 | set_pageblock_migratetype(page, MIGRATE_CMA); | 766 | set_pageblock_migratetype(page, MIGRATE_CMA); |
767 | __free_pages(page, pageblock_order); | 767 | __free_pages(page, pageblock_order); |
768 | totalram_pages += pageblock_nr_pages; | 768 | totalram_pages += pageblock_nr_pages; |
769 | } | 769 | } |
770 | #endif | 770 | #endif |
771 | 771 | ||
772 | /* | 772 | /* |
773 | * The order of subdivision here is critical for the IO subsystem. | 773 | * The order of subdivision here is critical for the IO subsystem. |
774 | * Please do not alter this order without good reasons and regression | 774 | * Please do not alter this order without good reasons and regression |
775 | * testing. Specifically, as large blocks of memory are subdivided, | 775 | * testing. Specifically, as large blocks of memory are subdivided, |
776 | * the order in which smaller blocks are delivered depends on the order | 776 | * the order in which smaller blocks are delivered depends on the order |
777 | * they're subdivided in this function. This is the primary factor | 777 | * they're subdivided in this function. This is the primary factor |
778 | * influencing the order in which pages are delivered to the IO | 778 | * influencing the order in which pages are delivered to the IO |
779 | * subsystem according to empirical testing, and this is also justified | 779 | * subsystem according to empirical testing, and this is also justified |
780 | * by considering the behavior of a buddy system containing a single | 780 | * by considering the behavior of a buddy system containing a single |
781 | * large block of memory acted on by a series of small allocations. | 781 | * large block of memory acted on by a series of small allocations. |
782 | * This behavior is a critical factor in sglist merging's success. | 782 | * This behavior is a critical factor in sglist merging's success. |
783 | * | 783 | * |
784 | * -- wli | 784 | * -- wli |
785 | */ | 785 | */ |
786 | static inline void expand(struct zone *zone, struct page *page, | 786 | static inline void expand(struct zone *zone, struct page *page, |
787 | int low, int high, struct free_area *area, | 787 | int low, int high, struct free_area *area, |
788 | int migratetype) | 788 | int migratetype) |
789 | { | 789 | { |
790 | unsigned long size = 1 << high; | 790 | unsigned long size = 1 << high; |
791 | 791 | ||
792 | while (high > low) { | 792 | while (high > low) { |
793 | area--; | 793 | area--; |
794 | high--; | 794 | high--; |
795 | size >>= 1; | 795 | size >>= 1; |
796 | VM_BUG_ON(bad_range(zone, &page[size])); | 796 | VM_BUG_ON(bad_range(zone, &page[size])); |
797 | 797 | ||
798 | #ifdef CONFIG_DEBUG_PAGEALLOC | 798 | #ifdef CONFIG_DEBUG_PAGEALLOC |
799 | if (high < debug_guardpage_minorder()) { | 799 | if (high < debug_guardpage_minorder()) { |
800 | /* | 800 | /* |
801 | * Mark as guard pages (or page), that will allow to | 801 | * Mark as guard pages (or page), that will allow to |
802 | * merge back to allocator when buddy will be freed. | 802 | * merge back to allocator when buddy will be freed. |
803 | * Corresponding page table entries will not be touched, | 803 | * Corresponding page table entries will not be touched, |
804 | * pages will stay not present in virtual address space | 804 | * pages will stay not present in virtual address space |
805 | */ | 805 | */ |
806 | INIT_LIST_HEAD(&page[size].lru); | 806 | INIT_LIST_HEAD(&page[size].lru); |
807 | set_page_guard_flag(&page[size]); | 807 | set_page_guard_flag(&page[size]); |
808 | set_page_private(&page[size], high); | 808 | set_page_private(&page[size], high); |
809 | /* Guard pages are not available for any usage */ | 809 | /* Guard pages are not available for any usage */ |
810 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); | 810 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); |
811 | continue; | 811 | continue; |
812 | } | 812 | } |
813 | #endif | 813 | #endif |
814 | list_add(&page[size].lru, &area->free_list[migratetype]); | 814 | list_add(&page[size].lru, &area->free_list[migratetype]); |
815 | area->nr_free++; | 815 | area->nr_free++; |
816 | set_page_order(&page[size], high); | 816 | set_page_order(&page[size], high); |
817 | } | 817 | } |
818 | } | 818 | } |
819 | 819 | ||
820 | /* | 820 | /* |
821 | * This page is about to be returned from the page allocator | 821 | * This page is about to be returned from the page allocator |
822 | */ | 822 | */ |
823 | static inline int check_new_page(struct page *page) | 823 | static inline int check_new_page(struct page *page) |
824 | { | 824 | { |
825 | if (unlikely(page_mapcount(page) | | 825 | if (unlikely(page_mapcount(page) | |
826 | (page->mapping != NULL) | | 826 | (page->mapping != NULL) | |
827 | (atomic_read(&page->_count) != 0) | | 827 | (atomic_read(&page->_count) != 0) | |
828 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | | 828 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | |
829 | (mem_cgroup_bad_page_check(page)))) { | 829 | (mem_cgroup_bad_page_check(page)))) { |
830 | bad_page(page); | 830 | bad_page(page); |
831 | return 1; | 831 | return 1; |
832 | } | 832 | } |
833 | return 0; | 833 | return 0; |
834 | } | 834 | } |
835 | 835 | ||
836 | static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | 836 | static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) |
837 | { | 837 | { |
838 | int i; | 838 | int i; |
839 | 839 | ||
840 | for (i = 0; i < (1 << order); i++) { | 840 | for (i = 0; i < (1 << order); i++) { |
841 | struct page *p = page + i; | 841 | struct page *p = page + i; |
842 | if (unlikely(check_new_page(p))) | 842 | if (unlikely(check_new_page(p))) |
843 | return 1; | 843 | return 1; |
844 | } | 844 | } |
845 | 845 | ||
846 | set_page_private(page, 0); | 846 | set_page_private(page, 0); |
847 | set_page_refcounted(page); | 847 | set_page_refcounted(page); |
848 | 848 | ||
849 | arch_alloc_page(page, order); | 849 | arch_alloc_page(page, order); |
850 | kernel_map_pages(page, 1 << order, 1); | 850 | kernel_map_pages(page, 1 << order, 1); |
851 | 851 | ||
852 | if (gfp_flags & __GFP_ZERO) | 852 | if (gfp_flags & __GFP_ZERO) |
853 | prep_zero_page(page, order, gfp_flags); | 853 | prep_zero_page(page, order, gfp_flags); |
854 | 854 | ||
855 | if (order && (gfp_flags & __GFP_COMP)) | 855 | if (order && (gfp_flags & __GFP_COMP)) |
856 | prep_compound_page(page, order); | 856 | prep_compound_page(page, order); |
857 | 857 | ||
858 | return 0; | 858 | return 0; |
859 | } | 859 | } |
860 | 860 | ||
861 | /* | 861 | /* |
862 | * Go through the free lists for the given migratetype and remove | 862 | * Go through the free lists for the given migratetype and remove |
863 | * the smallest available page from the freelists | 863 | * the smallest available page from the freelists |
864 | */ | 864 | */ |
865 | static inline | 865 | static inline |
866 | struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | 866 | struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, |
867 | int migratetype) | 867 | int migratetype) |
868 | { | 868 | { |
869 | unsigned int current_order; | 869 | unsigned int current_order; |
870 | struct free_area * area; | 870 | struct free_area * area; |
871 | struct page *page; | 871 | struct page *page; |
872 | 872 | ||
873 | /* Find a page of the appropriate size in the preferred list */ | 873 | /* Find a page of the appropriate size in the preferred list */ |
874 | for (current_order = order; current_order < MAX_ORDER; ++current_order) { | 874 | for (current_order = order; current_order < MAX_ORDER; ++current_order) { |
875 | area = &(zone->free_area[current_order]); | 875 | area = &(zone->free_area[current_order]); |
876 | if (list_empty(&area->free_list[migratetype])) | 876 | if (list_empty(&area->free_list[migratetype])) |
877 | continue; | 877 | continue; |
878 | 878 | ||
879 | page = list_entry(area->free_list[migratetype].next, | 879 | page = list_entry(area->free_list[migratetype].next, |
880 | struct page, lru); | 880 | struct page, lru); |
881 | list_del(&page->lru); | 881 | list_del(&page->lru); |
882 | rmv_page_order(page); | 882 | rmv_page_order(page); |
883 | area->nr_free--; | 883 | area->nr_free--; |
884 | expand(zone, page, order, current_order, area, migratetype); | 884 | expand(zone, page, order, current_order, area, migratetype); |
885 | return page; | 885 | return page; |
886 | } | 886 | } |
887 | 887 | ||
888 | return NULL; | 888 | return NULL; |
889 | } | 889 | } |
890 | 890 | ||
891 | 891 | ||
892 | /* | 892 | /* |
893 | * This array describes the order lists are fallen back to when | 893 | * This array describes the order lists are fallen back to when |
894 | * the free lists for the desirable migrate type are depleted | 894 | * the free lists for the desirable migrate type are depleted |
895 | */ | 895 | */ |
896 | static int fallbacks[MIGRATE_TYPES][4] = { | 896 | static int fallbacks[MIGRATE_TYPES][4] = { |
897 | [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | 897 | [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, |
898 | [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | 898 | [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, |
899 | #ifdef CONFIG_CMA | 899 | #ifdef CONFIG_CMA |
900 | [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | 900 | [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, |
901 | [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ | 901 | [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ |
902 | #else | 902 | #else |
903 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | 903 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, |
904 | #endif | 904 | #endif |
905 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ | 905 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ |
906 | [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ | 906 | [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ |
907 | }; | 907 | }; |
908 | 908 | ||
909 | /* | 909 | /* |
910 | * Move the free pages in a range to the free lists of the requested type. | 910 | * Move the free pages in a range to the free lists of the requested type. |
911 | * Note that start_page and end_pages are not aligned on a pageblock | 911 | * Note that start_page and end_pages are not aligned on a pageblock |
912 | * boundary. If alignment is required, use move_freepages_block() | 912 | * boundary. If alignment is required, use move_freepages_block() |
913 | */ | 913 | */ |
914 | static int move_freepages(struct zone *zone, | 914 | static int move_freepages(struct zone *zone, |
915 | struct page *start_page, struct page *end_page, | 915 | struct page *start_page, struct page *end_page, |
916 | int migratetype) | 916 | int migratetype) |
917 | { | 917 | { |
918 | struct page *page; | 918 | struct page *page; |
919 | unsigned long order; | 919 | unsigned long order; |
920 | int pages_moved = 0; | 920 | int pages_moved = 0; |
921 | 921 | ||
922 | #ifndef CONFIG_HOLES_IN_ZONE | 922 | #ifndef CONFIG_HOLES_IN_ZONE |
923 | /* | 923 | /* |
924 | * page_zone is not safe to call in this context when | 924 | * page_zone is not safe to call in this context when |
925 | * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant | 925 | * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant |
926 | * anyway as we check zone boundaries in move_freepages_block(). | 926 | * anyway as we check zone boundaries in move_freepages_block(). |
927 | * Remove at a later date when no bug reports exist related to | 927 | * Remove at a later date when no bug reports exist related to |
928 | * grouping pages by mobility | 928 | * grouping pages by mobility |
929 | */ | 929 | */ |
930 | BUG_ON(page_zone(start_page) != page_zone(end_page)); | 930 | BUG_ON(page_zone(start_page) != page_zone(end_page)); |
931 | #endif | 931 | #endif |
932 | 932 | ||
933 | for (page = start_page; page <= end_page;) { | 933 | for (page = start_page; page <= end_page;) { |
934 | /* Make sure we are not inadvertently changing nodes */ | 934 | /* Make sure we are not inadvertently changing nodes */ |
935 | VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); | 935 | VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); |
936 | 936 | ||
937 | if (!pfn_valid_within(page_to_pfn(page))) { | 937 | if (!pfn_valid_within(page_to_pfn(page))) { |
938 | page++; | 938 | page++; |
939 | continue; | 939 | continue; |
940 | } | 940 | } |
941 | 941 | ||
942 | if (!PageBuddy(page)) { | 942 | if (!PageBuddy(page)) { |
943 | page++; | 943 | page++; |
944 | continue; | 944 | continue; |
945 | } | 945 | } |
946 | 946 | ||
947 | order = page_order(page); | 947 | order = page_order(page); |
948 | list_move(&page->lru, | 948 | list_move(&page->lru, |
949 | &zone->free_area[order].free_list[migratetype]); | 949 | &zone->free_area[order].free_list[migratetype]); |
950 | page += 1 << order; | 950 | page += 1 << order; |
951 | pages_moved += 1 << order; | 951 | pages_moved += 1 << order; |
952 | } | 952 | } |
953 | 953 | ||
954 | return pages_moved; | 954 | return pages_moved; |
955 | } | 955 | } |
956 | 956 | ||
957 | static int move_freepages_block(struct zone *zone, struct page *page, | 957 | static int move_freepages_block(struct zone *zone, struct page *page, |
958 | int migratetype) | 958 | int migratetype) |
959 | { | 959 | { |
960 | unsigned long start_pfn, end_pfn; | 960 | unsigned long start_pfn, end_pfn; |
961 | struct page *start_page, *end_page; | 961 | struct page *start_page, *end_page; |
962 | 962 | ||
963 | start_pfn = page_to_pfn(page); | 963 | start_pfn = page_to_pfn(page); |
964 | start_pfn = start_pfn & ~(pageblock_nr_pages-1); | 964 | start_pfn = start_pfn & ~(pageblock_nr_pages-1); |
965 | start_page = pfn_to_page(start_pfn); | 965 | start_page = pfn_to_page(start_pfn); |
966 | end_page = start_page + pageblock_nr_pages - 1; | 966 | end_page = start_page + pageblock_nr_pages - 1; |
967 | end_pfn = start_pfn + pageblock_nr_pages - 1; | 967 | end_pfn = start_pfn + pageblock_nr_pages - 1; |
968 | 968 | ||
969 | /* Do not cross zone boundaries */ | 969 | /* Do not cross zone boundaries */ |
970 | if (start_pfn < zone->zone_start_pfn) | 970 | if (start_pfn < zone->zone_start_pfn) |
971 | start_page = page; | 971 | start_page = page; |
972 | if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) | 972 | if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages) |
973 | return 0; | 973 | return 0; |
974 | 974 | ||
975 | return move_freepages(zone, start_page, end_page, migratetype); | 975 | return move_freepages(zone, start_page, end_page, migratetype); |
976 | } | 976 | } |
977 | 977 | ||
978 | static void change_pageblock_range(struct page *pageblock_page, | 978 | static void change_pageblock_range(struct page *pageblock_page, |
979 | int start_order, int migratetype) | 979 | int start_order, int migratetype) |
980 | { | 980 | { |
981 | int nr_pageblocks = 1 << (start_order - pageblock_order); | 981 | int nr_pageblocks = 1 << (start_order - pageblock_order); |
982 | 982 | ||
983 | while (nr_pageblocks--) { | 983 | while (nr_pageblocks--) { |
984 | set_pageblock_migratetype(pageblock_page, migratetype); | 984 | set_pageblock_migratetype(pageblock_page, migratetype); |
985 | pageblock_page += pageblock_nr_pages; | 985 | pageblock_page += pageblock_nr_pages; |
986 | } | 986 | } |
987 | } | 987 | } |
988 | 988 | ||
989 | /* Remove an element from the buddy allocator from the fallback list */ | 989 | /* Remove an element from the buddy allocator from the fallback list */ |
990 | static inline struct page * | 990 | static inline struct page * |
991 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) | 991 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) |
992 | { | 992 | { |
993 | struct free_area * area; | 993 | struct free_area * area; |
994 | int current_order; | 994 | int current_order; |
995 | struct page *page; | 995 | struct page *page; |
996 | int migratetype, i; | 996 | int migratetype, i; |
997 | 997 | ||
998 | /* Find the largest possible block of pages in the other list */ | 998 | /* Find the largest possible block of pages in the other list */ |
999 | for (current_order = MAX_ORDER-1; current_order >= order; | 999 | for (current_order = MAX_ORDER-1; current_order >= order; |
1000 | --current_order) { | 1000 | --current_order) { |
1001 | for (i = 0;; i++) { | 1001 | for (i = 0;; i++) { |
1002 | migratetype = fallbacks[start_migratetype][i]; | 1002 | migratetype = fallbacks[start_migratetype][i]; |
1003 | 1003 | ||
1004 | /* MIGRATE_RESERVE handled later if necessary */ | 1004 | /* MIGRATE_RESERVE handled later if necessary */ |
1005 | if (migratetype == MIGRATE_RESERVE) | 1005 | if (migratetype == MIGRATE_RESERVE) |
1006 | break; | 1006 | break; |
1007 | 1007 | ||
1008 | area = &(zone->free_area[current_order]); | 1008 | area = &(zone->free_area[current_order]); |
1009 | if (list_empty(&area->free_list[migratetype])) | 1009 | if (list_empty(&area->free_list[migratetype])) |
1010 | continue; | 1010 | continue; |
1011 | 1011 | ||
1012 | page = list_entry(area->free_list[migratetype].next, | 1012 | page = list_entry(area->free_list[migratetype].next, |
1013 | struct page, lru); | 1013 | struct page, lru); |
1014 | area->nr_free--; | 1014 | area->nr_free--; |
1015 | 1015 | ||
1016 | /* | 1016 | /* |
1017 | * If breaking a large block of pages, move all free | 1017 | * If breaking a large block of pages, move all free |
1018 | * pages to the preferred allocation list. If falling | 1018 | * pages to the preferred allocation list. If falling |
1019 | * back for a reclaimable kernel allocation, be more | 1019 | * back for a reclaimable kernel allocation, be more |
1020 | * aggressive about taking ownership of free pages | 1020 | * aggressive about taking ownership of free pages |
1021 | * | 1021 | * |
1022 | * On the other hand, never change migration | 1022 | * On the other hand, never change migration |
1023 | * type of MIGRATE_CMA pageblocks nor move CMA | 1023 | * type of MIGRATE_CMA pageblocks nor move CMA |
1024 | * pages on different free lists. We don't | 1024 | * pages on different free lists. We don't |
1025 | * want unmovable pages to be allocated from | 1025 | * want unmovable pages to be allocated from |
1026 | * MIGRATE_CMA areas. | 1026 | * MIGRATE_CMA areas. |
1027 | */ | 1027 | */ |
1028 | if (!is_migrate_cma(migratetype) && | 1028 | if (!is_migrate_cma(migratetype) && |
1029 | (unlikely(current_order >= pageblock_order / 2) || | 1029 | (unlikely(current_order >= pageblock_order / 2) || |
1030 | start_migratetype == MIGRATE_RECLAIMABLE || | 1030 | start_migratetype == MIGRATE_RECLAIMABLE || |
1031 | page_group_by_mobility_disabled)) { | 1031 | page_group_by_mobility_disabled)) { |
1032 | int pages; | 1032 | int pages; |
1033 | pages = move_freepages_block(zone, page, | 1033 | pages = move_freepages_block(zone, page, |
1034 | start_migratetype); | 1034 | start_migratetype); |
1035 | 1035 | ||
1036 | /* Claim the whole block if over half of it is free */ | 1036 | /* Claim the whole block if over half of it is free */ |
1037 | if (pages >= (1 << (pageblock_order-1)) || | 1037 | if (pages >= (1 << (pageblock_order-1)) || |
1038 | page_group_by_mobility_disabled) | 1038 | page_group_by_mobility_disabled) |
1039 | set_pageblock_migratetype(page, | 1039 | set_pageblock_migratetype(page, |
1040 | start_migratetype); | 1040 | start_migratetype); |
1041 | 1041 | ||
1042 | migratetype = start_migratetype; | 1042 | migratetype = start_migratetype; |
1043 | } | 1043 | } |
1044 | 1044 | ||
1045 | /* Remove the page from the freelists */ | 1045 | /* Remove the page from the freelists */ |
1046 | list_del(&page->lru); | 1046 | list_del(&page->lru); |
1047 | rmv_page_order(page); | 1047 | rmv_page_order(page); |
1048 | 1048 | ||
1049 | /* Take ownership for orders >= pageblock_order */ | 1049 | /* Take ownership for orders >= pageblock_order */ |
1050 | if (current_order >= pageblock_order && | 1050 | if (current_order >= pageblock_order && |
1051 | !is_migrate_cma(migratetype)) | 1051 | !is_migrate_cma(migratetype)) |
1052 | change_pageblock_range(page, current_order, | 1052 | change_pageblock_range(page, current_order, |
1053 | start_migratetype); | 1053 | start_migratetype); |
1054 | 1054 | ||
1055 | expand(zone, page, order, current_order, area, | 1055 | expand(zone, page, order, current_order, area, |
1056 | is_migrate_cma(migratetype) | 1056 | is_migrate_cma(migratetype) |
1057 | ? migratetype : start_migratetype); | 1057 | ? migratetype : start_migratetype); |
1058 | 1058 | ||
1059 | trace_mm_page_alloc_extfrag(page, order, current_order, | 1059 | trace_mm_page_alloc_extfrag(page, order, current_order, |
1060 | start_migratetype, migratetype); | 1060 | start_migratetype, migratetype); |
1061 | 1061 | ||
1062 | return page; | 1062 | return page; |
1063 | } | 1063 | } |
1064 | } | 1064 | } |
1065 | 1065 | ||
1066 | return NULL; | 1066 | return NULL; |
1067 | } | 1067 | } |
1068 | 1068 | ||
1069 | /* | 1069 | /* |
1070 | * Do the hard work of removing an element from the buddy allocator. | 1070 | * Do the hard work of removing an element from the buddy allocator. |
1071 | * Call me with the zone->lock already held. | 1071 | * Call me with the zone->lock already held. |
1072 | */ | 1072 | */ |
1073 | static struct page *__rmqueue(struct zone *zone, unsigned int order, | 1073 | static struct page *__rmqueue(struct zone *zone, unsigned int order, |
1074 | int migratetype) | 1074 | int migratetype) |
1075 | { | 1075 | { |
1076 | struct page *page; | 1076 | struct page *page; |
1077 | 1077 | ||
1078 | retry_reserve: | 1078 | retry_reserve: |
1079 | page = __rmqueue_smallest(zone, order, migratetype); | 1079 | page = __rmqueue_smallest(zone, order, migratetype); |
1080 | 1080 | ||
1081 | if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { | 1081 | if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { |
1082 | page = __rmqueue_fallback(zone, order, migratetype); | 1082 | page = __rmqueue_fallback(zone, order, migratetype); |
1083 | 1083 | ||
1084 | /* | 1084 | /* |
1085 | * Use MIGRATE_RESERVE rather than fail an allocation. goto | 1085 | * Use MIGRATE_RESERVE rather than fail an allocation. goto |
1086 | * is used because __rmqueue_smallest is an inline function | 1086 | * is used because __rmqueue_smallest is an inline function |
1087 | * and we want just one call site | 1087 | * and we want just one call site |
1088 | */ | 1088 | */ |
1089 | if (!page) { | 1089 | if (!page) { |
1090 | migratetype = MIGRATE_RESERVE; | 1090 | migratetype = MIGRATE_RESERVE; |
1091 | goto retry_reserve; | 1091 | goto retry_reserve; |
1092 | } | 1092 | } |
1093 | } | 1093 | } |
1094 | 1094 | ||
1095 | trace_mm_page_alloc_zone_locked(page, order, migratetype); | 1095 | trace_mm_page_alloc_zone_locked(page, order, migratetype); |
1096 | return page; | 1096 | return page; |
1097 | } | 1097 | } |
1098 | 1098 | ||
1099 | /* | 1099 | /* |
1100 | * Obtain a specified number of elements from the buddy allocator, all under | 1100 | * Obtain a specified number of elements from the buddy allocator, all under |
1101 | * a single hold of the lock, for efficiency. Add them to the supplied list. | 1101 | * a single hold of the lock, for efficiency. Add them to the supplied list. |
1102 | * Returns the number of new pages which were placed at *list. | 1102 | * Returns the number of new pages which were placed at *list. |
1103 | */ | 1103 | */ |
1104 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 1104 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
1105 | unsigned long count, struct list_head *list, | 1105 | unsigned long count, struct list_head *list, |
1106 | int migratetype, int cold) | 1106 | int migratetype, int cold) |
1107 | { | 1107 | { |
1108 | int mt = migratetype, i; | 1108 | int mt = migratetype, i; |
1109 | 1109 | ||
1110 | spin_lock(&zone->lock); | 1110 | spin_lock(&zone->lock); |
1111 | for (i = 0; i < count; ++i) { | 1111 | for (i = 0; i < count; ++i) { |
1112 | struct page *page = __rmqueue(zone, order, migratetype); | 1112 | struct page *page = __rmqueue(zone, order, migratetype); |
1113 | if (unlikely(page == NULL)) | 1113 | if (unlikely(page == NULL)) |
1114 | break; | 1114 | break; |
1115 | 1115 | ||
1116 | /* | 1116 | /* |
1117 | * Split buddy pages returned by expand() are received here | 1117 | * Split buddy pages returned by expand() are received here |
1118 | * in physical page order. The page is added to the callers and | 1118 | * in physical page order. The page is added to the callers and |
1119 | * list and the list head then moves forward. From the callers | 1119 | * list and the list head then moves forward. From the callers |
1120 | * perspective, the linked list is ordered by page number in | 1120 | * perspective, the linked list is ordered by page number in |
1121 | * some conditions. This is useful for IO devices that can | 1121 | * some conditions. This is useful for IO devices that can |
1122 | * merge IO requests if the physical pages are ordered | 1122 | * merge IO requests if the physical pages are ordered |
1123 | * properly. | 1123 | * properly. |
1124 | */ | 1124 | */ |
1125 | if (likely(cold == 0)) | 1125 | if (likely(cold == 0)) |
1126 | list_add(&page->lru, list); | 1126 | list_add(&page->lru, list); |
1127 | else | 1127 | else |
1128 | list_add_tail(&page->lru, list); | 1128 | list_add_tail(&page->lru, list); |
1129 | if (IS_ENABLED(CONFIG_CMA)) { | 1129 | if (IS_ENABLED(CONFIG_CMA)) { |
1130 | mt = get_pageblock_migratetype(page); | 1130 | mt = get_pageblock_migratetype(page); |
1131 | if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) | 1131 | if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE) |
1132 | mt = migratetype; | 1132 | mt = migratetype; |
1133 | } | 1133 | } |
1134 | set_page_private(page, mt); | 1134 | set_page_private(page, mt); |
1135 | list = &page->lru; | 1135 | list = &page->lru; |
1136 | } | 1136 | } |
1137 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); | 1137 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); |
1138 | spin_unlock(&zone->lock); | 1138 | spin_unlock(&zone->lock); |
1139 | return i; | 1139 | return i; |
1140 | } | 1140 | } |
1141 | 1141 | ||
1142 | #ifdef CONFIG_NUMA | 1142 | #ifdef CONFIG_NUMA |
1143 | /* | 1143 | /* |
1144 | * Called from the vmstat counter updater to drain pagesets of this | 1144 | * Called from the vmstat counter updater to drain pagesets of this |
1145 | * currently executing processor on remote nodes after they have | 1145 | * currently executing processor on remote nodes after they have |
1146 | * expired. | 1146 | * expired. |
1147 | * | 1147 | * |
1148 | * Note that this function must be called with the thread pinned to | 1148 | * Note that this function must be called with the thread pinned to |
1149 | * a single processor. | 1149 | * a single processor. |
1150 | */ | 1150 | */ |
1151 | void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | 1151 | void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) |
1152 | { | 1152 | { |
1153 | unsigned long flags; | 1153 | unsigned long flags; |
1154 | int to_drain; | 1154 | int to_drain; |
1155 | 1155 | ||
1156 | local_irq_save(flags); | 1156 | local_irq_save(flags); |
1157 | if (pcp->count >= pcp->batch) | 1157 | if (pcp->count >= pcp->batch) |
1158 | to_drain = pcp->batch; | 1158 | to_drain = pcp->batch; |
1159 | else | 1159 | else |
1160 | to_drain = pcp->count; | 1160 | to_drain = pcp->count; |
1161 | if (to_drain > 0) { | 1161 | if (to_drain > 0) { |
1162 | free_pcppages_bulk(zone, to_drain, pcp); | 1162 | free_pcppages_bulk(zone, to_drain, pcp); |
1163 | pcp->count -= to_drain; | 1163 | pcp->count -= to_drain; |
1164 | } | 1164 | } |
1165 | local_irq_restore(flags); | 1165 | local_irq_restore(flags); |
1166 | } | 1166 | } |
1167 | #endif | 1167 | #endif |
1168 | 1168 | ||
1169 | /* | 1169 | /* |
1170 | * Drain pages of the indicated processor. | 1170 | * Drain pages of the indicated processor. |
1171 | * | 1171 | * |
1172 | * The processor must either be the current processor and the | 1172 | * The processor must either be the current processor and the |
1173 | * thread pinned to the current processor or a processor that | 1173 | * thread pinned to the current processor or a processor that |
1174 | * is not online. | 1174 | * is not online. |
1175 | */ | 1175 | */ |
1176 | static void drain_pages(unsigned int cpu) | 1176 | static void drain_pages(unsigned int cpu) |
1177 | { | 1177 | { |
1178 | unsigned long flags; | 1178 | unsigned long flags; |
1179 | struct zone *zone; | 1179 | struct zone *zone; |
1180 | 1180 | ||
1181 | for_each_populated_zone(zone) { | 1181 | for_each_populated_zone(zone) { |
1182 | struct per_cpu_pageset *pset; | 1182 | struct per_cpu_pageset *pset; |
1183 | struct per_cpu_pages *pcp; | 1183 | struct per_cpu_pages *pcp; |
1184 | 1184 | ||
1185 | local_irq_save(flags); | 1185 | local_irq_save(flags); |
1186 | pset = per_cpu_ptr(zone->pageset, cpu); | 1186 | pset = per_cpu_ptr(zone->pageset, cpu); |
1187 | 1187 | ||
1188 | pcp = &pset->pcp; | 1188 | pcp = &pset->pcp; |
1189 | if (pcp->count) { | 1189 | if (pcp->count) { |
1190 | free_pcppages_bulk(zone, pcp->count, pcp); | 1190 | free_pcppages_bulk(zone, pcp->count, pcp); |
1191 | pcp->count = 0; | 1191 | pcp->count = 0; |
1192 | } | 1192 | } |
1193 | local_irq_restore(flags); | 1193 | local_irq_restore(flags); |
1194 | } | 1194 | } |
1195 | } | 1195 | } |
1196 | 1196 | ||
1197 | /* | 1197 | /* |
1198 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. | 1198 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. |
1199 | */ | 1199 | */ |
1200 | void drain_local_pages(void *arg) | 1200 | void drain_local_pages(void *arg) |
1201 | { | 1201 | { |
1202 | drain_pages(smp_processor_id()); | 1202 | drain_pages(smp_processor_id()); |
1203 | } | 1203 | } |
1204 | 1204 | ||
1205 | /* | 1205 | /* |
1206 | * Spill all the per-cpu pages from all CPUs back into the buddy allocator. | 1206 | * Spill all the per-cpu pages from all CPUs back into the buddy allocator. |
1207 | * | 1207 | * |
1208 | * Note that this code is protected against sending an IPI to an offline | 1208 | * Note that this code is protected against sending an IPI to an offline |
1209 | * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: | 1209 | * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: |
1210 | * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but | 1210 | * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but |
1211 | * nothing keeps CPUs from showing up after we populated the cpumask and | 1211 | * nothing keeps CPUs from showing up after we populated the cpumask and |
1212 | * before the call to on_each_cpu_mask(). | 1212 | * before the call to on_each_cpu_mask(). |
1213 | */ | 1213 | */ |
1214 | void drain_all_pages(void) | 1214 | void drain_all_pages(void) |
1215 | { | 1215 | { |
1216 | int cpu; | 1216 | int cpu; |
1217 | struct per_cpu_pageset *pcp; | 1217 | struct per_cpu_pageset *pcp; |
1218 | struct zone *zone; | 1218 | struct zone *zone; |
1219 | 1219 | ||
1220 | /* | 1220 | /* |
1221 | * Allocate in the BSS so we wont require allocation in | 1221 | * Allocate in the BSS so we wont require allocation in |
1222 | * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y | 1222 | * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y |
1223 | */ | 1223 | */ |
1224 | static cpumask_t cpus_with_pcps; | 1224 | static cpumask_t cpus_with_pcps; |
1225 | 1225 | ||
1226 | /* | 1226 | /* |
1227 | * We don't care about racing with CPU hotplug event | 1227 | * We don't care about racing with CPU hotplug event |
1228 | * as offline notification will cause the notified | 1228 | * as offline notification will cause the notified |
1229 | * cpu to drain that CPU pcps and on_each_cpu_mask | 1229 | * cpu to drain that CPU pcps and on_each_cpu_mask |
1230 | * disables preemption as part of its processing | 1230 | * disables preemption as part of its processing |
1231 | */ | 1231 | */ |
1232 | for_each_online_cpu(cpu) { | 1232 | for_each_online_cpu(cpu) { |
1233 | bool has_pcps = false; | 1233 | bool has_pcps = false; |
1234 | for_each_populated_zone(zone) { | 1234 | for_each_populated_zone(zone) { |
1235 | pcp = per_cpu_ptr(zone->pageset, cpu); | 1235 | pcp = per_cpu_ptr(zone->pageset, cpu); |
1236 | if (pcp->pcp.count) { | 1236 | if (pcp->pcp.count) { |
1237 | has_pcps = true; | 1237 | has_pcps = true; |
1238 | break; | 1238 | break; |
1239 | } | 1239 | } |
1240 | } | 1240 | } |
1241 | if (has_pcps) | 1241 | if (has_pcps) |
1242 | cpumask_set_cpu(cpu, &cpus_with_pcps); | 1242 | cpumask_set_cpu(cpu, &cpus_with_pcps); |
1243 | else | 1243 | else |
1244 | cpumask_clear_cpu(cpu, &cpus_with_pcps); | 1244 | cpumask_clear_cpu(cpu, &cpus_with_pcps); |
1245 | } | 1245 | } |
1246 | on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); | 1246 | on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); |
1247 | } | 1247 | } |
1248 | 1248 | ||
1249 | #ifdef CONFIG_HIBERNATION | 1249 | #ifdef CONFIG_HIBERNATION |
1250 | 1250 | ||
1251 | void mark_free_pages(struct zone *zone) | 1251 | void mark_free_pages(struct zone *zone) |
1252 | { | 1252 | { |
1253 | unsigned long pfn, max_zone_pfn; | 1253 | unsigned long pfn, max_zone_pfn; |
1254 | unsigned long flags; | 1254 | unsigned long flags; |
1255 | int order, t; | 1255 | int order, t; |
1256 | struct list_head *curr; | 1256 | struct list_head *curr; |
1257 | 1257 | ||
1258 | if (!zone->spanned_pages) | 1258 | if (!zone->spanned_pages) |
1259 | return; | 1259 | return; |
1260 | 1260 | ||
1261 | spin_lock_irqsave(&zone->lock, flags); | 1261 | spin_lock_irqsave(&zone->lock, flags); |
1262 | 1262 | ||
1263 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 1263 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; |
1264 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 1264 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
1265 | if (pfn_valid(pfn)) { | 1265 | if (pfn_valid(pfn)) { |
1266 | struct page *page = pfn_to_page(pfn); | 1266 | struct page *page = pfn_to_page(pfn); |
1267 | 1267 | ||
1268 | if (!swsusp_page_is_forbidden(page)) | 1268 | if (!swsusp_page_is_forbidden(page)) |
1269 | swsusp_unset_page_free(page); | 1269 | swsusp_unset_page_free(page); |
1270 | } | 1270 | } |
1271 | 1271 | ||
1272 | for_each_migratetype_order(order, t) { | 1272 | for_each_migratetype_order(order, t) { |
1273 | list_for_each(curr, &zone->free_area[order].free_list[t]) { | 1273 | list_for_each(curr, &zone->free_area[order].free_list[t]) { |
1274 | unsigned long i; | 1274 | unsigned long i; |
1275 | 1275 | ||
1276 | pfn = page_to_pfn(list_entry(curr, struct page, lru)); | 1276 | pfn = page_to_pfn(list_entry(curr, struct page, lru)); |
1277 | for (i = 0; i < (1UL << order); i++) | 1277 | for (i = 0; i < (1UL << order); i++) |
1278 | swsusp_set_page_free(pfn_to_page(pfn + i)); | 1278 | swsusp_set_page_free(pfn_to_page(pfn + i)); |
1279 | } | 1279 | } |
1280 | } | 1280 | } |
1281 | spin_unlock_irqrestore(&zone->lock, flags); | 1281 | spin_unlock_irqrestore(&zone->lock, flags); |
1282 | } | 1282 | } |
1283 | #endif /* CONFIG_PM */ | 1283 | #endif /* CONFIG_PM */ |
1284 | 1284 | ||
1285 | /* | 1285 | /* |
1286 | * Free a 0-order page | 1286 | * Free a 0-order page |
1287 | * cold == 1 ? free a cold page : free a hot page | 1287 | * cold == 1 ? free a cold page : free a hot page |
1288 | */ | 1288 | */ |
1289 | void free_hot_cold_page(struct page *page, int cold) | 1289 | void free_hot_cold_page(struct page *page, int cold) |
1290 | { | 1290 | { |
1291 | struct zone *zone = page_zone(page); | 1291 | struct zone *zone = page_zone(page); |
1292 | struct per_cpu_pages *pcp; | 1292 | struct per_cpu_pages *pcp; |
1293 | unsigned long flags; | 1293 | unsigned long flags; |
1294 | int migratetype; | 1294 | int migratetype; |
1295 | int wasMlocked = __TestClearPageMlocked(page); | 1295 | int wasMlocked = __TestClearPageMlocked(page); |
1296 | 1296 | ||
1297 | if (!free_pages_prepare(page, 0)) | 1297 | if (!free_pages_prepare(page, 0)) |
1298 | return; | 1298 | return; |
1299 | 1299 | ||
1300 | migratetype = get_pageblock_migratetype(page); | 1300 | migratetype = get_pageblock_migratetype(page); |
1301 | set_page_private(page, migratetype); | 1301 | set_page_private(page, migratetype); |
1302 | local_irq_save(flags); | 1302 | local_irq_save(flags); |
1303 | if (unlikely(wasMlocked)) | 1303 | if (unlikely(wasMlocked)) |
1304 | free_page_mlock(page); | 1304 | free_page_mlock(page); |
1305 | __count_vm_event(PGFREE); | 1305 | __count_vm_event(PGFREE); |
1306 | 1306 | ||
1307 | /* | 1307 | /* |
1308 | * We only track unmovable, reclaimable and movable on pcp lists. | 1308 | * We only track unmovable, reclaimable and movable on pcp lists. |
1309 | * Free ISOLATE pages back to the allocator because they are being | 1309 | * Free ISOLATE pages back to the allocator because they are being |
1310 | * offlined but treat RESERVE as movable pages so we can get those | 1310 | * offlined but treat RESERVE as movable pages so we can get those |
1311 | * areas back if necessary. Otherwise, we may have to free | 1311 | * areas back if necessary. Otherwise, we may have to free |
1312 | * excessively into the page allocator | 1312 | * excessively into the page allocator |
1313 | */ | 1313 | */ |
1314 | if (migratetype >= MIGRATE_PCPTYPES) { | 1314 | if (migratetype >= MIGRATE_PCPTYPES) { |
1315 | if (unlikely(migratetype == MIGRATE_ISOLATE)) { | 1315 | if (unlikely(migratetype == MIGRATE_ISOLATE)) { |
1316 | free_one_page(zone, page, 0, migratetype); | 1316 | free_one_page(zone, page, 0, migratetype); |
1317 | goto out; | 1317 | goto out; |
1318 | } | 1318 | } |
1319 | migratetype = MIGRATE_MOVABLE; | 1319 | migratetype = MIGRATE_MOVABLE; |
1320 | } | 1320 | } |
1321 | 1321 | ||
1322 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | 1322 | pcp = &this_cpu_ptr(zone->pageset)->pcp; |
1323 | if (cold) | 1323 | if (cold) |
1324 | list_add_tail(&page->lru, &pcp->lists[migratetype]); | 1324 | list_add_tail(&page->lru, &pcp->lists[migratetype]); |
1325 | else | 1325 | else |
1326 | list_add(&page->lru, &pcp->lists[migratetype]); | 1326 | list_add(&page->lru, &pcp->lists[migratetype]); |
1327 | pcp->count++; | 1327 | pcp->count++; |
1328 | if (pcp->count >= pcp->high) { | 1328 | if (pcp->count >= pcp->high) { |
1329 | free_pcppages_bulk(zone, pcp->batch, pcp); | 1329 | free_pcppages_bulk(zone, pcp->batch, pcp); |
1330 | pcp->count -= pcp->batch; | 1330 | pcp->count -= pcp->batch; |
1331 | } | 1331 | } |
1332 | 1332 | ||
1333 | out: | 1333 | out: |
1334 | local_irq_restore(flags); | 1334 | local_irq_restore(flags); |
1335 | } | 1335 | } |
1336 | 1336 | ||
1337 | /* | 1337 | /* |
1338 | * Free a list of 0-order pages | 1338 | * Free a list of 0-order pages |
1339 | */ | 1339 | */ |
1340 | void free_hot_cold_page_list(struct list_head *list, int cold) | 1340 | void free_hot_cold_page_list(struct list_head *list, int cold) |
1341 | { | 1341 | { |
1342 | struct page *page, *next; | 1342 | struct page *page, *next; |
1343 | 1343 | ||
1344 | list_for_each_entry_safe(page, next, list, lru) { | 1344 | list_for_each_entry_safe(page, next, list, lru) { |
1345 | trace_mm_page_free_batched(page, cold); | 1345 | trace_mm_page_free_batched(page, cold); |
1346 | free_hot_cold_page(page, cold); | 1346 | free_hot_cold_page(page, cold); |
1347 | } | 1347 | } |
1348 | } | 1348 | } |
1349 | 1349 | ||
1350 | /* | 1350 | /* |
1351 | * split_page takes a non-compound higher-order page, and splits it into | 1351 | * split_page takes a non-compound higher-order page, and splits it into |
1352 | * n (1<<order) sub-pages: page[0..n] | 1352 | * n (1<<order) sub-pages: page[0..n] |
1353 | * Each sub-page must be freed individually. | 1353 | * Each sub-page must be freed individually. |
1354 | * | 1354 | * |
1355 | * Note: this is probably too low level an operation for use in drivers. | 1355 | * Note: this is probably too low level an operation for use in drivers. |
1356 | * Please consult with lkml before using this in your driver. | 1356 | * Please consult with lkml before using this in your driver. |
1357 | */ | 1357 | */ |
1358 | void split_page(struct page *page, unsigned int order) | 1358 | void split_page(struct page *page, unsigned int order) |
1359 | { | 1359 | { |
1360 | int i; | 1360 | int i; |
1361 | 1361 | ||
1362 | VM_BUG_ON(PageCompound(page)); | 1362 | VM_BUG_ON(PageCompound(page)); |
1363 | VM_BUG_ON(!page_count(page)); | 1363 | VM_BUG_ON(!page_count(page)); |
1364 | 1364 | ||
1365 | #ifdef CONFIG_KMEMCHECK | 1365 | #ifdef CONFIG_KMEMCHECK |
1366 | /* | 1366 | /* |
1367 | * Split shadow pages too, because free(page[0]) would | 1367 | * Split shadow pages too, because free(page[0]) would |
1368 | * otherwise free the whole shadow. | 1368 | * otherwise free the whole shadow. |
1369 | */ | 1369 | */ |
1370 | if (kmemcheck_page_is_tracked(page)) | 1370 | if (kmemcheck_page_is_tracked(page)) |
1371 | split_page(virt_to_page(page[0].shadow), order); | 1371 | split_page(virt_to_page(page[0].shadow), order); |
1372 | #endif | 1372 | #endif |
1373 | 1373 | ||
1374 | for (i = 1; i < (1 << order); i++) | 1374 | for (i = 1; i < (1 << order); i++) |
1375 | set_page_refcounted(page + i); | 1375 | set_page_refcounted(page + i); |
1376 | } | 1376 | } |
1377 | 1377 | ||
1378 | /* | 1378 | /* |
1379 | * Similar to split_page except the page is already free. As this is only | 1379 | * Similar to split_page except the page is already free. As this is only |
1380 | * being used for migration, the migratetype of the block also changes. | 1380 | * being used for migration, the migratetype of the block also changes. |
1381 | * As this is called with interrupts disabled, the caller is responsible | 1381 | * As this is called with interrupts disabled, the caller is responsible |
1382 | * for calling arch_alloc_page() and kernel_map_page() after interrupts | 1382 | * for calling arch_alloc_page() and kernel_map_page() after interrupts |
1383 | * are enabled. | 1383 | * are enabled. |
1384 | * | 1384 | * |
1385 | * Note: this is probably too low level an operation for use in drivers. | 1385 | * Note: this is probably too low level an operation for use in drivers. |
1386 | * Please consult with lkml before using this in your driver. | 1386 | * Please consult with lkml before using this in your driver. |
1387 | */ | 1387 | */ |
1388 | int split_free_page(struct page *page) | 1388 | int split_free_page(struct page *page) |
1389 | { | 1389 | { |
1390 | unsigned int order; | 1390 | unsigned int order; |
1391 | unsigned long watermark; | 1391 | unsigned long watermark; |
1392 | struct zone *zone; | 1392 | struct zone *zone; |
1393 | 1393 | ||
1394 | BUG_ON(!PageBuddy(page)); | 1394 | BUG_ON(!PageBuddy(page)); |
1395 | 1395 | ||
1396 | zone = page_zone(page); | 1396 | zone = page_zone(page); |
1397 | order = page_order(page); | 1397 | order = page_order(page); |
1398 | 1398 | ||
1399 | /* Obey watermarks as if the page was being allocated */ | 1399 | /* Obey watermarks as if the page was being allocated */ |
1400 | watermark = low_wmark_pages(zone) + (1 << order); | 1400 | watermark = low_wmark_pages(zone) + (1 << order); |
1401 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | 1401 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) |
1402 | return 0; | 1402 | return 0; |
1403 | 1403 | ||
1404 | /* Remove page from free list */ | 1404 | /* Remove page from free list */ |
1405 | list_del(&page->lru); | 1405 | list_del(&page->lru); |
1406 | zone->free_area[order].nr_free--; | 1406 | zone->free_area[order].nr_free--; |
1407 | rmv_page_order(page); | 1407 | rmv_page_order(page); |
1408 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); | 1408 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); |
1409 | 1409 | ||
1410 | /* Split into individual pages */ | 1410 | /* Split into individual pages */ |
1411 | set_page_refcounted(page); | 1411 | set_page_refcounted(page); |
1412 | split_page(page, order); | 1412 | split_page(page, order); |
1413 | 1413 | ||
1414 | if (order >= pageblock_order - 1) { | 1414 | if (order >= pageblock_order - 1) { |
1415 | struct page *endpage = page + (1 << order) - 1; | 1415 | struct page *endpage = page + (1 << order) - 1; |
1416 | for (; page < endpage; page += pageblock_nr_pages) { | 1416 | for (; page < endpage; page += pageblock_nr_pages) { |
1417 | int mt = get_pageblock_migratetype(page); | 1417 | int mt = get_pageblock_migratetype(page); |
1418 | if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) | 1418 | if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt)) |
1419 | set_pageblock_migratetype(page, | 1419 | set_pageblock_migratetype(page, |
1420 | MIGRATE_MOVABLE); | 1420 | MIGRATE_MOVABLE); |
1421 | } | 1421 | } |
1422 | } | 1422 | } |
1423 | 1423 | ||
1424 | return 1 << order; | 1424 | return 1 << order; |
1425 | } | 1425 | } |
1426 | 1426 | ||
1427 | /* | 1427 | /* |
1428 | * Really, prep_compound_page() should be called from __rmqueue_bulk(). But | 1428 | * Really, prep_compound_page() should be called from __rmqueue_bulk(). But |
1429 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | 1429 | * we cheat by calling it from here, in the order > 0 path. Saves a branch |
1430 | * or two. | 1430 | * or two. |
1431 | */ | 1431 | */ |
1432 | static inline | 1432 | static inline |
1433 | struct page *buffered_rmqueue(struct zone *preferred_zone, | 1433 | struct page *buffered_rmqueue(struct zone *preferred_zone, |
1434 | struct zone *zone, int order, gfp_t gfp_flags, | 1434 | struct zone *zone, int order, gfp_t gfp_flags, |
1435 | int migratetype) | 1435 | int migratetype) |
1436 | { | 1436 | { |
1437 | unsigned long flags; | 1437 | unsigned long flags; |
1438 | struct page *page; | 1438 | struct page *page; |
1439 | int cold = !!(gfp_flags & __GFP_COLD); | 1439 | int cold = !!(gfp_flags & __GFP_COLD); |
1440 | 1440 | ||
1441 | again: | 1441 | again: |
1442 | if (likely(order == 0)) { | 1442 | if (likely(order == 0)) { |
1443 | struct per_cpu_pages *pcp; | 1443 | struct per_cpu_pages *pcp; |
1444 | struct list_head *list; | 1444 | struct list_head *list; |
1445 | 1445 | ||
1446 | local_irq_save(flags); | 1446 | local_irq_save(flags); |
1447 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | 1447 | pcp = &this_cpu_ptr(zone->pageset)->pcp; |
1448 | list = &pcp->lists[migratetype]; | 1448 | list = &pcp->lists[migratetype]; |
1449 | if (list_empty(list)) { | 1449 | if (list_empty(list)) { |
1450 | pcp->count += rmqueue_bulk(zone, 0, | 1450 | pcp->count += rmqueue_bulk(zone, 0, |
1451 | pcp->batch, list, | 1451 | pcp->batch, list, |
1452 | migratetype, cold); | 1452 | migratetype, cold); |
1453 | if (unlikely(list_empty(list))) | 1453 | if (unlikely(list_empty(list))) |
1454 | goto failed; | 1454 | goto failed; |
1455 | } | 1455 | } |
1456 | 1456 | ||
1457 | if (cold) | 1457 | if (cold) |
1458 | page = list_entry(list->prev, struct page, lru); | 1458 | page = list_entry(list->prev, struct page, lru); |
1459 | else | 1459 | else |
1460 | page = list_entry(list->next, struct page, lru); | 1460 | page = list_entry(list->next, struct page, lru); |
1461 | 1461 | ||
1462 | list_del(&page->lru); | 1462 | list_del(&page->lru); |
1463 | pcp->count--; | 1463 | pcp->count--; |
1464 | } else { | 1464 | } else { |
1465 | if (unlikely(gfp_flags & __GFP_NOFAIL)) { | 1465 | if (unlikely(gfp_flags & __GFP_NOFAIL)) { |
1466 | /* | 1466 | /* |
1467 | * __GFP_NOFAIL is not to be used in new code. | 1467 | * __GFP_NOFAIL is not to be used in new code. |
1468 | * | 1468 | * |
1469 | * All __GFP_NOFAIL callers should be fixed so that they | 1469 | * All __GFP_NOFAIL callers should be fixed so that they |
1470 | * properly detect and handle allocation failures. | 1470 | * properly detect and handle allocation failures. |
1471 | * | 1471 | * |
1472 | * We most definitely don't want callers attempting to | 1472 | * We most definitely don't want callers attempting to |
1473 | * allocate greater than order-1 page units with | 1473 | * allocate greater than order-1 page units with |
1474 | * __GFP_NOFAIL. | 1474 | * __GFP_NOFAIL. |
1475 | */ | 1475 | */ |
1476 | WARN_ON_ONCE(order > 1); | 1476 | WARN_ON_ONCE(order > 1); |
1477 | } | 1477 | } |
1478 | spin_lock_irqsave(&zone->lock, flags); | 1478 | spin_lock_irqsave(&zone->lock, flags); |
1479 | page = __rmqueue(zone, order, migratetype); | 1479 | page = __rmqueue(zone, order, migratetype); |
1480 | spin_unlock(&zone->lock); | 1480 | spin_unlock(&zone->lock); |
1481 | if (!page) | 1481 | if (!page) |
1482 | goto failed; | 1482 | goto failed; |
1483 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); | 1483 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); |
1484 | } | 1484 | } |
1485 | 1485 | ||
1486 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1486 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
1487 | zone_statistics(preferred_zone, zone, gfp_flags); | 1487 | zone_statistics(preferred_zone, zone, gfp_flags); |
1488 | local_irq_restore(flags); | 1488 | local_irq_restore(flags); |
1489 | 1489 | ||
1490 | VM_BUG_ON(bad_range(zone, page)); | 1490 | VM_BUG_ON(bad_range(zone, page)); |
1491 | if (prep_new_page(page, order, gfp_flags)) | 1491 | if (prep_new_page(page, order, gfp_flags)) |
1492 | goto again; | 1492 | goto again; |
1493 | return page; | 1493 | return page; |
1494 | 1494 | ||
1495 | failed: | 1495 | failed: |
1496 | local_irq_restore(flags); | 1496 | local_irq_restore(flags); |
1497 | return NULL; | 1497 | return NULL; |
1498 | } | 1498 | } |
1499 | 1499 | ||
1500 | /* The ALLOC_WMARK bits are used as an index to zone->watermark */ | 1500 | /* The ALLOC_WMARK bits are used as an index to zone->watermark */ |
1501 | #define ALLOC_WMARK_MIN WMARK_MIN | 1501 | #define ALLOC_WMARK_MIN WMARK_MIN |
1502 | #define ALLOC_WMARK_LOW WMARK_LOW | 1502 | #define ALLOC_WMARK_LOW WMARK_LOW |
1503 | #define ALLOC_WMARK_HIGH WMARK_HIGH | 1503 | #define ALLOC_WMARK_HIGH WMARK_HIGH |
1504 | #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ | 1504 | #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ |
1505 | 1505 | ||
1506 | /* Mask to get the watermark bits */ | 1506 | /* Mask to get the watermark bits */ |
1507 | #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) | 1507 | #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) |
1508 | 1508 | ||
1509 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ | 1509 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ |
1510 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | 1510 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ |
1511 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | 1511 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ |
1512 | 1512 | ||
1513 | #ifdef CONFIG_FAIL_PAGE_ALLOC | 1513 | #ifdef CONFIG_FAIL_PAGE_ALLOC |
1514 | 1514 | ||
1515 | static struct { | 1515 | static struct { |
1516 | struct fault_attr attr; | 1516 | struct fault_attr attr; |
1517 | 1517 | ||
1518 | u32 ignore_gfp_highmem; | 1518 | u32 ignore_gfp_highmem; |
1519 | u32 ignore_gfp_wait; | 1519 | u32 ignore_gfp_wait; |
1520 | u32 min_order; | 1520 | u32 min_order; |
1521 | } fail_page_alloc = { | 1521 | } fail_page_alloc = { |
1522 | .attr = FAULT_ATTR_INITIALIZER, | 1522 | .attr = FAULT_ATTR_INITIALIZER, |
1523 | .ignore_gfp_wait = 1, | 1523 | .ignore_gfp_wait = 1, |
1524 | .ignore_gfp_highmem = 1, | 1524 | .ignore_gfp_highmem = 1, |
1525 | .min_order = 1, | 1525 | .min_order = 1, |
1526 | }; | 1526 | }; |
1527 | 1527 | ||
1528 | static int __init setup_fail_page_alloc(char *str) | 1528 | static int __init setup_fail_page_alloc(char *str) |
1529 | { | 1529 | { |
1530 | return setup_fault_attr(&fail_page_alloc.attr, str); | 1530 | return setup_fault_attr(&fail_page_alloc.attr, str); |
1531 | } | 1531 | } |
1532 | __setup("fail_page_alloc=", setup_fail_page_alloc); | 1532 | __setup("fail_page_alloc=", setup_fail_page_alloc); |
1533 | 1533 | ||
1534 | static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | 1534 | static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
1535 | { | 1535 | { |
1536 | if (order < fail_page_alloc.min_order) | 1536 | if (order < fail_page_alloc.min_order) |
1537 | return false; | 1537 | return false; |
1538 | if (gfp_mask & __GFP_NOFAIL) | 1538 | if (gfp_mask & __GFP_NOFAIL) |
1539 | return false; | 1539 | return false; |
1540 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) | 1540 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) |
1541 | return false; | 1541 | return false; |
1542 | if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) | 1542 | if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) |
1543 | return false; | 1543 | return false; |
1544 | 1544 | ||
1545 | return should_fail(&fail_page_alloc.attr, 1 << order); | 1545 | return should_fail(&fail_page_alloc.attr, 1 << order); |
1546 | } | 1546 | } |
1547 | 1547 | ||
1548 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | 1548 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
1549 | 1549 | ||
1550 | static int __init fail_page_alloc_debugfs(void) | 1550 | static int __init fail_page_alloc_debugfs(void) |
1551 | { | 1551 | { |
1552 | umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | 1552 | umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; |
1553 | struct dentry *dir; | 1553 | struct dentry *dir; |
1554 | 1554 | ||
1555 | dir = fault_create_debugfs_attr("fail_page_alloc", NULL, | 1555 | dir = fault_create_debugfs_attr("fail_page_alloc", NULL, |
1556 | &fail_page_alloc.attr); | 1556 | &fail_page_alloc.attr); |
1557 | if (IS_ERR(dir)) | 1557 | if (IS_ERR(dir)) |
1558 | return PTR_ERR(dir); | 1558 | return PTR_ERR(dir); |
1559 | 1559 | ||
1560 | if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, | 1560 | if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, |
1561 | &fail_page_alloc.ignore_gfp_wait)) | 1561 | &fail_page_alloc.ignore_gfp_wait)) |
1562 | goto fail; | 1562 | goto fail; |
1563 | if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, | 1563 | if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir, |
1564 | &fail_page_alloc.ignore_gfp_highmem)) | 1564 | &fail_page_alloc.ignore_gfp_highmem)) |
1565 | goto fail; | 1565 | goto fail; |
1566 | if (!debugfs_create_u32("min-order", mode, dir, | 1566 | if (!debugfs_create_u32("min-order", mode, dir, |
1567 | &fail_page_alloc.min_order)) | 1567 | &fail_page_alloc.min_order)) |
1568 | goto fail; | 1568 | goto fail; |
1569 | 1569 | ||
1570 | return 0; | 1570 | return 0; |
1571 | fail: | 1571 | fail: |
1572 | debugfs_remove_recursive(dir); | 1572 | debugfs_remove_recursive(dir); |
1573 | 1573 | ||
1574 | return -ENOMEM; | 1574 | return -ENOMEM; |
1575 | } | 1575 | } |
1576 | 1576 | ||
1577 | late_initcall(fail_page_alloc_debugfs); | 1577 | late_initcall(fail_page_alloc_debugfs); |
1578 | 1578 | ||
1579 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | 1579 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ |
1580 | 1580 | ||
1581 | #else /* CONFIG_FAIL_PAGE_ALLOC */ | 1581 | #else /* CONFIG_FAIL_PAGE_ALLOC */ |
1582 | 1582 | ||
1583 | static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | 1583 | static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
1584 | { | 1584 | { |
1585 | return false; | 1585 | return false; |
1586 | } | 1586 | } |
1587 | 1587 | ||
1588 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ | 1588 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ |
1589 | 1589 | ||
1590 | /* | 1590 | /* |
1591 | * Return true if free pages are above 'mark'. This takes into account the order | 1591 | * Return true if free pages are above 'mark'. This takes into account the order |
1592 | * of the allocation. | 1592 | * of the allocation. |
1593 | */ | 1593 | */ |
1594 | static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1594 | static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
1595 | int classzone_idx, int alloc_flags, long free_pages) | 1595 | int classzone_idx, int alloc_flags, long free_pages) |
1596 | { | 1596 | { |
1597 | /* free_pages my go negative - that's OK */ | 1597 | /* free_pages my go negative - that's OK */ |
1598 | long min = mark; | 1598 | long min = mark; |
1599 | int o; | 1599 | int o; |
1600 | 1600 | ||
1601 | free_pages -= (1 << order) - 1; | 1601 | free_pages -= (1 << order) - 1; |
1602 | if (alloc_flags & ALLOC_HIGH) | 1602 | if (alloc_flags & ALLOC_HIGH) |
1603 | min -= min / 2; | 1603 | min -= min / 2; |
1604 | if (alloc_flags & ALLOC_HARDER) | 1604 | if (alloc_flags & ALLOC_HARDER) |
1605 | min -= min / 4; | 1605 | min -= min / 4; |
1606 | 1606 | ||
1607 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) | 1607 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) |
1608 | return false; | 1608 | return false; |
1609 | for (o = 0; o < order; o++) { | 1609 | for (o = 0; o < order; o++) { |
1610 | /* At the next order, this order's pages become unavailable */ | 1610 | /* At the next order, this order's pages become unavailable */ |
1611 | free_pages -= z->free_area[o].nr_free << o; | 1611 | free_pages -= z->free_area[o].nr_free << o; |
1612 | 1612 | ||
1613 | /* Require fewer higher order pages to be free */ | 1613 | /* Require fewer higher order pages to be free */ |
1614 | min >>= 1; | 1614 | min >>= 1; |
1615 | 1615 | ||
1616 | if (free_pages <= min) | 1616 | if (free_pages <= min) |
1617 | return false; | 1617 | return false; |
1618 | } | 1618 | } |
1619 | return true; | 1619 | return true; |
1620 | } | 1620 | } |
1621 | 1621 | ||
1622 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1622 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
1623 | int classzone_idx, int alloc_flags) | 1623 | int classzone_idx, int alloc_flags) |
1624 | { | 1624 | { |
1625 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | 1625 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, |
1626 | zone_page_state(z, NR_FREE_PAGES)); | 1626 | zone_page_state(z, NR_FREE_PAGES)); |
1627 | } | 1627 | } |
1628 | 1628 | ||
1629 | bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | 1629 | bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, |
1630 | int classzone_idx, int alloc_flags) | 1630 | int classzone_idx, int alloc_flags) |
1631 | { | 1631 | { |
1632 | long free_pages = zone_page_state(z, NR_FREE_PAGES); | 1632 | long free_pages = zone_page_state(z, NR_FREE_PAGES); |
1633 | 1633 | ||
1634 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) | 1634 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) |
1635 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); | 1635 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); |
1636 | 1636 | ||
1637 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | 1637 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, |
1638 | free_pages); | 1638 | free_pages); |
1639 | } | 1639 | } |
1640 | 1640 | ||
1641 | #ifdef CONFIG_NUMA | 1641 | #ifdef CONFIG_NUMA |
1642 | /* | 1642 | /* |
1643 | * zlc_setup - Setup for "zonelist cache". Uses cached zone data to | 1643 | * zlc_setup - Setup for "zonelist cache". Uses cached zone data to |
1644 | * skip over zones that are not allowed by the cpuset, or that have | 1644 | * skip over zones that are not allowed by the cpuset, or that have |
1645 | * been recently (in last second) found to be nearly full. See further | 1645 | * been recently (in last second) found to be nearly full. See further |
1646 | * comments in mmzone.h. Reduces cache footprint of zonelist scans | 1646 | * comments in mmzone.h. Reduces cache footprint of zonelist scans |
1647 | * that have to skip over a lot of full or unallowed zones. | 1647 | * that have to skip over a lot of full or unallowed zones. |
1648 | * | 1648 | * |
1649 | * If the zonelist cache is present in the passed in zonelist, then | 1649 | * If the zonelist cache is present in the passed in zonelist, then |
1650 | * returns a pointer to the allowed node mask (either the current | 1650 | * returns a pointer to the allowed node mask (either the current |
1651 | * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) | 1651 | * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) |
1652 | * | 1652 | * |
1653 | * If the zonelist cache is not available for this zonelist, does | 1653 | * If the zonelist cache is not available for this zonelist, does |
1654 | * nothing and returns NULL. | 1654 | * nothing and returns NULL. |
1655 | * | 1655 | * |
1656 | * If the fullzones BITMAP in the zonelist cache is stale (more than | 1656 | * If the fullzones BITMAP in the zonelist cache is stale (more than |
1657 | * a second since last zap'd) then we zap it out (clear its bits.) | 1657 | * a second since last zap'd) then we zap it out (clear its bits.) |
1658 | * | 1658 | * |
1659 | * We hold off even calling zlc_setup, until after we've checked the | 1659 | * We hold off even calling zlc_setup, until after we've checked the |
1660 | * first zone in the zonelist, on the theory that most allocations will | 1660 | * first zone in the zonelist, on the theory that most allocations will |
1661 | * be satisfied from that first zone, so best to examine that zone as | 1661 | * be satisfied from that first zone, so best to examine that zone as |
1662 | * quickly as we can. | 1662 | * quickly as we can. |
1663 | */ | 1663 | */ |
1664 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | 1664 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) |
1665 | { | 1665 | { |
1666 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | 1666 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ |
1667 | nodemask_t *allowednodes; /* zonelist_cache approximation */ | 1667 | nodemask_t *allowednodes; /* zonelist_cache approximation */ |
1668 | 1668 | ||
1669 | zlc = zonelist->zlcache_ptr; | 1669 | zlc = zonelist->zlcache_ptr; |
1670 | if (!zlc) | 1670 | if (!zlc) |
1671 | return NULL; | 1671 | return NULL; |
1672 | 1672 | ||
1673 | if (time_after(jiffies, zlc->last_full_zap + HZ)) { | 1673 | if (time_after(jiffies, zlc->last_full_zap + HZ)) { |
1674 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | 1674 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); |
1675 | zlc->last_full_zap = jiffies; | 1675 | zlc->last_full_zap = jiffies; |
1676 | } | 1676 | } |
1677 | 1677 | ||
1678 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? | 1678 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? |
1679 | &cpuset_current_mems_allowed : | 1679 | &cpuset_current_mems_allowed : |
1680 | &node_states[N_HIGH_MEMORY]; | 1680 | &node_states[N_HIGH_MEMORY]; |
1681 | return allowednodes; | 1681 | return allowednodes; |
1682 | } | 1682 | } |
1683 | 1683 | ||
1684 | /* | 1684 | /* |
1685 | * Given 'z' scanning a zonelist, run a couple of quick checks to see | 1685 | * Given 'z' scanning a zonelist, run a couple of quick checks to see |
1686 | * if it is worth looking at further for free memory: | 1686 | * if it is worth looking at further for free memory: |
1687 | * 1) Check that the zone isn't thought to be full (doesn't have its | 1687 | * 1) Check that the zone isn't thought to be full (doesn't have its |
1688 | * bit set in the zonelist_cache fullzones BITMAP). | 1688 | * bit set in the zonelist_cache fullzones BITMAP). |
1689 | * 2) Check that the zones node (obtained from the zonelist_cache | 1689 | * 2) Check that the zones node (obtained from the zonelist_cache |
1690 | * z_to_n[] mapping) is allowed in the passed in allowednodes mask. | 1690 | * z_to_n[] mapping) is allowed in the passed in allowednodes mask. |
1691 | * Return true (non-zero) if zone is worth looking at further, or | 1691 | * Return true (non-zero) if zone is worth looking at further, or |
1692 | * else return false (zero) if it is not. | 1692 | * else return false (zero) if it is not. |
1693 | * | 1693 | * |
1694 | * This check -ignores- the distinction between various watermarks, | 1694 | * This check -ignores- the distinction between various watermarks, |
1695 | * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is | 1695 | * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is |
1696 | * found to be full for any variation of these watermarks, it will | 1696 | * found to be full for any variation of these watermarks, it will |
1697 | * be considered full for up to one second by all requests, unless | 1697 | * be considered full for up to one second by all requests, unless |
1698 | * we are so low on memory on all allowed nodes that we are forced | 1698 | * we are so low on memory on all allowed nodes that we are forced |
1699 | * into the second scan of the zonelist. | 1699 | * into the second scan of the zonelist. |
1700 | * | 1700 | * |
1701 | * In the second scan we ignore this zonelist cache and exactly | 1701 | * In the second scan we ignore this zonelist cache and exactly |
1702 | * apply the watermarks to all zones, even it is slower to do so. | 1702 | * apply the watermarks to all zones, even it is slower to do so. |
1703 | * We are low on memory in the second scan, and should leave no stone | 1703 | * We are low on memory in the second scan, and should leave no stone |
1704 | * unturned looking for a free page. | 1704 | * unturned looking for a free page. |
1705 | */ | 1705 | */ |
1706 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, | 1706 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, |
1707 | nodemask_t *allowednodes) | 1707 | nodemask_t *allowednodes) |
1708 | { | 1708 | { |
1709 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | 1709 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ |
1710 | int i; /* index of *z in zonelist zones */ | 1710 | int i; /* index of *z in zonelist zones */ |
1711 | int n; /* node that zone *z is on */ | 1711 | int n; /* node that zone *z is on */ |
1712 | 1712 | ||
1713 | zlc = zonelist->zlcache_ptr; | 1713 | zlc = zonelist->zlcache_ptr; |
1714 | if (!zlc) | 1714 | if (!zlc) |
1715 | return 1; | 1715 | return 1; |
1716 | 1716 | ||
1717 | i = z - zonelist->_zonerefs; | 1717 | i = z - zonelist->_zonerefs; |
1718 | n = zlc->z_to_n[i]; | 1718 | n = zlc->z_to_n[i]; |
1719 | 1719 | ||
1720 | /* This zone is worth trying if it is allowed but not full */ | 1720 | /* This zone is worth trying if it is allowed but not full */ |
1721 | return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); | 1721 | return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); |
1722 | } | 1722 | } |
1723 | 1723 | ||
1724 | /* | 1724 | /* |
1725 | * Given 'z' scanning a zonelist, set the corresponding bit in | 1725 | * Given 'z' scanning a zonelist, set the corresponding bit in |
1726 | * zlc->fullzones, so that subsequent attempts to allocate a page | 1726 | * zlc->fullzones, so that subsequent attempts to allocate a page |
1727 | * from that zone don't waste time re-examining it. | 1727 | * from that zone don't waste time re-examining it. |
1728 | */ | 1728 | */ |
1729 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | 1729 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) |
1730 | { | 1730 | { |
1731 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | 1731 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ |
1732 | int i; /* index of *z in zonelist zones */ | 1732 | int i; /* index of *z in zonelist zones */ |
1733 | 1733 | ||
1734 | zlc = zonelist->zlcache_ptr; | 1734 | zlc = zonelist->zlcache_ptr; |
1735 | if (!zlc) | 1735 | if (!zlc) |
1736 | return; | 1736 | return; |
1737 | 1737 | ||
1738 | i = z - zonelist->_zonerefs; | 1738 | i = z - zonelist->_zonerefs; |
1739 | 1739 | ||
1740 | set_bit(i, zlc->fullzones); | 1740 | set_bit(i, zlc->fullzones); |
1741 | } | 1741 | } |
1742 | 1742 | ||
1743 | /* | 1743 | /* |
1744 | * clear all zones full, called after direct reclaim makes progress so that | 1744 | * clear all zones full, called after direct reclaim makes progress so that |
1745 | * a zone that was recently full is not skipped over for up to a second | 1745 | * a zone that was recently full is not skipped over for up to a second |
1746 | */ | 1746 | */ |
1747 | static void zlc_clear_zones_full(struct zonelist *zonelist) | 1747 | static void zlc_clear_zones_full(struct zonelist *zonelist) |
1748 | { | 1748 | { |
1749 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | 1749 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ |
1750 | 1750 | ||
1751 | zlc = zonelist->zlcache_ptr; | 1751 | zlc = zonelist->zlcache_ptr; |
1752 | if (!zlc) | 1752 | if (!zlc) |
1753 | return; | 1753 | return; |
1754 | 1754 | ||
1755 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | 1755 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); |
1756 | } | 1756 | } |
1757 | 1757 | ||
1758 | #else /* CONFIG_NUMA */ | 1758 | #else /* CONFIG_NUMA */ |
1759 | 1759 | ||
1760 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | 1760 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) |
1761 | { | 1761 | { |
1762 | return NULL; | 1762 | return NULL; |
1763 | } | 1763 | } |
1764 | 1764 | ||
1765 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, | 1765 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, |
1766 | nodemask_t *allowednodes) | 1766 | nodemask_t *allowednodes) |
1767 | { | 1767 | { |
1768 | return 1; | 1768 | return 1; |
1769 | } | 1769 | } |
1770 | 1770 | ||
1771 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | 1771 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) |
1772 | { | 1772 | { |
1773 | } | 1773 | } |
1774 | 1774 | ||
1775 | static void zlc_clear_zones_full(struct zonelist *zonelist) | 1775 | static void zlc_clear_zones_full(struct zonelist *zonelist) |
1776 | { | 1776 | { |
1777 | } | 1777 | } |
1778 | #endif /* CONFIG_NUMA */ | 1778 | #endif /* CONFIG_NUMA */ |
1779 | 1779 | ||
1780 | /* | 1780 | /* |
1781 | * get_page_from_freelist goes through the zonelist trying to allocate | 1781 | * get_page_from_freelist goes through the zonelist trying to allocate |
1782 | * a page. | 1782 | * a page. |
1783 | */ | 1783 | */ |
1784 | static struct page * | 1784 | static struct page * |
1785 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, | 1785 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, |
1786 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags, | 1786 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags, |
1787 | struct zone *preferred_zone, int migratetype) | 1787 | struct zone *preferred_zone, int migratetype) |
1788 | { | 1788 | { |
1789 | struct zoneref *z; | 1789 | struct zoneref *z; |
1790 | struct page *page = NULL; | 1790 | struct page *page = NULL; |
1791 | int classzone_idx; | 1791 | int classzone_idx; |
1792 | struct zone *zone; | 1792 | struct zone *zone; |
1793 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ | 1793 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ |
1794 | int zlc_active = 0; /* set if using zonelist_cache */ | 1794 | int zlc_active = 0; /* set if using zonelist_cache */ |
1795 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | 1795 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ |
1796 | 1796 | ||
1797 | classzone_idx = zone_idx(preferred_zone); | 1797 | classzone_idx = zone_idx(preferred_zone); |
1798 | zonelist_scan: | 1798 | zonelist_scan: |
1799 | /* | 1799 | /* |
1800 | * Scan zonelist, looking for a zone with enough free. | 1800 | * Scan zonelist, looking for a zone with enough free. |
1801 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1801 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1802 | */ | 1802 | */ |
1803 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 1803 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
1804 | high_zoneidx, nodemask) { | 1804 | high_zoneidx, nodemask) { |
1805 | if (NUMA_BUILD && zlc_active && | 1805 | if (NUMA_BUILD && zlc_active && |
1806 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1806 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1807 | continue; | 1807 | continue; |
1808 | if ((alloc_flags & ALLOC_CPUSET) && | 1808 | if ((alloc_flags & ALLOC_CPUSET) && |
1809 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1809 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1810 | continue; | 1810 | continue; |
1811 | /* | 1811 | /* |
1812 | * When allocating a page cache page for writing, we | 1812 | * When allocating a page cache page for writing, we |
1813 | * want to get it from a zone that is within its dirty | 1813 | * want to get it from a zone that is within its dirty |
1814 | * limit, such that no single zone holds more than its | 1814 | * limit, such that no single zone holds more than its |
1815 | * proportional share of globally allowed dirty pages. | 1815 | * proportional share of globally allowed dirty pages. |
1816 | * The dirty limits take into account the zone's | 1816 | * The dirty limits take into account the zone's |
1817 | * lowmem reserves and high watermark so that kswapd | 1817 | * lowmem reserves and high watermark so that kswapd |
1818 | * should be able to balance it without having to | 1818 | * should be able to balance it without having to |
1819 | * write pages from its LRU list. | 1819 | * write pages from its LRU list. |
1820 | * | 1820 | * |
1821 | * This may look like it could increase pressure on | 1821 | * This may look like it could increase pressure on |
1822 | * lower zones by failing allocations in higher zones | 1822 | * lower zones by failing allocations in higher zones |
1823 | * before they are full. But the pages that do spill | 1823 | * before they are full. But the pages that do spill |
1824 | * over are limited as the lower zones are protected | 1824 | * over are limited as the lower zones are protected |
1825 | * by this very same mechanism. It should not become | 1825 | * by this very same mechanism. It should not become |
1826 | * a practical burden to them. | 1826 | * a practical burden to them. |
1827 | * | 1827 | * |
1828 | * XXX: For now, allow allocations to potentially | 1828 | * XXX: For now, allow allocations to potentially |
1829 | * exceed the per-zone dirty limit in the slowpath | 1829 | * exceed the per-zone dirty limit in the slowpath |
1830 | * (ALLOC_WMARK_LOW unset) before going into reclaim, | 1830 | * (ALLOC_WMARK_LOW unset) before going into reclaim, |
1831 | * which is important when on a NUMA setup the allowed | 1831 | * which is important when on a NUMA setup the allowed |
1832 | * zones are together not big enough to reach the | 1832 | * zones are together not big enough to reach the |
1833 | * global limit. The proper fix for these situations | 1833 | * global limit. The proper fix for these situations |
1834 | * will require awareness of zones in the | 1834 | * will require awareness of zones in the |
1835 | * dirty-throttling and the flusher threads. | 1835 | * dirty-throttling and the flusher threads. |
1836 | */ | 1836 | */ |
1837 | if ((alloc_flags & ALLOC_WMARK_LOW) && | 1837 | if ((alloc_flags & ALLOC_WMARK_LOW) && |
1838 | (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) | 1838 | (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) |
1839 | goto this_zone_full; | 1839 | goto this_zone_full; |
1840 | 1840 | ||
1841 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | 1841 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); |
1842 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1842 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
1843 | unsigned long mark; | 1843 | unsigned long mark; |
1844 | int ret; | 1844 | int ret; |
1845 | 1845 | ||
1846 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; | 1846 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
1847 | if (zone_watermark_ok(zone, order, mark, | 1847 | if (zone_watermark_ok(zone, order, mark, |
1848 | classzone_idx, alloc_flags)) | 1848 | classzone_idx, alloc_flags)) |
1849 | goto try_this_zone; | 1849 | goto try_this_zone; |
1850 | 1850 | ||
1851 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { | 1851 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { |
1852 | /* | 1852 | /* |
1853 | * we do zlc_setup if there are multiple nodes | 1853 | * we do zlc_setup if there are multiple nodes |
1854 | * and before considering the first zone allowed | 1854 | * and before considering the first zone allowed |
1855 | * by the cpuset. | 1855 | * by the cpuset. |
1856 | */ | 1856 | */ |
1857 | allowednodes = zlc_setup(zonelist, alloc_flags); | 1857 | allowednodes = zlc_setup(zonelist, alloc_flags); |
1858 | zlc_active = 1; | 1858 | zlc_active = 1; |
1859 | did_zlc_setup = 1; | 1859 | did_zlc_setup = 1; |
1860 | } | 1860 | } |
1861 | 1861 | ||
1862 | if (zone_reclaim_mode == 0) | 1862 | if (zone_reclaim_mode == 0) |
1863 | goto this_zone_full; | 1863 | goto this_zone_full; |
1864 | 1864 | ||
1865 | /* | 1865 | /* |
1866 | * As we may have just activated ZLC, check if the first | 1866 | * As we may have just activated ZLC, check if the first |
1867 | * eligible zone has failed zone_reclaim recently. | 1867 | * eligible zone has failed zone_reclaim recently. |
1868 | */ | 1868 | */ |
1869 | if (NUMA_BUILD && zlc_active && | 1869 | if (NUMA_BUILD && zlc_active && |
1870 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1870 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1871 | continue; | 1871 | continue; |
1872 | 1872 | ||
1873 | ret = zone_reclaim(zone, gfp_mask, order); | 1873 | ret = zone_reclaim(zone, gfp_mask, order); |
1874 | switch (ret) { | 1874 | switch (ret) { |
1875 | case ZONE_RECLAIM_NOSCAN: | 1875 | case ZONE_RECLAIM_NOSCAN: |
1876 | /* did not scan */ | 1876 | /* did not scan */ |
1877 | continue; | 1877 | continue; |
1878 | case ZONE_RECLAIM_FULL: | 1878 | case ZONE_RECLAIM_FULL: |
1879 | /* scanned but unreclaimable */ | 1879 | /* scanned but unreclaimable */ |
1880 | continue; | 1880 | continue; |
1881 | default: | 1881 | default: |
1882 | /* did we reclaim enough */ | 1882 | /* did we reclaim enough */ |
1883 | if (!zone_watermark_ok(zone, order, mark, | 1883 | if (!zone_watermark_ok(zone, order, mark, |
1884 | classzone_idx, alloc_flags)) | 1884 | classzone_idx, alloc_flags)) |
1885 | goto this_zone_full; | 1885 | goto this_zone_full; |
1886 | } | 1886 | } |
1887 | } | 1887 | } |
1888 | 1888 | ||
1889 | try_this_zone: | 1889 | try_this_zone: |
1890 | page = buffered_rmqueue(preferred_zone, zone, order, | 1890 | page = buffered_rmqueue(preferred_zone, zone, order, |
1891 | gfp_mask, migratetype); | 1891 | gfp_mask, migratetype); |
1892 | if (page) | 1892 | if (page) |
1893 | break; | 1893 | break; |
1894 | this_zone_full: | 1894 | this_zone_full: |
1895 | if (NUMA_BUILD) | 1895 | if (NUMA_BUILD) |
1896 | zlc_mark_zone_full(zonelist, z); | 1896 | zlc_mark_zone_full(zonelist, z); |
1897 | } | 1897 | } |
1898 | 1898 | ||
1899 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { | 1899 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { |
1900 | /* Disable zlc cache for second zonelist scan */ | 1900 | /* Disable zlc cache for second zonelist scan */ |
1901 | zlc_active = 0; | 1901 | zlc_active = 0; |
1902 | goto zonelist_scan; | 1902 | goto zonelist_scan; |
1903 | } | 1903 | } |
1904 | return page; | 1904 | return page; |
1905 | } | 1905 | } |
1906 | 1906 | ||
1907 | /* | 1907 | /* |
1908 | * Large machines with many possible nodes should not always dump per-node | 1908 | * Large machines with many possible nodes should not always dump per-node |
1909 | * meminfo in irq context. | 1909 | * meminfo in irq context. |
1910 | */ | 1910 | */ |
1911 | static inline bool should_suppress_show_mem(void) | 1911 | static inline bool should_suppress_show_mem(void) |
1912 | { | 1912 | { |
1913 | bool ret = false; | 1913 | bool ret = false; |
1914 | 1914 | ||
1915 | #if NODES_SHIFT > 8 | 1915 | #if NODES_SHIFT > 8 |
1916 | ret = in_interrupt(); | 1916 | ret = in_interrupt(); |
1917 | #endif | 1917 | #endif |
1918 | return ret; | 1918 | return ret; |
1919 | } | 1919 | } |
1920 | 1920 | ||
1921 | static DEFINE_RATELIMIT_STATE(nopage_rs, | 1921 | static DEFINE_RATELIMIT_STATE(nopage_rs, |
1922 | DEFAULT_RATELIMIT_INTERVAL, | 1922 | DEFAULT_RATELIMIT_INTERVAL, |
1923 | DEFAULT_RATELIMIT_BURST); | 1923 | DEFAULT_RATELIMIT_BURST); |
1924 | 1924 | ||
1925 | void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) | 1925 | void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) |
1926 | { | 1926 | { |
1927 | unsigned int filter = SHOW_MEM_FILTER_NODES; | 1927 | unsigned int filter = SHOW_MEM_FILTER_NODES; |
1928 | 1928 | ||
1929 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || | 1929 | if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || |
1930 | debug_guardpage_minorder() > 0) | 1930 | debug_guardpage_minorder() > 0) |
1931 | return; | 1931 | return; |
1932 | 1932 | ||
1933 | /* | 1933 | /* |
1934 | * This documents exceptions given to allocations in certain | 1934 | * This documents exceptions given to allocations in certain |
1935 | * contexts that are allowed to allocate outside current's set | 1935 | * contexts that are allowed to allocate outside current's set |
1936 | * of allowed nodes. | 1936 | * of allowed nodes. |
1937 | */ | 1937 | */ |
1938 | if (!(gfp_mask & __GFP_NOMEMALLOC)) | 1938 | if (!(gfp_mask & __GFP_NOMEMALLOC)) |
1939 | if (test_thread_flag(TIF_MEMDIE) || | 1939 | if (test_thread_flag(TIF_MEMDIE) || |
1940 | (current->flags & (PF_MEMALLOC | PF_EXITING))) | 1940 | (current->flags & (PF_MEMALLOC | PF_EXITING))) |
1941 | filter &= ~SHOW_MEM_FILTER_NODES; | 1941 | filter &= ~SHOW_MEM_FILTER_NODES; |
1942 | if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) | 1942 | if (in_interrupt() || !(gfp_mask & __GFP_WAIT)) |
1943 | filter &= ~SHOW_MEM_FILTER_NODES; | 1943 | filter &= ~SHOW_MEM_FILTER_NODES; |
1944 | 1944 | ||
1945 | if (fmt) { | 1945 | if (fmt) { |
1946 | struct va_format vaf; | 1946 | struct va_format vaf; |
1947 | va_list args; | 1947 | va_list args; |
1948 | 1948 | ||
1949 | va_start(args, fmt); | 1949 | va_start(args, fmt); |
1950 | 1950 | ||
1951 | vaf.fmt = fmt; | 1951 | vaf.fmt = fmt; |
1952 | vaf.va = &args; | 1952 | vaf.va = &args; |
1953 | 1953 | ||
1954 | pr_warn("%pV", &vaf); | 1954 | pr_warn("%pV", &vaf); |
1955 | 1955 | ||
1956 | va_end(args); | 1956 | va_end(args); |
1957 | } | 1957 | } |
1958 | 1958 | ||
1959 | pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", | 1959 | pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", |
1960 | current->comm, order, gfp_mask); | 1960 | current->comm, order, gfp_mask); |
1961 | 1961 | ||
1962 | dump_stack(); | 1962 | dump_stack(); |
1963 | if (!should_suppress_show_mem()) | 1963 | if (!should_suppress_show_mem()) |
1964 | show_mem(filter); | 1964 | show_mem(filter); |
1965 | } | 1965 | } |
1966 | 1966 | ||
1967 | static inline int | 1967 | static inline int |
1968 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, | 1968 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, |
1969 | unsigned long did_some_progress, | 1969 | unsigned long did_some_progress, |
1970 | unsigned long pages_reclaimed) | 1970 | unsigned long pages_reclaimed) |
1971 | { | 1971 | { |
1972 | /* Do not loop if specifically requested */ | 1972 | /* Do not loop if specifically requested */ |
1973 | if (gfp_mask & __GFP_NORETRY) | 1973 | if (gfp_mask & __GFP_NORETRY) |
1974 | return 0; | 1974 | return 0; |
1975 | 1975 | ||
1976 | /* Always retry if specifically requested */ | 1976 | /* Always retry if specifically requested */ |
1977 | if (gfp_mask & __GFP_NOFAIL) | 1977 | if (gfp_mask & __GFP_NOFAIL) |
1978 | return 1; | 1978 | return 1; |
1979 | 1979 | ||
1980 | /* | 1980 | /* |
1981 | * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim | 1981 | * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim |
1982 | * making forward progress without invoking OOM. Suspend also disables | 1982 | * making forward progress without invoking OOM. Suspend also disables |
1983 | * storage devices so kswapd will not help. Bail if we are suspending. | 1983 | * storage devices so kswapd will not help. Bail if we are suspending. |
1984 | */ | 1984 | */ |
1985 | if (!did_some_progress && pm_suspended_storage()) | 1985 | if (!did_some_progress && pm_suspended_storage()) |
1986 | return 0; | 1986 | return 0; |
1987 | 1987 | ||
1988 | /* | 1988 | /* |
1989 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER | 1989 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER |
1990 | * means __GFP_NOFAIL, but that may not be true in other | 1990 | * means __GFP_NOFAIL, but that may not be true in other |
1991 | * implementations. | 1991 | * implementations. |
1992 | */ | 1992 | */ |
1993 | if (order <= PAGE_ALLOC_COSTLY_ORDER) | 1993 | if (order <= PAGE_ALLOC_COSTLY_ORDER) |
1994 | return 1; | 1994 | return 1; |
1995 | 1995 | ||
1996 | /* | 1996 | /* |
1997 | * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is | 1997 | * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is |
1998 | * specified, then we retry until we no longer reclaim any pages | 1998 | * specified, then we retry until we no longer reclaim any pages |
1999 | * (above), or we've reclaimed an order of pages at least as | 1999 | * (above), or we've reclaimed an order of pages at least as |
2000 | * large as the allocation's order. In both cases, if the | 2000 | * large as the allocation's order. In both cases, if the |
2001 | * allocation still fails, we stop retrying. | 2001 | * allocation still fails, we stop retrying. |
2002 | */ | 2002 | */ |
2003 | if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) | 2003 | if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) |
2004 | return 1; | 2004 | return 1; |
2005 | 2005 | ||
2006 | return 0; | 2006 | return 0; |
2007 | } | 2007 | } |
2008 | 2008 | ||
2009 | static inline struct page * | 2009 | static inline struct page * |
2010 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | 2010 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, |
2011 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2011 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2012 | nodemask_t *nodemask, struct zone *preferred_zone, | 2012 | nodemask_t *nodemask, struct zone *preferred_zone, |
2013 | int migratetype) | 2013 | int migratetype) |
2014 | { | 2014 | { |
2015 | struct page *page; | 2015 | struct page *page; |
2016 | 2016 | ||
2017 | /* Acquire the OOM killer lock for the zones in zonelist */ | 2017 | /* Acquire the OOM killer lock for the zones in zonelist */ |
2018 | if (!try_set_zonelist_oom(zonelist, gfp_mask)) { | 2018 | if (!try_set_zonelist_oom(zonelist, gfp_mask)) { |
2019 | schedule_timeout_uninterruptible(1); | 2019 | schedule_timeout_uninterruptible(1); |
2020 | return NULL; | 2020 | return NULL; |
2021 | } | 2021 | } |
2022 | 2022 | ||
2023 | /* | 2023 | /* |
2024 | * Go through the zonelist yet one more time, keep very high watermark | 2024 | * Go through the zonelist yet one more time, keep very high watermark |
2025 | * here, this is only to catch a parallel oom killing, we must fail if | 2025 | * here, this is only to catch a parallel oom killing, we must fail if |
2026 | * we're still under heavy pressure. | 2026 | * we're still under heavy pressure. |
2027 | */ | 2027 | */ |
2028 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | 2028 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, |
2029 | order, zonelist, high_zoneidx, | 2029 | order, zonelist, high_zoneidx, |
2030 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, | 2030 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, |
2031 | preferred_zone, migratetype); | 2031 | preferred_zone, migratetype); |
2032 | if (page) | 2032 | if (page) |
2033 | goto out; | 2033 | goto out; |
2034 | 2034 | ||
2035 | if (!(gfp_mask & __GFP_NOFAIL)) { | 2035 | if (!(gfp_mask & __GFP_NOFAIL)) { |
2036 | /* The OOM killer will not help higher order allocs */ | 2036 | /* The OOM killer will not help higher order allocs */ |
2037 | if (order > PAGE_ALLOC_COSTLY_ORDER) | 2037 | if (order > PAGE_ALLOC_COSTLY_ORDER) |
2038 | goto out; | 2038 | goto out; |
2039 | /* The OOM killer does not needlessly kill tasks for lowmem */ | 2039 | /* The OOM killer does not needlessly kill tasks for lowmem */ |
2040 | if (high_zoneidx < ZONE_NORMAL) | 2040 | if (high_zoneidx < ZONE_NORMAL) |
2041 | goto out; | 2041 | goto out; |
2042 | /* | 2042 | /* |
2043 | * GFP_THISNODE contains __GFP_NORETRY and we never hit this. | 2043 | * GFP_THISNODE contains __GFP_NORETRY and we never hit this. |
2044 | * Sanity check for bare calls of __GFP_THISNODE, not real OOM. | 2044 | * Sanity check for bare calls of __GFP_THISNODE, not real OOM. |
2045 | * The caller should handle page allocation failure by itself if | 2045 | * The caller should handle page allocation failure by itself if |
2046 | * it specifies __GFP_THISNODE. | 2046 | * it specifies __GFP_THISNODE. |
2047 | * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. | 2047 | * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. |
2048 | */ | 2048 | */ |
2049 | if (gfp_mask & __GFP_THISNODE) | 2049 | if (gfp_mask & __GFP_THISNODE) |
2050 | goto out; | 2050 | goto out; |
2051 | } | 2051 | } |
2052 | /* Exhausted what can be done so it's blamo time */ | 2052 | /* Exhausted what can be done so it's blamo time */ |
2053 | out_of_memory(zonelist, gfp_mask, order, nodemask, false); | 2053 | out_of_memory(zonelist, gfp_mask, order, nodemask, false); |
2054 | 2054 | ||
2055 | out: | 2055 | out: |
2056 | clear_zonelist_oom(zonelist, gfp_mask); | 2056 | clear_zonelist_oom(zonelist, gfp_mask); |
2057 | return page; | 2057 | return page; |
2058 | } | 2058 | } |
2059 | 2059 | ||
2060 | #ifdef CONFIG_COMPACTION | 2060 | #ifdef CONFIG_COMPACTION |
2061 | /* Try memory compaction for high-order allocations before reclaim */ | 2061 | /* Try memory compaction for high-order allocations before reclaim */ |
2062 | static struct page * | 2062 | static struct page * |
2063 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2063 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2064 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2064 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2065 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2065 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2066 | int migratetype, bool sync_migration, | 2066 | int migratetype, bool sync_migration, |
2067 | bool *deferred_compaction, | 2067 | bool *deferred_compaction, |
2068 | unsigned long *did_some_progress) | 2068 | unsigned long *did_some_progress) |
2069 | { | 2069 | { |
2070 | struct page *page; | 2070 | struct page *page; |
2071 | 2071 | ||
2072 | if (!order) | 2072 | if (!order) |
2073 | return NULL; | 2073 | return NULL; |
2074 | 2074 | ||
2075 | if (compaction_deferred(preferred_zone, order)) { | 2075 | if (compaction_deferred(preferred_zone, order)) { |
2076 | *deferred_compaction = true; | 2076 | *deferred_compaction = true; |
2077 | return NULL; | 2077 | return NULL; |
2078 | } | 2078 | } |
2079 | 2079 | ||
2080 | current->flags |= PF_MEMALLOC; | 2080 | current->flags |= PF_MEMALLOC; |
2081 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 2081 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, |
2082 | nodemask, sync_migration); | 2082 | nodemask, sync_migration); |
2083 | current->flags &= ~PF_MEMALLOC; | 2083 | current->flags &= ~PF_MEMALLOC; |
2084 | if (*did_some_progress != COMPACT_SKIPPED) { | 2084 | if (*did_some_progress != COMPACT_SKIPPED) { |
2085 | 2085 | ||
2086 | /* Page migration frees to the PCP lists but we want merging */ | 2086 | /* Page migration frees to the PCP lists but we want merging */ |
2087 | drain_pages(get_cpu()); | 2087 | drain_pages(get_cpu()); |
2088 | put_cpu(); | 2088 | put_cpu(); |
2089 | 2089 | ||
2090 | page = get_page_from_freelist(gfp_mask, nodemask, | 2090 | page = get_page_from_freelist(gfp_mask, nodemask, |
2091 | order, zonelist, high_zoneidx, | 2091 | order, zonelist, high_zoneidx, |
2092 | alloc_flags, preferred_zone, | 2092 | alloc_flags, preferred_zone, |
2093 | migratetype); | 2093 | migratetype); |
2094 | if (page) { | 2094 | if (page) { |
2095 | preferred_zone->compact_considered = 0; | 2095 | preferred_zone->compact_considered = 0; |
2096 | preferred_zone->compact_defer_shift = 0; | 2096 | preferred_zone->compact_defer_shift = 0; |
2097 | if (order >= preferred_zone->compact_order_failed) | 2097 | if (order >= preferred_zone->compact_order_failed) |
2098 | preferred_zone->compact_order_failed = order + 1; | 2098 | preferred_zone->compact_order_failed = order + 1; |
2099 | count_vm_event(COMPACTSUCCESS); | 2099 | count_vm_event(COMPACTSUCCESS); |
2100 | return page; | 2100 | return page; |
2101 | } | 2101 | } |
2102 | 2102 | ||
2103 | /* | 2103 | /* |
2104 | * It's bad if compaction run occurs and fails. | 2104 | * It's bad if compaction run occurs and fails. |
2105 | * The most likely reason is that pages exist, | 2105 | * The most likely reason is that pages exist, |
2106 | * but not enough to satisfy watermarks. | 2106 | * but not enough to satisfy watermarks. |
2107 | */ | 2107 | */ |
2108 | count_vm_event(COMPACTFAIL); | 2108 | count_vm_event(COMPACTFAIL); |
2109 | 2109 | ||
2110 | /* | 2110 | /* |
2111 | * As async compaction considers a subset of pageblocks, only | 2111 | * As async compaction considers a subset of pageblocks, only |
2112 | * defer if the failure was a sync compaction failure. | 2112 | * defer if the failure was a sync compaction failure. |
2113 | */ | 2113 | */ |
2114 | if (sync_migration) | 2114 | if (sync_migration) |
2115 | defer_compaction(preferred_zone, order); | 2115 | defer_compaction(preferred_zone, order); |
2116 | 2116 | ||
2117 | cond_resched(); | 2117 | cond_resched(); |
2118 | } | 2118 | } |
2119 | 2119 | ||
2120 | return NULL; | 2120 | return NULL; |
2121 | } | 2121 | } |
2122 | #else | 2122 | #else |
2123 | static inline struct page * | 2123 | static inline struct page * |
2124 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2124 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2125 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2125 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2126 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2126 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2127 | int migratetype, bool sync_migration, | 2127 | int migratetype, bool sync_migration, |
2128 | bool *deferred_compaction, | 2128 | bool *deferred_compaction, |
2129 | unsigned long *did_some_progress) | 2129 | unsigned long *did_some_progress) |
2130 | { | 2130 | { |
2131 | return NULL; | 2131 | return NULL; |
2132 | } | 2132 | } |
2133 | #endif /* CONFIG_COMPACTION */ | 2133 | #endif /* CONFIG_COMPACTION */ |
2134 | 2134 | ||
2135 | /* Perform direct synchronous page reclaim */ | 2135 | /* Perform direct synchronous page reclaim */ |
2136 | static int | 2136 | static int |
2137 | __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, | 2137 | __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, |
2138 | nodemask_t *nodemask) | 2138 | nodemask_t *nodemask) |
2139 | { | 2139 | { |
2140 | struct reclaim_state reclaim_state; | 2140 | struct reclaim_state reclaim_state; |
2141 | int progress; | 2141 | int progress; |
2142 | 2142 | ||
2143 | cond_resched(); | 2143 | cond_resched(); |
2144 | 2144 | ||
2145 | /* We now go into synchronous reclaim */ | 2145 | /* We now go into synchronous reclaim */ |
2146 | cpuset_memory_pressure_bump(); | 2146 | cpuset_memory_pressure_bump(); |
2147 | current->flags |= PF_MEMALLOC; | 2147 | current->flags |= PF_MEMALLOC; |
2148 | lockdep_set_current_reclaim_state(gfp_mask); | 2148 | lockdep_set_current_reclaim_state(gfp_mask); |
2149 | reclaim_state.reclaimed_slab = 0; | 2149 | reclaim_state.reclaimed_slab = 0; |
2150 | current->reclaim_state = &reclaim_state; | 2150 | current->reclaim_state = &reclaim_state; |
2151 | 2151 | ||
2152 | progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); | 2152 | progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); |
2153 | 2153 | ||
2154 | current->reclaim_state = NULL; | 2154 | current->reclaim_state = NULL; |
2155 | lockdep_clear_current_reclaim_state(); | 2155 | lockdep_clear_current_reclaim_state(); |
2156 | current->flags &= ~PF_MEMALLOC; | 2156 | current->flags &= ~PF_MEMALLOC; |
2157 | 2157 | ||
2158 | cond_resched(); | 2158 | cond_resched(); |
2159 | 2159 | ||
2160 | return progress; | 2160 | return progress; |
2161 | } | 2161 | } |
2162 | 2162 | ||
2163 | /* The really slow allocator path where we enter direct reclaim */ | 2163 | /* The really slow allocator path where we enter direct reclaim */ |
2164 | static inline struct page * | 2164 | static inline struct page * |
2165 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | 2165 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, |
2166 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2166 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2167 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2167 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2168 | int migratetype, unsigned long *did_some_progress) | 2168 | int migratetype, unsigned long *did_some_progress) |
2169 | { | 2169 | { |
2170 | struct page *page = NULL; | 2170 | struct page *page = NULL; |
2171 | bool drained = false; | 2171 | bool drained = false; |
2172 | 2172 | ||
2173 | *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, | 2173 | *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, |
2174 | nodemask); | 2174 | nodemask); |
2175 | if (unlikely(!(*did_some_progress))) | 2175 | if (unlikely(!(*did_some_progress))) |
2176 | return NULL; | 2176 | return NULL; |
2177 | 2177 | ||
2178 | /* After successful reclaim, reconsider all zones for allocation */ | 2178 | /* After successful reclaim, reconsider all zones for allocation */ |
2179 | if (NUMA_BUILD) | 2179 | if (NUMA_BUILD) |
2180 | zlc_clear_zones_full(zonelist); | 2180 | zlc_clear_zones_full(zonelist); |
2181 | 2181 | ||
2182 | retry: | 2182 | retry: |
2183 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2183 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
2184 | zonelist, high_zoneidx, | 2184 | zonelist, high_zoneidx, |
2185 | alloc_flags, preferred_zone, | 2185 | alloc_flags, preferred_zone, |
2186 | migratetype); | 2186 | migratetype); |
2187 | 2187 | ||
2188 | /* | 2188 | /* |
2189 | * If an allocation failed after direct reclaim, it could be because | 2189 | * If an allocation failed after direct reclaim, it could be because |
2190 | * pages are pinned on the per-cpu lists. Drain them and try again | 2190 | * pages are pinned on the per-cpu lists. Drain them and try again |
2191 | */ | 2191 | */ |
2192 | if (!page && !drained) { | 2192 | if (!page && !drained) { |
2193 | drain_all_pages(); | 2193 | drain_all_pages(); |
2194 | drained = true; | 2194 | drained = true; |
2195 | goto retry; | 2195 | goto retry; |
2196 | } | 2196 | } |
2197 | 2197 | ||
2198 | return page; | 2198 | return page; |
2199 | } | 2199 | } |
2200 | 2200 | ||
2201 | /* | 2201 | /* |
2202 | * This is called in the allocator slow-path if the allocation request is of | 2202 | * This is called in the allocator slow-path if the allocation request is of |
2203 | * sufficient urgency to ignore watermarks and take other desperate measures | 2203 | * sufficient urgency to ignore watermarks and take other desperate measures |
2204 | */ | 2204 | */ |
2205 | static inline struct page * | 2205 | static inline struct page * |
2206 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | 2206 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, |
2207 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2207 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2208 | nodemask_t *nodemask, struct zone *preferred_zone, | 2208 | nodemask_t *nodemask, struct zone *preferred_zone, |
2209 | int migratetype) | 2209 | int migratetype) |
2210 | { | 2210 | { |
2211 | struct page *page; | 2211 | struct page *page; |
2212 | 2212 | ||
2213 | do { | 2213 | do { |
2214 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2214 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
2215 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, | 2215 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, |
2216 | preferred_zone, migratetype); | 2216 | preferred_zone, migratetype); |
2217 | 2217 | ||
2218 | if (!page && gfp_mask & __GFP_NOFAIL) | 2218 | if (!page && gfp_mask & __GFP_NOFAIL) |
2219 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2219 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); |
2220 | } while (!page && (gfp_mask & __GFP_NOFAIL)); | 2220 | } while (!page && (gfp_mask & __GFP_NOFAIL)); |
2221 | 2221 | ||
2222 | return page; | 2222 | return page; |
2223 | } | 2223 | } |
2224 | 2224 | ||
2225 | static inline | 2225 | static inline |
2226 | void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, | 2226 | void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, |
2227 | enum zone_type high_zoneidx, | 2227 | enum zone_type high_zoneidx, |
2228 | enum zone_type classzone_idx) | 2228 | enum zone_type classzone_idx) |
2229 | { | 2229 | { |
2230 | struct zoneref *z; | 2230 | struct zoneref *z; |
2231 | struct zone *zone; | 2231 | struct zone *zone; |
2232 | 2232 | ||
2233 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 2233 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) |
2234 | wakeup_kswapd(zone, order, classzone_idx); | 2234 | wakeup_kswapd(zone, order, classzone_idx); |
2235 | } | 2235 | } |
2236 | 2236 | ||
2237 | static inline int | 2237 | static inline int |
2238 | gfp_to_alloc_flags(gfp_t gfp_mask) | 2238 | gfp_to_alloc_flags(gfp_t gfp_mask) |
2239 | { | 2239 | { |
2240 | int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; | 2240 | int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; |
2241 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 2241 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
2242 | 2242 | ||
2243 | /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ | 2243 | /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ |
2244 | BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); | 2244 | BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); |
2245 | 2245 | ||
2246 | /* | 2246 | /* |
2247 | * The caller may dip into page reserves a bit more if the caller | 2247 | * The caller may dip into page reserves a bit more if the caller |
2248 | * cannot run direct reclaim, or if the caller has realtime scheduling | 2248 | * cannot run direct reclaim, or if the caller has realtime scheduling |
2249 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will | 2249 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will |
2250 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). | 2250 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). |
2251 | */ | 2251 | */ |
2252 | alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); | 2252 | alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); |
2253 | 2253 | ||
2254 | if (!wait) { | 2254 | if (!wait) { |
2255 | /* | 2255 | /* |
2256 | * Not worth trying to allocate harder for | 2256 | * Not worth trying to allocate harder for |
2257 | * __GFP_NOMEMALLOC even if it can't schedule. | 2257 | * __GFP_NOMEMALLOC even if it can't schedule. |
2258 | */ | 2258 | */ |
2259 | if (!(gfp_mask & __GFP_NOMEMALLOC)) | 2259 | if (!(gfp_mask & __GFP_NOMEMALLOC)) |
2260 | alloc_flags |= ALLOC_HARDER; | 2260 | alloc_flags |= ALLOC_HARDER; |
2261 | /* | 2261 | /* |
2262 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | 2262 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
2263 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 2263 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
2264 | */ | 2264 | */ |
2265 | alloc_flags &= ~ALLOC_CPUSET; | 2265 | alloc_flags &= ~ALLOC_CPUSET; |
2266 | } else if (unlikely(rt_task(current)) && !in_interrupt()) | 2266 | } else if (unlikely(rt_task(current)) && !in_interrupt()) |
2267 | alloc_flags |= ALLOC_HARDER; | 2267 | alloc_flags |= ALLOC_HARDER; |
2268 | 2268 | ||
2269 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { | 2269 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { |
2270 | if (!in_interrupt() && | 2270 | if (!in_interrupt() && |
2271 | ((current->flags & PF_MEMALLOC) || | 2271 | ((current->flags & PF_MEMALLOC) || |
2272 | unlikely(test_thread_flag(TIF_MEMDIE)))) | 2272 | unlikely(test_thread_flag(TIF_MEMDIE)))) |
2273 | alloc_flags |= ALLOC_NO_WATERMARKS; | 2273 | alloc_flags |= ALLOC_NO_WATERMARKS; |
2274 | } | 2274 | } |
2275 | 2275 | ||
2276 | return alloc_flags; | 2276 | return alloc_flags; |
2277 | } | 2277 | } |
2278 | 2278 | ||
2279 | static inline struct page * | 2279 | static inline struct page * |
2280 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | 2280 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, |
2281 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2281 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2282 | nodemask_t *nodemask, struct zone *preferred_zone, | 2282 | nodemask_t *nodemask, struct zone *preferred_zone, |
2283 | int migratetype) | 2283 | int migratetype) |
2284 | { | 2284 | { |
2285 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 2285 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
2286 | struct page *page = NULL; | 2286 | struct page *page = NULL; |
2287 | int alloc_flags; | 2287 | int alloc_flags; |
2288 | unsigned long pages_reclaimed = 0; | 2288 | unsigned long pages_reclaimed = 0; |
2289 | unsigned long did_some_progress; | 2289 | unsigned long did_some_progress; |
2290 | bool sync_migration = false; | 2290 | bool sync_migration = false; |
2291 | bool deferred_compaction = false; | 2291 | bool deferred_compaction = false; |
2292 | 2292 | ||
2293 | /* | 2293 | /* |
2294 | * In the slowpath, we sanity check order to avoid ever trying to | 2294 | * In the slowpath, we sanity check order to avoid ever trying to |
2295 | * reclaim >= MAX_ORDER areas which will never succeed. Callers may | 2295 | * reclaim >= MAX_ORDER areas which will never succeed. Callers may |
2296 | * be using allocators in order of preference for an area that is | 2296 | * be using allocators in order of preference for an area that is |
2297 | * too large. | 2297 | * too large. |
2298 | */ | 2298 | */ |
2299 | if (order >= MAX_ORDER) { | 2299 | if (order >= MAX_ORDER) { |
2300 | WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); | 2300 | WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); |
2301 | return NULL; | 2301 | return NULL; |
2302 | } | 2302 | } |
2303 | 2303 | ||
2304 | /* | 2304 | /* |
2305 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and | 2305 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and |
2306 | * __GFP_NOWARN set) should not cause reclaim since the subsystem | 2306 | * __GFP_NOWARN set) should not cause reclaim since the subsystem |
2307 | * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim | 2307 | * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim |
2308 | * using a larger set of nodes after it has established that the | 2308 | * using a larger set of nodes after it has established that the |
2309 | * allowed per node queues are empty and that nodes are | 2309 | * allowed per node queues are empty and that nodes are |
2310 | * over allocated. | 2310 | * over allocated. |
2311 | */ | 2311 | */ |
2312 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | 2312 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) |
2313 | goto nopage; | 2313 | goto nopage; |
2314 | 2314 | ||
2315 | restart: | 2315 | restart: |
2316 | if (!(gfp_mask & __GFP_NO_KSWAPD)) | 2316 | if (!(gfp_mask & __GFP_NO_KSWAPD)) |
2317 | wake_all_kswapd(order, zonelist, high_zoneidx, | 2317 | wake_all_kswapd(order, zonelist, high_zoneidx, |
2318 | zone_idx(preferred_zone)); | 2318 | zone_idx(preferred_zone)); |
2319 | 2319 | ||
2320 | /* | 2320 | /* |
2321 | * OK, we're below the kswapd watermark and have kicked background | 2321 | * OK, we're below the kswapd watermark and have kicked background |
2322 | * reclaim. Now things get more complex, so set up alloc_flags according | 2322 | * reclaim. Now things get more complex, so set up alloc_flags according |
2323 | * to how we want to proceed. | 2323 | * to how we want to proceed. |
2324 | */ | 2324 | */ |
2325 | alloc_flags = gfp_to_alloc_flags(gfp_mask); | 2325 | alloc_flags = gfp_to_alloc_flags(gfp_mask); |
2326 | 2326 | ||
2327 | /* | 2327 | /* |
2328 | * Find the true preferred zone if the allocation is unconstrained by | 2328 | * Find the true preferred zone if the allocation is unconstrained by |
2329 | * cpusets. | 2329 | * cpusets. |
2330 | */ | 2330 | */ |
2331 | if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) | 2331 | if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) |
2332 | first_zones_zonelist(zonelist, high_zoneidx, NULL, | 2332 | first_zones_zonelist(zonelist, high_zoneidx, NULL, |
2333 | &preferred_zone); | 2333 | &preferred_zone); |
2334 | 2334 | ||
2335 | rebalance: | 2335 | rebalance: |
2336 | /* This is the last chance, in general, before the goto nopage. */ | 2336 | /* This is the last chance, in general, before the goto nopage. */ |
2337 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, | 2337 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, |
2338 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, | 2338 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, |
2339 | preferred_zone, migratetype); | 2339 | preferred_zone, migratetype); |
2340 | if (page) | 2340 | if (page) |
2341 | goto got_pg; | 2341 | goto got_pg; |
2342 | 2342 | ||
2343 | /* Allocate without watermarks if the context allows */ | 2343 | /* Allocate without watermarks if the context allows */ |
2344 | if (alloc_flags & ALLOC_NO_WATERMARKS) { | 2344 | if (alloc_flags & ALLOC_NO_WATERMARKS) { |
2345 | page = __alloc_pages_high_priority(gfp_mask, order, | 2345 | page = __alloc_pages_high_priority(gfp_mask, order, |
2346 | zonelist, high_zoneidx, nodemask, | 2346 | zonelist, high_zoneidx, nodemask, |
2347 | preferred_zone, migratetype); | 2347 | preferred_zone, migratetype); |
2348 | if (page) | 2348 | if (page) |
2349 | goto got_pg; | 2349 | goto got_pg; |
2350 | } | 2350 | } |
2351 | 2351 | ||
2352 | /* Atomic allocations - we can't balance anything */ | 2352 | /* Atomic allocations - we can't balance anything */ |
2353 | if (!wait) | 2353 | if (!wait) |
2354 | goto nopage; | 2354 | goto nopage; |
2355 | 2355 | ||
2356 | /* Avoid recursion of direct reclaim */ | 2356 | /* Avoid recursion of direct reclaim */ |
2357 | if (current->flags & PF_MEMALLOC) | 2357 | if (current->flags & PF_MEMALLOC) |
2358 | goto nopage; | 2358 | goto nopage; |
2359 | 2359 | ||
2360 | /* Avoid allocations with no watermarks from looping endlessly */ | 2360 | /* Avoid allocations with no watermarks from looping endlessly */ |
2361 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) | 2361 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) |
2362 | goto nopage; | 2362 | goto nopage; |
2363 | 2363 | ||
2364 | /* | 2364 | /* |
2365 | * Try direct compaction. The first pass is asynchronous. Subsequent | 2365 | * Try direct compaction. The first pass is asynchronous. Subsequent |
2366 | * attempts after direct reclaim are synchronous | 2366 | * attempts after direct reclaim are synchronous |
2367 | */ | 2367 | */ |
2368 | page = __alloc_pages_direct_compact(gfp_mask, order, | 2368 | page = __alloc_pages_direct_compact(gfp_mask, order, |
2369 | zonelist, high_zoneidx, | 2369 | zonelist, high_zoneidx, |
2370 | nodemask, | 2370 | nodemask, |
2371 | alloc_flags, preferred_zone, | 2371 | alloc_flags, preferred_zone, |
2372 | migratetype, sync_migration, | 2372 | migratetype, sync_migration, |
2373 | &deferred_compaction, | 2373 | &deferred_compaction, |
2374 | &did_some_progress); | 2374 | &did_some_progress); |
2375 | if (page) | 2375 | if (page) |
2376 | goto got_pg; | 2376 | goto got_pg; |
2377 | sync_migration = true; | 2377 | sync_migration = true; |
2378 | 2378 | ||
2379 | /* | 2379 | /* |
2380 | * If compaction is deferred for high-order allocations, it is because | 2380 | * If compaction is deferred for high-order allocations, it is because |
2381 | * sync compaction recently failed. In this is the case and the caller | 2381 | * sync compaction recently failed. In this is the case and the caller |
2382 | * has requested the system not be heavily disrupted, fail the | 2382 | * has requested the system not be heavily disrupted, fail the |
2383 | * allocation now instead of entering direct reclaim | 2383 | * allocation now instead of entering direct reclaim |
2384 | */ | 2384 | */ |
2385 | if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) | 2385 | if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD)) |
2386 | goto nopage; | 2386 | goto nopage; |
2387 | 2387 | ||
2388 | /* Try direct reclaim and then allocating */ | 2388 | /* Try direct reclaim and then allocating */ |
2389 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | 2389 | page = __alloc_pages_direct_reclaim(gfp_mask, order, |
2390 | zonelist, high_zoneidx, | 2390 | zonelist, high_zoneidx, |
2391 | nodemask, | 2391 | nodemask, |
2392 | alloc_flags, preferred_zone, | 2392 | alloc_flags, preferred_zone, |
2393 | migratetype, &did_some_progress); | 2393 | migratetype, &did_some_progress); |
2394 | if (page) | 2394 | if (page) |
2395 | goto got_pg; | 2395 | goto got_pg; |
2396 | 2396 | ||
2397 | /* | 2397 | /* |
2398 | * If we failed to make any progress reclaiming, then we are | 2398 | * If we failed to make any progress reclaiming, then we are |
2399 | * running out of options and have to consider going OOM | 2399 | * running out of options and have to consider going OOM |
2400 | */ | 2400 | */ |
2401 | if (!did_some_progress) { | 2401 | if (!did_some_progress) { |
2402 | if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | 2402 | if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { |
2403 | if (oom_killer_disabled) | 2403 | if (oom_killer_disabled) |
2404 | goto nopage; | 2404 | goto nopage; |
2405 | /* Coredumps can quickly deplete all memory reserves */ | 2405 | /* Coredumps can quickly deplete all memory reserves */ |
2406 | if ((current->flags & PF_DUMPCORE) && | 2406 | if ((current->flags & PF_DUMPCORE) && |
2407 | !(gfp_mask & __GFP_NOFAIL)) | 2407 | !(gfp_mask & __GFP_NOFAIL)) |
2408 | goto nopage; | 2408 | goto nopage; |
2409 | page = __alloc_pages_may_oom(gfp_mask, order, | 2409 | page = __alloc_pages_may_oom(gfp_mask, order, |
2410 | zonelist, high_zoneidx, | 2410 | zonelist, high_zoneidx, |
2411 | nodemask, preferred_zone, | 2411 | nodemask, preferred_zone, |
2412 | migratetype); | 2412 | migratetype); |
2413 | if (page) | 2413 | if (page) |
2414 | goto got_pg; | 2414 | goto got_pg; |
2415 | 2415 | ||
2416 | if (!(gfp_mask & __GFP_NOFAIL)) { | 2416 | if (!(gfp_mask & __GFP_NOFAIL)) { |
2417 | /* | 2417 | /* |
2418 | * The oom killer is not called for high-order | 2418 | * The oom killer is not called for high-order |
2419 | * allocations that may fail, so if no progress | 2419 | * allocations that may fail, so if no progress |
2420 | * is being made, there are no other options and | 2420 | * is being made, there are no other options and |
2421 | * retrying is unlikely to help. | 2421 | * retrying is unlikely to help. |
2422 | */ | 2422 | */ |
2423 | if (order > PAGE_ALLOC_COSTLY_ORDER) | 2423 | if (order > PAGE_ALLOC_COSTLY_ORDER) |
2424 | goto nopage; | 2424 | goto nopage; |
2425 | /* | 2425 | /* |
2426 | * The oom killer is not called for lowmem | 2426 | * The oom killer is not called for lowmem |
2427 | * allocations to prevent needlessly killing | 2427 | * allocations to prevent needlessly killing |
2428 | * innocent tasks. | 2428 | * innocent tasks. |
2429 | */ | 2429 | */ |
2430 | if (high_zoneidx < ZONE_NORMAL) | 2430 | if (high_zoneidx < ZONE_NORMAL) |
2431 | goto nopage; | 2431 | goto nopage; |
2432 | } | 2432 | } |
2433 | 2433 | ||
2434 | goto restart; | 2434 | goto restart; |
2435 | } | 2435 | } |
2436 | } | 2436 | } |
2437 | 2437 | ||
2438 | /* Check if we should retry the allocation */ | 2438 | /* Check if we should retry the allocation */ |
2439 | pages_reclaimed += did_some_progress; | 2439 | pages_reclaimed += did_some_progress; |
2440 | if (should_alloc_retry(gfp_mask, order, did_some_progress, | 2440 | if (should_alloc_retry(gfp_mask, order, did_some_progress, |
2441 | pages_reclaimed)) { | 2441 | pages_reclaimed)) { |
2442 | /* Wait for some write requests to complete then retry */ | 2442 | /* Wait for some write requests to complete then retry */ |
2443 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); | 2443 | wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); |
2444 | goto rebalance; | 2444 | goto rebalance; |
2445 | } else { | 2445 | } else { |
2446 | /* | 2446 | /* |
2447 | * High-order allocations do not necessarily loop after | 2447 | * High-order allocations do not necessarily loop after |
2448 | * direct reclaim and reclaim/compaction depends on compaction | 2448 | * direct reclaim and reclaim/compaction depends on compaction |
2449 | * being called after reclaim so call directly if necessary | 2449 | * being called after reclaim so call directly if necessary |
2450 | */ | 2450 | */ |
2451 | page = __alloc_pages_direct_compact(gfp_mask, order, | 2451 | page = __alloc_pages_direct_compact(gfp_mask, order, |
2452 | zonelist, high_zoneidx, | 2452 | zonelist, high_zoneidx, |
2453 | nodemask, | 2453 | nodemask, |
2454 | alloc_flags, preferred_zone, | 2454 | alloc_flags, preferred_zone, |
2455 | migratetype, sync_migration, | 2455 | migratetype, sync_migration, |
2456 | &deferred_compaction, | 2456 | &deferred_compaction, |
2457 | &did_some_progress); | 2457 | &did_some_progress); |
2458 | if (page) | 2458 | if (page) |
2459 | goto got_pg; | 2459 | goto got_pg; |
2460 | } | 2460 | } |
2461 | 2461 | ||
2462 | nopage: | 2462 | nopage: |
2463 | warn_alloc_failed(gfp_mask, order, NULL); | 2463 | warn_alloc_failed(gfp_mask, order, NULL); |
2464 | return page; | 2464 | return page; |
2465 | got_pg: | 2465 | got_pg: |
2466 | if (kmemcheck_enabled) | 2466 | if (kmemcheck_enabled) |
2467 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | 2467 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); |
2468 | return page; | 2468 | return page; |
2469 | 2469 | ||
2470 | } | 2470 | } |
2471 | 2471 | ||
2472 | /* | 2472 | /* |
2473 | * This is the 'heart' of the zoned buddy allocator. | 2473 | * This is the 'heart' of the zoned buddy allocator. |
2474 | */ | 2474 | */ |
2475 | struct page * | 2475 | struct page * |
2476 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | 2476 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, |
2477 | struct zonelist *zonelist, nodemask_t *nodemask) | 2477 | struct zonelist *zonelist, nodemask_t *nodemask) |
2478 | { | 2478 | { |
2479 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 2479 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
2480 | struct zone *preferred_zone; | 2480 | struct zone *preferred_zone; |
2481 | struct page *page = NULL; | 2481 | struct page *page = NULL; |
2482 | int migratetype = allocflags_to_migratetype(gfp_mask); | 2482 | int migratetype = allocflags_to_migratetype(gfp_mask); |
2483 | unsigned int cpuset_mems_cookie; | 2483 | unsigned int cpuset_mems_cookie; |
2484 | 2484 | ||
2485 | gfp_mask &= gfp_allowed_mask; | 2485 | gfp_mask &= gfp_allowed_mask; |
2486 | 2486 | ||
2487 | lockdep_trace_alloc(gfp_mask); | 2487 | lockdep_trace_alloc(gfp_mask); |
2488 | 2488 | ||
2489 | might_sleep_if(gfp_mask & __GFP_WAIT); | 2489 | might_sleep_if(gfp_mask & __GFP_WAIT); |
2490 | 2490 | ||
2491 | if (should_fail_alloc_page(gfp_mask, order)) | 2491 | if (should_fail_alloc_page(gfp_mask, order)) |
2492 | return NULL; | 2492 | return NULL; |
2493 | 2493 | ||
2494 | /* | 2494 | /* |
2495 | * Check the zones suitable for the gfp_mask contain at least one | 2495 | * Check the zones suitable for the gfp_mask contain at least one |
2496 | * valid zone. It's possible to have an empty zonelist as a result | 2496 | * valid zone. It's possible to have an empty zonelist as a result |
2497 | * of GFP_THISNODE and a memoryless node | 2497 | * of GFP_THISNODE and a memoryless node |
2498 | */ | 2498 | */ |
2499 | if (unlikely(!zonelist->_zonerefs->zone)) | 2499 | if (unlikely(!zonelist->_zonerefs->zone)) |
2500 | return NULL; | 2500 | return NULL; |
2501 | 2501 | ||
2502 | retry_cpuset: | 2502 | retry_cpuset: |
2503 | cpuset_mems_cookie = get_mems_allowed(); | 2503 | cpuset_mems_cookie = get_mems_allowed(); |
2504 | 2504 | ||
2505 | /* The preferred zone is used for statistics later */ | 2505 | /* The preferred zone is used for statistics later */ |
2506 | first_zones_zonelist(zonelist, high_zoneidx, | 2506 | first_zones_zonelist(zonelist, high_zoneidx, |
2507 | nodemask ? : &cpuset_current_mems_allowed, | 2507 | nodemask ? : &cpuset_current_mems_allowed, |
2508 | &preferred_zone); | 2508 | &preferred_zone); |
2509 | if (!preferred_zone) | 2509 | if (!preferred_zone) |
2510 | goto out; | 2510 | goto out; |
2511 | 2511 | ||
2512 | /* First allocation attempt */ | 2512 | /* First allocation attempt */ |
2513 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2513 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
2514 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, | 2514 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, |
2515 | preferred_zone, migratetype); | 2515 | preferred_zone, migratetype); |
2516 | if (unlikely(!page)) | 2516 | if (unlikely(!page)) |
2517 | page = __alloc_pages_slowpath(gfp_mask, order, | 2517 | page = __alloc_pages_slowpath(gfp_mask, order, |
2518 | zonelist, high_zoneidx, nodemask, | 2518 | zonelist, high_zoneidx, nodemask, |
2519 | preferred_zone, migratetype); | 2519 | preferred_zone, migratetype); |
2520 | 2520 | ||
2521 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2521 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); |
2522 | 2522 | ||
2523 | out: | 2523 | out: |
2524 | /* | 2524 | /* |
2525 | * When updating a task's mems_allowed, it is possible to race with | 2525 | * When updating a task's mems_allowed, it is possible to race with |
2526 | * parallel threads in such a way that an allocation can fail while | 2526 | * parallel threads in such a way that an allocation can fail while |
2527 | * the mask is being updated. If a page allocation is about to fail, | 2527 | * the mask is being updated. If a page allocation is about to fail, |
2528 | * check if the cpuset changed during allocation and if so, retry. | 2528 | * check if the cpuset changed during allocation and if so, retry. |
2529 | */ | 2529 | */ |
2530 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | 2530 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
2531 | goto retry_cpuset; | 2531 | goto retry_cpuset; |
2532 | 2532 | ||
2533 | return page; | 2533 | return page; |
2534 | } | 2534 | } |
2535 | EXPORT_SYMBOL(__alloc_pages_nodemask); | 2535 | EXPORT_SYMBOL(__alloc_pages_nodemask); |
2536 | 2536 | ||
2537 | /* | 2537 | /* |
2538 | * Common helper functions. | 2538 | * Common helper functions. |
2539 | */ | 2539 | */ |
2540 | unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) | 2540 | unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) |
2541 | { | 2541 | { |
2542 | struct page *page; | 2542 | struct page *page; |
2543 | 2543 | ||
2544 | /* | 2544 | /* |
2545 | * __get_free_pages() returns a 32-bit address, which cannot represent | 2545 | * __get_free_pages() returns a 32-bit address, which cannot represent |
2546 | * a highmem page | 2546 | * a highmem page |
2547 | */ | 2547 | */ |
2548 | VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); | 2548 | VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); |
2549 | 2549 | ||
2550 | page = alloc_pages(gfp_mask, order); | 2550 | page = alloc_pages(gfp_mask, order); |
2551 | if (!page) | 2551 | if (!page) |
2552 | return 0; | 2552 | return 0; |
2553 | return (unsigned long) page_address(page); | 2553 | return (unsigned long) page_address(page); |
2554 | } | 2554 | } |
2555 | EXPORT_SYMBOL(__get_free_pages); | 2555 | EXPORT_SYMBOL(__get_free_pages); |
2556 | 2556 | ||
2557 | unsigned long get_zeroed_page(gfp_t gfp_mask) | 2557 | unsigned long get_zeroed_page(gfp_t gfp_mask) |
2558 | { | 2558 | { |
2559 | return __get_free_pages(gfp_mask | __GFP_ZERO, 0); | 2559 | return __get_free_pages(gfp_mask | __GFP_ZERO, 0); |
2560 | } | 2560 | } |
2561 | EXPORT_SYMBOL(get_zeroed_page); | 2561 | EXPORT_SYMBOL(get_zeroed_page); |
2562 | 2562 | ||
2563 | void __free_pages(struct page *page, unsigned int order) | 2563 | void __free_pages(struct page *page, unsigned int order) |
2564 | { | 2564 | { |
2565 | if (put_page_testzero(page)) { | 2565 | if (put_page_testzero(page)) { |
2566 | if (order == 0) | 2566 | if (order == 0) |
2567 | free_hot_cold_page(page, 0); | 2567 | free_hot_cold_page(page, 0); |
2568 | else | 2568 | else |
2569 | __free_pages_ok(page, order); | 2569 | __free_pages_ok(page, order); |
2570 | } | 2570 | } |
2571 | } | 2571 | } |
2572 | 2572 | ||
2573 | EXPORT_SYMBOL(__free_pages); | 2573 | EXPORT_SYMBOL(__free_pages); |
2574 | 2574 | ||
2575 | void free_pages(unsigned long addr, unsigned int order) | 2575 | void free_pages(unsigned long addr, unsigned int order) |
2576 | { | 2576 | { |
2577 | if (addr != 0) { | 2577 | if (addr != 0) { |
2578 | VM_BUG_ON(!virt_addr_valid((void *)addr)); | 2578 | VM_BUG_ON(!virt_addr_valid((void *)addr)); |
2579 | __free_pages(virt_to_page((void *)addr), order); | 2579 | __free_pages(virt_to_page((void *)addr), order); |
2580 | } | 2580 | } |
2581 | } | 2581 | } |
2582 | 2582 | ||
2583 | EXPORT_SYMBOL(free_pages); | 2583 | EXPORT_SYMBOL(free_pages); |
2584 | 2584 | ||
2585 | static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) | 2585 | static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) |
2586 | { | 2586 | { |
2587 | if (addr) { | 2587 | if (addr) { |
2588 | unsigned long alloc_end = addr + (PAGE_SIZE << order); | 2588 | unsigned long alloc_end = addr + (PAGE_SIZE << order); |
2589 | unsigned long used = addr + PAGE_ALIGN(size); | 2589 | unsigned long used = addr + PAGE_ALIGN(size); |
2590 | 2590 | ||
2591 | split_page(virt_to_page((void *)addr), order); | 2591 | split_page(virt_to_page((void *)addr), order); |
2592 | while (used < alloc_end) { | 2592 | while (used < alloc_end) { |
2593 | free_page(used); | 2593 | free_page(used); |
2594 | used += PAGE_SIZE; | 2594 | used += PAGE_SIZE; |
2595 | } | 2595 | } |
2596 | } | 2596 | } |
2597 | return (void *)addr; | 2597 | return (void *)addr; |
2598 | } | 2598 | } |
2599 | 2599 | ||
2600 | /** | 2600 | /** |
2601 | * alloc_pages_exact - allocate an exact number physically-contiguous pages. | 2601 | * alloc_pages_exact - allocate an exact number physically-contiguous pages. |
2602 | * @size: the number of bytes to allocate | 2602 | * @size: the number of bytes to allocate |
2603 | * @gfp_mask: GFP flags for the allocation | 2603 | * @gfp_mask: GFP flags for the allocation |
2604 | * | 2604 | * |
2605 | * This function is similar to alloc_pages(), except that it allocates the | 2605 | * This function is similar to alloc_pages(), except that it allocates the |
2606 | * minimum number of pages to satisfy the request. alloc_pages() can only | 2606 | * minimum number of pages to satisfy the request. alloc_pages() can only |
2607 | * allocate memory in power-of-two pages. | 2607 | * allocate memory in power-of-two pages. |
2608 | * | 2608 | * |
2609 | * This function is also limited by MAX_ORDER. | 2609 | * This function is also limited by MAX_ORDER. |
2610 | * | 2610 | * |
2611 | * Memory allocated by this function must be released by free_pages_exact(). | 2611 | * Memory allocated by this function must be released by free_pages_exact(). |
2612 | */ | 2612 | */ |
2613 | void *alloc_pages_exact(size_t size, gfp_t gfp_mask) | 2613 | void *alloc_pages_exact(size_t size, gfp_t gfp_mask) |
2614 | { | 2614 | { |
2615 | unsigned int order = get_order(size); | 2615 | unsigned int order = get_order(size); |
2616 | unsigned long addr; | 2616 | unsigned long addr; |
2617 | 2617 | ||
2618 | addr = __get_free_pages(gfp_mask, order); | 2618 | addr = __get_free_pages(gfp_mask, order); |
2619 | return make_alloc_exact(addr, order, size); | 2619 | return make_alloc_exact(addr, order, size); |
2620 | } | 2620 | } |
2621 | EXPORT_SYMBOL(alloc_pages_exact); | 2621 | EXPORT_SYMBOL(alloc_pages_exact); |
2622 | 2622 | ||
2623 | /** | 2623 | /** |
2624 | * alloc_pages_exact_nid - allocate an exact number of physically-contiguous | 2624 | * alloc_pages_exact_nid - allocate an exact number of physically-contiguous |
2625 | * pages on a node. | 2625 | * pages on a node. |
2626 | * @nid: the preferred node ID where memory should be allocated | 2626 | * @nid: the preferred node ID where memory should be allocated |
2627 | * @size: the number of bytes to allocate | 2627 | * @size: the number of bytes to allocate |
2628 | * @gfp_mask: GFP flags for the allocation | 2628 | * @gfp_mask: GFP flags for the allocation |
2629 | * | 2629 | * |
2630 | * Like alloc_pages_exact(), but try to allocate on node nid first before falling | 2630 | * Like alloc_pages_exact(), but try to allocate on node nid first before falling |
2631 | * back. | 2631 | * back. |
2632 | * Note this is not alloc_pages_exact_node() which allocates on a specific node, | 2632 | * Note this is not alloc_pages_exact_node() which allocates on a specific node, |
2633 | * but is not exact. | 2633 | * but is not exact. |
2634 | */ | 2634 | */ |
2635 | void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) | 2635 | void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) |
2636 | { | 2636 | { |
2637 | unsigned order = get_order(size); | 2637 | unsigned order = get_order(size); |
2638 | struct page *p = alloc_pages_node(nid, gfp_mask, order); | 2638 | struct page *p = alloc_pages_node(nid, gfp_mask, order); |
2639 | if (!p) | 2639 | if (!p) |
2640 | return NULL; | 2640 | return NULL; |
2641 | return make_alloc_exact((unsigned long)page_address(p), order, size); | 2641 | return make_alloc_exact((unsigned long)page_address(p), order, size); |
2642 | } | 2642 | } |
2643 | EXPORT_SYMBOL(alloc_pages_exact_nid); | 2643 | EXPORT_SYMBOL(alloc_pages_exact_nid); |
2644 | 2644 | ||
2645 | /** | 2645 | /** |
2646 | * free_pages_exact - release memory allocated via alloc_pages_exact() | 2646 | * free_pages_exact - release memory allocated via alloc_pages_exact() |
2647 | * @virt: the value returned by alloc_pages_exact. | 2647 | * @virt: the value returned by alloc_pages_exact. |
2648 | * @size: size of allocation, same value as passed to alloc_pages_exact(). | 2648 | * @size: size of allocation, same value as passed to alloc_pages_exact(). |
2649 | * | 2649 | * |
2650 | * Release the memory allocated by a previous call to alloc_pages_exact. | 2650 | * Release the memory allocated by a previous call to alloc_pages_exact. |
2651 | */ | 2651 | */ |
2652 | void free_pages_exact(void *virt, size_t size) | 2652 | void free_pages_exact(void *virt, size_t size) |
2653 | { | 2653 | { |
2654 | unsigned long addr = (unsigned long)virt; | 2654 | unsigned long addr = (unsigned long)virt; |
2655 | unsigned long end = addr + PAGE_ALIGN(size); | 2655 | unsigned long end = addr + PAGE_ALIGN(size); |
2656 | 2656 | ||
2657 | while (addr < end) { | 2657 | while (addr < end) { |
2658 | free_page(addr); | 2658 | free_page(addr); |
2659 | addr += PAGE_SIZE; | 2659 | addr += PAGE_SIZE; |
2660 | } | 2660 | } |
2661 | } | 2661 | } |
2662 | EXPORT_SYMBOL(free_pages_exact); | 2662 | EXPORT_SYMBOL(free_pages_exact); |
2663 | 2663 | ||
2664 | static unsigned int nr_free_zone_pages(int offset) | 2664 | static unsigned int nr_free_zone_pages(int offset) |
2665 | { | 2665 | { |
2666 | struct zoneref *z; | 2666 | struct zoneref *z; |
2667 | struct zone *zone; | 2667 | struct zone *zone; |
2668 | 2668 | ||
2669 | /* Just pick one node, since fallback list is circular */ | 2669 | /* Just pick one node, since fallback list is circular */ |
2670 | unsigned int sum = 0; | 2670 | unsigned int sum = 0; |
2671 | 2671 | ||
2672 | struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); | 2672 | struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); |
2673 | 2673 | ||
2674 | for_each_zone_zonelist(zone, z, zonelist, offset) { | 2674 | for_each_zone_zonelist(zone, z, zonelist, offset) { |
2675 | unsigned long size = zone->present_pages; | 2675 | unsigned long size = zone->present_pages; |
2676 | unsigned long high = high_wmark_pages(zone); | 2676 | unsigned long high = high_wmark_pages(zone); |
2677 | if (size > high) | 2677 | if (size > high) |
2678 | sum += size - high; | 2678 | sum += size - high; |
2679 | } | 2679 | } |
2680 | 2680 | ||
2681 | return sum; | 2681 | return sum; |
2682 | } | 2682 | } |
2683 | 2683 | ||
2684 | /* | 2684 | /* |
2685 | * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL | 2685 | * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL |
2686 | */ | 2686 | */ |
2687 | unsigned int nr_free_buffer_pages(void) | 2687 | unsigned int nr_free_buffer_pages(void) |
2688 | { | 2688 | { |
2689 | return nr_free_zone_pages(gfp_zone(GFP_USER)); | 2689 | return nr_free_zone_pages(gfp_zone(GFP_USER)); |
2690 | } | 2690 | } |
2691 | EXPORT_SYMBOL_GPL(nr_free_buffer_pages); | 2691 | EXPORT_SYMBOL_GPL(nr_free_buffer_pages); |
2692 | 2692 | ||
2693 | /* | 2693 | /* |
2694 | * Amount of free RAM allocatable within all zones | 2694 | * Amount of free RAM allocatable within all zones |
2695 | */ | 2695 | */ |
2696 | unsigned int nr_free_pagecache_pages(void) | 2696 | unsigned int nr_free_pagecache_pages(void) |
2697 | { | 2697 | { |
2698 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); | 2698 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE)); |
2699 | } | 2699 | } |
2700 | 2700 | ||
2701 | static inline void show_node(struct zone *zone) | 2701 | static inline void show_node(struct zone *zone) |
2702 | { | 2702 | { |
2703 | if (NUMA_BUILD) | 2703 | if (NUMA_BUILD) |
2704 | printk("Node %d ", zone_to_nid(zone)); | 2704 | printk("Node %d ", zone_to_nid(zone)); |
2705 | } | 2705 | } |
2706 | 2706 | ||
2707 | void si_meminfo(struct sysinfo *val) | 2707 | void si_meminfo(struct sysinfo *val) |
2708 | { | 2708 | { |
2709 | val->totalram = totalram_pages; | 2709 | val->totalram = totalram_pages; |
2710 | val->sharedram = 0; | 2710 | val->sharedram = 0; |
2711 | val->freeram = global_page_state(NR_FREE_PAGES); | 2711 | val->freeram = global_page_state(NR_FREE_PAGES); |
2712 | val->bufferram = nr_blockdev_pages(); | 2712 | val->bufferram = nr_blockdev_pages(); |
2713 | val->totalhigh = totalhigh_pages; | 2713 | val->totalhigh = totalhigh_pages; |
2714 | val->freehigh = nr_free_highpages(); | 2714 | val->freehigh = nr_free_highpages(); |
2715 | val->mem_unit = PAGE_SIZE; | 2715 | val->mem_unit = PAGE_SIZE; |
2716 | } | 2716 | } |
2717 | 2717 | ||
2718 | EXPORT_SYMBOL(si_meminfo); | 2718 | EXPORT_SYMBOL(si_meminfo); |
2719 | 2719 | ||
2720 | #ifdef CONFIG_NUMA | 2720 | #ifdef CONFIG_NUMA |
2721 | void si_meminfo_node(struct sysinfo *val, int nid) | 2721 | void si_meminfo_node(struct sysinfo *val, int nid) |
2722 | { | 2722 | { |
2723 | pg_data_t *pgdat = NODE_DATA(nid); | 2723 | pg_data_t *pgdat = NODE_DATA(nid); |
2724 | 2724 | ||
2725 | val->totalram = pgdat->node_present_pages; | 2725 | val->totalram = pgdat->node_present_pages; |
2726 | val->freeram = node_page_state(nid, NR_FREE_PAGES); | 2726 | val->freeram = node_page_state(nid, NR_FREE_PAGES); |
2727 | #ifdef CONFIG_HIGHMEM | 2727 | #ifdef CONFIG_HIGHMEM |
2728 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; | 2728 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; |
2729 | val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], | 2729 | val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], |
2730 | NR_FREE_PAGES); | 2730 | NR_FREE_PAGES); |
2731 | #else | 2731 | #else |
2732 | val->totalhigh = 0; | 2732 | val->totalhigh = 0; |
2733 | val->freehigh = 0; | 2733 | val->freehigh = 0; |
2734 | #endif | 2734 | #endif |
2735 | val->mem_unit = PAGE_SIZE; | 2735 | val->mem_unit = PAGE_SIZE; |
2736 | } | 2736 | } |
2737 | #endif | 2737 | #endif |
2738 | 2738 | ||
2739 | /* | 2739 | /* |
2740 | * Determine whether the node should be displayed or not, depending on whether | 2740 | * Determine whether the node should be displayed or not, depending on whether |
2741 | * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). | 2741 | * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). |
2742 | */ | 2742 | */ |
2743 | bool skip_free_areas_node(unsigned int flags, int nid) | 2743 | bool skip_free_areas_node(unsigned int flags, int nid) |
2744 | { | 2744 | { |
2745 | bool ret = false; | 2745 | bool ret = false; |
2746 | unsigned int cpuset_mems_cookie; | 2746 | unsigned int cpuset_mems_cookie; |
2747 | 2747 | ||
2748 | if (!(flags & SHOW_MEM_FILTER_NODES)) | 2748 | if (!(flags & SHOW_MEM_FILTER_NODES)) |
2749 | goto out; | 2749 | goto out; |
2750 | 2750 | ||
2751 | do { | 2751 | do { |
2752 | cpuset_mems_cookie = get_mems_allowed(); | 2752 | cpuset_mems_cookie = get_mems_allowed(); |
2753 | ret = !node_isset(nid, cpuset_current_mems_allowed); | 2753 | ret = !node_isset(nid, cpuset_current_mems_allowed); |
2754 | } while (!put_mems_allowed(cpuset_mems_cookie)); | 2754 | } while (!put_mems_allowed(cpuset_mems_cookie)); |
2755 | out: | 2755 | out: |
2756 | return ret; | 2756 | return ret; |
2757 | } | 2757 | } |
2758 | 2758 | ||
2759 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 2759 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
2760 | 2760 | ||
2761 | /* | 2761 | /* |
2762 | * Show free area list (used inside shift_scroll-lock stuff) | 2762 | * Show free area list (used inside shift_scroll-lock stuff) |
2763 | * We also calculate the percentage fragmentation. We do this by counting the | 2763 | * We also calculate the percentage fragmentation. We do this by counting the |
2764 | * memory on each free list with the exception of the first item on the list. | 2764 | * memory on each free list with the exception of the first item on the list. |
2765 | * Suppresses nodes that are not allowed by current's cpuset if | 2765 | * Suppresses nodes that are not allowed by current's cpuset if |
2766 | * SHOW_MEM_FILTER_NODES is passed. | 2766 | * SHOW_MEM_FILTER_NODES is passed. |
2767 | */ | 2767 | */ |
2768 | void show_free_areas(unsigned int filter) | 2768 | void show_free_areas(unsigned int filter) |
2769 | { | 2769 | { |
2770 | int cpu; | 2770 | int cpu; |
2771 | struct zone *zone; | 2771 | struct zone *zone; |
2772 | 2772 | ||
2773 | for_each_populated_zone(zone) { | 2773 | for_each_populated_zone(zone) { |
2774 | if (skip_free_areas_node(filter, zone_to_nid(zone))) | 2774 | if (skip_free_areas_node(filter, zone_to_nid(zone))) |
2775 | continue; | 2775 | continue; |
2776 | show_node(zone); | 2776 | show_node(zone); |
2777 | printk("%s per-cpu:\n", zone->name); | 2777 | printk("%s per-cpu:\n", zone->name); |
2778 | 2778 | ||
2779 | for_each_online_cpu(cpu) { | 2779 | for_each_online_cpu(cpu) { |
2780 | struct per_cpu_pageset *pageset; | 2780 | struct per_cpu_pageset *pageset; |
2781 | 2781 | ||
2782 | pageset = per_cpu_ptr(zone->pageset, cpu); | 2782 | pageset = per_cpu_ptr(zone->pageset, cpu); |
2783 | 2783 | ||
2784 | printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", | 2784 | printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", |
2785 | cpu, pageset->pcp.high, | 2785 | cpu, pageset->pcp.high, |
2786 | pageset->pcp.batch, pageset->pcp.count); | 2786 | pageset->pcp.batch, pageset->pcp.count); |
2787 | } | 2787 | } |
2788 | } | 2788 | } |
2789 | 2789 | ||
2790 | printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" | 2790 | printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" |
2791 | " active_file:%lu inactive_file:%lu isolated_file:%lu\n" | 2791 | " active_file:%lu inactive_file:%lu isolated_file:%lu\n" |
2792 | " unevictable:%lu" | 2792 | " unevictable:%lu" |
2793 | " dirty:%lu writeback:%lu unstable:%lu\n" | 2793 | " dirty:%lu writeback:%lu unstable:%lu\n" |
2794 | " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" | 2794 | " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" |
2795 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", | 2795 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", |
2796 | global_page_state(NR_ACTIVE_ANON), | 2796 | global_page_state(NR_ACTIVE_ANON), |
2797 | global_page_state(NR_INACTIVE_ANON), | 2797 | global_page_state(NR_INACTIVE_ANON), |
2798 | global_page_state(NR_ISOLATED_ANON), | 2798 | global_page_state(NR_ISOLATED_ANON), |
2799 | global_page_state(NR_ACTIVE_FILE), | 2799 | global_page_state(NR_ACTIVE_FILE), |
2800 | global_page_state(NR_INACTIVE_FILE), | 2800 | global_page_state(NR_INACTIVE_FILE), |
2801 | global_page_state(NR_ISOLATED_FILE), | 2801 | global_page_state(NR_ISOLATED_FILE), |
2802 | global_page_state(NR_UNEVICTABLE), | 2802 | global_page_state(NR_UNEVICTABLE), |
2803 | global_page_state(NR_FILE_DIRTY), | 2803 | global_page_state(NR_FILE_DIRTY), |
2804 | global_page_state(NR_WRITEBACK), | 2804 | global_page_state(NR_WRITEBACK), |
2805 | global_page_state(NR_UNSTABLE_NFS), | 2805 | global_page_state(NR_UNSTABLE_NFS), |
2806 | global_page_state(NR_FREE_PAGES), | 2806 | global_page_state(NR_FREE_PAGES), |
2807 | global_page_state(NR_SLAB_RECLAIMABLE), | 2807 | global_page_state(NR_SLAB_RECLAIMABLE), |
2808 | global_page_state(NR_SLAB_UNRECLAIMABLE), | 2808 | global_page_state(NR_SLAB_UNRECLAIMABLE), |
2809 | global_page_state(NR_FILE_MAPPED), | 2809 | global_page_state(NR_FILE_MAPPED), |
2810 | global_page_state(NR_SHMEM), | 2810 | global_page_state(NR_SHMEM), |
2811 | global_page_state(NR_PAGETABLE), | 2811 | global_page_state(NR_PAGETABLE), |
2812 | global_page_state(NR_BOUNCE)); | 2812 | global_page_state(NR_BOUNCE)); |
2813 | 2813 | ||
2814 | for_each_populated_zone(zone) { | 2814 | for_each_populated_zone(zone) { |
2815 | int i; | 2815 | int i; |
2816 | 2816 | ||
2817 | if (skip_free_areas_node(filter, zone_to_nid(zone))) | 2817 | if (skip_free_areas_node(filter, zone_to_nid(zone))) |
2818 | continue; | 2818 | continue; |
2819 | show_node(zone); | 2819 | show_node(zone); |
2820 | printk("%s" | 2820 | printk("%s" |
2821 | " free:%lukB" | 2821 | " free:%lukB" |
2822 | " min:%lukB" | 2822 | " min:%lukB" |
2823 | " low:%lukB" | 2823 | " low:%lukB" |
2824 | " high:%lukB" | 2824 | " high:%lukB" |
2825 | " active_anon:%lukB" | 2825 | " active_anon:%lukB" |
2826 | " inactive_anon:%lukB" | 2826 | " inactive_anon:%lukB" |
2827 | " active_file:%lukB" | 2827 | " active_file:%lukB" |
2828 | " inactive_file:%lukB" | 2828 | " inactive_file:%lukB" |
2829 | " unevictable:%lukB" | 2829 | " unevictable:%lukB" |
2830 | " isolated(anon):%lukB" | 2830 | " isolated(anon):%lukB" |
2831 | " isolated(file):%lukB" | 2831 | " isolated(file):%lukB" |
2832 | " present:%lukB" | 2832 | " present:%lukB" |
2833 | " mlocked:%lukB" | 2833 | " mlocked:%lukB" |
2834 | " dirty:%lukB" | 2834 | " dirty:%lukB" |
2835 | " writeback:%lukB" | 2835 | " writeback:%lukB" |
2836 | " mapped:%lukB" | 2836 | " mapped:%lukB" |
2837 | " shmem:%lukB" | 2837 | " shmem:%lukB" |
2838 | " slab_reclaimable:%lukB" | 2838 | " slab_reclaimable:%lukB" |
2839 | " slab_unreclaimable:%lukB" | 2839 | " slab_unreclaimable:%lukB" |
2840 | " kernel_stack:%lukB" | 2840 | " kernel_stack:%lukB" |
2841 | " pagetables:%lukB" | 2841 | " pagetables:%lukB" |
2842 | " unstable:%lukB" | 2842 | " unstable:%lukB" |
2843 | " bounce:%lukB" | 2843 | " bounce:%lukB" |
2844 | " writeback_tmp:%lukB" | 2844 | " writeback_tmp:%lukB" |
2845 | " pages_scanned:%lu" | 2845 | " pages_scanned:%lu" |
2846 | " all_unreclaimable? %s" | 2846 | " all_unreclaimable? %s" |
2847 | "\n", | 2847 | "\n", |
2848 | zone->name, | 2848 | zone->name, |
2849 | K(zone_page_state(zone, NR_FREE_PAGES)), | 2849 | K(zone_page_state(zone, NR_FREE_PAGES)), |
2850 | K(min_wmark_pages(zone)), | 2850 | K(min_wmark_pages(zone)), |
2851 | K(low_wmark_pages(zone)), | 2851 | K(low_wmark_pages(zone)), |
2852 | K(high_wmark_pages(zone)), | 2852 | K(high_wmark_pages(zone)), |
2853 | K(zone_page_state(zone, NR_ACTIVE_ANON)), | 2853 | K(zone_page_state(zone, NR_ACTIVE_ANON)), |
2854 | K(zone_page_state(zone, NR_INACTIVE_ANON)), | 2854 | K(zone_page_state(zone, NR_INACTIVE_ANON)), |
2855 | K(zone_page_state(zone, NR_ACTIVE_FILE)), | 2855 | K(zone_page_state(zone, NR_ACTIVE_FILE)), |
2856 | K(zone_page_state(zone, NR_INACTIVE_FILE)), | 2856 | K(zone_page_state(zone, NR_INACTIVE_FILE)), |
2857 | K(zone_page_state(zone, NR_UNEVICTABLE)), | 2857 | K(zone_page_state(zone, NR_UNEVICTABLE)), |
2858 | K(zone_page_state(zone, NR_ISOLATED_ANON)), | 2858 | K(zone_page_state(zone, NR_ISOLATED_ANON)), |
2859 | K(zone_page_state(zone, NR_ISOLATED_FILE)), | 2859 | K(zone_page_state(zone, NR_ISOLATED_FILE)), |
2860 | K(zone->present_pages), | 2860 | K(zone->present_pages), |
2861 | K(zone_page_state(zone, NR_MLOCK)), | 2861 | K(zone_page_state(zone, NR_MLOCK)), |
2862 | K(zone_page_state(zone, NR_FILE_DIRTY)), | 2862 | K(zone_page_state(zone, NR_FILE_DIRTY)), |
2863 | K(zone_page_state(zone, NR_WRITEBACK)), | 2863 | K(zone_page_state(zone, NR_WRITEBACK)), |
2864 | K(zone_page_state(zone, NR_FILE_MAPPED)), | 2864 | K(zone_page_state(zone, NR_FILE_MAPPED)), |
2865 | K(zone_page_state(zone, NR_SHMEM)), | 2865 | K(zone_page_state(zone, NR_SHMEM)), |
2866 | K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), | 2866 | K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), |
2867 | K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), | 2867 | K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), |
2868 | zone_page_state(zone, NR_KERNEL_STACK) * | 2868 | zone_page_state(zone, NR_KERNEL_STACK) * |
2869 | THREAD_SIZE / 1024, | 2869 | THREAD_SIZE / 1024, |
2870 | K(zone_page_state(zone, NR_PAGETABLE)), | 2870 | K(zone_page_state(zone, NR_PAGETABLE)), |
2871 | K(zone_page_state(zone, NR_UNSTABLE_NFS)), | 2871 | K(zone_page_state(zone, NR_UNSTABLE_NFS)), |
2872 | K(zone_page_state(zone, NR_BOUNCE)), | 2872 | K(zone_page_state(zone, NR_BOUNCE)), |
2873 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), | 2873 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), |
2874 | zone->pages_scanned, | 2874 | zone->pages_scanned, |
2875 | (zone->all_unreclaimable ? "yes" : "no") | 2875 | (zone->all_unreclaimable ? "yes" : "no") |
2876 | ); | 2876 | ); |
2877 | printk("lowmem_reserve[]:"); | 2877 | printk("lowmem_reserve[]:"); |
2878 | for (i = 0; i < MAX_NR_ZONES; i++) | 2878 | for (i = 0; i < MAX_NR_ZONES; i++) |
2879 | printk(" %lu", zone->lowmem_reserve[i]); | 2879 | printk(" %lu", zone->lowmem_reserve[i]); |
2880 | printk("\n"); | 2880 | printk("\n"); |
2881 | } | 2881 | } |
2882 | 2882 | ||
2883 | for_each_populated_zone(zone) { | 2883 | for_each_populated_zone(zone) { |
2884 | unsigned long nr[MAX_ORDER], flags, order, total = 0; | 2884 | unsigned long nr[MAX_ORDER], flags, order, total = 0; |
2885 | 2885 | ||
2886 | if (skip_free_areas_node(filter, zone_to_nid(zone))) | 2886 | if (skip_free_areas_node(filter, zone_to_nid(zone))) |
2887 | continue; | 2887 | continue; |
2888 | show_node(zone); | 2888 | show_node(zone); |
2889 | printk("%s: ", zone->name); | 2889 | printk("%s: ", zone->name); |
2890 | 2890 | ||
2891 | spin_lock_irqsave(&zone->lock, flags); | 2891 | spin_lock_irqsave(&zone->lock, flags); |
2892 | for (order = 0; order < MAX_ORDER; order++) { | 2892 | for (order = 0; order < MAX_ORDER; order++) { |
2893 | nr[order] = zone->free_area[order].nr_free; | 2893 | nr[order] = zone->free_area[order].nr_free; |
2894 | total += nr[order] << order; | 2894 | total += nr[order] << order; |
2895 | } | 2895 | } |
2896 | spin_unlock_irqrestore(&zone->lock, flags); | 2896 | spin_unlock_irqrestore(&zone->lock, flags); |
2897 | for (order = 0; order < MAX_ORDER; order++) | 2897 | for (order = 0; order < MAX_ORDER; order++) |
2898 | printk("%lu*%lukB ", nr[order], K(1UL) << order); | 2898 | printk("%lu*%lukB ", nr[order], K(1UL) << order); |
2899 | printk("= %lukB\n", K(total)); | 2899 | printk("= %lukB\n", K(total)); |
2900 | } | 2900 | } |
2901 | 2901 | ||
2902 | printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); | 2902 | printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES)); |
2903 | 2903 | ||
2904 | show_swap_cache_info(); | 2904 | show_swap_cache_info(); |
2905 | } | 2905 | } |
2906 | 2906 | ||
2907 | static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) | 2907 | static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) |
2908 | { | 2908 | { |
2909 | zoneref->zone = zone; | 2909 | zoneref->zone = zone; |
2910 | zoneref->zone_idx = zone_idx(zone); | 2910 | zoneref->zone_idx = zone_idx(zone); |
2911 | } | 2911 | } |
2912 | 2912 | ||
2913 | /* | 2913 | /* |
2914 | * Builds allocation fallback zone lists. | 2914 | * Builds allocation fallback zone lists. |
2915 | * | 2915 | * |
2916 | * Add all populated zones of a node to the zonelist. | 2916 | * Add all populated zones of a node to the zonelist. |
2917 | */ | 2917 | */ |
2918 | static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, | 2918 | static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, |
2919 | int nr_zones, enum zone_type zone_type) | 2919 | int nr_zones, enum zone_type zone_type) |
2920 | { | 2920 | { |
2921 | struct zone *zone; | 2921 | struct zone *zone; |
2922 | 2922 | ||
2923 | BUG_ON(zone_type >= MAX_NR_ZONES); | 2923 | BUG_ON(zone_type >= MAX_NR_ZONES); |
2924 | zone_type++; | 2924 | zone_type++; |
2925 | 2925 | ||
2926 | do { | 2926 | do { |
2927 | zone_type--; | 2927 | zone_type--; |
2928 | zone = pgdat->node_zones + zone_type; | 2928 | zone = pgdat->node_zones + zone_type; |
2929 | if (populated_zone(zone)) { | 2929 | if (populated_zone(zone)) { |
2930 | zoneref_set_zone(zone, | 2930 | zoneref_set_zone(zone, |
2931 | &zonelist->_zonerefs[nr_zones++]); | 2931 | &zonelist->_zonerefs[nr_zones++]); |
2932 | check_highest_zone(zone_type); | 2932 | check_highest_zone(zone_type); |
2933 | } | 2933 | } |
2934 | 2934 | ||
2935 | } while (zone_type); | 2935 | } while (zone_type); |
2936 | return nr_zones; | 2936 | return nr_zones; |
2937 | } | 2937 | } |
2938 | 2938 | ||
2939 | 2939 | ||
2940 | /* | 2940 | /* |
2941 | * zonelist_order: | 2941 | * zonelist_order: |
2942 | * 0 = automatic detection of better ordering. | 2942 | * 0 = automatic detection of better ordering. |
2943 | * 1 = order by ([node] distance, -zonetype) | 2943 | * 1 = order by ([node] distance, -zonetype) |
2944 | * 2 = order by (-zonetype, [node] distance) | 2944 | * 2 = order by (-zonetype, [node] distance) |
2945 | * | 2945 | * |
2946 | * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create | 2946 | * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create |
2947 | * the same zonelist. So only NUMA can configure this param. | 2947 | * the same zonelist. So only NUMA can configure this param. |
2948 | */ | 2948 | */ |
2949 | #define ZONELIST_ORDER_DEFAULT 0 | 2949 | #define ZONELIST_ORDER_DEFAULT 0 |
2950 | #define ZONELIST_ORDER_NODE 1 | 2950 | #define ZONELIST_ORDER_NODE 1 |
2951 | #define ZONELIST_ORDER_ZONE 2 | 2951 | #define ZONELIST_ORDER_ZONE 2 |
2952 | 2952 | ||
2953 | /* zonelist order in the kernel. | 2953 | /* zonelist order in the kernel. |
2954 | * set_zonelist_order() will set this to NODE or ZONE. | 2954 | * set_zonelist_order() will set this to NODE or ZONE. |
2955 | */ | 2955 | */ |
2956 | static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; | 2956 | static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; |
2957 | static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; | 2957 | static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; |
2958 | 2958 | ||
2959 | 2959 | ||
2960 | #ifdef CONFIG_NUMA | 2960 | #ifdef CONFIG_NUMA |
2961 | /* The value user specified ....changed by config */ | 2961 | /* The value user specified ....changed by config */ |
2962 | static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; | 2962 | static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; |
2963 | /* string for sysctl */ | 2963 | /* string for sysctl */ |
2964 | #define NUMA_ZONELIST_ORDER_LEN 16 | 2964 | #define NUMA_ZONELIST_ORDER_LEN 16 |
2965 | char numa_zonelist_order[16] = "default"; | 2965 | char numa_zonelist_order[16] = "default"; |
2966 | 2966 | ||
2967 | /* | 2967 | /* |
2968 | * interface for configure zonelist ordering. | 2968 | * interface for configure zonelist ordering. |
2969 | * command line option "numa_zonelist_order" | 2969 | * command line option "numa_zonelist_order" |
2970 | * = "[dD]efault - default, automatic configuration. | 2970 | * = "[dD]efault - default, automatic configuration. |
2971 | * = "[nN]ode - order by node locality, then by zone within node | 2971 | * = "[nN]ode - order by node locality, then by zone within node |
2972 | * = "[zZ]one - order by zone, then by locality within zone | 2972 | * = "[zZ]one - order by zone, then by locality within zone |
2973 | */ | 2973 | */ |
2974 | 2974 | ||
2975 | static int __parse_numa_zonelist_order(char *s) | 2975 | static int __parse_numa_zonelist_order(char *s) |
2976 | { | 2976 | { |
2977 | if (*s == 'd' || *s == 'D') { | 2977 | if (*s == 'd' || *s == 'D') { |
2978 | user_zonelist_order = ZONELIST_ORDER_DEFAULT; | 2978 | user_zonelist_order = ZONELIST_ORDER_DEFAULT; |
2979 | } else if (*s == 'n' || *s == 'N') { | 2979 | } else if (*s == 'n' || *s == 'N') { |
2980 | user_zonelist_order = ZONELIST_ORDER_NODE; | 2980 | user_zonelist_order = ZONELIST_ORDER_NODE; |
2981 | } else if (*s == 'z' || *s == 'Z') { | 2981 | } else if (*s == 'z' || *s == 'Z') { |
2982 | user_zonelist_order = ZONELIST_ORDER_ZONE; | 2982 | user_zonelist_order = ZONELIST_ORDER_ZONE; |
2983 | } else { | 2983 | } else { |
2984 | printk(KERN_WARNING | 2984 | printk(KERN_WARNING |
2985 | "Ignoring invalid numa_zonelist_order value: " | 2985 | "Ignoring invalid numa_zonelist_order value: " |
2986 | "%s\n", s); | 2986 | "%s\n", s); |
2987 | return -EINVAL; | 2987 | return -EINVAL; |
2988 | } | 2988 | } |
2989 | return 0; | 2989 | return 0; |
2990 | } | 2990 | } |
2991 | 2991 | ||
2992 | static __init int setup_numa_zonelist_order(char *s) | 2992 | static __init int setup_numa_zonelist_order(char *s) |
2993 | { | 2993 | { |
2994 | int ret; | 2994 | int ret; |
2995 | 2995 | ||
2996 | if (!s) | 2996 | if (!s) |
2997 | return 0; | 2997 | return 0; |
2998 | 2998 | ||
2999 | ret = __parse_numa_zonelist_order(s); | 2999 | ret = __parse_numa_zonelist_order(s); |
3000 | if (ret == 0) | 3000 | if (ret == 0) |
3001 | strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); | 3001 | strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); |
3002 | 3002 | ||
3003 | return ret; | 3003 | return ret; |
3004 | } | 3004 | } |
3005 | early_param("numa_zonelist_order", setup_numa_zonelist_order); | 3005 | early_param("numa_zonelist_order", setup_numa_zonelist_order); |
3006 | 3006 | ||
3007 | /* | 3007 | /* |
3008 | * sysctl handler for numa_zonelist_order | 3008 | * sysctl handler for numa_zonelist_order |
3009 | */ | 3009 | */ |
3010 | int numa_zonelist_order_handler(ctl_table *table, int write, | 3010 | int numa_zonelist_order_handler(ctl_table *table, int write, |
3011 | void __user *buffer, size_t *length, | 3011 | void __user *buffer, size_t *length, |
3012 | loff_t *ppos) | 3012 | loff_t *ppos) |
3013 | { | 3013 | { |
3014 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; | 3014 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; |
3015 | int ret; | 3015 | int ret; |
3016 | static DEFINE_MUTEX(zl_order_mutex); | 3016 | static DEFINE_MUTEX(zl_order_mutex); |
3017 | 3017 | ||
3018 | mutex_lock(&zl_order_mutex); | 3018 | mutex_lock(&zl_order_mutex); |
3019 | if (write) | 3019 | if (write) |
3020 | strcpy(saved_string, (char*)table->data); | 3020 | strcpy(saved_string, (char*)table->data); |
3021 | ret = proc_dostring(table, write, buffer, length, ppos); | 3021 | ret = proc_dostring(table, write, buffer, length, ppos); |
3022 | if (ret) | 3022 | if (ret) |
3023 | goto out; | 3023 | goto out; |
3024 | if (write) { | 3024 | if (write) { |
3025 | int oldval = user_zonelist_order; | 3025 | int oldval = user_zonelist_order; |
3026 | if (__parse_numa_zonelist_order((char*)table->data)) { | 3026 | if (__parse_numa_zonelist_order((char*)table->data)) { |
3027 | /* | 3027 | /* |
3028 | * bogus value. restore saved string | 3028 | * bogus value. restore saved string |
3029 | */ | 3029 | */ |
3030 | strncpy((char*)table->data, saved_string, | 3030 | strncpy((char*)table->data, saved_string, |
3031 | NUMA_ZONELIST_ORDER_LEN); | 3031 | NUMA_ZONELIST_ORDER_LEN); |
3032 | user_zonelist_order = oldval; | 3032 | user_zonelist_order = oldval; |
3033 | } else if (oldval != user_zonelist_order) { | 3033 | } else if (oldval != user_zonelist_order) { |
3034 | mutex_lock(&zonelists_mutex); | 3034 | mutex_lock(&zonelists_mutex); |
3035 | build_all_zonelists(NULL); | 3035 | build_all_zonelists(NULL); |
3036 | mutex_unlock(&zonelists_mutex); | 3036 | mutex_unlock(&zonelists_mutex); |
3037 | } | 3037 | } |
3038 | } | 3038 | } |
3039 | out: | 3039 | out: |
3040 | mutex_unlock(&zl_order_mutex); | 3040 | mutex_unlock(&zl_order_mutex); |
3041 | return ret; | 3041 | return ret; |
3042 | } | 3042 | } |
3043 | 3043 | ||
3044 | 3044 | ||
3045 | #define MAX_NODE_LOAD (nr_online_nodes) | 3045 | #define MAX_NODE_LOAD (nr_online_nodes) |
3046 | static int node_load[MAX_NUMNODES]; | 3046 | static int node_load[MAX_NUMNODES]; |
3047 | 3047 | ||
3048 | /** | 3048 | /** |
3049 | * find_next_best_node - find the next node that should appear in a given node's fallback list | 3049 | * find_next_best_node - find the next node that should appear in a given node's fallback list |
3050 | * @node: node whose fallback list we're appending | 3050 | * @node: node whose fallback list we're appending |
3051 | * @used_node_mask: nodemask_t of already used nodes | 3051 | * @used_node_mask: nodemask_t of already used nodes |
3052 | * | 3052 | * |
3053 | * We use a number of factors to determine which is the next node that should | 3053 | * We use a number of factors to determine which is the next node that should |
3054 | * appear on a given node's fallback list. The node should not have appeared | 3054 | * appear on a given node's fallback list. The node should not have appeared |
3055 | * already in @node's fallback list, and it should be the next closest node | 3055 | * already in @node's fallback list, and it should be the next closest node |
3056 | * according to the distance array (which contains arbitrary distance values | 3056 | * according to the distance array (which contains arbitrary distance values |
3057 | * from each node to each node in the system), and should also prefer nodes | 3057 | * from each node to each node in the system), and should also prefer nodes |
3058 | * with no CPUs, since presumably they'll have very little allocation pressure | 3058 | * with no CPUs, since presumably they'll have very little allocation pressure |
3059 | * on them otherwise. | 3059 | * on them otherwise. |
3060 | * It returns -1 if no node is found. | 3060 | * It returns -1 if no node is found. |
3061 | */ | 3061 | */ |
3062 | static int find_next_best_node(int node, nodemask_t *used_node_mask) | 3062 | static int find_next_best_node(int node, nodemask_t *used_node_mask) |
3063 | { | 3063 | { |
3064 | int n, val; | 3064 | int n, val; |
3065 | int min_val = INT_MAX; | 3065 | int min_val = INT_MAX; |
3066 | int best_node = -1; | 3066 | int best_node = -1; |
3067 | const struct cpumask *tmp = cpumask_of_node(0); | 3067 | const struct cpumask *tmp = cpumask_of_node(0); |
3068 | 3068 | ||
3069 | /* Use the local node if we haven't already */ | 3069 | /* Use the local node if we haven't already */ |
3070 | if (!node_isset(node, *used_node_mask)) { | 3070 | if (!node_isset(node, *used_node_mask)) { |
3071 | node_set(node, *used_node_mask); | 3071 | node_set(node, *used_node_mask); |
3072 | return node; | 3072 | return node; |
3073 | } | 3073 | } |
3074 | 3074 | ||
3075 | for_each_node_state(n, N_HIGH_MEMORY) { | 3075 | for_each_node_state(n, N_HIGH_MEMORY) { |
3076 | 3076 | ||
3077 | /* Don't want a node to appear more than once */ | 3077 | /* Don't want a node to appear more than once */ |
3078 | if (node_isset(n, *used_node_mask)) | 3078 | if (node_isset(n, *used_node_mask)) |
3079 | continue; | 3079 | continue; |
3080 | 3080 | ||
3081 | /* Use the distance array to find the distance */ | 3081 | /* Use the distance array to find the distance */ |
3082 | val = node_distance(node, n); | 3082 | val = node_distance(node, n); |
3083 | 3083 | ||
3084 | /* Penalize nodes under us ("prefer the next node") */ | 3084 | /* Penalize nodes under us ("prefer the next node") */ |
3085 | val += (n < node); | 3085 | val += (n < node); |
3086 | 3086 | ||
3087 | /* Give preference to headless and unused nodes */ | 3087 | /* Give preference to headless and unused nodes */ |
3088 | tmp = cpumask_of_node(n); | 3088 | tmp = cpumask_of_node(n); |
3089 | if (!cpumask_empty(tmp)) | 3089 | if (!cpumask_empty(tmp)) |
3090 | val += PENALTY_FOR_NODE_WITH_CPUS; | 3090 | val += PENALTY_FOR_NODE_WITH_CPUS; |
3091 | 3091 | ||
3092 | /* Slight preference for less loaded node */ | 3092 | /* Slight preference for less loaded node */ |
3093 | val *= (MAX_NODE_LOAD*MAX_NUMNODES); | 3093 | val *= (MAX_NODE_LOAD*MAX_NUMNODES); |
3094 | val += node_load[n]; | 3094 | val += node_load[n]; |
3095 | 3095 | ||
3096 | if (val < min_val) { | 3096 | if (val < min_val) { |
3097 | min_val = val; | 3097 | min_val = val; |
3098 | best_node = n; | 3098 | best_node = n; |
3099 | } | 3099 | } |
3100 | } | 3100 | } |
3101 | 3101 | ||
3102 | if (best_node >= 0) | 3102 | if (best_node >= 0) |
3103 | node_set(best_node, *used_node_mask); | 3103 | node_set(best_node, *used_node_mask); |
3104 | 3104 | ||
3105 | return best_node; | 3105 | return best_node; |
3106 | } | 3106 | } |
3107 | 3107 | ||
3108 | 3108 | ||
3109 | /* | 3109 | /* |
3110 | * Build zonelists ordered by node and zones within node. | 3110 | * Build zonelists ordered by node and zones within node. |
3111 | * This results in maximum locality--normal zone overflows into local | 3111 | * This results in maximum locality--normal zone overflows into local |
3112 | * DMA zone, if any--but risks exhausting DMA zone. | 3112 | * DMA zone, if any--but risks exhausting DMA zone. |
3113 | */ | 3113 | */ |
3114 | static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) | 3114 | static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) |
3115 | { | 3115 | { |
3116 | int j; | 3116 | int j; |
3117 | struct zonelist *zonelist; | 3117 | struct zonelist *zonelist; |
3118 | 3118 | ||
3119 | zonelist = &pgdat->node_zonelists[0]; | 3119 | zonelist = &pgdat->node_zonelists[0]; |
3120 | for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) | 3120 | for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) |
3121 | ; | 3121 | ; |
3122 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, | 3122 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, |
3123 | MAX_NR_ZONES - 1); | 3123 | MAX_NR_ZONES - 1); |
3124 | zonelist->_zonerefs[j].zone = NULL; | 3124 | zonelist->_zonerefs[j].zone = NULL; |
3125 | zonelist->_zonerefs[j].zone_idx = 0; | 3125 | zonelist->_zonerefs[j].zone_idx = 0; |
3126 | } | 3126 | } |
3127 | 3127 | ||
3128 | /* | 3128 | /* |
3129 | * Build gfp_thisnode zonelists | 3129 | * Build gfp_thisnode zonelists |
3130 | */ | 3130 | */ |
3131 | static void build_thisnode_zonelists(pg_data_t *pgdat) | 3131 | static void build_thisnode_zonelists(pg_data_t *pgdat) |
3132 | { | 3132 | { |
3133 | int j; | 3133 | int j; |
3134 | struct zonelist *zonelist; | 3134 | struct zonelist *zonelist; |
3135 | 3135 | ||
3136 | zonelist = &pgdat->node_zonelists[1]; | 3136 | zonelist = &pgdat->node_zonelists[1]; |
3137 | j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); | 3137 | j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); |
3138 | zonelist->_zonerefs[j].zone = NULL; | 3138 | zonelist->_zonerefs[j].zone = NULL; |
3139 | zonelist->_zonerefs[j].zone_idx = 0; | 3139 | zonelist->_zonerefs[j].zone_idx = 0; |
3140 | } | 3140 | } |
3141 | 3141 | ||
3142 | /* | 3142 | /* |
3143 | * Build zonelists ordered by zone and nodes within zones. | 3143 | * Build zonelists ordered by zone and nodes within zones. |
3144 | * This results in conserving DMA zone[s] until all Normal memory is | 3144 | * This results in conserving DMA zone[s] until all Normal memory is |
3145 | * exhausted, but results in overflowing to remote node while memory | 3145 | * exhausted, but results in overflowing to remote node while memory |
3146 | * may still exist in local DMA zone. | 3146 | * may still exist in local DMA zone. |
3147 | */ | 3147 | */ |
3148 | static int node_order[MAX_NUMNODES]; | 3148 | static int node_order[MAX_NUMNODES]; |
3149 | 3149 | ||
3150 | static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) | 3150 | static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) |
3151 | { | 3151 | { |
3152 | int pos, j, node; | 3152 | int pos, j, node; |
3153 | int zone_type; /* needs to be signed */ | 3153 | int zone_type; /* needs to be signed */ |
3154 | struct zone *z; | 3154 | struct zone *z; |
3155 | struct zonelist *zonelist; | 3155 | struct zonelist *zonelist; |
3156 | 3156 | ||
3157 | zonelist = &pgdat->node_zonelists[0]; | 3157 | zonelist = &pgdat->node_zonelists[0]; |
3158 | pos = 0; | 3158 | pos = 0; |
3159 | for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { | 3159 | for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { |
3160 | for (j = 0; j < nr_nodes; j++) { | 3160 | for (j = 0; j < nr_nodes; j++) { |
3161 | node = node_order[j]; | 3161 | node = node_order[j]; |
3162 | z = &NODE_DATA(node)->node_zones[zone_type]; | 3162 | z = &NODE_DATA(node)->node_zones[zone_type]; |
3163 | if (populated_zone(z)) { | 3163 | if (populated_zone(z)) { |
3164 | zoneref_set_zone(z, | 3164 | zoneref_set_zone(z, |
3165 | &zonelist->_zonerefs[pos++]); | 3165 | &zonelist->_zonerefs[pos++]); |
3166 | check_highest_zone(zone_type); | 3166 | check_highest_zone(zone_type); |
3167 | } | 3167 | } |
3168 | } | 3168 | } |
3169 | } | 3169 | } |
3170 | zonelist->_zonerefs[pos].zone = NULL; | 3170 | zonelist->_zonerefs[pos].zone = NULL; |
3171 | zonelist->_zonerefs[pos].zone_idx = 0; | 3171 | zonelist->_zonerefs[pos].zone_idx = 0; |
3172 | } | 3172 | } |
3173 | 3173 | ||
3174 | static int default_zonelist_order(void) | 3174 | static int default_zonelist_order(void) |
3175 | { | 3175 | { |
3176 | int nid, zone_type; | 3176 | int nid, zone_type; |
3177 | unsigned long low_kmem_size,total_size; | 3177 | unsigned long low_kmem_size,total_size; |
3178 | struct zone *z; | 3178 | struct zone *z; |
3179 | int average_size; | 3179 | int average_size; |
3180 | /* | 3180 | /* |
3181 | * ZONE_DMA and ZONE_DMA32 can be very small area in the system. | 3181 | * ZONE_DMA and ZONE_DMA32 can be very small area in the system. |
3182 | * If they are really small and used heavily, the system can fall | 3182 | * If they are really small and used heavily, the system can fall |
3183 | * into OOM very easily. | 3183 | * into OOM very easily. |
3184 | * This function detect ZONE_DMA/DMA32 size and configures zone order. | 3184 | * This function detect ZONE_DMA/DMA32 size and configures zone order. |
3185 | */ | 3185 | */ |
3186 | /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ | 3186 | /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ |
3187 | low_kmem_size = 0; | 3187 | low_kmem_size = 0; |
3188 | total_size = 0; | 3188 | total_size = 0; |
3189 | for_each_online_node(nid) { | 3189 | for_each_online_node(nid) { |
3190 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { | 3190 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { |
3191 | z = &NODE_DATA(nid)->node_zones[zone_type]; | 3191 | z = &NODE_DATA(nid)->node_zones[zone_type]; |
3192 | if (populated_zone(z)) { | 3192 | if (populated_zone(z)) { |
3193 | if (zone_type < ZONE_NORMAL) | 3193 | if (zone_type < ZONE_NORMAL) |
3194 | low_kmem_size += z->present_pages; | 3194 | low_kmem_size += z->present_pages; |
3195 | total_size += z->present_pages; | 3195 | total_size += z->present_pages; |
3196 | } else if (zone_type == ZONE_NORMAL) { | 3196 | } else if (zone_type == ZONE_NORMAL) { |
3197 | /* | 3197 | /* |
3198 | * If any node has only lowmem, then node order | 3198 | * If any node has only lowmem, then node order |
3199 | * is preferred to allow kernel allocations | 3199 | * is preferred to allow kernel allocations |
3200 | * locally; otherwise, they can easily infringe | 3200 | * locally; otherwise, they can easily infringe |
3201 | * on other nodes when there is an abundance of | 3201 | * on other nodes when there is an abundance of |
3202 | * lowmem available to allocate from. | 3202 | * lowmem available to allocate from. |
3203 | */ | 3203 | */ |
3204 | return ZONELIST_ORDER_NODE; | 3204 | return ZONELIST_ORDER_NODE; |
3205 | } | 3205 | } |
3206 | } | 3206 | } |
3207 | } | 3207 | } |
3208 | if (!low_kmem_size || /* there are no DMA area. */ | 3208 | if (!low_kmem_size || /* there are no DMA area. */ |
3209 | low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ | 3209 | low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ |
3210 | return ZONELIST_ORDER_NODE; | 3210 | return ZONELIST_ORDER_NODE; |
3211 | /* | 3211 | /* |
3212 | * look into each node's config. | 3212 | * look into each node's config. |
3213 | * If there is a node whose DMA/DMA32 memory is very big area on | 3213 | * If there is a node whose DMA/DMA32 memory is very big area on |
3214 | * local memory, NODE_ORDER may be suitable. | 3214 | * local memory, NODE_ORDER may be suitable. |
3215 | */ | 3215 | */ |
3216 | average_size = total_size / | 3216 | average_size = total_size / |
3217 | (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); | 3217 | (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); |
3218 | for_each_online_node(nid) { | 3218 | for_each_online_node(nid) { |
3219 | low_kmem_size = 0; | 3219 | low_kmem_size = 0; |
3220 | total_size = 0; | 3220 | total_size = 0; |
3221 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { | 3221 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { |
3222 | z = &NODE_DATA(nid)->node_zones[zone_type]; | 3222 | z = &NODE_DATA(nid)->node_zones[zone_type]; |
3223 | if (populated_zone(z)) { | 3223 | if (populated_zone(z)) { |
3224 | if (zone_type < ZONE_NORMAL) | 3224 | if (zone_type < ZONE_NORMAL) |
3225 | low_kmem_size += z->present_pages; | 3225 | low_kmem_size += z->present_pages; |
3226 | total_size += z->present_pages; | 3226 | total_size += z->present_pages; |
3227 | } | 3227 | } |
3228 | } | 3228 | } |
3229 | if (low_kmem_size && | 3229 | if (low_kmem_size && |
3230 | total_size > average_size && /* ignore small node */ | 3230 | total_size > average_size && /* ignore small node */ |
3231 | low_kmem_size > total_size * 70/100) | 3231 | low_kmem_size > total_size * 70/100) |
3232 | return ZONELIST_ORDER_NODE; | 3232 | return ZONELIST_ORDER_NODE; |
3233 | } | 3233 | } |
3234 | return ZONELIST_ORDER_ZONE; | 3234 | return ZONELIST_ORDER_ZONE; |
3235 | } | 3235 | } |
3236 | 3236 | ||
3237 | static void set_zonelist_order(void) | 3237 | static void set_zonelist_order(void) |
3238 | { | 3238 | { |
3239 | if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) | 3239 | if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) |
3240 | current_zonelist_order = default_zonelist_order(); | 3240 | current_zonelist_order = default_zonelist_order(); |
3241 | else | 3241 | else |
3242 | current_zonelist_order = user_zonelist_order; | 3242 | current_zonelist_order = user_zonelist_order; |
3243 | } | 3243 | } |
3244 | 3244 | ||
3245 | static void build_zonelists(pg_data_t *pgdat) | 3245 | static void build_zonelists(pg_data_t *pgdat) |
3246 | { | 3246 | { |
3247 | int j, node, load; | 3247 | int j, node, load; |
3248 | enum zone_type i; | 3248 | enum zone_type i; |
3249 | nodemask_t used_mask; | 3249 | nodemask_t used_mask; |
3250 | int local_node, prev_node; | 3250 | int local_node, prev_node; |
3251 | struct zonelist *zonelist; | 3251 | struct zonelist *zonelist; |
3252 | int order = current_zonelist_order; | 3252 | int order = current_zonelist_order; |
3253 | 3253 | ||
3254 | /* initialize zonelists */ | 3254 | /* initialize zonelists */ |
3255 | for (i = 0; i < MAX_ZONELISTS; i++) { | 3255 | for (i = 0; i < MAX_ZONELISTS; i++) { |
3256 | zonelist = pgdat->node_zonelists + i; | 3256 | zonelist = pgdat->node_zonelists + i; |
3257 | zonelist->_zonerefs[0].zone = NULL; | 3257 | zonelist->_zonerefs[0].zone = NULL; |
3258 | zonelist->_zonerefs[0].zone_idx = 0; | 3258 | zonelist->_zonerefs[0].zone_idx = 0; |
3259 | } | 3259 | } |
3260 | 3260 | ||
3261 | /* NUMA-aware ordering of nodes */ | 3261 | /* NUMA-aware ordering of nodes */ |
3262 | local_node = pgdat->node_id; | 3262 | local_node = pgdat->node_id; |
3263 | load = nr_online_nodes; | 3263 | load = nr_online_nodes; |
3264 | prev_node = local_node; | 3264 | prev_node = local_node; |
3265 | nodes_clear(used_mask); | 3265 | nodes_clear(used_mask); |
3266 | 3266 | ||
3267 | memset(node_order, 0, sizeof(node_order)); | 3267 | memset(node_order, 0, sizeof(node_order)); |
3268 | j = 0; | 3268 | j = 0; |
3269 | 3269 | ||
3270 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { | 3270 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { |
3271 | int distance = node_distance(local_node, node); | 3271 | int distance = node_distance(local_node, node); |
3272 | 3272 | ||
3273 | /* | 3273 | /* |
3274 | * If another node is sufficiently far away then it is better | 3274 | * If another node is sufficiently far away then it is better |
3275 | * to reclaim pages in a zone before going off node. | 3275 | * to reclaim pages in a zone before going off node. |
3276 | */ | 3276 | */ |
3277 | if (distance > RECLAIM_DISTANCE) | 3277 | if (distance > RECLAIM_DISTANCE) |
3278 | zone_reclaim_mode = 1; | 3278 | zone_reclaim_mode = 1; |
3279 | 3279 | ||
3280 | /* | 3280 | /* |
3281 | * We don't want to pressure a particular node. | 3281 | * We don't want to pressure a particular node. |
3282 | * So adding penalty to the first node in same | 3282 | * So adding penalty to the first node in same |
3283 | * distance group to make it round-robin. | 3283 | * distance group to make it round-robin. |
3284 | */ | 3284 | */ |
3285 | if (distance != node_distance(local_node, prev_node)) | 3285 | if (distance != node_distance(local_node, prev_node)) |
3286 | node_load[node] = load; | 3286 | node_load[node] = load; |
3287 | 3287 | ||
3288 | prev_node = node; | 3288 | prev_node = node; |
3289 | load--; | 3289 | load--; |
3290 | if (order == ZONELIST_ORDER_NODE) | 3290 | if (order == ZONELIST_ORDER_NODE) |
3291 | build_zonelists_in_node_order(pgdat, node); | 3291 | build_zonelists_in_node_order(pgdat, node); |
3292 | else | 3292 | else |
3293 | node_order[j++] = node; /* remember order */ | 3293 | node_order[j++] = node; /* remember order */ |
3294 | } | 3294 | } |
3295 | 3295 | ||
3296 | if (order == ZONELIST_ORDER_ZONE) { | 3296 | if (order == ZONELIST_ORDER_ZONE) { |
3297 | /* calculate node order -- i.e., DMA last! */ | 3297 | /* calculate node order -- i.e., DMA last! */ |
3298 | build_zonelists_in_zone_order(pgdat, j); | 3298 | build_zonelists_in_zone_order(pgdat, j); |
3299 | } | 3299 | } |
3300 | 3300 | ||
3301 | build_thisnode_zonelists(pgdat); | 3301 | build_thisnode_zonelists(pgdat); |
3302 | } | 3302 | } |
3303 | 3303 | ||
3304 | /* Construct the zonelist performance cache - see further mmzone.h */ | 3304 | /* Construct the zonelist performance cache - see further mmzone.h */ |
3305 | static void build_zonelist_cache(pg_data_t *pgdat) | 3305 | static void build_zonelist_cache(pg_data_t *pgdat) |
3306 | { | 3306 | { |
3307 | struct zonelist *zonelist; | 3307 | struct zonelist *zonelist; |
3308 | struct zonelist_cache *zlc; | 3308 | struct zonelist_cache *zlc; |
3309 | struct zoneref *z; | 3309 | struct zoneref *z; |
3310 | 3310 | ||
3311 | zonelist = &pgdat->node_zonelists[0]; | 3311 | zonelist = &pgdat->node_zonelists[0]; |
3312 | zonelist->zlcache_ptr = zlc = &zonelist->zlcache; | 3312 | zonelist->zlcache_ptr = zlc = &zonelist->zlcache; |
3313 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | 3313 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); |
3314 | for (z = zonelist->_zonerefs; z->zone; z++) | 3314 | for (z = zonelist->_zonerefs; z->zone; z++) |
3315 | zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); | 3315 | zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); |
3316 | } | 3316 | } |
3317 | 3317 | ||
3318 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES | 3318 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES |
3319 | /* | 3319 | /* |
3320 | * Return node id of node used for "local" allocations. | 3320 | * Return node id of node used for "local" allocations. |
3321 | * I.e., first node id of first zone in arg node's generic zonelist. | 3321 | * I.e., first node id of first zone in arg node's generic zonelist. |
3322 | * Used for initializing percpu 'numa_mem', which is used primarily | 3322 | * Used for initializing percpu 'numa_mem', which is used primarily |
3323 | * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. | 3323 | * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. |
3324 | */ | 3324 | */ |
3325 | int local_memory_node(int node) | 3325 | int local_memory_node(int node) |
3326 | { | 3326 | { |
3327 | struct zone *zone; | 3327 | struct zone *zone; |
3328 | 3328 | ||
3329 | (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), | 3329 | (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), |
3330 | gfp_zone(GFP_KERNEL), | 3330 | gfp_zone(GFP_KERNEL), |
3331 | NULL, | 3331 | NULL, |
3332 | &zone); | 3332 | &zone); |
3333 | return zone->node; | 3333 | return zone->node; |
3334 | } | 3334 | } |
3335 | #endif | 3335 | #endif |
3336 | 3336 | ||
3337 | #else /* CONFIG_NUMA */ | 3337 | #else /* CONFIG_NUMA */ |
3338 | 3338 | ||
3339 | static void set_zonelist_order(void) | 3339 | static void set_zonelist_order(void) |
3340 | { | 3340 | { |
3341 | current_zonelist_order = ZONELIST_ORDER_ZONE; | 3341 | current_zonelist_order = ZONELIST_ORDER_ZONE; |
3342 | } | 3342 | } |
3343 | 3343 | ||
3344 | static void build_zonelists(pg_data_t *pgdat) | 3344 | static void build_zonelists(pg_data_t *pgdat) |
3345 | { | 3345 | { |
3346 | int node, local_node; | 3346 | int node, local_node; |
3347 | enum zone_type j; | 3347 | enum zone_type j; |
3348 | struct zonelist *zonelist; | 3348 | struct zonelist *zonelist; |
3349 | 3349 | ||
3350 | local_node = pgdat->node_id; | 3350 | local_node = pgdat->node_id; |
3351 | 3351 | ||
3352 | zonelist = &pgdat->node_zonelists[0]; | 3352 | zonelist = &pgdat->node_zonelists[0]; |
3353 | j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); | 3353 | j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); |
3354 | 3354 | ||
3355 | /* | 3355 | /* |
3356 | * Now we build the zonelist so that it contains the zones | 3356 | * Now we build the zonelist so that it contains the zones |
3357 | * of all the other nodes. | 3357 | * of all the other nodes. |
3358 | * We don't want to pressure a particular node, so when | 3358 | * We don't want to pressure a particular node, so when |
3359 | * building the zones for node N, we make sure that the | 3359 | * building the zones for node N, we make sure that the |
3360 | * zones coming right after the local ones are those from | 3360 | * zones coming right after the local ones are those from |
3361 | * node N+1 (modulo N) | 3361 | * node N+1 (modulo N) |
3362 | */ | 3362 | */ |
3363 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { | 3363 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { |
3364 | if (!node_online(node)) | 3364 | if (!node_online(node)) |
3365 | continue; | 3365 | continue; |
3366 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, | 3366 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, |
3367 | MAX_NR_ZONES - 1); | 3367 | MAX_NR_ZONES - 1); |
3368 | } | 3368 | } |
3369 | for (node = 0; node < local_node; node++) { | 3369 | for (node = 0; node < local_node; node++) { |
3370 | if (!node_online(node)) | 3370 | if (!node_online(node)) |
3371 | continue; | 3371 | continue; |
3372 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, | 3372 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, |
3373 | MAX_NR_ZONES - 1); | 3373 | MAX_NR_ZONES - 1); |
3374 | } | 3374 | } |
3375 | 3375 | ||
3376 | zonelist->_zonerefs[j].zone = NULL; | 3376 | zonelist->_zonerefs[j].zone = NULL; |
3377 | zonelist->_zonerefs[j].zone_idx = 0; | 3377 | zonelist->_zonerefs[j].zone_idx = 0; |
3378 | } | 3378 | } |
3379 | 3379 | ||
3380 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ | 3380 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ |
3381 | static void build_zonelist_cache(pg_data_t *pgdat) | 3381 | static void build_zonelist_cache(pg_data_t *pgdat) |
3382 | { | 3382 | { |
3383 | pgdat->node_zonelists[0].zlcache_ptr = NULL; | 3383 | pgdat->node_zonelists[0].zlcache_ptr = NULL; |
3384 | } | 3384 | } |
3385 | 3385 | ||
3386 | #endif /* CONFIG_NUMA */ | 3386 | #endif /* CONFIG_NUMA */ |
3387 | 3387 | ||
3388 | /* | 3388 | /* |
3389 | * Boot pageset table. One per cpu which is going to be used for all | 3389 | * Boot pageset table. One per cpu which is going to be used for all |
3390 | * zones and all nodes. The parameters will be set in such a way | 3390 | * zones and all nodes. The parameters will be set in such a way |
3391 | * that an item put on a list will immediately be handed over to | 3391 | * that an item put on a list will immediately be handed over to |
3392 | * the buddy list. This is safe since pageset manipulation is done | 3392 | * the buddy list. This is safe since pageset manipulation is done |
3393 | * with interrupts disabled. | 3393 | * with interrupts disabled. |
3394 | * | 3394 | * |
3395 | * The boot_pagesets must be kept even after bootup is complete for | 3395 | * The boot_pagesets must be kept even after bootup is complete for |
3396 | * unused processors and/or zones. They do play a role for bootstrapping | 3396 | * unused processors and/or zones. They do play a role for bootstrapping |
3397 | * hotplugged processors. | 3397 | * hotplugged processors. |
3398 | * | 3398 | * |
3399 | * zoneinfo_show() and maybe other functions do | 3399 | * zoneinfo_show() and maybe other functions do |
3400 | * not check if the processor is online before following the pageset pointer. | 3400 | * not check if the processor is online before following the pageset pointer. |
3401 | * Other parts of the kernel may not check if the zone is available. | 3401 | * Other parts of the kernel may not check if the zone is available. |
3402 | */ | 3402 | */ |
3403 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); | 3403 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); |
3404 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); | 3404 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); |
3405 | static void setup_zone_pageset(struct zone *zone); | 3405 | static void setup_zone_pageset(struct zone *zone); |
3406 | 3406 | ||
3407 | /* | 3407 | /* |
3408 | * Global mutex to protect against size modification of zonelists | 3408 | * Global mutex to protect against size modification of zonelists |
3409 | * as well as to serialize pageset setup for the new populated zone. | 3409 | * as well as to serialize pageset setup for the new populated zone. |
3410 | */ | 3410 | */ |
3411 | DEFINE_MUTEX(zonelists_mutex); | 3411 | DEFINE_MUTEX(zonelists_mutex); |
3412 | 3412 | ||
3413 | /* return values int ....just for stop_machine() */ | 3413 | /* return values int ....just for stop_machine() */ |
3414 | static __init_refok int __build_all_zonelists(void *data) | 3414 | static __init_refok int __build_all_zonelists(void *data) |
3415 | { | 3415 | { |
3416 | int nid; | 3416 | int nid; |
3417 | int cpu; | 3417 | int cpu; |
3418 | 3418 | ||
3419 | #ifdef CONFIG_NUMA | 3419 | #ifdef CONFIG_NUMA |
3420 | memset(node_load, 0, sizeof(node_load)); | 3420 | memset(node_load, 0, sizeof(node_load)); |
3421 | #endif | 3421 | #endif |
3422 | for_each_online_node(nid) { | 3422 | for_each_online_node(nid) { |
3423 | pg_data_t *pgdat = NODE_DATA(nid); | 3423 | pg_data_t *pgdat = NODE_DATA(nid); |
3424 | 3424 | ||
3425 | build_zonelists(pgdat); | 3425 | build_zonelists(pgdat); |
3426 | build_zonelist_cache(pgdat); | 3426 | build_zonelist_cache(pgdat); |
3427 | } | 3427 | } |
3428 | 3428 | ||
3429 | /* | 3429 | /* |
3430 | * Initialize the boot_pagesets that are going to be used | 3430 | * Initialize the boot_pagesets that are going to be used |
3431 | * for bootstrapping processors. The real pagesets for | 3431 | * for bootstrapping processors. The real pagesets for |
3432 | * each zone will be allocated later when the per cpu | 3432 | * each zone will be allocated later when the per cpu |
3433 | * allocator is available. | 3433 | * allocator is available. |
3434 | * | 3434 | * |
3435 | * boot_pagesets are used also for bootstrapping offline | 3435 | * boot_pagesets are used also for bootstrapping offline |
3436 | * cpus if the system is already booted because the pagesets | 3436 | * cpus if the system is already booted because the pagesets |
3437 | * are needed to initialize allocators on a specific cpu too. | 3437 | * are needed to initialize allocators on a specific cpu too. |
3438 | * F.e. the percpu allocator needs the page allocator which | 3438 | * F.e. the percpu allocator needs the page allocator which |
3439 | * needs the percpu allocator in order to allocate its pagesets | 3439 | * needs the percpu allocator in order to allocate its pagesets |
3440 | * (a chicken-egg dilemma). | 3440 | * (a chicken-egg dilemma). |
3441 | */ | 3441 | */ |
3442 | for_each_possible_cpu(cpu) { | 3442 | for_each_possible_cpu(cpu) { |
3443 | setup_pageset(&per_cpu(boot_pageset, cpu), 0); | 3443 | setup_pageset(&per_cpu(boot_pageset, cpu), 0); |
3444 | 3444 | ||
3445 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES | 3445 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES |
3446 | /* | 3446 | /* |
3447 | * We now know the "local memory node" for each node-- | 3447 | * We now know the "local memory node" for each node-- |
3448 | * i.e., the node of the first zone in the generic zonelist. | 3448 | * i.e., the node of the first zone in the generic zonelist. |
3449 | * Set up numa_mem percpu variable for on-line cpus. During | 3449 | * Set up numa_mem percpu variable for on-line cpus. During |
3450 | * boot, only the boot cpu should be on-line; we'll init the | 3450 | * boot, only the boot cpu should be on-line; we'll init the |
3451 | * secondary cpus' numa_mem as they come on-line. During | 3451 | * secondary cpus' numa_mem as they come on-line. During |
3452 | * node/memory hotplug, we'll fixup all on-line cpus. | 3452 | * node/memory hotplug, we'll fixup all on-line cpus. |
3453 | */ | 3453 | */ |
3454 | if (cpu_online(cpu)) | 3454 | if (cpu_online(cpu)) |
3455 | set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); | 3455 | set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); |
3456 | #endif | 3456 | #endif |
3457 | } | 3457 | } |
3458 | 3458 | ||
3459 | return 0; | 3459 | return 0; |
3460 | } | 3460 | } |
3461 | 3461 | ||
3462 | /* | 3462 | /* |
3463 | * Called with zonelists_mutex held always | 3463 | * Called with zonelists_mutex held always |
3464 | * unless system_state == SYSTEM_BOOTING. | 3464 | * unless system_state == SYSTEM_BOOTING. |
3465 | */ | 3465 | */ |
3466 | void __ref build_all_zonelists(void *data) | 3466 | void __ref build_all_zonelists(void *data) |
3467 | { | 3467 | { |
3468 | set_zonelist_order(); | 3468 | set_zonelist_order(); |
3469 | 3469 | ||
3470 | if (system_state == SYSTEM_BOOTING) { | 3470 | if (system_state == SYSTEM_BOOTING) { |
3471 | __build_all_zonelists(NULL); | 3471 | __build_all_zonelists(NULL); |
3472 | mminit_verify_zonelist(); | 3472 | mminit_verify_zonelist(); |
3473 | cpuset_init_current_mems_allowed(); | 3473 | cpuset_init_current_mems_allowed(); |
3474 | } else { | 3474 | } else { |
3475 | /* we have to stop all cpus to guarantee there is no user | 3475 | /* we have to stop all cpus to guarantee there is no user |
3476 | of zonelist */ | 3476 | of zonelist */ |
3477 | #ifdef CONFIG_MEMORY_HOTPLUG | 3477 | #ifdef CONFIG_MEMORY_HOTPLUG |
3478 | if (data) | 3478 | if (data) |
3479 | setup_zone_pageset((struct zone *)data); | 3479 | setup_zone_pageset((struct zone *)data); |
3480 | #endif | 3480 | #endif |
3481 | stop_machine(__build_all_zonelists, NULL, NULL); | 3481 | stop_machine(__build_all_zonelists, NULL, NULL); |
3482 | /* cpuset refresh routine should be here */ | 3482 | /* cpuset refresh routine should be here */ |
3483 | } | 3483 | } |
3484 | vm_total_pages = nr_free_pagecache_pages(); | 3484 | vm_total_pages = nr_free_pagecache_pages(); |
3485 | /* | 3485 | /* |
3486 | * Disable grouping by mobility if the number of pages in the | 3486 | * Disable grouping by mobility if the number of pages in the |
3487 | * system is too low to allow the mechanism to work. It would be | 3487 | * system is too low to allow the mechanism to work. It would be |
3488 | * more accurate, but expensive to check per-zone. This check is | 3488 | * more accurate, but expensive to check per-zone. This check is |
3489 | * made on memory-hotadd so a system can start with mobility | 3489 | * made on memory-hotadd so a system can start with mobility |
3490 | * disabled and enable it later | 3490 | * disabled and enable it later |
3491 | */ | 3491 | */ |
3492 | if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) | 3492 | if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) |
3493 | page_group_by_mobility_disabled = 1; | 3493 | page_group_by_mobility_disabled = 1; |
3494 | else | 3494 | else |
3495 | page_group_by_mobility_disabled = 0; | 3495 | page_group_by_mobility_disabled = 0; |
3496 | 3496 | ||
3497 | printk("Built %i zonelists in %s order, mobility grouping %s. " | 3497 | printk("Built %i zonelists in %s order, mobility grouping %s. " |
3498 | "Total pages: %ld\n", | 3498 | "Total pages: %ld\n", |
3499 | nr_online_nodes, | 3499 | nr_online_nodes, |
3500 | zonelist_order_name[current_zonelist_order], | 3500 | zonelist_order_name[current_zonelist_order], |
3501 | page_group_by_mobility_disabled ? "off" : "on", | 3501 | page_group_by_mobility_disabled ? "off" : "on", |
3502 | vm_total_pages); | 3502 | vm_total_pages); |
3503 | #ifdef CONFIG_NUMA | 3503 | #ifdef CONFIG_NUMA |
3504 | printk("Policy zone: %s\n", zone_names[policy_zone]); | 3504 | printk("Policy zone: %s\n", zone_names[policy_zone]); |
3505 | #endif | 3505 | #endif |
3506 | } | 3506 | } |
3507 | 3507 | ||
3508 | /* | 3508 | /* |
3509 | * Helper functions to size the waitqueue hash table. | 3509 | * Helper functions to size the waitqueue hash table. |
3510 | * Essentially these want to choose hash table sizes sufficiently | 3510 | * Essentially these want to choose hash table sizes sufficiently |
3511 | * large so that collisions trying to wait on pages are rare. | 3511 | * large so that collisions trying to wait on pages are rare. |
3512 | * But in fact, the number of active page waitqueues on typical | 3512 | * But in fact, the number of active page waitqueues on typical |
3513 | * systems is ridiculously low, less than 200. So this is even | 3513 | * systems is ridiculously low, less than 200. So this is even |
3514 | * conservative, even though it seems large. | 3514 | * conservative, even though it seems large. |
3515 | * | 3515 | * |
3516 | * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to | 3516 | * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to |
3517 | * waitqueues, i.e. the size of the waitq table given the number of pages. | 3517 | * waitqueues, i.e. the size of the waitq table given the number of pages. |
3518 | */ | 3518 | */ |
3519 | #define PAGES_PER_WAITQUEUE 256 | 3519 | #define PAGES_PER_WAITQUEUE 256 |
3520 | 3520 | ||
3521 | #ifndef CONFIG_MEMORY_HOTPLUG | 3521 | #ifndef CONFIG_MEMORY_HOTPLUG |
3522 | static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) | 3522 | static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) |
3523 | { | 3523 | { |
3524 | unsigned long size = 1; | 3524 | unsigned long size = 1; |
3525 | 3525 | ||
3526 | pages /= PAGES_PER_WAITQUEUE; | 3526 | pages /= PAGES_PER_WAITQUEUE; |
3527 | 3527 | ||
3528 | while (size < pages) | 3528 | while (size < pages) |
3529 | size <<= 1; | 3529 | size <<= 1; |
3530 | 3530 | ||
3531 | /* | 3531 | /* |
3532 | * Once we have dozens or even hundreds of threads sleeping | 3532 | * Once we have dozens or even hundreds of threads sleeping |
3533 | * on IO we've got bigger problems than wait queue collision. | 3533 | * on IO we've got bigger problems than wait queue collision. |
3534 | * Limit the size of the wait table to a reasonable size. | 3534 | * Limit the size of the wait table to a reasonable size. |
3535 | */ | 3535 | */ |
3536 | size = min(size, 4096UL); | 3536 | size = min(size, 4096UL); |
3537 | 3537 | ||
3538 | return max(size, 4UL); | 3538 | return max(size, 4UL); |
3539 | } | 3539 | } |
3540 | #else | 3540 | #else |
3541 | /* | 3541 | /* |
3542 | * A zone's size might be changed by hot-add, so it is not possible to determine | 3542 | * A zone's size might be changed by hot-add, so it is not possible to determine |
3543 | * a suitable size for its wait_table. So we use the maximum size now. | 3543 | * a suitable size for its wait_table. So we use the maximum size now. |
3544 | * | 3544 | * |
3545 | * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: | 3545 | * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: |
3546 | * | 3546 | * |
3547 | * i386 (preemption config) : 4096 x 16 = 64Kbyte. | 3547 | * i386 (preemption config) : 4096 x 16 = 64Kbyte. |
3548 | * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. | 3548 | * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. |
3549 | * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. | 3549 | * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. |
3550 | * | 3550 | * |
3551 | * The maximum entries are prepared when a zone's memory is (512K + 256) pages | 3551 | * The maximum entries are prepared when a zone's memory is (512K + 256) pages |
3552 | * or more by the traditional way. (See above). It equals: | 3552 | * or more by the traditional way. (See above). It equals: |
3553 | * | 3553 | * |
3554 | * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. | 3554 | * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. |
3555 | * ia64(16K page size) : = ( 8G + 4M)byte. | 3555 | * ia64(16K page size) : = ( 8G + 4M)byte. |
3556 | * powerpc (64K page size) : = (32G +16M)byte. | 3556 | * powerpc (64K page size) : = (32G +16M)byte. |
3557 | */ | 3557 | */ |
3558 | static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) | 3558 | static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) |
3559 | { | 3559 | { |
3560 | return 4096UL; | 3560 | return 4096UL; |
3561 | } | 3561 | } |
3562 | #endif | 3562 | #endif |
3563 | 3563 | ||
3564 | /* | 3564 | /* |
3565 | * This is an integer logarithm so that shifts can be used later | 3565 | * This is an integer logarithm so that shifts can be used later |
3566 | * to extract the more random high bits from the multiplicative | 3566 | * to extract the more random high bits from the multiplicative |
3567 | * hash function before the remainder is taken. | 3567 | * hash function before the remainder is taken. |
3568 | */ | 3568 | */ |
3569 | static inline unsigned long wait_table_bits(unsigned long size) | 3569 | static inline unsigned long wait_table_bits(unsigned long size) |
3570 | { | 3570 | { |
3571 | return ffz(~size); | 3571 | return ffz(~size); |
3572 | } | 3572 | } |
3573 | 3573 | ||
3574 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) | 3574 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) |
3575 | 3575 | ||
3576 | /* | 3576 | /* |
3577 | * Check if a pageblock contains reserved pages | 3577 | * Check if a pageblock contains reserved pages |
3578 | */ | 3578 | */ |
3579 | static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) | 3579 | static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) |
3580 | { | 3580 | { |
3581 | unsigned long pfn; | 3581 | unsigned long pfn; |
3582 | 3582 | ||
3583 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 3583 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
3584 | if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) | 3584 | if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) |
3585 | return 1; | 3585 | return 1; |
3586 | } | 3586 | } |
3587 | return 0; | 3587 | return 0; |
3588 | } | 3588 | } |
3589 | 3589 | ||
3590 | /* | 3590 | /* |
3591 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number | 3591 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number |
3592 | * of blocks reserved is based on min_wmark_pages(zone). The memory within | 3592 | * of blocks reserved is based on min_wmark_pages(zone). The memory within |
3593 | * the reserve will tend to store contiguous free pages. Setting min_free_kbytes | 3593 | * the reserve will tend to store contiguous free pages. Setting min_free_kbytes |
3594 | * higher will lead to a bigger reserve which will get freed as contiguous | 3594 | * higher will lead to a bigger reserve which will get freed as contiguous |
3595 | * blocks as reclaim kicks in | 3595 | * blocks as reclaim kicks in |
3596 | */ | 3596 | */ |
3597 | static void setup_zone_migrate_reserve(struct zone *zone) | 3597 | static void setup_zone_migrate_reserve(struct zone *zone) |
3598 | { | 3598 | { |
3599 | unsigned long start_pfn, pfn, end_pfn, block_end_pfn; | 3599 | unsigned long start_pfn, pfn, end_pfn, block_end_pfn; |
3600 | struct page *page; | 3600 | struct page *page; |
3601 | unsigned long block_migratetype; | 3601 | unsigned long block_migratetype; |
3602 | int reserve; | 3602 | int reserve; |
3603 | 3603 | ||
3604 | /* | 3604 | /* |
3605 | * Get the start pfn, end pfn and the number of blocks to reserve | 3605 | * Get the start pfn, end pfn and the number of blocks to reserve |
3606 | * We have to be careful to be aligned to pageblock_nr_pages to | 3606 | * We have to be careful to be aligned to pageblock_nr_pages to |
3607 | * make sure that we always check pfn_valid for the first page in | 3607 | * make sure that we always check pfn_valid for the first page in |
3608 | * the block. | 3608 | * the block. |
3609 | */ | 3609 | */ |
3610 | start_pfn = zone->zone_start_pfn; | 3610 | start_pfn = zone->zone_start_pfn; |
3611 | end_pfn = start_pfn + zone->spanned_pages; | 3611 | end_pfn = start_pfn + zone->spanned_pages; |
3612 | start_pfn = roundup(start_pfn, pageblock_nr_pages); | 3612 | start_pfn = roundup(start_pfn, pageblock_nr_pages); |
3613 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> | 3613 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> |
3614 | pageblock_order; | 3614 | pageblock_order; |
3615 | 3615 | ||
3616 | /* | 3616 | /* |
3617 | * Reserve blocks are generally in place to help high-order atomic | 3617 | * Reserve blocks are generally in place to help high-order atomic |
3618 | * allocations that are short-lived. A min_free_kbytes value that | 3618 | * allocations that are short-lived. A min_free_kbytes value that |
3619 | * would result in more than 2 reserve blocks for atomic allocations | 3619 | * would result in more than 2 reserve blocks for atomic allocations |
3620 | * is assumed to be in place to help anti-fragmentation for the | 3620 | * is assumed to be in place to help anti-fragmentation for the |
3621 | * future allocation of hugepages at runtime. | 3621 | * future allocation of hugepages at runtime. |
3622 | */ | 3622 | */ |
3623 | reserve = min(2, reserve); | 3623 | reserve = min(2, reserve); |
3624 | 3624 | ||
3625 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | 3625 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { |
3626 | if (!pfn_valid(pfn)) | 3626 | if (!pfn_valid(pfn)) |
3627 | continue; | 3627 | continue; |
3628 | page = pfn_to_page(pfn); | 3628 | page = pfn_to_page(pfn); |
3629 | 3629 | ||
3630 | /* Watch out for overlapping nodes */ | 3630 | /* Watch out for overlapping nodes */ |
3631 | if (page_to_nid(page) != zone_to_nid(zone)) | 3631 | if (page_to_nid(page) != zone_to_nid(zone)) |
3632 | continue; | 3632 | continue; |
3633 | 3633 | ||
3634 | block_migratetype = get_pageblock_migratetype(page); | 3634 | block_migratetype = get_pageblock_migratetype(page); |
3635 | 3635 | ||
3636 | /* Only test what is necessary when the reserves are not met */ | 3636 | /* Only test what is necessary when the reserves are not met */ |
3637 | if (reserve > 0) { | 3637 | if (reserve > 0) { |
3638 | /* | 3638 | /* |
3639 | * Blocks with reserved pages will never free, skip | 3639 | * Blocks with reserved pages will never free, skip |
3640 | * them. | 3640 | * them. |
3641 | */ | 3641 | */ |
3642 | block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); | 3642 | block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); |
3643 | if (pageblock_is_reserved(pfn, block_end_pfn)) | 3643 | if (pageblock_is_reserved(pfn, block_end_pfn)) |
3644 | continue; | 3644 | continue; |
3645 | 3645 | ||
3646 | /* If this block is reserved, account for it */ | 3646 | /* If this block is reserved, account for it */ |
3647 | if (block_migratetype == MIGRATE_RESERVE) { | 3647 | if (block_migratetype == MIGRATE_RESERVE) { |
3648 | reserve--; | 3648 | reserve--; |
3649 | continue; | 3649 | continue; |
3650 | } | 3650 | } |
3651 | 3651 | ||
3652 | /* Suitable for reserving if this block is movable */ | 3652 | /* Suitable for reserving if this block is movable */ |
3653 | if (block_migratetype == MIGRATE_MOVABLE) { | 3653 | if (block_migratetype == MIGRATE_MOVABLE) { |
3654 | set_pageblock_migratetype(page, | 3654 | set_pageblock_migratetype(page, |
3655 | MIGRATE_RESERVE); | 3655 | MIGRATE_RESERVE); |
3656 | move_freepages_block(zone, page, | 3656 | move_freepages_block(zone, page, |
3657 | MIGRATE_RESERVE); | 3657 | MIGRATE_RESERVE); |
3658 | reserve--; | 3658 | reserve--; |
3659 | continue; | 3659 | continue; |
3660 | } | 3660 | } |
3661 | } | 3661 | } |
3662 | 3662 | ||
3663 | /* | 3663 | /* |
3664 | * If the reserve is met and this is a previous reserved block, | 3664 | * If the reserve is met and this is a previous reserved block, |
3665 | * take it back | 3665 | * take it back |
3666 | */ | 3666 | */ |
3667 | if (block_migratetype == MIGRATE_RESERVE) { | 3667 | if (block_migratetype == MIGRATE_RESERVE) { |
3668 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | 3668 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); |
3669 | move_freepages_block(zone, page, MIGRATE_MOVABLE); | 3669 | move_freepages_block(zone, page, MIGRATE_MOVABLE); |
3670 | } | 3670 | } |
3671 | } | 3671 | } |
3672 | } | 3672 | } |
3673 | 3673 | ||
3674 | /* | 3674 | /* |
3675 | * Initially all pages are reserved - free ones are freed | 3675 | * Initially all pages are reserved - free ones are freed |
3676 | * up by free_all_bootmem() once the early boot process is | 3676 | * up by free_all_bootmem() once the early boot process is |
3677 | * done. Non-atomic initialization, single-pass. | 3677 | * done. Non-atomic initialization, single-pass. |
3678 | */ | 3678 | */ |
3679 | void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | 3679 | void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, |
3680 | unsigned long start_pfn, enum memmap_context context) | 3680 | unsigned long start_pfn, enum memmap_context context) |
3681 | { | 3681 | { |
3682 | struct page *page; | 3682 | struct page *page; |
3683 | unsigned long end_pfn = start_pfn + size; | 3683 | unsigned long end_pfn = start_pfn + size; |
3684 | unsigned long pfn; | 3684 | unsigned long pfn; |
3685 | struct zone *z; | 3685 | struct zone *z; |
3686 | 3686 | ||
3687 | if (highest_memmap_pfn < end_pfn - 1) | 3687 | if (highest_memmap_pfn < end_pfn - 1) |
3688 | highest_memmap_pfn = end_pfn - 1; | 3688 | highest_memmap_pfn = end_pfn - 1; |
3689 | 3689 | ||
3690 | z = &NODE_DATA(nid)->node_zones[zone]; | 3690 | z = &NODE_DATA(nid)->node_zones[zone]; |
3691 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 3691 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
3692 | /* | 3692 | /* |
3693 | * There can be holes in boot-time mem_map[]s | 3693 | * There can be holes in boot-time mem_map[]s |
3694 | * handed to this function. They do not | 3694 | * handed to this function. They do not |
3695 | * exist on hotplugged memory. | 3695 | * exist on hotplugged memory. |
3696 | */ | 3696 | */ |
3697 | if (context == MEMMAP_EARLY) { | 3697 | if (context == MEMMAP_EARLY) { |
3698 | if (!early_pfn_valid(pfn)) | 3698 | if (!early_pfn_valid(pfn)) |
3699 | continue; | 3699 | continue; |
3700 | if (!early_pfn_in_nid(pfn, nid)) | 3700 | if (!early_pfn_in_nid(pfn, nid)) |
3701 | continue; | 3701 | continue; |
3702 | } | 3702 | } |
3703 | page = pfn_to_page(pfn); | 3703 | page = pfn_to_page(pfn); |
3704 | set_page_links(page, zone, nid, pfn); | 3704 | set_page_links(page, zone, nid, pfn); |
3705 | mminit_verify_page_links(page, zone, nid, pfn); | 3705 | mminit_verify_page_links(page, zone, nid, pfn); |
3706 | init_page_count(page); | 3706 | init_page_count(page); |
3707 | reset_page_mapcount(page); | 3707 | reset_page_mapcount(page); |
3708 | SetPageReserved(page); | 3708 | SetPageReserved(page); |
3709 | /* | 3709 | /* |
3710 | * Mark the block movable so that blocks are reserved for | 3710 | * Mark the block movable so that blocks are reserved for |
3711 | * movable at startup. This will force kernel allocations | 3711 | * movable at startup. This will force kernel allocations |
3712 | * to reserve their blocks rather than leaking throughout | 3712 | * to reserve their blocks rather than leaking throughout |
3713 | * the address space during boot when many long-lived | 3713 | * the address space during boot when many long-lived |
3714 | * kernel allocations are made. Later some blocks near | 3714 | * kernel allocations are made. Later some blocks near |
3715 | * the start are marked MIGRATE_RESERVE by | 3715 | * the start are marked MIGRATE_RESERVE by |
3716 | * setup_zone_migrate_reserve() | 3716 | * setup_zone_migrate_reserve() |
3717 | * | 3717 | * |
3718 | * bitmap is created for zone's valid pfn range. but memmap | 3718 | * bitmap is created for zone's valid pfn range. but memmap |
3719 | * can be created for invalid pages (for alignment) | 3719 | * can be created for invalid pages (for alignment) |
3720 | * check here not to call set_pageblock_migratetype() against | 3720 | * check here not to call set_pageblock_migratetype() against |
3721 | * pfn out of zone. | 3721 | * pfn out of zone. |
3722 | */ | 3722 | */ |
3723 | if ((z->zone_start_pfn <= pfn) | 3723 | if ((z->zone_start_pfn <= pfn) |
3724 | && (pfn < z->zone_start_pfn + z->spanned_pages) | 3724 | && (pfn < z->zone_start_pfn + z->spanned_pages) |
3725 | && !(pfn & (pageblock_nr_pages - 1))) | 3725 | && !(pfn & (pageblock_nr_pages - 1))) |
3726 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); | 3726 | set_pageblock_migratetype(page, MIGRATE_MOVABLE); |
3727 | 3727 | ||
3728 | INIT_LIST_HEAD(&page->lru); | 3728 | INIT_LIST_HEAD(&page->lru); |
3729 | #ifdef WANT_PAGE_VIRTUAL | 3729 | #ifdef WANT_PAGE_VIRTUAL |
3730 | /* The shift won't overflow because ZONE_NORMAL is below 4G. */ | 3730 | /* The shift won't overflow because ZONE_NORMAL is below 4G. */ |
3731 | if (!is_highmem_idx(zone)) | 3731 | if (!is_highmem_idx(zone)) |
3732 | set_page_address(page, __va(pfn << PAGE_SHIFT)); | 3732 | set_page_address(page, __va(pfn << PAGE_SHIFT)); |
3733 | #endif | 3733 | #endif |
3734 | } | 3734 | } |
3735 | } | 3735 | } |
3736 | 3736 | ||
3737 | static void __meminit zone_init_free_lists(struct zone *zone) | 3737 | static void __meminit zone_init_free_lists(struct zone *zone) |
3738 | { | 3738 | { |
3739 | int order, t; | 3739 | int order, t; |
3740 | for_each_migratetype_order(order, t) { | 3740 | for_each_migratetype_order(order, t) { |
3741 | INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); | 3741 | INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); |
3742 | zone->free_area[order].nr_free = 0; | 3742 | zone->free_area[order].nr_free = 0; |
3743 | } | 3743 | } |
3744 | } | 3744 | } |
3745 | 3745 | ||
3746 | #ifndef __HAVE_ARCH_MEMMAP_INIT | 3746 | #ifndef __HAVE_ARCH_MEMMAP_INIT |
3747 | #define memmap_init(size, nid, zone, start_pfn) \ | 3747 | #define memmap_init(size, nid, zone, start_pfn) \ |
3748 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) | 3748 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) |
3749 | #endif | 3749 | #endif |
3750 | 3750 | ||
3751 | static int zone_batchsize(struct zone *zone) | 3751 | static int zone_batchsize(struct zone *zone) |
3752 | { | 3752 | { |
3753 | #ifdef CONFIG_MMU | 3753 | #ifdef CONFIG_MMU |
3754 | int batch; | 3754 | int batch; |
3755 | 3755 | ||
3756 | /* | 3756 | /* |
3757 | * The per-cpu-pages pools are set to around 1000th of the | 3757 | * The per-cpu-pages pools are set to around 1000th of the |
3758 | * size of the zone. But no more than 1/2 of a meg. | 3758 | * size of the zone. But no more than 1/2 of a meg. |
3759 | * | 3759 | * |
3760 | * OK, so we don't know how big the cache is. So guess. | 3760 | * OK, so we don't know how big the cache is. So guess. |
3761 | */ | 3761 | */ |
3762 | batch = zone->present_pages / 1024; | 3762 | batch = zone->present_pages / 1024; |
3763 | if (batch * PAGE_SIZE > 512 * 1024) | 3763 | if (batch * PAGE_SIZE > 512 * 1024) |
3764 | batch = (512 * 1024) / PAGE_SIZE; | 3764 | batch = (512 * 1024) / PAGE_SIZE; |
3765 | batch /= 4; /* We effectively *= 4 below */ | 3765 | batch /= 4; /* We effectively *= 4 below */ |
3766 | if (batch < 1) | 3766 | if (batch < 1) |
3767 | batch = 1; | 3767 | batch = 1; |
3768 | 3768 | ||
3769 | /* | 3769 | /* |
3770 | * Clamp the batch to a 2^n - 1 value. Having a power | 3770 | * Clamp the batch to a 2^n - 1 value. Having a power |
3771 | * of 2 value was found to be more likely to have | 3771 | * of 2 value was found to be more likely to have |
3772 | * suboptimal cache aliasing properties in some cases. | 3772 | * suboptimal cache aliasing properties in some cases. |
3773 | * | 3773 | * |
3774 | * For example if 2 tasks are alternately allocating | 3774 | * For example if 2 tasks are alternately allocating |
3775 | * batches of pages, one task can end up with a lot | 3775 | * batches of pages, one task can end up with a lot |
3776 | * of pages of one half of the possible page colors | 3776 | * of pages of one half of the possible page colors |
3777 | * and the other with pages of the other colors. | 3777 | * and the other with pages of the other colors. |
3778 | */ | 3778 | */ |
3779 | batch = rounddown_pow_of_two(batch + batch/2) - 1; | 3779 | batch = rounddown_pow_of_two(batch + batch/2) - 1; |
3780 | 3780 | ||
3781 | return batch; | 3781 | return batch; |
3782 | 3782 | ||
3783 | #else | 3783 | #else |
3784 | /* The deferral and batching of frees should be suppressed under NOMMU | 3784 | /* The deferral and batching of frees should be suppressed under NOMMU |
3785 | * conditions. | 3785 | * conditions. |
3786 | * | 3786 | * |
3787 | * The problem is that NOMMU needs to be able to allocate large chunks | 3787 | * The problem is that NOMMU needs to be able to allocate large chunks |
3788 | * of contiguous memory as there's no hardware page translation to | 3788 | * of contiguous memory as there's no hardware page translation to |
3789 | * assemble apparent contiguous memory from discontiguous pages. | 3789 | * assemble apparent contiguous memory from discontiguous pages. |
3790 | * | 3790 | * |
3791 | * Queueing large contiguous runs of pages for batching, however, | 3791 | * Queueing large contiguous runs of pages for batching, however, |
3792 | * causes the pages to actually be freed in smaller chunks. As there | 3792 | * causes the pages to actually be freed in smaller chunks. As there |
3793 | * can be a significant delay between the individual batches being | 3793 | * can be a significant delay between the individual batches being |
3794 | * recycled, this leads to the once large chunks of space being | 3794 | * recycled, this leads to the once large chunks of space being |
3795 | * fragmented and becoming unavailable for high-order allocations. | 3795 | * fragmented and becoming unavailable for high-order allocations. |
3796 | */ | 3796 | */ |
3797 | return 0; | 3797 | return 0; |
3798 | #endif | 3798 | #endif |
3799 | } | 3799 | } |
3800 | 3800 | ||
3801 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | 3801 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) |
3802 | { | 3802 | { |
3803 | struct per_cpu_pages *pcp; | 3803 | struct per_cpu_pages *pcp; |
3804 | int migratetype; | 3804 | int migratetype; |
3805 | 3805 | ||
3806 | memset(p, 0, sizeof(*p)); | 3806 | memset(p, 0, sizeof(*p)); |
3807 | 3807 | ||
3808 | pcp = &p->pcp; | 3808 | pcp = &p->pcp; |
3809 | pcp->count = 0; | 3809 | pcp->count = 0; |
3810 | pcp->high = 6 * batch; | 3810 | pcp->high = 6 * batch; |
3811 | pcp->batch = max(1UL, 1 * batch); | 3811 | pcp->batch = max(1UL, 1 * batch); |
3812 | for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) | 3812 | for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) |
3813 | INIT_LIST_HEAD(&pcp->lists[migratetype]); | 3813 | INIT_LIST_HEAD(&pcp->lists[migratetype]); |
3814 | } | 3814 | } |
3815 | 3815 | ||
3816 | /* | 3816 | /* |
3817 | * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist | 3817 | * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist |
3818 | * to the value high for the pageset p. | 3818 | * to the value high for the pageset p. |
3819 | */ | 3819 | */ |
3820 | 3820 | ||
3821 | static void setup_pagelist_highmark(struct per_cpu_pageset *p, | 3821 | static void setup_pagelist_highmark(struct per_cpu_pageset *p, |
3822 | unsigned long high) | 3822 | unsigned long high) |
3823 | { | 3823 | { |
3824 | struct per_cpu_pages *pcp; | 3824 | struct per_cpu_pages *pcp; |
3825 | 3825 | ||
3826 | pcp = &p->pcp; | 3826 | pcp = &p->pcp; |
3827 | pcp->high = high; | 3827 | pcp->high = high; |
3828 | pcp->batch = max(1UL, high/4); | 3828 | pcp->batch = max(1UL, high/4); |
3829 | if ((high/4) > (PAGE_SHIFT * 8)) | 3829 | if ((high/4) > (PAGE_SHIFT * 8)) |
3830 | pcp->batch = PAGE_SHIFT * 8; | 3830 | pcp->batch = PAGE_SHIFT * 8; |
3831 | } | 3831 | } |
3832 | 3832 | ||
3833 | static void setup_zone_pageset(struct zone *zone) | 3833 | static void setup_zone_pageset(struct zone *zone) |
3834 | { | 3834 | { |
3835 | int cpu; | 3835 | int cpu; |
3836 | 3836 | ||
3837 | zone->pageset = alloc_percpu(struct per_cpu_pageset); | 3837 | zone->pageset = alloc_percpu(struct per_cpu_pageset); |
3838 | 3838 | ||
3839 | for_each_possible_cpu(cpu) { | 3839 | for_each_possible_cpu(cpu) { |
3840 | struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); | 3840 | struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); |
3841 | 3841 | ||
3842 | setup_pageset(pcp, zone_batchsize(zone)); | 3842 | setup_pageset(pcp, zone_batchsize(zone)); |
3843 | 3843 | ||
3844 | if (percpu_pagelist_fraction) | 3844 | if (percpu_pagelist_fraction) |
3845 | setup_pagelist_highmark(pcp, | 3845 | setup_pagelist_highmark(pcp, |
3846 | (zone->present_pages / | 3846 | (zone->present_pages / |
3847 | percpu_pagelist_fraction)); | 3847 | percpu_pagelist_fraction)); |
3848 | } | 3848 | } |
3849 | } | 3849 | } |
3850 | 3850 | ||
3851 | /* | 3851 | /* |
3852 | * Allocate per cpu pagesets and initialize them. | 3852 | * Allocate per cpu pagesets and initialize them. |
3853 | * Before this call only boot pagesets were available. | 3853 | * Before this call only boot pagesets were available. |
3854 | */ | 3854 | */ |
3855 | void __init setup_per_cpu_pageset(void) | 3855 | void __init setup_per_cpu_pageset(void) |
3856 | { | 3856 | { |
3857 | struct zone *zone; | 3857 | struct zone *zone; |
3858 | 3858 | ||
3859 | for_each_populated_zone(zone) | 3859 | for_each_populated_zone(zone) |
3860 | setup_zone_pageset(zone); | 3860 | setup_zone_pageset(zone); |
3861 | } | 3861 | } |
3862 | 3862 | ||
3863 | static noinline __init_refok | 3863 | static noinline __init_refok |
3864 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | 3864 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) |
3865 | { | 3865 | { |
3866 | int i; | 3866 | int i; |
3867 | struct pglist_data *pgdat = zone->zone_pgdat; | 3867 | struct pglist_data *pgdat = zone->zone_pgdat; |
3868 | size_t alloc_size; | 3868 | size_t alloc_size; |
3869 | 3869 | ||
3870 | /* | 3870 | /* |
3871 | * The per-page waitqueue mechanism uses hashed waitqueues | 3871 | * The per-page waitqueue mechanism uses hashed waitqueues |
3872 | * per zone. | 3872 | * per zone. |
3873 | */ | 3873 | */ |
3874 | zone->wait_table_hash_nr_entries = | 3874 | zone->wait_table_hash_nr_entries = |
3875 | wait_table_hash_nr_entries(zone_size_pages); | 3875 | wait_table_hash_nr_entries(zone_size_pages); |
3876 | zone->wait_table_bits = | 3876 | zone->wait_table_bits = |
3877 | wait_table_bits(zone->wait_table_hash_nr_entries); | 3877 | wait_table_bits(zone->wait_table_hash_nr_entries); |
3878 | alloc_size = zone->wait_table_hash_nr_entries | 3878 | alloc_size = zone->wait_table_hash_nr_entries |
3879 | * sizeof(wait_queue_head_t); | 3879 | * sizeof(wait_queue_head_t); |
3880 | 3880 | ||
3881 | if (!slab_is_available()) { | 3881 | if (!slab_is_available()) { |
3882 | zone->wait_table = (wait_queue_head_t *) | 3882 | zone->wait_table = (wait_queue_head_t *) |
3883 | alloc_bootmem_node_nopanic(pgdat, alloc_size); | 3883 | alloc_bootmem_node_nopanic(pgdat, alloc_size); |
3884 | } else { | 3884 | } else { |
3885 | /* | 3885 | /* |
3886 | * This case means that a zone whose size was 0 gets new memory | 3886 | * This case means that a zone whose size was 0 gets new memory |
3887 | * via memory hot-add. | 3887 | * via memory hot-add. |
3888 | * But it may be the case that a new node was hot-added. In | 3888 | * But it may be the case that a new node was hot-added. In |
3889 | * this case vmalloc() will not be able to use this new node's | 3889 | * this case vmalloc() will not be able to use this new node's |
3890 | * memory - this wait_table must be initialized to use this new | 3890 | * memory - this wait_table must be initialized to use this new |
3891 | * node itself as well. | 3891 | * node itself as well. |
3892 | * To use this new node's memory, further consideration will be | 3892 | * To use this new node's memory, further consideration will be |
3893 | * necessary. | 3893 | * necessary. |
3894 | */ | 3894 | */ |
3895 | zone->wait_table = vmalloc(alloc_size); | 3895 | zone->wait_table = vmalloc(alloc_size); |
3896 | } | 3896 | } |
3897 | if (!zone->wait_table) | 3897 | if (!zone->wait_table) |
3898 | return -ENOMEM; | 3898 | return -ENOMEM; |
3899 | 3899 | ||
3900 | for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) | 3900 | for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) |
3901 | init_waitqueue_head(zone->wait_table + i); | 3901 | init_waitqueue_head(zone->wait_table + i); |
3902 | 3902 | ||
3903 | return 0; | 3903 | return 0; |
3904 | } | 3904 | } |
3905 | 3905 | ||
3906 | static int __zone_pcp_update(void *data) | 3906 | static int __zone_pcp_update(void *data) |
3907 | { | 3907 | { |
3908 | struct zone *zone = data; | 3908 | struct zone *zone = data; |
3909 | int cpu; | 3909 | int cpu; |
3910 | unsigned long batch = zone_batchsize(zone), flags; | 3910 | unsigned long batch = zone_batchsize(zone), flags; |
3911 | 3911 | ||
3912 | for_each_possible_cpu(cpu) { | 3912 | for_each_possible_cpu(cpu) { |
3913 | struct per_cpu_pageset *pset; | 3913 | struct per_cpu_pageset *pset; |
3914 | struct per_cpu_pages *pcp; | 3914 | struct per_cpu_pages *pcp; |
3915 | 3915 | ||
3916 | pset = per_cpu_ptr(zone->pageset, cpu); | 3916 | pset = per_cpu_ptr(zone->pageset, cpu); |
3917 | pcp = &pset->pcp; | 3917 | pcp = &pset->pcp; |
3918 | 3918 | ||
3919 | local_irq_save(flags); | 3919 | local_irq_save(flags); |
3920 | if (pcp->count > 0) | 3920 | if (pcp->count > 0) |
3921 | free_pcppages_bulk(zone, pcp->count, pcp); | 3921 | free_pcppages_bulk(zone, pcp->count, pcp); |
3922 | setup_pageset(pset, batch); | 3922 | setup_pageset(pset, batch); |
3923 | local_irq_restore(flags); | 3923 | local_irq_restore(flags); |
3924 | } | 3924 | } |
3925 | return 0; | 3925 | return 0; |
3926 | } | 3926 | } |
3927 | 3927 | ||
3928 | void zone_pcp_update(struct zone *zone) | 3928 | void zone_pcp_update(struct zone *zone) |
3929 | { | 3929 | { |
3930 | stop_machine(__zone_pcp_update, zone, NULL); | 3930 | stop_machine(__zone_pcp_update, zone, NULL); |
3931 | } | 3931 | } |
3932 | 3932 | ||
3933 | static __meminit void zone_pcp_init(struct zone *zone) | 3933 | static __meminit void zone_pcp_init(struct zone *zone) |
3934 | { | 3934 | { |
3935 | /* | 3935 | /* |
3936 | * per cpu subsystem is not up at this point. The following code | 3936 | * per cpu subsystem is not up at this point. The following code |
3937 | * relies on the ability of the linker to provide the | 3937 | * relies on the ability of the linker to provide the |
3938 | * offset of a (static) per cpu variable into the per cpu area. | 3938 | * offset of a (static) per cpu variable into the per cpu area. |
3939 | */ | 3939 | */ |
3940 | zone->pageset = &boot_pageset; | 3940 | zone->pageset = &boot_pageset; |
3941 | 3941 | ||
3942 | if (zone->present_pages) | 3942 | if (zone->present_pages) |
3943 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", | 3943 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", |
3944 | zone->name, zone->present_pages, | 3944 | zone->name, zone->present_pages, |
3945 | zone_batchsize(zone)); | 3945 | zone_batchsize(zone)); |
3946 | } | 3946 | } |
3947 | 3947 | ||
3948 | __meminit int init_currently_empty_zone(struct zone *zone, | 3948 | __meminit int init_currently_empty_zone(struct zone *zone, |
3949 | unsigned long zone_start_pfn, | 3949 | unsigned long zone_start_pfn, |
3950 | unsigned long size, | 3950 | unsigned long size, |
3951 | enum memmap_context context) | 3951 | enum memmap_context context) |
3952 | { | 3952 | { |
3953 | struct pglist_data *pgdat = zone->zone_pgdat; | 3953 | struct pglist_data *pgdat = zone->zone_pgdat; |
3954 | int ret; | 3954 | int ret; |
3955 | ret = zone_wait_table_init(zone, size); | 3955 | ret = zone_wait_table_init(zone, size); |
3956 | if (ret) | 3956 | if (ret) |
3957 | return ret; | 3957 | return ret; |
3958 | pgdat->nr_zones = zone_idx(zone) + 1; | 3958 | pgdat->nr_zones = zone_idx(zone) + 1; |
3959 | 3959 | ||
3960 | zone->zone_start_pfn = zone_start_pfn; | 3960 | zone->zone_start_pfn = zone_start_pfn; |
3961 | 3961 | ||
3962 | mminit_dprintk(MMINIT_TRACE, "memmap_init", | 3962 | mminit_dprintk(MMINIT_TRACE, "memmap_init", |
3963 | "Initialising map node %d zone %lu pfns %lu -> %lu\n", | 3963 | "Initialising map node %d zone %lu pfns %lu -> %lu\n", |
3964 | pgdat->node_id, | 3964 | pgdat->node_id, |
3965 | (unsigned long)zone_idx(zone), | 3965 | (unsigned long)zone_idx(zone), |
3966 | zone_start_pfn, (zone_start_pfn + size)); | 3966 | zone_start_pfn, (zone_start_pfn + size)); |
3967 | 3967 | ||
3968 | zone_init_free_lists(zone); | 3968 | zone_init_free_lists(zone); |
3969 | 3969 | ||
3970 | return 0; | 3970 | return 0; |
3971 | } | 3971 | } |
3972 | 3972 | ||
3973 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 3973 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
3974 | #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID | 3974 | #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID |
3975 | /* | 3975 | /* |
3976 | * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. | 3976 | * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. |
3977 | * Architectures may implement their own version but if add_active_range() | 3977 | * Architectures may implement their own version but if add_active_range() |
3978 | * was used and there are no special requirements, this is a convenient | 3978 | * was used and there are no special requirements, this is a convenient |
3979 | * alternative | 3979 | * alternative |
3980 | */ | 3980 | */ |
3981 | int __meminit __early_pfn_to_nid(unsigned long pfn) | 3981 | int __meminit __early_pfn_to_nid(unsigned long pfn) |
3982 | { | 3982 | { |
3983 | unsigned long start_pfn, end_pfn; | 3983 | unsigned long start_pfn, end_pfn; |
3984 | int i, nid; | 3984 | int i, nid; |
3985 | 3985 | ||
3986 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) | 3986 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
3987 | if (start_pfn <= pfn && pfn < end_pfn) | 3987 | if (start_pfn <= pfn && pfn < end_pfn) |
3988 | return nid; | 3988 | return nid; |
3989 | /* This is a memory hole */ | 3989 | /* This is a memory hole */ |
3990 | return -1; | 3990 | return -1; |
3991 | } | 3991 | } |
3992 | #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ | 3992 | #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ |
3993 | 3993 | ||
3994 | int __meminit early_pfn_to_nid(unsigned long pfn) | 3994 | int __meminit early_pfn_to_nid(unsigned long pfn) |
3995 | { | 3995 | { |
3996 | int nid; | 3996 | int nid; |
3997 | 3997 | ||
3998 | nid = __early_pfn_to_nid(pfn); | 3998 | nid = __early_pfn_to_nid(pfn); |
3999 | if (nid >= 0) | 3999 | if (nid >= 0) |
4000 | return nid; | 4000 | return nid; |
4001 | /* just returns 0 */ | 4001 | /* just returns 0 */ |
4002 | return 0; | 4002 | return 0; |
4003 | } | 4003 | } |
4004 | 4004 | ||
4005 | #ifdef CONFIG_NODES_SPAN_OTHER_NODES | 4005 | #ifdef CONFIG_NODES_SPAN_OTHER_NODES |
4006 | bool __meminit early_pfn_in_nid(unsigned long pfn, int node) | 4006 | bool __meminit early_pfn_in_nid(unsigned long pfn, int node) |
4007 | { | 4007 | { |
4008 | int nid; | 4008 | int nid; |
4009 | 4009 | ||
4010 | nid = __early_pfn_to_nid(pfn); | 4010 | nid = __early_pfn_to_nid(pfn); |
4011 | if (nid >= 0 && nid != node) | 4011 | if (nid >= 0 && nid != node) |
4012 | return false; | 4012 | return false; |
4013 | return true; | 4013 | return true; |
4014 | } | 4014 | } |
4015 | #endif | 4015 | #endif |
4016 | 4016 | ||
4017 | /** | 4017 | /** |
4018 | * free_bootmem_with_active_regions - Call free_bootmem_node for each active range | 4018 | * free_bootmem_with_active_regions - Call free_bootmem_node for each active range |
4019 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. | 4019 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. |
4020 | * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node | 4020 | * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node |
4021 | * | 4021 | * |
4022 | * If an architecture guarantees that all ranges registered with | 4022 | * If an architecture guarantees that all ranges registered with |
4023 | * add_active_ranges() contain no holes and may be freed, this | 4023 | * add_active_ranges() contain no holes and may be freed, this |
4024 | * this function may be used instead of calling free_bootmem() manually. | 4024 | * this function may be used instead of calling free_bootmem() manually. |
4025 | */ | 4025 | */ |
4026 | void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) | 4026 | void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) |
4027 | { | 4027 | { |
4028 | unsigned long start_pfn, end_pfn; | 4028 | unsigned long start_pfn, end_pfn; |
4029 | int i, this_nid; | 4029 | int i, this_nid; |
4030 | 4030 | ||
4031 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { | 4031 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { |
4032 | start_pfn = min(start_pfn, max_low_pfn); | 4032 | start_pfn = min(start_pfn, max_low_pfn); |
4033 | end_pfn = min(end_pfn, max_low_pfn); | 4033 | end_pfn = min(end_pfn, max_low_pfn); |
4034 | 4034 | ||
4035 | if (start_pfn < end_pfn) | 4035 | if (start_pfn < end_pfn) |
4036 | free_bootmem_node(NODE_DATA(this_nid), | 4036 | free_bootmem_node(NODE_DATA(this_nid), |
4037 | PFN_PHYS(start_pfn), | 4037 | PFN_PHYS(start_pfn), |
4038 | (end_pfn - start_pfn) << PAGE_SHIFT); | 4038 | (end_pfn - start_pfn) << PAGE_SHIFT); |
4039 | } | 4039 | } |
4040 | } | 4040 | } |
4041 | 4041 | ||
4042 | /** | 4042 | /** |
4043 | * sparse_memory_present_with_active_regions - Call memory_present for each active range | 4043 | * sparse_memory_present_with_active_regions - Call memory_present for each active range |
4044 | * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. | 4044 | * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. |
4045 | * | 4045 | * |
4046 | * If an architecture guarantees that all ranges registered with | 4046 | * If an architecture guarantees that all ranges registered with |
4047 | * add_active_ranges() contain no holes and may be freed, this | 4047 | * add_active_ranges() contain no holes and may be freed, this |
4048 | * function may be used instead of calling memory_present() manually. | 4048 | * function may be used instead of calling memory_present() manually. |
4049 | */ | 4049 | */ |
4050 | void __init sparse_memory_present_with_active_regions(int nid) | 4050 | void __init sparse_memory_present_with_active_regions(int nid) |
4051 | { | 4051 | { |
4052 | unsigned long start_pfn, end_pfn; | 4052 | unsigned long start_pfn, end_pfn; |
4053 | int i, this_nid; | 4053 | int i, this_nid; |
4054 | 4054 | ||
4055 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) | 4055 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) |
4056 | memory_present(this_nid, start_pfn, end_pfn); | 4056 | memory_present(this_nid, start_pfn, end_pfn); |
4057 | } | 4057 | } |
4058 | 4058 | ||
4059 | /** | 4059 | /** |
4060 | * get_pfn_range_for_nid - Return the start and end page frames for a node | 4060 | * get_pfn_range_for_nid - Return the start and end page frames for a node |
4061 | * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. | 4061 | * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. |
4062 | * @start_pfn: Passed by reference. On return, it will have the node start_pfn. | 4062 | * @start_pfn: Passed by reference. On return, it will have the node start_pfn. |
4063 | * @end_pfn: Passed by reference. On return, it will have the node end_pfn. | 4063 | * @end_pfn: Passed by reference. On return, it will have the node end_pfn. |
4064 | * | 4064 | * |
4065 | * It returns the start and end page frame of a node based on information | 4065 | * It returns the start and end page frame of a node based on information |
4066 | * provided by an arch calling add_active_range(). If called for a node | 4066 | * provided by an arch calling add_active_range(). If called for a node |
4067 | * with no available memory, a warning is printed and the start and end | 4067 | * with no available memory, a warning is printed and the start and end |
4068 | * PFNs will be 0. | 4068 | * PFNs will be 0. |
4069 | */ | 4069 | */ |
4070 | void __meminit get_pfn_range_for_nid(unsigned int nid, | 4070 | void __meminit get_pfn_range_for_nid(unsigned int nid, |
4071 | unsigned long *start_pfn, unsigned long *end_pfn) | 4071 | unsigned long *start_pfn, unsigned long *end_pfn) |
4072 | { | 4072 | { |
4073 | unsigned long this_start_pfn, this_end_pfn; | 4073 | unsigned long this_start_pfn, this_end_pfn; |
4074 | int i; | 4074 | int i; |
4075 | 4075 | ||
4076 | *start_pfn = -1UL; | 4076 | *start_pfn = -1UL; |
4077 | *end_pfn = 0; | 4077 | *end_pfn = 0; |
4078 | 4078 | ||
4079 | for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { | 4079 | for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { |
4080 | *start_pfn = min(*start_pfn, this_start_pfn); | 4080 | *start_pfn = min(*start_pfn, this_start_pfn); |
4081 | *end_pfn = max(*end_pfn, this_end_pfn); | 4081 | *end_pfn = max(*end_pfn, this_end_pfn); |
4082 | } | 4082 | } |
4083 | 4083 | ||
4084 | if (*start_pfn == -1UL) | 4084 | if (*start_pfn == -1UL) |
4085 | *start_pfn = 0; | 4085 | *start_pfn = 0; |
4086 | } | 4086 | } |
4087 | 4087 | ||
4088 | /* | 4088 | /* |
4089 | * This finds a zone that can be used for ZONE_MOVABLE pages. The | 4089 | * This finds a zone that can be used for ZONE_MOVABLE pages. The |
4090 | * assumption is made that zones within a node are ordered in monotonic | 4090 | * assumption is made that zones within a node are ordered in monotonic |
4091 | * increasing memory addresses so that the "highest" populated zone is used | 4091 | * increasing memory addresses so that the "highest" populated zone is used |
4092 | */ | 4092 | */ |
4093 | static void __init find_usable_zone_for_movable(void) | 4093 | static void __init find_usable_zone_for_movable(void) |
4094 | { | 4094 | { |
4095 | int zone_index; | 4095 | int zone_index; |
4096 | for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { | 4096 | for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { |
4097 | if (zone_index == ZONE_MOVABLE) | 4097 | if (zone_index == ZONE_MOVABLE) |
4098 | continue; | 4098 | continue; |
4099 | 4099 | ||
4100 | if (arch_zone_highest_possible_pfn[zone_index] > | 4100 | if (arch_zone_highest_possible_pfn[zone_index] > |
4101 | arch_zone_lowest_possible_pfn[zone_index]) | 4101 | arch_zone_lowest_possible_pfn[zone_index]) |
4102 | break; | 4102 | break; |
4103 | } | 4103 | } |
4104 | 4104 | ||
4105 | VM_BUG_ON(zone_index == -1); | 4105 | VM_BUG_ON(zone_index == -1); |
4106 | movable_zone = zone_index; | 4106 | movable_zone = zone_index; |
4107 | } | 4107 | } |
4108 | 4108 | ||
4109 | /* | 4109 | /* |
4110 | * The zone ranges provided by the architecture do not include ZONE_MOVABLE | 4110 | * The zone ranges provided by the architecture do not include ZONE_MOVABLE |
4111 | * because it is sized independent of architecture. Unlike the other zones, | 4111 | * because it is sized independent of architecture. Unlike the other zones, |
4112 | * the starting point for ZONE_MOVABLE is not fixed. It may be different | 4112 | * the starting point for ZONE_MOVABLE is not fixed. It may be different |
4113 | * in each node depending on the size of each node and how evenly kernelcore | 4113 | * in each node depending on the size of each node and how evenly kernelcore |
4114 | * is distributed. This helper function adjusts the zone ranges | 4114 | * is distributed. This helper function adjusts the zone ranges |
4115 | * provided by the architecture for a given node by using the end of the | 4115 | * provided by the architecture for a given node by using the end of the |
4116 | * highest usable zone for ZONE_MOVABLE. This preserves the assumption that | 4116 | * highest usable zone for ZONE_MOVABLE. This preserves the assumption that |
4117 | * zones within a node are in order of monotonic increases memory addresses | 4117 | * zones within a node are in order of monotonic increases memory addresses |
4118 | */ | 4118 | */ |
4119 | static void __meminit adjust_zone_range_for_zone_movable(int nid, | 4119 | static void __meminit adjust_zone_range_for_zone_movable(int nid, |
4120 | unsigned long zone_type, | 4120 | unsigned long zone_type, |
4121 | unsigned long node_start_pfn, | 4121 | unsigned long node_start_pfn, |
4122 | unsigned long node_end_pfn, | 4122 | unsigned long node_end_pfn, |
4123 | unsigned long *zone_start_pfn, | 4123 | unsigned long *zone_start_pfn, |
4124 | unsigned long *zone_end_pfn) | 4124 | unsigned long *zone_end_pfn) |
4125 | { | 4125 | { |
4126 | /* Only adjust if ZONE_MOVABLE is on this node */ | 4126 | /* Only adjust if ZONE_MOVABLE is on this node */ |
4127 | if (zone_movable_pfn[nid]) { | 4127 | if (zone_movable_pfn[nid]) { |
4128 | /* Size ZONE_MOVABLE */ | 4128 | /* Size ZONE_MOVABLE */ |
4129 | if (zone_type == ZONE_MOVABLE) { | 4129 | if (zone_type == ZONE_MOVABLE) { |
4130 | *zone_start_pfn = zone_movable_pfn[nid]; | 4130 | *zone_start_pfn = zone_movable_pfn[nid]; |
4131 | *zone_end_pfn = min(node_end_pfn, | 4131 | *zone_end_pfn = min(node_end_pfn, |
4132 | arch_zone_highest_possible_pfn[movable_zone]); | 4132 | arch_zone_highest_possible_pfn[movable_zone]); |
4133 | 4133 | ||
4134 | /* Adjust for ZONE_MOVABLE starting within this range */ | 4134 | /* Adjust for ZONE_MOVABLE starting within this range */ |
4135 | } else if (*zone_start_pfn < zone_movable_pfn[nid] && | 4135 | } else if (*zone_start_pfn < zone_movable_pfn[nid] && |
4136 | *zone_end_pfn > zone_movable_pfn[nid]) { | 4136 | *zone_end_pfn > zone_movable_pfn[nid]) { |
4137 | *zone_end_pfn = zone_movable_pfn[nid]; | 4137 | *zone_end_pfn = zone_movable_pfn[nid]; |
4138 | 4138 | ||
4139 | /* Check if this whole range is within ZONE_MOVABLE */ | 4139 | /* Check if this whole range is within ZONE_MOVABLE */ |
4140 | } else if (*zone_start_pfn >= zone_movable_pfn[nid]) | 4140 | } else if (*zone_start_pfn >= zone_movable_pfn[nid]) |
4141 | *zone_start_pfn = *zone_end_pfn; | 4141 | *zone_start_pfn = *zone_end_pfn; |
4142 | } | 4142 | } |
4143 | } | 4143 | } |
4144 | 4144 | ||
4145 | /* | 4145 | /* |
4146 | * Return the number of pages a zone spans in a node, including holes | 4146 | * Return the number of pages a zone spans in a node, including holes |
4147 | * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() | 4147 | * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() |
4148 | */ | 4148 | */ |
4149 | static unsigned long __meminit zone_spanned_pages_in_node(int nid, | 4149 | static unsigned long __meminit zone_spanned_pages_in_node(int nid, |
4150 | unsigned long zone_type, | 4150 | unsigned long zone_type, |
4151 | unsigned long *ignored) | 4151 | unsigned long *ignored) |
4152 | { | 4152 | { |
4153 | unsigned long node_start_pfn, node_end_pfn; | 4153 | unsigned long node_start_pfn, node_end_pfn; |
4154 | unsigned long zone_start_pfn, zone_end_pfn; | 4154 | unsigned long zone_start_pfn, zone_end_pfn; |
4155 | 4155 | ||
4156 | /* Get the start and end of the node and zone */ | 4156 | /* Get the start and end of the node and zone */ |
4157 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); | 4157 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); |
4158 | zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; | 4158 | zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; |
4159 | zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; | 4159 | zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; |
4160 | adjust_zone_range_for_zone_movable(nid, zone_type, | 4160 | adjust_zone_range_for_zone_movable(nid, zone_type, |
4161 | node_start_pfn, node_end_pfn, | 4161 | node_start_pfn, node_end_pfn, |
4162 | &zone_start_pfn, &zone_end_pfn); | 4162 | &zone_start_pfn, &zone_end_pfn); |
4163 | 4163 | ||
4164 | /* Check that this node has pages within the zone's required range */ | 4164 | /* Check that this node has pages within the zone's required range */ |
4165 | if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) | 4165 | if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) |
4166 | return 0; | 4166 | return 0; |
4167 | 4167 | ||
4168 | /* Move the zone boundaries inside the node if necessary */ | 4168 | /* Move the zone boundaries inside the node if necessary */ |
4169 | zone_end_pfn = min(zone_end_pfn, node_end_pfn); | 4169 | zone_end_pfn = min(zone_end_pfn, node_end_pfn); |
4170 | zone_start_pfn = max(zone_start_pfn, node_start_pfn); | 4170 | zone_start_pfn = max(zone_start_pfn, node_start_pfn); |
4171 | 4171 | ||
4172 | /* Return the spanned pages */ | 4172 | /* Return the spanned pages */ |
4173 | return zone_end_pfn - zone_start_pfn; | 4173 | return zone_end_pfn - zone_start_pfn; |
4174 | } | 4174 | } |
4175 | 4175 | ||
4176 | /* | 4176 | /* |
4177 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, | 4177 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, |
4178 | * then all holes in the requested range will be accounted for. | 4178 | * then all holes in the requested range will be accounted for. |
4179 | */ | 4179 | */ |
4180 | unsigned long __meminit __absent_pages_in_range(int nid, | 4180 | unsigned long __meminit __absent_pages_in_range(int nid, |
4181 | unsigned long range_start_pfn, | 4181 | unsigned long range_start_pfn, |
4182 | unsigned long range_end_pfn) | 4182 | unsigned long range_end_pfn) |
4183 | { | 4183 | { |
4184 | unsigned long nr_absent = range_end_pfn - range_start_pfn; | 4184 | unsigned long nr_absent = range_end_pfn - range_start_pfn; |
4185 | unsigned long start_pfn, end_pfn; | 4185 | unsigned long start_pfn, end_pfn; |
4186 | int i; | 4186 | int i; |
4187 | 4187 | ||
4188 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { | 4188 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { |
4189 | start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); | 4189 | start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); |
4190 | end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); | 4190 | end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); |
4191 | nr_absent -= end_pfn - start_pfn; | 4191 | nr_absent -= end_pfn - start_pfn; |
4192 | } | 4192 | } |
4193 | return nr_absent; | 4193 | return nr_absent; |
4194 | } | 4194 | } |
4195 | 4195 | ||
4196 | /** | 4196 | /** |
4197 | * absent_pages_in_range - Return number of page frames in holes within a range | 4197 | * absent_pages_in_range - Return number of page frames in holes within a range |
4198 | * @start_pfn: The start PFN to start searching for holes | 4198 | * @start_pfn: The start PFN to start searching for holes |
4199 | * @end_pfn: The end PFN to stop searching for holes | 4199 | * @end_pfn: The end PFN to stop searching for holes |
4200 | * | 4200 | * |
4201 | * It returns the number of pages frames in memory holes within a range. | 4201 | * It returns the number of pages frames in memory holes within a range. |
4202 | */ | 4202 | */ |
4203 | unsigned long __init absent_pages_in_range(unsigned long start_pfn, | 4203 | unsigned long __init absent_pages_in_range(unsigned long start_pfn, |
4204 | unsigned long end_pfn) | 4204 | unsigned long end_pfn) |
4205 | { | 4205 | { |
4206 | return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); | 4206 | return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); |
4207 | } | 4207 | } |
4208 | 4208 | ||
4209 | /* Return the number of page frames in holes in a zone on a node */ | 4209 | /* Return the number of page frames in holes in a zone on a node */ |
4210 | static unsigned long __meminit zone_absent_pages_in_node(int nid, | 4210 | static unsigned long __meminit zone_absent_pages_in_node(int nid, |
4211 | unsigned long zone_type, | 4211 | unsigned long zone_type, |
4212 | unsigned long *ignored) | 4212 | unsigned long *ignored) |
4213 | { | 4213 | { |
4214 | unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; | 4214 | unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; |
4215 | unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; | 4215 | unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; |
4216 | unsigned long node_start_pfn, node_end_pfn; | 4216 | unsigned long node_start_pfn, node_end_pfn; |
4217 | unsigned long zone_start_pfn, zone_end_pfn; | 4217 | unsigned long zone_start_pfn, zone_end_pfn; |
4218 | 4218 | ||
4219 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); | 4219 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); |
4220 | zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); | 4220 | zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); |
4221 | zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); | 4221 | zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); |
4222 | 4222 | ||
4223 | adjust_zone_range_for_zone_movable(nid, zone_type, | 4223 | adjust_zone_range_for_zone_movable(nid, zone_type, |
4224 | node_start_pfn, node_end_pfn, | 4224 | node_start_pfn, node_end_pfn, |
4225 | &zone_start_pfn, &zone_end_pfn); | 4225 | &zone_start_pfn, &zone_end_pfn); |
4226 | return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); | 4226 | return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); |
4227 | } | 4227 | } |
4228 | 4228 | ||
4229 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 4229 | #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
4230 | static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, | 4230 | static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, |
4231 | unsigned long zone_type, | 4231 | unsigned long zone_type, |
4232 | unsigned long *zones_size) | 4232 | unsigned long *zones_size) |
4233 | { | 4233 | { |
4234 | return zones_size[zone_type]; | 4234 | return zones_size[zone_type]; |
4235 | } | 4235 | } |
4236 | 4236 | ||
4237 | static inline unsigned long __meminit zone_absent_pages_in_node(int nid, | 4237 | static inline unsigned long __meminit zone_absent_pages_in_node(int nid, |
4238 | unsigned long zone_type, | 4238 | unsigned long zone_type, |
4239 | unsigned long *zholes_size) | 4239 | unsigned long *zholes_size) |
4240 | { | 4240 | { |
4241 | if (!zholes_size) | 4241 | if (!zholes_size) |
4242 | return 0; | 4242 | return 0; |
4243 | 4243 | ||
4244 | return zholes_size[zone_type]; | 4244 | return zholes_size[zone_type]; |
4245 | } | 4245 | } |
4246 | 4246 | ||
4247 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 4247 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
4248 | 4248 | ||
4249 | static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, | 4249 | static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, |
4250 | unsigned long *zones_size, unsigned long *zholes_size) | 4250 | unsigned long *zones_size, unsigned long *zholes_size) |
4251 | { | 4251 | { |
4252 | unsigned long realtotalpages, totalpages = 0; | 4252 | unsigned long realtotalpages, totalpages = 0; |
4253 | enum zone_type i; | 4253 | enum zone_type i; |
4254 | 4254 | ||
4255 | for (i = 0; i < MAX_NR_ZONES; i++) | 4255 | for (i = 0; i < MAX_NR_ZONES; i++) |
4256 | totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, | 4256 | totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, |
4257 | zones_size); | 4257 | zones_size); |
4258 | pgdat->node_spanned_pages = totalpages; | 4258 | pgdat->node_spanned_pages = totalpages; |
4259 | 4259 | ||
4260 | realtotalpages = totalpages; | 4260 | realtotalpages = totalpages; |
4261 | for (i = 0; i < MAX_NR_ZONES; i++) | 4261 | for (i = 0; i < MAX_NR_ZONES; i++) |
4262 | realtotalpages -= | 4262 | realtotalpages -= |
4263 | zone_absent_pages_in_node(pgdat->node_id, i, | 4263 | zone_absent_pages_in_node(pgdat->node_id, i, |
4264 | zholes_size); | 4264 | zholes_size); |
4265 | pgdat->node_present_pages = realtotalpages; | 4265 | pgdat->node_present_pages = realtotalpages; |
4266 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, | 4266 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, |
4267 | realtotalpages); | 4267 | realtotalpages); |
4268 | } | 4268 | } |
4269 | 4269 | ||
4270 | #ifndef CONFIG_SPARSEMEM | 4270 | #ifndef CONFIG_SPARSEMEM |
4271 | /* | 4271 | /* |
4272 | * Calculate the size of the zone->blockflags rounded to an unsigned long | 4272 | * Calculate the size of the zone->blockflags rounded to an unsigned long |
4273 | * Start by making sure zonesize is a multiple of pageblock_order by rounding | 4273 | * Start by making sure zonesize is a multiple of pageblock_order by rounding |
4274 | * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally | 4274 | * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally |
4275 | * round what is now in bits to nearest long in bits, then return it in | 4275 | * round what is now in bits to nearest long in bits, then return it in |
4276 | * bytes. | 4276 | * bytes. |
4277 | */ | 4277 | */ |
4278 | static unsigned long __init usemap_size(unsigned long zonesize) | 4278 | static unsigned long __init usemap_size(unsigned long zonesize) |
4279 | { | 4279 | { |
4280 | unsigned long usemapsize; | 4280 | unsigned long usemapsize; |
4281 | 4281 | ||
4282 | usemapsize = roundup(zonesize, pageblock_nr_pages); | 4282 | usemapsize = roundup(zonesize, pageblock_nr_pages); |
4283 | usemapsize = usemapsize >> pageblock_order; | 4283 | usemapsize = usemapsize >> pageblock_order; |
4284 | usemapsize *= NR_PAGEBLOCK_BITS; | 4284 | usemapsize *= NR_PAGEBLOCK_BITS; |
4285 | usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); | 4285 | usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); |
4286 | 4286 | ||
4287 | return usemapsize / 8; | 4287 | return usemapsize / 8; |
4288 | } | 4288 | } |
4289 | 4289 | ||
4290 | static void __init setup_usemap(struct pglist_data *pgdat, | 4290 | static void __init setup_usemap(struct pglist_data *pgdat, |
4291 | struct zone *zone, unsigned long zonesize) | 4291 | struct zone *zone, unsigned long zonesize) |
4292 | { | 4292 | { |
4293 | unsigned long usemapsize = usemap_size(zonesize); | 4293 | unsigned long usemapsize = usemap_size(zonesize); |
4294 | zone->pageblock_flags = NULL; | 4294 | zone->pageblock_flags = NULL; |
4295 | if (usemapsize) | 4295 | if (usemapsize) |
4296 | zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, | 4296 | zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat, |
4297 | usemapsize); | 4297 | usemapsize); |
4298 | } | 4298 | } |
4299 | #else | 4299 | #else |
4300 | static inline void setup_usemap(struct pglist_data *pgdat, | 4300 | static inline void setup_usemap(struct pglist_data *pgdat, |
4301 | struct zone *zone, unsigned long zonesize) {} | 4301 | struct zone *zone, unsigned long zonesize) {} |
4302 | #endif /* CONFIG_SPARSEMEM */ | 4302 | #endif /* CONFIG_SPARSEMEM */ |
4303 | 4303 | ||
4304 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 4304 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
4305 | 4305 | ||
4306 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ | 4306 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ |
4307 | static inline void __init set_pageblock_order(void) | 4307 | static inline void __init set_pageblock_order(void) |
4308 | { | 4308 | { |
4309 | unsigned int order; | 4309 | unsigned int order; |
4310 | 4310 | ||
4311 | /* Check that pageblock_nr_pages has not already been setup */ | 4311 | /* Check that pageblock_nr_pages has not already been setup */ |
4312 | if (pageblock_order) | 4312 | if (pageblock_order) |
4313 | return; | 4313 | return; |
4314 | 4314 | ||
4315 | if (HPAGE_SHIFT > PAGE_SHIFT) | 4315 | if (HPAGE_SHIFT > PAGE_SHIFT) |
4316 | order = HUGETLB_PAGE_ORDER; | 4316 | order = HUGETLB_PAGE_ORDER; |
4317 | else | 4317 | else |
4318 | order = MAX_ORDER - 1; | 4318 | order = MAX_ORDER - 1; |
4319 | 4319 | ||
4320 | /* | 4320 | /* |
4321 | * Assume the largest contiguous order of interest is a huge page. | 4321 | * Assume the largest contiguous order of interest is a huge page. |
4322 | * This value may be variable depending on boot parameters on IA64 and | 4322 | * This value may be variable depending on boot parameters on IA64 and |
4323 | * powerpc. | 4323 | * powerpc. |
4324 | */ | 4324 | */ |
4325 | pageblock_order = order; | 4325 | pageblock_order = order; |
4326 | } | 4326 | } |
4327 | #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | 4327 | #else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ |
4328 | 4328 | ||
4329 | /* | 4329 | /* |
4330 | * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() | 4330 | * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order() |
4331 | * is unused as pageblock_order is set at compile-time. See | 4331 | * is unused as pageblock_order is set at compile-time. See |
4332 | * include/linux/pageblock-flags.h for the values of pageblock_order based on | 4332 | * include/linux/pageblock-flags.h for the values of pageblock_order based on |
4333 | * the kernel config | 4333 | * the kernel config |
4334 | */ | 4334 | */ |
4335 | static inline void set_pageblock_order(void) | 4335 | static inline void set_pageblock_order(void) |
4336 | { | 4336 | { |
4337 | } | 4337 | } |
4338 | 4338 | ||
4339 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | 4339 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ |
4340 | 4340 | ||
4341 | /* | 4341 | /* |
4342 | * Set up the zone data structures: | 4342 | * Set up the zone data structures: |
4343 | * - mark all pages reserved | 4343 | * - mark all pages reserved |
4344 | * - mark all memory queues empty | 4344 | * - mark all memory queues empty |
4345 | * - clear the memory bitmaps | 4345 | * - clear the memory bitmaps |
4346 | */ | 4346 | */ |
4347 | static void __paginginit free_area_init_core(struct pglist_data *pgdat, | 4347 | static void __paginginit free_area_init_core(struct pglist_data *pgdat, |
4348 | unsigned long *zones_size, unsigned long *zholes_size) | 4348 | unsigned long *zones_size, unsigned long *zholes_size) |
4349 | { | 4349 | { |
4350 | enum zone_type j; | 4350 | enum zone_type j; |
4351 | int nid = pgdat->node_id; | 4351 | int nid = pgdat->node_id; |
4352 | unsigned long zone_start_pfn = pgdat->node_start_pfn; | 4352 | unsigned long zone_start_pfn = pgdat->node_start_pfn; |
4353 | int ret; | 4353 | int ret; |
4354 | 4354 | ||
4355 | pgdat_resize_init(pgdat); | 4355 | pgdat_resize_init(pgdat); |
4356 | pgdat->nr_zones = 0; | 4356 | pgdat->nr_zones = 0; |
4357 | init_waitqueue_head(&pgdat->kswapd_wait); | 4357 | init_waitqueue_head(&pgdat->kswapd_wait); |
4358 | pgdat->kswapd_max_order = 0; | 4358 | pgdat->kswapd_max_order = 0; |
4359 | pgdat_page_cgroup_init(pgdat); | 4359 | pgdat_page_cgroup_init(pgdat); |
4360 | 4360 | ||
4361 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4361 | for (j = 0; j < MAX_NR_ZONES; j++) { |
4362 | struct zone *zone = pgdat->node_zones + j; | 4362 | struct zone *zone = pgdat->node_zones + j; |
4363 | unsigned long size, realsize, memmap_pages; | 4363 | unsigned long size, realsize, memmap_pages; |
4364 | 4364 | ||
4365 | size = zone_spanned_pages_in_node(nid, j, zones_size); | 4365 | size = zone_spanned_pages_in_node(nid, j, zones_size); |
4366 | realsize = size - zone_absent_pages_in_node(nid, j, | 4366 | realsize = size - zone_absent_pages_in_node(nid, j, |
4367 | zholes_size); | 4367 | zholes_size); |
4368 | 4368 | ||
4369 | /* | 4369 | /* |
4370 | * Adjust realsize so that it accounts for how much memory | 4370 | * Adjust realsize so that it accounts for how much memory |
4371 | * is used by this zone for memmap. This affects the watermark | 4371 | * is used by this zone for memmap. This affects the watermark |
4372 | * and per-cpu initialisations | 4372 | * and per-cpu initialisations |
4373 | */ | 4373 | */ |
4374 | memmap_pages = | 4374 | memmap_pages = |
4375 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; | 4375 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; |
4376 | if (realsize >= memmap_pages) { | 4376 | if (realsize >= memmap_pages) { |
4377 | realsize -= memmap_pages; | 4377 | realsize -= memmap_pages; |
4378 | if (memmap_pages) | 4378 | if (memmap_pages) |
4379 | printk(KERN_DEBUG | 4379 | printk(KERN_DEBUG |
4380 | " %s zone: %lu pages used for memmap\n", | 4380 | " %s zone: %lu pages used for memmap\n", |
4381 | zone_names[j], memmap_pages); | 4381 | zone_names[j], memmap_pages); |
4382 | } else | 4382 | } else |
4383 | printk(KERN_WARNING | 4383 | printk(KERN_WARNING |
4384 | " %s zone: %lu pages exceeds realsize %lu\n", | 4384 | " %s zone: %lu pages exceeds realsize %lu\n", |
4385 | zone_names[j], memmap_pages, realsize); | 4385 | zone_names[j], memmap_pages, realsize); |
4386 | 4386 | ||
4387 | /* Account for reserved pages */ | 4387 | /* Account for reserved pages */ |
4388 | if (j == 0 && realsize > dma_reserve) { | 4388 | if (j == 0 && realsize > dma_reserve) { |
4389 | realsize -= dma_reserve; | 4389 | realsize -= dma_reserve; |
4390 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", | 4390 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", |
4391 | zone_names[0], dma_reserve); | 4391 | zone_names[0], dma_reserve); |
4392 | } | 4392 | } |
4393 | 4393 | ||
4394 | if (!is_highmem_idx(j)) | 4394 | if (!is_highmem_idx(j)) |
4395 | nr_kernel_pages += realsize; | 4395 | nr_kernel_pages += realsize; |
4396 | nr_all_pages += realsize; | 4396 | nr_all_pages += realsize; |
4397 | 4397 | ||
4398 | zone->spanned_pages = size; | 4398 | zone->spanned_pages = size; |
4399 | zone->present_pages = realsize; | 4399 | zone->present_pages = realsize; |
4400 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | ||
4401 | zone->compact_cached_free_pfn = zone->zone_start_pfn + | ||
4402 | zone->spanned_pages; | ||
4403 | zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1); | ||
4404 | #endif | ||
4400 | #ifdef CONFIG_NUMA | 4405 | #ifdef CONFIG_NUMA |
4401 | zone->node = nid; | 4406 | zone->node = nid; |
4402 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | 4407 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) |
4403 | / 100; | 4408 | / 100; |
4404 | zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; | 4409 | zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; |
4405 | #endif | 4410 | #endif |
4406 | zone->name = zone_names[j]; | 4411 | zone->name = zone_names[j]; |
4407 | spin_lock_init(&zone->lock); | 4412 | spin_lock_init(&zone->lock); |
4408 | spin_lock_init(&zone->lru_lock); | 4413 | spin_lock_init(&zone->lru_lock); |
4409 | zone_seqlock_init(zone); | 4414 | zone_seqlock_init(zone); |
4410 | zone->zone_pgdat = pgdat; | 4415 | zone->zone_pgdat = pgdat; |
4411 | 4416 | ||
4412 | zone_pcp_init(zone); | 4417 | zone_pcp_init(zone); |
4413 | lruvec_init(&zone->lruvec, zone); | 4418 | lruvec_init(&zone->lruvec, zone); |
4414 | zap_zone_vm_stats(zone); | 4419 | zap_zone_vm_stats(zone); |
4415 | zone->flags = 0; | 4420 | zone->flags = 0; |
4416 | if (!size) | 4421 | if (!size) |
4417 | continue; | 4422 | continue; |
4418 | 4423 | ||
4419 | set_pageblock_order(); | 4424 | set_pageblock_order(); |
4420 | setup_usemap(pgdat, zone, size); | 4425 | setup_usemap(pgdat, zone, size); |
4421 | ret = init_currently_empty_zone(zone, zone_start_pfn, | 4426 | ret = init_currently_empty_zone(zone, zone_start_pfn, |
4422 | size, MEMMAP_EARLY); | 4427 | size, MEMMAP_EARLY); |
4423 | BUG_ON(ret); | 4428 | BUG_ON(ret); |
4424 | memmap_init(size, nid, j, zone_start_pfn); | 4429 | memmap_init(size, nid, j, zone_start_pfn); |
4425 | zone_start_pfn += size; | 4430 | zone_start_pfn += size; |
4426 | } | 4431 | } |
4427 | } | 4432 | } |
4428 | 4433 | ||
4429 | static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) | 4434 | static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) |
4430 | { | 4435 | { |
4431 | /* Skip empty nodes */ | 4436 | /* Skip empty nodes */ |
4432 | if (!pgdat->node_spanned_pages) | 4437 | if (!pgdat->node_spanned_pages) |
4433 | return; | 4438 | return; |
4434 | 4439 | ||
4435 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | 4440 | #ifdef CONFIG_FLAT_NODE_MEM_MAP |
4436 | /* ia64 gets its own node_mem_map, before this, without bootmem */ | 4441 | /* ia64 gets its own node_mem_map, before this, without bootmem */ |
4437 | if (!pgdat->node_mem_map) { | 4442 | if (!pgdat->node_mem_map) { |
4438 | unsigned long size, start, end; | 4443 | unsigned long size, start, end; |
4439 | struct page *map; | 4444 | struct page *map; |
4440 | 4445 | ||
4441 | /* | 4446 | /* |
4442 | * The zone's endpoints aren't required to be MAX_ORDER | 4447 | * The zone's endpoints aren't required to be MAX_ORDER |
4443 | * aligned but the node_mem_map endpoints must be in order | 4448 | * aligned but the node_mem_map endpoints must be in order |
4444 | * for the buddy allocator to function correctly. | 4449 | * for the buddy allocator to function correctly. |
4445 | */ | 4450 | */ |
4446 | start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); | 4451 | start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); |
4447 | end = pgdat->node_start_pfn + pgdat->node_spanned_pages; | 4452 | end = pgdat->node_start_pfn + pgdat->node_spanned_pages; |
4448 | end = ALIGN(end, MAX_ORDER_NR_PAGES); | 4453 | end = ALIGN(end, MAX_ORDER_NR_PAGES); |
4449 | size = (end - start) * sizeof(struct page); | 4454 | size = (end - start) * sizeof(struct page); |
4450 | map = alloc_remap(pgdat->node_id, size); | 4455 | map = alloc_remap(pgdat->node_id, size); |
4451 | if (!map) | 4456 | if (!map) |
4452 | map = alloc_bootmem_node_nopanic(pgdat, size); | 4457 | map = alloc_bootmem_node_nopanic(pgdat, size); |
4453 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); | 4458 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); |
4454 | } | 4459 | } |
4455 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 4460 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
4456 | /* | 4461 | /* |
4457 | * With no DISCONTIG, the global mem_map is just set as node 0's | 4462 | * With no DISCONTIG, the global mem_map is just set as node 0's |
4458 | */ | 4463 | */ |
4459 | if (pgdat == NODE_DATA(0)) { | 4464 | if (pgdat == NODE_DATA(0)) { |
4460 | mem_map = NODE_DATA(0)->node_mem_map; | 4465 | mem_map = NODE_DATA(0)->node_mem_map; |
4461 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 4466 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
4462 | if (page_to_pfn(mem_map) != pgdat->node_start_pfn) | 4467 | if (page_to_pfn(mem_map) != pgdat->node_start_pfn) |
4463 | mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); | 4468 | mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); |
4464 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 4469 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
4465 | } | 4470 | } |
4466 | #endif | 4471 | #endif |
4467 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ | 4472 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ |
4468 | } | 4473 | } |
4469 | 4474 | ||
4470 | void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | 4475 | void __paginginit free_area_init_node(int nid, unsigned long *zones_size, |
4471 | unsigned long node_start_pfn, unsigned long *zholes_size) | 4476 | unsigned long node_start_pfn, unsigned long *zholes_size) |
4472 | { | 4477 | { |
4473 | pg_data_t *pgdat = NODE_DATA(nid); | 4478 | pg_data_t *pgdat = NODE_DATA(nid); |
4474 | 4479 | ||
4475 | pgdat->node_id = nid; | 4480 | pgdat->node_id = nid; |
4476 | pgdat->node_start_pfn = node_start_pfn; | 4481 | pgdat->node_start_pfn = node_start_pfn; |
4477 | calculate_node_totalpages(pgdat, zones_size, zholes_size); | 4482 | calculate_node_totalpages(pgdat, zones_size, zholes_size); |
4478 | 4483 | ||
4479 | alloc_node_mem_map(pgdat); | 4484 | alloc_node_mem_map(pgdat); |
4480 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | 4485 | #ifdef CONFIG_FLAT_NODE_MEM_MAP |
4481 | printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", | 4486 | printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", |
4482 | nid, (unsigned long)pgdat, | 4487 | nid, (unsigned long)pgdat, |
4483 | (unsigned long)pgdat->node_mem_map); | 4488 | (unsigned long)pgdat->node_mem_map); |
4484 | #endif | 4489 | #endif |
4485 | 4490 | ||
4486 | free_area_init_core(pgdat, zones_size, zholes_size); | 4491 | free_area_init_core(pgdat, zones_size, zholes_size); |
4487 | } | 4492 | } |
4488 | 4493 | ||
4489 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP | 4494 | #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP |
4490 | 4495 | ||
4491 | #if MAX_NUMNODES > 1 | 4496 | #if MAX_NUMNODES > 1 |
4492 | /* | 4497 | /* |
4493 | * Figure out the number of possible node ids. | 4498 | * Figure out the number of possible node ids. |
4494 | */ | 4499 | */ |
4495 | static void __init setup_nr_node_ids(void) | 4500 | static void __init setup_nr_node_ids(void) |
4496 | { | 4501 | { |
4497 | unsigned int node; | 4502 | unsigned int node; |
4498 | unsigned int highest = 0; | 4503 | unsigned int highest = 0; |
4499 | 4504 | ||
4500 | for_each_node_mask(node, node_possible_map) | 4505 | for_each_node_mask(node, node_possible_map) |
4501 | highest = node; | 4506 | highest = node; |
4502 | nr_node_ids = highest + 1; | 4507 | nr_node_ids = highest + 1; |
4503 | } | 4508 | } |
4504 | #else | 4509 | #else |
4505 | static inline void setup_nr_node_ids(void) | 4510 | static inline void setup_nr_node_ids(void) |
4506 | { | 4511 | { |
4507 | } | 4512 | } |
4508 | #endif | 4513 | #endif |
4509 | 4514 | ||
4510 | /** | 4515 | /** |
4511 | * node_map_pfn_alignment - determine the maximum internode alignment | 4516 | * node_map_pfn_alignment - determine the maximum internode alignment |
4512 | * | 4517 | * |
4513 | * This function should be called after node map is populated and sorted. | 4518 | * This function should be called after node map is populated and sorted. |
4514 | * It calculates the maximum power of two alignment which can distinguish | 4519 | * It calculates the maximum power of two alignment which can distinguish |
4515 | * all the nodes. | 4520 | * all the nodes. |
4516 | * | 4521 | * |
4517 | * For example, if all nodes are 1GiB and aligned to 1GiB, the return value | 4522 | * For example, if all nodes are 1GiB and aligned to 1GiB, the return value |
4518 | * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the | 4523 | * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the |
4519 | * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is | 4524 | * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is |
4520 | * shifted, 1GiB is enough and this function will indicate so. | 4525 | * shifted, 1GiB is enough and this function will indicate so. |
4521 | * | 4526 | * |
4522 | * This is used to test whether pfn -> nid mapping of the chosen memory | 4527 | * This is used to test whether pfn -> nid mapping of the chosen memory |
4523 | * model has fine enough granularity to avoid incorrect mapping for the | 4528 | * model has fine enough granularity to avoid incorrect mapping for the |
4524 | * populated node map. | 4529 | * populated node map. |
4525 | * | 4530 | * |
4526 | * Returns the determined alignment in pfn's. 0 if there is no alignment | 4531 | * Returns the determined alignment in pfn's. 0 if there is no alignment |
4527 | * requirement (single node). | 4532 | * requirement (single node). |
4528 | */ | 4533 | */ |
4529 | unsigned long __init node_map_pfn_alignment(void) | 4534 | unsigned long __init node_map_pfn_alignment(void) |
4530 | { | 4535 | { |
4531 | unsigned long accl_mask = 0, last_end = 0; | 4536 | unsigned long accl_mask = 0, last_end = 0; |
4532 | unsigned long start, end, mask; | 4537 | unsigned long start, end, mask; |
4533 | int last_nid = -1; | 4538 | int last_nid = -1; |
4534 | int i, nid; | 4539 | int i, nid; |
4535 | 4540 | ||
4536 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { | 4541 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { |
4537 | if (!start || last_nid < 0 || last_nid == nid) { | 4542 | if (!start || last_nid < 0 || last_nid == nid) { |
4538 | last_nid = nid; | 4543 | last_nid = nid; |
4539 | last_end = end; | 4544 | last_end = end; |
4540 | continue; | 4545 | continue; |
4541 | } | 4546 | } |
4542 | 4547 | ||
4543 | /* | 4548 | /* |
4544 | * Start with a mask granular enough to pin-point to the | 4549 | * Start with a mask granular enough to pin-point to the |
4545 | * start pfn and tick off bits one-by-one until it becomes | 4550 | * start pfn and tick off bits one-by-one until it becomes |
4546 | * too coarse to separate the current node from the last. | 4551 | * too coarse to separate the current node from the last. |
4547 | */ | 4552 | */ |
4548 | mask = ~((1 << __ffs(start)) - 1); | 4553 | mask = ~((1 << __ffs(start)) - 1); |
4549 | while (mask && last_end <= (start & (mask << 1))) | 4554 | while (mask && last_end <= (start & (mask << 1))) |
4550 | mask <<= 1; | 4555 | mask <<= 1; |
4551 | 4556 | ||
4552 | /* accumulate all internode masks */ | 4557 | /* accumulate all internode masks */ |
4553 | accl_mask |= mask; | 4558 | accl_mask |= mask; |
4554 | } | 4559 | } |
4555 | 4560 | ||
4556 | /* convert mask to number of pages */ | 4561 | /* convert mask to number of pages */ |
4557 | return ~accl_mask + 1; | 4562 | return ~accl_mask + 1; |
4558 | } | 4563 | } |
4559 | 4564 | ||
4560 | /* Find the lowest pfn for a node */ | 4565 | /* Find the lowest pfn for a node */ |
4561 | static unsigned long __init find_min_pfn_for_node(int nid) | 4566 | static unsigned long __init find_min_pfn_for_node(int nid) |
4562 | { | 4567 | { |
4563 | unsigned long min_pfn = ULONG_MAX; | 4568 | unsigned long min_pfn = ULONG_MAX; |
4564 | unsigned long start_pfn; | 4569 | unsigned long start_pfn; |
4565 | int i; | 4570 | int i; |
4566 | 4571 | ||
4567 | for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) | 4572 | for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) |
4568 | min_pfn = min(min_pfn, start_pfn); | 4573 | min_pfn = min(min_pfn, start_pfn); |
4569 | 4574 | ||
4570 | if (min_pfn == ULONG_MAX) { | 4575 | if (min_pfn == ULONG_MAX) { |
4571 | printk(KERN_WARNING | 4576 | printk(KERN_WARNING |
4572 | "Could not find start_pfn for node %d\n", nid); | 4577 | "Could not find start_pfn for node %d\n", nid); |
4573 | return 0; | 4578 | return 0; |
4574 | } | 4579 | } |
4575 | 4580 | ||
4576 | return min_pfn; | 4581 | return min_pfn; |
4577 | } | 4582 | } |
4578 | 4583 | ||
4579 | /** | 4584 | /** |
4580 | * find_min_pfn_with_active_regions - Find the minimum PFN registered | 4585 | * find_min_pfn_with_active_regions - Find the minimum PFN registered |
4581 | * | 4586 | * |
4582 | * It returns the minimum PFN based on information provided via | 4587 | * It returns the minimum PFN based on information provided via |
4583 | * add_active_range(). | 4588 | * add_active_range(). |
4584 | */ | 4589 | */ |
4585 | unsigned long __init find_min_pfn_with_active_regions(void) | 4590 | unsigned long __init find_min_pfn_with_active_regions(void) |
4586 | { | 4591 | { |
4587 | return find_min_pfn_for_node(MAX_NUMNODES); | 4592 | return find_min_pfn_for_node(MAX_NUMNODES); |
4588 | } | 4593 | } |
4589 | 4594 | ||
4590 | /* | 4595 | /* |
4591 | * early_calculate_totalpages() | 4596 | * early_calculate_totalpages() |
4592 | * Sum pages in active regions for movable zone. | 4597 | * Sum pages in active regions for movable zone. |
4593 | * Populate N_HIGH_MEMORY for calculating usable_nodes. | 4598 | * Populate N_HIGH_MEMORY for calculating usable_nodes. |
4594 | */ | 4599 | */ |
4595 | static unsigned long __init early_calculate_totalpages(void) | 4600 | static unsigned long __init early_calculate_totalpages(void) |
4596 | { | 4601 | { |
4597 | unsigned long totalpages = 0; | 4602 | unsigned long totalpages = 0; |
4598 | unsigned long start_pfn, end_pfn; | 4603 | unsigned long start_pfn, end_pfn; |
4599 | int i, nid; | 4604 | int i, nid; |
4600 | 4605 | ||
4601 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { | 4606 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { |
4602 | unsigned long pages = end_pfn - start_pfn; | 4607 | unsigned long pages = end_pfn - start_pfn; |
4603 | 4608 | ||
4604 | totalpages += pages; | 4609 | totalpages += pages; |
4605 | if (pages) | 4610 | if (pages) |
4606 | node_set_state(nid, N_HIGH_MEMORY); | 4611 | node_set_state(nid, N_HIGH_MEMORY); |
4607 | } | 4612 | } |
4608 | return totalpages; | 4613 | return totalpages; |
4609 | } | 4614 | } |
4610 | 4615 | ||
4611 | /* | 4616 | /* |
4612 | * Find the PFN the Movable zone begins in each node. Kernel memory | 4617 | * Find the PFN the Movable zone begins in each node. Kernel memory |
4613 | * is spread evenly between nodes as long as the nodes have enough | 4618 | * is spread evenly between nodes as long as the nodes have enough |
4614 | * memory. When they don't, some nodes will have more kernelcore than | 4619 | * memory. When they don't, some nodes will have more kernelcore than |
4615 | * others | 4620 | * others |
4616 | */ | 4621 | */ |
4617 | static void __init find_zone_movable_pfns_for_nodes(void) | 4622 | static void __init find_zone_movable_pfns_for_nodes(void) |
4618 | { | 4623 | { |
4619 | int i, nid; | 4624 | int i, nid; |
4620 | unsigned long usable_startpfn; | 4625 | unsigned long usable_startpfn; |
4621 | unsigned long kernelcore_node, kernelcore_remaining; | 4626 | unsigned long kernelcore_node, kernelcore_remaining; |
4622 | /* save the state before borrow the nodemask */ | 4627 | /* save the state before borrow the nodemask */ |
4623 | nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; | 4628 | nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; |
4624 | unsigned long totalpages = early_calculate_totalpages(); | 4629 | unsigned long totalpages = early_calculate_totalpages(); |
4625 | int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); | 4630 | int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); |
4626 | 4631 | ||
4627 | /* | 4632 | /* |
4628 | * If movablecore was specified, calculate what size of | 4633 | * If movablecore was specified, calculate what size of |
4629 | * kernelcore that corresponds so that memory usable for | 4634 | * kernelcore that corresponds so that memory usable for |
4630 | * any allocation type is evenly spread. If both kernelcore | 4635 | * any allocation type is evenly spread. If both kernelcore |
4631 | * and movablecore are specified, then the value of kernelcore | 4636 | * and movablecore are specified, then the value of kernelcore |
4632 | * will be used for required_kernelcore if it's greater than | 4637 | * will be used for required_kernelcore if it's greater than |
4633 | * what movablecore would have allowed. | 4638 | * what movablecore would have allowed. |
4634 | */ | 4639 | */ |
4635 | if (required_movablecore) { | 4640 | if (required_movablecore) { |
4636 | unsigned long corepages; | 4641 | unsigned long corepages; |
4637 | 4642 | ||
4638 | /* | 4643 | /* |
4639 | * Round-up so that ZONE_MOVABLE is at least as large as what | 4644 | * Round-up so that ZONE_MOVABLE is at least as large as what |
4640 | * was requested by the user | 4645 | * was requested by the user |
4641 | */ | 4646 | */ |
4642 | required_movablecore = | 4647 | required_movablecore = |
4643 | roundup(required_movablecore, MAX_ORDER_NR_PAGES); | 4648 | roundup(required_movablecore, MAX_ORDER_NR_PAGES); |
4644 | corepages = totalpages - required_movablecore; | 4649 | corepages = totalpages - required_movablecore; |
4645 | 4650 | ||
4646 | required_kernelcore = max(required_kernelcore, corepages); | 4651 | required_kernelcore = max(required_kernelcore, corepages); |
4647 | } | 4652 | } |
4648 | 4653 | ||
4649 | /* If kernelcore was not specified, there is no ZONE_MOVABLE */ | 4654 | /* If kernelcore was not specified, there is no ZONE_MOVABLE */ |
4650 | if (!required_kernelcore) | 4655 | if (!required_kernelcore) |
4651 | goto out; | 4656 | goto out; |
4652 | 4657 | ||
4653 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ | 4658 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ |
4654 | find_usable_zone_for_movable(); | 4659 | find_usable_zone_for_movable(); |
4655 | usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; | 4660 | usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone]; |
4656 | 4661 | ||
4657 | restart: | 4662 | restart: |
4658 | /* Spread kernelcore memory as evenly as possible throughout nodes */ | 4663 | /* Spread kernelcore memory as evenly as possible throughout nodes */ |
4659 | kernelcore_node = required_kernelcore / usable_nodes; | 4664 | kernelcore_node = required_kernelcore / usable_nodes; |
4660 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4665 | for_each_node_state(nid, N_HIGH_MEMORY) { |
4661 | unsigned long start_pfn, end_pfn; | 4666 | unsigned long start_pfn, end_pfn; |
4662 | 4667 | ||
4663 | /* | 4668 | /* |
4664 | * Recalculate kernelcore_node if the division per node | 4669 | * Recalculate kernelcore_node if the division per node |
4665 | * now exceeds what is necessary to satisfy the requested | 4670 | * now exceeds what is necessary to satisfy the requested |
4666 | * amount of memory for the kernel | 4671 | * amount of memory for the kernel |
4667 | */ | 4672 | */ |
4668 | if (required_kernelcore < kernelcore_node) | 4673 | if (required_kernelcore < kernelcore_node) |
4669 | kernelcore_node = required_kernelcore / usable_nodes; | 4674 | kernelcore_node = required_kernelcore / usable_nodes; |
4670 | 4675 | ||
4671 | /* | 4676 | /* |
4672 | * As the map is walked, we track how much memory is usable | 4677 | * As the map is walked, we track how much memory is usable |
4673 | * by the kernel using kernelcore_remaining. When it is | 4678 | * by the kernel using kernelcore_remaining. When it is |
4674 | * 0, the rest of the node is usable by ZONE_MOVABLE | 4679 | * 0, the rest of the node is usable by ZONE_MOVABLE |
4675 | */ | 4680 | */ |
4676 | kernelcore_remaining = kernelcore_node; | 4681 | kernelcore_remaining = kernelcore_node; |
4677 | 4682 | ||
4678 | /* Go through each range of PFNs within this node */ | 4683 | /* Go through each range of PFNs within this node */ |
4679 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { | 4684 | for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { |
4680 | unsigned long size_pages; | 4685 | unsigned long size_pages; |
4681 | 4686 | ||
4682 | start_pfn = max(start_pfn, zone_movable_pfn[nid]); | 4687 | start_pfn = max(start_pfn, zone_movable_pfn[nid]); |
4683 | if (start_pfn >= end_pfn) | 4688 | if (start_pfn >= end_pfn) |
4684 | continue; | 4689 | continue; |
4685 | 4690 | ||
4686 | /* Account for what is only usable for kernelcore */ | 4691 | /* Account for what is only usable for kernelcore */ |
4687 | if (start_pfn < usable_startpfn) { | 4692 | if (start_pfn < usable_startpfn) { |
4688 | unsigned long kernel_pages; | 4693 | unsigned long kernel_pages; |
4689 | kernel_pages = min(end_pfn, usable_startpfn) | 4694 | kernel_pages = min(end_pfn, usable_startpfn) |
4690 | - start_pfn; | 4695 | - start_pfn; |
4691 | 4696 | ||
4692 | kernelcore_remaining -= min(kernel_pages, | 4697 | kernelcore_remaining -= min(kernel_pages, |
4693 | kernelcore_remaining); | 4698 | kernelcore_remaining); |
4694 | required_kernelcore -= min(kernel_pages, | 4699 | required_kernelcore -= min(kernel_pages, |
4695 | required_kernelcore); | 4700 | required_kernelcore); |
4696 | 4701 | ||
4697 | /* Continue if range is now fully accounted */ | 4702 | /* Continue if range is now fully accounted */ |
4698 | if (end_pfn <= usable_startpfn) { | 4703 | if (end_pfn <= usable_startpfn) { |
4699 | 4704 | ||
4700 | /* | 4705 | /* |
4701 | * Push zone_movable_pfn to the end so | 4706 | * Push zone_movable_pfn to the end so |
4702 | * that if we have to rebalance | 4707 | * that if we have to rebalance |
4703 | * kernelcore across nodes, we will | 4708 | * kernelcore across nodes, we will |
4704 | * not double account here | 4709 | * not double account here |
4705 | */ | 4710 | */ |
4706 | zone_movable_pfn[nid] = end_pfn; | 4711 | zone_movable_pfn[nid] = end_pfn; |
4707 | continue; | 4712 | continue; |
4708 | } | 4713 | } |
4709 | start_pfn = usable_startpfn; | 4714 | start_pfn = usable_startpfn; |
4710 | } | 4715 | } |
4711 | 4716 | ||
4712 | /* | 4717 | /* |
4713 | * The usable PFN range for ZONE_MOVABLE is from | 4718 | * The usable PFN range for ZONE_MOVABLE is from |
4714 | * start_pfn->end_pfn. Calculate size_pages as the | 4719 | * start_pfn->end_pfn. Calculate size_pages as the |
4715 | * number of pages used as kernelcore | 4720 | * number of pages used as kernelcore |
4716 | */ | 4721 | */ |
4717 | size_pages = end_pfn - start_pfn; | 4722 | size_pages = end_pfn - start_pfn; |
4718 | if (size_pages > kernelcore_remaining) | 4723 | if (size_pages > kernelcore_remaining) |
4719 | size_pages = kernelcore_remaining; | 4724 | size_pages = kernelcore_remaining; |
4720 | zone_movable_pfn[nid] = start_pfn + size_pages; | 4725 | zone_movable_pfn[nid] = start_pfn + size_pages; |
4721 | 4726 | ||
4722 | /* | 4727 | /* |
4723 | * Some kernelcore has been met, update counts and | 4728 | * Some kernelcore has been met, update counts and |
4724 | * break if the kernelcore for this node has been | 4729 | * break if the kernelcore for this node has been |
4725 | * satisified | 4730 | * satisified |
4726 | */ | 4731 | */ |
4727 | required_kernelcore -= min(required_kernelcore, | 4732 | required_kernelcore -= min(required_kernelcore, |
4728 | size_pages); | 4733 | size_pages); |
4729 | kernelcore_remaining -= size_pages; | 4734 | kernelcore_remaining -= size_pages; |
4730 | if (!kernelcore_remaining) | 4735 | if (!kernelcore_remaining) |
4731 | break; | 4736 | break; |
4732 | } | 4737 | } |
4733 | } | 4738 | } |
4734 | 4739 | ||
4735 | /* | 4740 | /* |
4736 | * If there is still required_kernelcore, we do another pass with one | 4741 | * If there is still required_kernelcore, we do another pass with one |
4737 | * less node in the count. This will push zone_movable_pfn[nid] further | 4742 | * less node in the count. This will push zone_movable_pfn[nid] further |
4738 | * along on the nodes that still have memory until kernelcore is | 4743 | * along on the nodes that still have memory until kernelcore is |
4739 | * satisified | 4744 | * satisified |
4740 | */ | 4745 | */ |
4741 | usable_nodes--; | 4746 | usable_nodes--; |
4742 | if (usable_nodes && required_kernelcore > usable_nodes) | 4747 | if (usable_nodes && required_kernelcore > usable_nodes) |
4743 | goto restart; | 4748 | goto restart; |
4744 | 4749 | ||
4745 | /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ | 4750 | /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */ |
4746 | for (nid = 0; nid < MAX_NUMNODES; nid++) | 4751 | for (nid = 0; nid < MAX_NUMNODES; nid++) |
4747 | zone_movable_pfn[nid] = | 4752 | zone_movable_pfn[nid] = |
4748 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); | 4753 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); |
4749 | 4754 | ||
4750 | out: | 4755 | out: |
4751 | /* restore the node_state */ | 4756 | /* restore the node_state */ |
4752 | node_states[N_HIGH_MEMORY] = saved_node_state; | 4757 | node_states[N_HIGH_MEMORY] = saved_node_state; |
4753 | } | 4758 | } |
4754 | 4759 | ||
4755 | /* Any regular memory on that node ? */ | 4760 | /* Any regular memory on that node ? */ |
4756 | static void check_for_regular_memory(pg_data_t *pgdat) | 4761 | static void check_for_regular_memory(pg_data_t *pgdat) |
4757 | { | 4762 | { |
4758 | #ifdef CONFIG_HIGHMEM | 4763 | #ifdef CONFIG_HIGHMEM |
4759 | enum zone_type zone_type; | 4764 | enum zone_type zone_type; |
4760 | 4765 | ||
4761 | for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { | 4766 | for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { |
4762 | struct zone *zone = &pgdat->node_zones[zone_type]; | 4767 | struct zone *zone = &pgdat->node_zones[zone_type]; |
4763 | if (zone->present_pages) { | 4768 | if (zone->present_pages) { |
4764 | node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); | 4769 | node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); |
4765 | break; | 4770 | break; |
4766 | } | 4771 | } |
4767 | } | 4772 | } |
4768 | #endif | 4773 | #endif |
4769 | } | 4774 | } |
4770 | 4775 | ||
4771 | /** | 4776 | /** |
4772 | * free_area_init_nodes - Initialise all pg_data_t and zone data | 4777 | * free_area_init_nodes - Initialise all pg_data_t and zone data |
4773 | * @max_zone_pfn: an array of max PFNs for each zone | 4778 | * @max_zone_pfn: an array of max PFNs for each zone |
4774 | * | 4779 | * |
4775 | * This will call free_area_init_node() for each active node in the system. | 4780 | * This will call free_area_init_node() for each active node in the system. |
4776 | * Using the page ranges provided by add_active_range(), the size of each | 4781 | * Using the page ranges provided by add_active_range(), the size of each |
4777 | * zone in each node and their holes is calculated. If the maximum PFN | 4782 | * zone in each node and their holes is calculated. If the maximum PFN |
4778 | * between two adjacent zones match, it is assumed that the zone is empty. | 4783 | * between two adjacent zones match, it is assumed that the zone is empty. |
4779 | * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed | 4784 | * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed |
4780 | * that arch_max_dma32_pfn has no pages. It is also assumed that a zone | 4785 | * that arch_max_dma32_pfn has no pages. It is also assumed that a zone |
4781 | * starts where the previous one ended. For example, ZONE_DMA32 starts | 4786 | * starts where the previous one ended. For example, ZONE_DMA32 starts |
4782 | * at arch_max_dma_pfn. | 4787 | * at arch_max_dma_pfn. |
4783 | */ | 4788 | */ |
4784 | void __init free_area_init_nodes(unsigned long *max_zone_pfn) | 4789 | void __init free_area_init_nodes(unsigned long *max_zone_pfn) |
4785 | { | 4790 | { |
4786 | unsigned long start_pfn, end_pfn; | 4791 | unsigned long start_pfn, end_pfn; |
4787 | int i, nid; | 4792 | int i, nid; |
4788 | 4793 | ||
4789 | /* Record where the zone boundaries are */ | 4794 | /* Record where the zone boundaries are */ |
4790 | memset(arch_zone_lowest_possible_pfn, 0, | 4795 | memset(arch_zone_lowest_possible_pfn, 0, |
4791 | sizeof(arch_zone_lowest_possible_pfn)); | 4796 | sizeof(arch_zone_lowest_possible_pfn)); |
4792 | memset(arch_zone_highest_possible_pfn, 0, | 4797 | memset(arch_zone_highest_possible_pfn, 0, |
4793 | sizeof(arch_zone_highest_possible_pfn)); | 4798 | sizeof(arch_zone_highest_possible_pfn)); |
4794 | arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); | 4799 | arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); |
4795 | arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; | 4800 | arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; |
4796 | for (i = 1; i < MAX_NR_ZONES; i++) { | 4801 | for (i = 1; i < MAX_NR_ZONES; i++) { |
4797 | if (i == ZONE_MOVABLE) | 4802 | if (i == ZONE_MOVABLE) |
4798 | continue; | 4803 | continue; |
4799 | arch_zone_lowest_possible_pfn[i] = | 4804 | arch_zone_lowest_possible_pfn[i] = |
4800 | arch_zone_highest_possible_pfn[i-1]; | 4805 | arch_zone_highest_possible_pfn[i-1]; |
4801 | arch_zone_highest_possible_pfn[i] = | 4806 | arch_zone_highest_possible_pfn[i] = |
4802 | max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); | 4807 | max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); |
4803 | } | 4808 | } |
4804 | arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; | 4809 | arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; |
4805 | arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; | 4810 | arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; |
4806 | 4811 | ||
4807 | /* Find the PFNs that ZONE_MOVABLE begins at in each node */ | 4812 | /* Find the PFNs that ZONE_MOVABLE begins at in each node */ |
4808 | memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); | 4813 | memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); |
4809 | find_zone_movable_pfns_for_nodes(); | 4814 | find_zone_movable_pfns_for_nodes(); |
4810 | 4815 | ||
4811 | /* Print out the zone ranges */ | 4816 | /* Print out the zone ranges */ |
4812 | printk("Zone ranges:\n"); | 4817 | printk("Zone ranges:\n"); |
4813 | for (i = 0; i < MAX_NR_ZONES; i++) { | 4818 | for (i = 0; i < MAX_NR_ZONES; i++) { |
4814 | if (i == ZONE_MOVABLE) | 4819 | if (i == ZONE_MOVABLE) |
4815 | continue; | 4820 | continue; |
4816 | printk(KERN_CONT " %-8s ", zone_names[i]); | 4821 | printk(KERN_CONT " %-8s ", zone_names[i]); |
4817 | if (arch_zone_lowest_possible_pfn[i] == | 4822 | if (arch_zone_lowest_possible_pfn[i] == |
4818 | arch_zone_highest_possible_pfn[i]) | 4823 | arch_zone_highest_possible_pfn[i]) |
4819 | printk(KERN_CONT "empty\n"); | 4824 | printk(KERN_CONT "empty\n"); |
4820 | else | 4825 | else |
4821 | printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", | 4826 | printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n", |
4822 | arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, | 4827 | arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, |
4823 | (arch_zone_highest_possible_pfn[i] | 4828 | (arch_zone_highest_possible_pfn[i] |
4824 | << PAGE_SHIFT) - 1); | 4829 | << PAGE_SHIFT) - 1); |
4825 | } | 4830 | } |
4826 | 4831 | ||
4827 | /* Print out the PFNs ZONE_MOVABLE begins at in each node */ | 4832 | /* Print out the PFNs ZONE_MOVABLE begins at in each node */ |
4828 | printk("Movable zone start for each node\n"); | 4833 | printk("Movable zone start for each node\n"); |
4829 | for (i = 0; i < MAX_NUMNODES; i++) { | 4834 | for (i = 0; i < MAX_NUMNODES; i++) { |
4830 | if (zone_movable_pfn[i]) | 4835 | if (zone_movable_pfn[i]) |
4831 | printk(" Node %d: %#010lx\n", i, | 4836 | printk(" Node %d: %#010lx\n", i, |
4832 | zone_movable_pfn[i] << PAGE_SHIFT); | 4837 | zone_movable_pfn[i] << PAGE_SHIFT); |
4833 | } | 4838 | } |
4834 | 4839 | ||
4835 | /* Print out the early_node_map[] */ | 4840 | /* Print out the early_node_map[] */ |
4836 | printk("Early memory node ranges\n"); | 4841 | printk("Early memory node ranges\n"); |
4837 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) | 4842 | for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) |
4838 | printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, | 4843 | printk(" node %3d: [mem %#010lx-%#010lx]\n", nid, |
4839 | start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); | 4844 | start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); |
4840 | 4845 | ||
4841 | /* Initialise every node */ | 4846 | /* Initialise every node */ |
4842 | mminit_verify_pageflags_layout(); | 4847 | mminit_verify_pageflags_layout(); |
4843 | setup_nr_node_ids(); | 4848 | setup_nr_node_ids(); |
4844 | for_each_online_node(nid) { | 4849 | for_each_online_node(nid) { |
4845 | pg_data_t *pgdat = NODE_DATA(nid); | 4850 | pg_data_t *pgdat = NODE_DATA(nid); |
4846 | free_area_init_node(nid, NULL, | 4851 | free_area_init_node(nid, NULL, |
4847 | find_min_pfn_for_node(nid), NULL); | 4852 | find_min_pfn_for_node(nid), NULL); |
4848 | 4853 | ||
4849 | /* Any memory on that node */ | 4854 | /* Any memory on that node */ |
4850 | if (pgdat->node_present_pages) | 4855 | if (pgdat->node_present_pages) |
4851 | node_set_state(nid, N_HIGH_MEMORY); | 4856 | node_set_state(nid, N_HIGH_MEMORY); |
4852 | check_for_regular_memory(pgdat); | 4857 | check_for_regular_memory(pgdat); |
4853 | } | 4858 | } |
4854 | } | 4859 | } |
4855 | 4860 | ||
4856 | static int __init cmdline_parse_core(char *p, unsigned long *core) | 4861 | static int __init cmdline_parse_core(char *p, unsigned long *core) |
4857 | { | 4862 | { |
4858 | unsigned long long coremem; | 4863 | unsigned long long coremem; |
4859 | if (!p) | 4864 | if (!p) |
4860 | return -EINVAL; | 4865 | return -EINVAL; |
4861 | 4866 | ||
4862 | coremem = memparse(p, &p); | 4867 | coremem = memparse(p, &p); |
4863 | *core = coremem >> PAGE_SHIFT; | 4868 | *core = coremem >> PAGE_SHIFT; |
4864 | 4869 | ||
4865 | /* Paranoid check that UL is enough for the coremem value */ | 4870 | /* Paranoid check that UL is enough for the coremem value */ |
4866 | WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); | 4871 | WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); |
4867 | 4872 | ||
4868 | return 0; | 4873 | return 0; |
4869 | } | 4874 | } |
4870 | 4875 | ||
4871 | /* | 4876 | /* |
4872 | * kernelcore=size sets the amount of memory for use for allocations that | 4877 | * kernelcore=size sets the amount of memory for use for allocations that |
4873 | * cannot be reclaimed or migrated. | 4878 | * cannot be reclaimed or migrated. |
4874 | */ | 4879 | */ |
4875 | static int __init cmdline_parse_kernelcore(char *p) | 4880 | static int __init cmdline_parse_kernelcore(char *p) |
4876 | { | 4881 | { |
4877 | return cmdline_parse_core(p, &required_kernelcore); | 4882 | return cmdline_parse_core(p, &required_kernelcore); |
4878 | } | 4883 | } |
4879 | 4884 | ||
4880 | /* | 4885 | /* |
4881 | * movablecore=size sets the amount of memory for use for allocations that | 4886 | * movablecore=size sets the amount of memory for use for allocations that |
4882 | * can be reclaimed or migrated. | 4887 | * can be reclaimed or migrated. |
4883 | */ | 4888 | */ |
4884 | static int __init cmdline_parse_movablecore(char *p) | 4889 | static int __init cmdline_parse_movablecore(char *p) |
4885 | { | 4890 | { |
4886 | return cmdline_parse_core(p, &required_movablecore); | 4891 | return cmdline_parse_core(p, &required_movablecore); |
4887 | } | 4892 | } |
4888 | 4893 | ||
4889 | early_param("kernelcore", cmdline_parse_kernelcore); | 4894 | early_param("kernelcore", cmdline_parse_kernelcore); |
4890 | early_param("movablecore", cmdline_parse_movablecore); | 4895 | early_param("movablecore", cmdline_parse_movablecore); |
4891 | 4896 | ||
4892 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ | 4897 | #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ |
4893 | 4898 | ||
4894 | /** | 4899 | /** |
4895 | * set_dma_reserve - set the specified number of pages reserved in the first zone | 4900 | * set_dma_reserve - set the specified number of pages reserved in the first zone |
4896 | * @new_dma_reserve: The number of pages to mark reserved | 4901 | * @new_dma_reserve: The number of pages to mark reserved |
4897 | * | 4902 | * |
4898 | * The per-cpu batchsize and zone watermarks are determined by present_pages. | 4903 | * The per-cpu batchsize and zone watermarks are determined by present_pages. |
4899 | * In the DMA zone, a significant percentage may be consumed by kernel image | 4904 | * In the DMA zone, a significant percentage may be consumed by kernel image |
4900 | * and other unfreeable allocations which can skew the watermarks badly. This | 4905 | * and other unfreeable allocations which can skew the watermarks badly. This |
4901 | * function may optionally be used to account for unfreeable pages in the | 4906 | * function may optionally be used to account for unfreeable pages in the |
4902 | * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and | 4907 | * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and |
4903 | * smaller per-cpu batchsize. | 4908 | * smaller per-cpu batchsize. |
4904 | */ | 4909 | */ |
4905 | void __init set_dma_reserve(unsigned long new_dma_reserve) | 4910 | void __init set_dma_reserve(unsigned long new_dma_reserve) |
4906 | { | 4911 | { |
4907 | dma_reserve = new_dma_reserve; | 4912 | dma_reserve = new_dma_reserve; |
4908 | } | 4913 | } |
4909 | 4914 | ||
4910 | void __init free_area_init(unsigned long *zones_size) | 4915 | void __init free_area_init(unsigned long *zones_size) |
4911 | { | 4916 | { |
4912 | free_area_init_node(0, zones_size, | 4917 | free_area_init_node(0, zones_size, |
4913 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); | 4918 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); |
4914 | } | 4919 | } |
4915 | 4920 | ||
4916 | static int page_alloc_cpu_notify(struct notifier_block *self, | 4921 | static int page_alloc_cpu_notify(struct notifier_block *self, |
4917 | unsigned long action, void *hcpu) | 4922 | unsigned long action, void *hcpu) |
4918 | { | 4923 | { |
4919 | int cpu = (unsigned long)hcpu; | 4924 | int cpu = (unsigned long)hcpu; |
4920 | 4925 | ||
4921 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { | 4926 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { |
4922 | lru_add_drain_cpu(cpu); | 4927 | lru_add_drain_cpu(cpu); |
4923 | drain_pages(cpu); | 4928 | drain_pages(cpu); |
4924 | 4929 | ||
4925 | /* | 4930 | /* |
4926 | * Spill the event counters of the dead processor | 4931 | * Spill the event counters of the dead processor |
4927 | * into the current processors event counters. | 4932 | * into the current processors event counters. |
4928 | * This artificially elevates the count of the current | 4933 | * This artificially elevates the count of the current |
4929 | * processor. | 4934 | * processor. |
4930 | */ | 4935 | */ |
4931 | vm_events_fold_cpu(cpu); | 4936 | vm_events_fold_cpu(cpu); |
4932 | 4937 | ||
4933 | /* | 4938 | /* |
4934 | * Zero the differential counters of the dead processor | 4939 | * Zero the differential counters of the dead processor |
4935 | * so that the vm statistics are consistent. | 4940 | * so that the vm statistics are consistent. |
4936 | * | 4941 | * |
4937 | * This is only okay since the processor is dead and cannot | 4942 | * This is only okay since the processor is dead and cannot |
4938 | * race with what we are doing. | 4943 | * race with what we are doing. |
4939 | */ | 4944 | */ |
4940 | refresh_cpu_vm_stats(cpu); | 4945 | refresh_cpu_vm_stats(cpu); |
4941 | } | 4946 | } |
4942 | return NOTIFY_OK; | 4947 | return NOTIFY_OK; |
4943 | } | 4948 | } |
4944 | 4949 | ||
4945 | void __init page_alloc_init(void) | 4950 | void __init page_alloc_init(void) |
4946 | { | 4951 | { |
4947 | hotcpu_notifier(page_alloc_cpu_notify, 0); | 4952 | hotcpu_notifier(page_alloc_cpu_notify, 0); |
4948 | } | 4953 | } |
4949 | 4954 | ||
4950 | /* | 4955 | /* |
4951 | * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio | 4956 | * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio |
4952 | * or min_free_kbytes changes. | 4957 | * or min_free_kbytes changes. |
4953 | */ | 4958 | */ |
4954 | static void calculate_totalreserve_pages(void) | 4959 | static void calculate_totalreserve_pages(void) |
4955 | { | 4960 | { |
4956 | struct pglist_data *pgdat; | 4961 | struct pglist_data *pgdat; |
4957 | unsigned long reserve_pages = 0; | 4962 | unsigned long reserve_pages = 0; |
4958 | enum zone_type i, j; | 4963 | enum zone_type i, j; |
4959 | 4964 | ||
4960 | for_each_online_pgdat(pgdat) { | 4965 | for_each_online_pgdat(pgdat) { |
4961 | for (i = 0; i < MAX_NR_ZONES; i++) { | 4966 | for (i = 0; i < MAX_NR_ZONES; i++) { |
4962 | struct zone *zone = pgdat->node_zones + i; | 4967 | struct zone *zone = pgdat->node_zones + i; |
4963 | unsigned long max = 0; | 4968 | unsigned long max = 0; |
4964 | 4969 | ||
4965 | /* Find valid and maximum lowmem_reserve in the zone */ | 4970 | /* Find valid and maximum lowmem_reserve in the zone */ |
4966 | for (j = i; j < MAX_NR_ZONES; j++) { | 4971 | for (j = i; j < MAX_NR_ZONES; j++) { |
4967 | if (zone->lowmem_reserve[j] > max) | 4972 | if (zone->lowmem_reserve[j] > max) |
4968 | max = zone->lowmem_reserve[j]; | 4973 | max = zone->lowmem_reserve[j]; |
4969 | } | 4974 | } |
4970 | 4975 | ||
4971 | /* we treat the high watermark as reserved pages. */ | 4976 | /* we treat the high watermark as reserved pages. */ |
4972 | max += high_wmark_pages(zone); | 4977 | max += high_wmark_pages(zone); |
4973 | 4978 | ||
4974 | if (max > zone->present_pages) | 4979 | if (max > zone->present_pages) |
4975 | max = zone->present_pages; | 4980 | max = zone->present_pages; |
4976 | reserve_pages += max; | 4981 | reserve_pages += max; |
4977 | /* | 4982 | /* |
4978 | * Lowmem reserves are not available to | 4983 | * Lowmem reserves are not available to |
4979 | * GFP_HIGHUSER page cache allocations and | 4984 | * GFP_HIGHUSER page cache allocations and |
4980 | * kswapd tries to balance zones to their high | 4985 | * kswapd tries to balance zones to their high |
4981 | * watermark. As a result, neither should be | 4986 | * watermark. As a result, neither should be |
4982 | * regarded as dirtyable memory, to prevent a | 4987 | * regarded as dirtyable memory, to prevent a |
4983 | * situation where reclaim has to clean pages | 4988 | * situation where reclaim has to clean pages |
4984 | * in order to balance the zones. | 4989 | * in order to balance the zones. |
4985 | */ | 4990 | */ |
4986 | zone->dirty_balance_reserve = max; | 4991 | zone->dirty_balance_reserve = max; |
4987 | } | 4992 | } |
4988 | } | 4993 | } |
4989 | dirty_balance_reserve = reserve_pages; | 4994 | dirty_balance_reserve = reserve_pages; |
4990 | totalreserve_pages = reserve_pages; | 4995 | totalreserve_pages = reserve_pages; |
4991 | } | 4996 | } |
4992 | 4997 | ||
4993 | /* | 4998 | /* |
4994 | * setup_per_zone_lowmem_reserve - called whenever | 4999 | * setup_per_zone_lowmem_reserve - called whenever |
4995 | * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone | 5000 | * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone |
4996 | * has a correct pages reserved value, so an adequate number of | 5001 | * has a correct pages reserved value, so an adequate number of |
4997 | * pages are left in the zone after a successful __alloc_pages(). | 5002 | * pages are left in the zone after a successful __alloc_pages(). |
4998 | */ | 5003 | */ |
4999 | static void setup_per_zone_lowmem_reserve(void) | 5004 | static void setup_per_zone_lowmem_reserve(void) |
5000 | { | 5005 | { |
5001 | struct pglist_data *pgdat; | 5006 | struct pglist_data *pgdat; |
5002 | enum zone_type j, idx; | 5007 | enum zone_type j, idx; |
5003 | 5008 | ||
5004 | for_each_online_pgdat(pgdat) { | 5009 | for_each_online_pgdat(pgdat) { |
5005 | for (j = 0; j < MAX_NR_ZONES; j++) { | 5010 | for (j = 0; j < MAX_NR_ZONES; j++) { |
5006 | struct zone *zone = pgdat->node_zones + j; | 5011 | struct zone *zone = pgdat->node_zones + j; |
5007 | unsigned long present_pages = zone->present_pages; | 5012 | unsigned long present_pages = zone->present_pages; |
5008 | 5013 | ||
5009 | zone->lowmem_reserve[j] = 0; | 5014 | zone->lowmem_reserve[j] = 0; |
5010 | 5015 | ||
5011 | idx = j; | 5016 | idx = j; |
5012 | while (idx) { | 5017 | while (idx) { |
5013 | struct zone *lower_zone; | 5018 | struct zone *lower_zone; |
5014 | 5019 | ||
5015 | idx--; | 5020 | idx--; |
5016 | 5021 | ||
5017 | if (sysctl_lowmem_reserve_ratio[idx] < 1) | 5022 | if (sysctl_lowmem_reserve_ratio[idx] < 1) |
5018 | sysctl_lowmem_reserve_ratio[idx] = 1; | 5023 | sysctl_lowmem_reserve_ratio[idx] = 1; |
5019 | 5024 | ||
5020 | lower_zone = pgdat->node_zones + idx; | 5025 | lower_zone = pgdat->node_zones + idx; |
5021 | lower_zone->lowmem_reserve[j] = present_pages / | 5026 | lower_zone->lowmem_reserve[j] = present_pages / |
5022 | sysctl_lowmem_reserve_ratio[idx]; | 5027 | sysctl_lowmem_reserve_ratio[idx]; |
5023 | present_pages += lower_zone->present_pages; | 5028 | present_pages += lower_zone->present_pages; |
5024 | } | 5029 | } |
5025 | } | 5030 | } |
5026 | } | 5031 | } |
5027 | 5032 | ||
5028 | /* update totalreserve_pages */ | 5033 | /* update totalreserve_pages */ |
5029 | calculate_totalreserve_pages(); | 5034 | calculate_totalreserve_pages(); |
5030 | } | 5035 | } |
5031 | 5036 | ||
5032 | static void __setup_per_zone_wmarks(void) | 5037 | static void __setup_per_zone_wmarks(void) |
5033 | { | 5038 | { |
5034 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); | 5039 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); |
5035 | unsigned long lowmem_pages = 0; | 5040 | unsigned long lowmem_pages = 0; |
5036 | struct zone *zone; | 5041 | struct zone *zone; |
5037 | unsigned long flags; | 5042 | unsigned long flags; |
5038 | 5043 | ||
5039 | /* Calculate total number of !ZONE_HIGHMEM pages */ | 5044 | /* Calculate total number of !ZONE_HIGHMEM pages */ |
5040 | for_each_zone(zone) { | 5045 | for_each_zone(zone) { |
5041 | if (!is_highmem(zone)) | 5046 | if (!is_highmem(zone)) |
5042 | lowmem_pages += zone->present_pages; | 5047 | lowmem_pages += zone->present_pages; |
5043 | } | 5048 | } |
5044 | 5049 | ||
5045 | for_each_zone(zone) { | 5050 | for_each_zone(zone) { |
5046 | u64 tmp; | 5051 | u64 tmp; |
5047 | 5052 | ||
5048 | spin_lock_irqsave(&zone->lock, flags); | 5053 | spin_lock_irqsave(&zone->lock, flags); |
5049 | tmp = (u64)pages_min * zone->present_pages; | 5054 | tmp = (u64)pages_min * zone->present_pages; |
5050 | do_div(tmp, lowmem_pages); | 5055 | do_div(tmp, lowmem_pages); |
5051 | if (is_highmem(zone)) { | 5056 | if (is_highmem(zone)) { |
5052 | /* | 5057 | /* |
5053 | * __GFP_HIGH and PF_MEMALLOC allocations usually don't | 5058 | * __GFP_HIGH and PF_MEMALLOC allocations usually don't |
5054 | * need highmem pages, so cap pages_min to a small | 5059 | * need highmem pages, so cap pages_min to a small |
5055 | * value here. | 5060 | * value here. |
5056 | * | 5061 | * |
5057 | * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) | 5062 | * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) |
5058 | * deltas controls asynch page reclaim, and so should | 5063 | * deltas controls asynch page reclaim, and so should |
5059 | * not be capped for highmem. | 5064 | * not be capped for highmem. |
5060 | */ | 5065 | */ |
5061 | int min_pages; | 5066 | int min_pages; |
5062 | 5067 | ||
5063 | min_pages = zone->present_pages / 1024; | 5068 | min_pages = zone->present_pages / 1024; |
5064 | if (min_pages < SWAP_CLUSTER_MAX) | 5069 | if (min_pages < SWAP_CLUSTER_MAX) |
5065 | min_pages = SWAP_CLUSTER_MAX; | 5070 | min_pages = SWAP_CLUSTER_MAX; |
5066 | if (min_pages > 128) | 5071 | if (min_pages > 128) |
5067 | min_pages = 128; | 5072 | min_pages = 128; |
5068 | zone->watermark[WMARK_MIN] = min_pages; | 5073 | zone->watermark[WMARK_MIN] = min_pages; |
5069 | } else { | 5074 | } else { |
5070 | /* | 5075 | /* |
5071 | * If it's a lowmem zone, reserve a number of pages | 5076 | * If it's a lowmem zone, reserve a number of pages |
5072 | * proportionate to the zone's size. | 5077 | * proportionate to the zone's size. |
5073 | */ | 5078 | */ |
5074 | zone->watermark[WMARK_MIN] = tmp; | 5079 | zone->watermark[WMARK_MIN] = tmp; |
5075 | } | 5080 | } |
5076 | 5081 | ||
5077 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); | 5082 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); |
5078 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); | 5083 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); |
5079 | 5084 | ||
5080 | zone->watermark[WMARK_MIN] += cma_wmark_pages(zone); | 5085 | zone->watermark[WMARK_MIN] += cma_wmark_pages(zone); |
5081 | zone->watermark[WMARK_LOW] += cma_wmark_pages(zone); | 5086 | zone->watermark[WMARK_LOW] += cma_wmark_pages(zone); |
5082 | zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone); | 5087 | zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone); |
5083 | 5088 | ||
5084 | setup_zone_migrate_reserve(zone); | 5089 | setup_zone_migrate_reserve(zone); |
5085 | spin_unlock_irqrestore(&zone->lock, flags); | 5090 | spin_unlock_irqrestore(&zone->lock, flags); |
5086 | } | 5091 | } |
5087 | 5092 | ||
5088 | /* update totalreserve_pages */ | 5093 | /* update totalreserve_pages */ |
5089 | calculate_totalreserve_pages(); | 5094 | calculate_totalreserve_pages(); |
5090 | } | 5095 | } |
5091 | 5096 | ||
5092 | /** | 5097 | /** |
5093 | * setup_per_zone_wmarks - called when min_free_kbytes changes | 5098 | * setup_per_zone_wmarks - called when min_free_kbytes changes |
5094 | * or when memory is hot-{added|removed} | 5099 | * or when memory is hot-{added|removed} |
5095 | * | 5100 | * |
5096 | * Ensures that the watermark[min,low,high] values for each zone are set | 5101 | * Ensures that the watermark[min,low,high] values for each zone are set |
5097 | * correctly with respect to min_free_kbytes. | 5102 | * correctly with respect to min_free_kbytes. |
5098 | */ | 5103 | */ |
5099 | void setup_per_zone_wmarks(void) | 5104 | void setup_per_zone_wmarks(void) |
5100 | { | 5105 | { |
5101 | mutex_lock(&zonelists_mutex); | 5106 | mutex_lock(&zonelists_mutex); |
5102 | __setup_per_zone_wmarks(); | 5107 | __setup_per_zone_wmarks(); |
5103 | mutex_unlock(&zonelists_mutex); | 5108 | mutex_unlock(&zonelists_mutex); |
5104 | } | 5109 | } |
5105 | 5110 | ||
5106 | /* | 5111 | /* |
5107 | * The inactive anon list should be small enough that the VM never has to | 5112 | * The inactive anon list should be small enough that the VM never has to |
5108 | * do too much work, but large enough that each inactive page has a chance | 5113 | * do too much work, but large enough that each inactive page has a chance |
5109 | * to be referenced again before it is swapped out. | 5114 | * to be referenced again before it is swapped out. |
5110 | * | 5115 | * |
5111 | * The inactive_anon ratio is the target ratio of ACTIVE_ANON to | 5116 | * The inactive_anon ratio is the target ratio of ACTIVE_ANON to |
5112 | * INACTIVE_ANON pages on this zone's LRU, maintained by the | 5117 | * INACTIVE_ANON pages on this zone's LRU, maintained by the |
5113 | * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of | 5118 | * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of |
5114 | * the anonymous pages are kept on the inactive list. | 5119 | * the anonymous pages are kept on the inactive list. |
5115 | * | 5120 | * |
5116 | * total target max | 5121 | * total target max |
5117 | * memory ratio inactive anon | 5122 | * memory ratio inactive anon |
5118 | * ------------------------------------- | 5123 | * ------------------------------------- |
5119 | * 10MB 1 5MB | 5124 | * 10MB 1 5MB |
5120 | * 100MB 1 50MB | 5125 | * 100MB 1 50MB |
5121 | * 1GB 3 250MB | 5126 | * 1GB 3 250MB |
5122 | * 10GB 10 0.9GB | 5127 | * 10GB 10 0.9GB |
5123 | * 100GB 31 3GB | 5128 | * 100GB 31 3GB |
5124 | * 1TB 101 10GB | 5129 | * 1TB 101 10GB |
5125 | * 10TB 320 32GB | 5130 | * 10TB 320 32GB |
5126 | */ | 5131 | */ |
5127 | static void __meminit calculate_zone_inactive_ratio(struct zone *zone) | 5132 | static void __meminit calculate_zone_inactive_ratio(struct zone *zone) |
5128 | { | 5133 | { |
5129 | unsigned int gb, ratio; | 5134 | unsigned int gb, ratio; |
5130 | 5135 | ||
5131 | /* Zone size in gigabytes */ | 5136 | /* Zone size in gigabytes */ |
5132 | gb = zone->present_pages >> (30 - PAGE_SHIFT); | 5137 | gb = zone->present_pages >> (30 - PAGE_SHIFT); |
5133 | if (gb) | 5138 | if (gb) |
5134 | ratio = int_sqrt(10 * gb); | 5139 | ratio = int_sqrt(10 * gb); |
5135 | else | 5140 | else |
5136 | ratio = 1; | 5141 | ratio = 1; |
5137 | 5142 | ||
5138 | zone->inactive_ratio = ratio; | 5143 | zone->inactive_ratio = ratio; |
5139 | } | 5144 | } |
5140 | 5145 | ||
5141 | static void __meminit setup_per_zone_inactive_ratio(void) | 5146 | static void __meminit setup_per_zone_inactive_ratio(void) |
5142 | { | 5147 | { |
5143 | struct zone *zone; | 5148 | struct zone *zone; |
5144 | 5149 | ||
5145 | for_each_zone(zone) | 5150 | for_each_zone(zone) |
5146 | calculate_zone_inactive_ratio(zone); | 5151 | calculate_zone_inactive_ratio(zone); |
5147 | } | 5152 | } |
5148 | 5153 | ||
5149 | /* | 5154 | /* |
5150 | * Initialise min_free_kbytes. | 5155 | * Initialise min_free_kbytes. |
5151 | * | 5156 | * |
5152 | * For small machines we want it small (128k min). For large machines | 5157 | * For small machines we want it small (128k min). For large machines |
5153 | * we want it large (64MB max). But it is not linear, because network | 5158 | * we want it large (64MB max). But it is not linear, because network |
5154 | * bandwidth does not increase linearly with machine size. We use | 5159 | * bandwidth does not increase linearly with machine size. We use |
5155 | * | 5160 | * |
5156 | * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: | 5161 | * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: |
5157 | * min_free_kbytes = sqrt(lowmem_kbytes * 16) | 5162 | * min_free_kbytes = sqrt(lowmem_kbytes * 16) |
5158 | * | 5163 | * |
5159 | * which yields | 5164 | * which yields |
5160 | * | 5165 | * |
5161 | * 16MB: 512k | 5166 | * 16MB: 512k |
5162 | * 32MB: 724k | 5167 | * 32MB: 724k |
5163 | * 64MB: 1024k | 5168 | * 64MB: 1024k |
5164 | * 128MB: 1448k | 5169 | * 128MB: 1448k |
5165 | * 256MB: 2048k | 5170 | * 256MB: 2048k |
5166 | * 512MB: 2896k | 5171 | * 512MB: 2896k |
5167 | * 1024MB: 4096k | 5172 | * 1024MB: 4096k |
5168 | * 2048MB: 5792k | 5173 | * 2048MB: 5792k |
5169 | * 4096MB: 8192k | 5174 | * 4096MB: 8192k |
5170 | * 8192MB: 11584k | 5175 | * 8192MB: 11584k |
5171 | * 16384MB: 16384k | 5176 | * 16384MB: 16384k |
5172 | */ | 5177 | */ |
5173 | int __meminit init_per_zone_wmark_min(void) | 5178 | int __meminit init_per_zone_wmark_min(void) |
5174 | { | 5179 | { |
5175 | unsigned long lowmem_kbytes; | 5180 | unsigned long lowmem_kbytes; |
5176 | 5181 | ||
5177 | lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); | 5182 | lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); |
5178 | 5183 | ||
5179 | min_free_kbytes = int_sqrt(lowmem_kbytes * 16); | 5184 | min_free_kbytes = int_sqrt(lowmem_kbytes * 16); |
5180 | if (min_free_kbytes < 128) | 5185 | if (min_free_kbytes < 128) |
5181 | min_free_kbytes = 128; | 5186 | min_free_kbytes = 128; |
5182 | if (min_free_kbytes > 65536) | 5187 | if (min_free_kbytes > 65536) |
5183 | min_free_kbytes = 65536; | 5188 | min_free_kbytes = 65536; |
5184 | setup_per_zone_wmarks(); | 5189 | setup_per_zone_wmarks(); |
5185 | refresh_zone_stat_thresholds(); | 5190 | refresh_zone_stat_thresholds(); |
5186 | setup_per_zone_lowmem_reserve(); | 5191 | setup_per_zone_lowmem_reserve(); |
5187 | setup_per_zone_inactive_ratio(); | 5192 | setup_per_zone_inactive_ratio(); |
5188 | return 0; | 5193 | return 0; |
5189 | } | 5194 | } |
5190 | module_init(init_per_zone_wmark_min) | 5195 | module_init(init_per_zone_wmark_min) |
5191 | 5196 | ||
5192 | /* | 5197 | /* |
5193 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so | 5198 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so |
5194 | * that we can call two helper functions whenever min_free_kbytes | 5199 | * that we can call two helper functions whenever min_free_kbytes |
5195 | * changes. | 5200 | * changes. |
5196 | */ | 5201 | */ |
5197 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, | 5202 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, |
5198 | void __user *buffer, size_t *length, loff_t *ppos) | 5203 | void __user *buffer, size_t *length, loff_t *ppos) |
5199 | { | 5204 | { |
5200 | proc_dointvec(table, write, buffer, length, ppos); | 5205 | proc_dointvec(table, write, buffer, length, ppos); |
5201 | if (write) | 5206 | if (write) |
5202 | setup_per_zone_wmarks(); | 5207 | setup_per_zone_wmarks(); |
5203 | return 0; | 5208 | return 0; |
5204 | } | 5209 | } |
5205 | 5210 | ||
5206 | #ifdef CONFIG_NUMA | 5211 | #ifdef CONFIG_NUMA |
5207 | int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, | 5212 | int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, |
5208 | void __user *buffer, size_t *length, loff_t *ppos) | 5213 | void __user *buffer, size_t *length, loff_t *ppos) |
5209 | { | 5214 | { |
5210 | struct zone *zone; | 5215 | struct zone *zone; |
5211 | int rc; | 5216 | int rc; |
5212 | 5217 | ||
5213 | rc = proc_dointvec_minmax(table, write, buffer, length, ppos); | 5218 | rc = proc_dointvec_minmax(table, write, buffer, length, ppos); |
5214 | if (rc) | 5219 | if (rc) |
5215 | return rc; | 5220 | return rc; |
5216 | 5221 | ||
5217 | for_each_zone(zone) | 5222 | for_each_zone(zone) |
5218 | zone->min_unmapped_pages = (zone->present_pages * | 5223 | zone->min_unmapped_pages = (zone->present_pages * |
5219 | sysctl_min_unmapped_ratio) / 100; | 5224 | sysctl_min_unmapped_ratio) / 100; |
5220 | return 0; | 5225 | return 0; |
5221 | } | 5226 | } |
5222 | 5227 | ||
5223 | int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, | 5228 | int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, |
5224 | void __user *buffer, size_t *length, loff_t *ppos) | 5229 | void __user *buffer, size_t *length, loff_t *ppos) |
5225 | { | 5230 | { |
5226 | struct zone *zone; | 5231 | struct zone *zone; |
5227 | int rc; | 5232 | int rc; |
5228 | 5233 | ||
5229 | rc = proc_dointvec_minmax(table, write, buffer, length, ppos); | 5234 | rc = proc_dointvec_minmax(table, write, buffer, length, ppos); |
5230 | if (rc) | 5235 | if (rc) |
5231 | return rc; | 5236 | return rc; |
5232 | 5237 | ||
5233 | for_each_zone(zone) | 5238 | for_each_zone(zone) |
5234 | zone->min_slab_pages = (zone->present_pages * | 5239 | zone->min_slab_pages = (zone->present_pages * |
5235 | sysctl_min_slab_ratio) / 100; | 5240 | sysctl_min_slab_ratio) / 100; |
5236 | return 0; | 5241 | return 0; |
5237 | } | 5242 | } |
5238 | #endif | 5243 | #endif |
5239 | 5244 | ||
5240 | /* | 5245 | /* |
5241 | * lowmem_reserve_ratio_sysctl_handler - just a wrapper around | 5246 | * lowmem_reserve_ratio_sysctl_handler - just a wrapper around |
5242 | * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() | 5247 | * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() |
5243 | * whenever sysctl_lowmem_reserve_ratio changes. | 5248 | * whenever sysctl_lowmem_reserve_ratio changes. |
5244 | * | 5249 | * |
5245 | * The reserve ratio obviously has absolutely no relation with the | 5250 | * The reserve ratio obviously has absolutely no relation with the |
5246 | * minimum watermarks. The lowmem reserve ratio can only make sense | 5251 | * minimum watermarks. The lowmem reserve ratio can only make sense |
5247 | * if in function of the boot time zone sizes. | 5252 | * if in function of the boot time zone sizes. |
5248 | */ | 5253 | */ |
5249 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | 5254 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, |
5250 | void __user *buffer, size_t *length, loff_t *ppos) | 5255 | void __user *buffer, size_t *length, loff_t *ppos) |
5251 | { | 5256 | { |
5252 | proc_dointvec_minmax(table, write, buffer, length, ppos); | 5257 | proc_dointvec_minmax(table, write, buffer, length, ppos); |
5253 | setup_per_zone_lowmem_reserve(); | 5258 | setup_per_zone_lowmem_reserve(); |
5254 | return 0; | 5259 | return 0; |
5255 | } | 5260 | } |
5256 | 5261 | ||
5257 | /* | 5262 | /* |
5258 | * percpu_pagelist_fraction - changes the pcp->high for each zone on each | 5263 | * percpu_pagelist_fraction - changes the pcp->high for each zone on each |
5259 | * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist | 5264 | * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist |
5260 | * can have before it gets flushed back to buddy allocator. | 5265 | * can have before it gets flushed back to buddy allocator. |
5261 | */ | 5266 | */ |
5262 | 5267 | ||
5263 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | 5268 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, |
5264 | void __user *buffer, size_t *length, loff_t *ppos) | 5269 | void __user *buffer, size_t *length, loff_t *ppos) |
5265 | { | 5270 | { |
5266 | struct zone *zone; | 5271 | struct zone *zone; |
5267 | unsigned int cpu; | 5272 | unsigned int cpu; |
5268 | int ret; | 5273 | int ret; |
5269 | 5274 | ||
5270 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); | 5275 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); |
5271 | if (!write || (ret < 0)) | 5276 | if (!write || (ret < 0)) |
5272 | return ret; | 5277 | return ret; |
5273 | for_each_populated_zone(zone) { | 5278 | for_each_populated_zone(zone) { |
5274 | for_each_possible_cpu(cpu) { | 5279 | for_each_possible_cpu(cpu) { |
5275 | unsigned long high; | 5280 | unsigned long high; |
5276 | high = zone->present_pages / percpu_pagelist_fraction; | 5281 | high = zone->present_pages / percpu_pagelist_fraction; |
5277 | setup_pagelist_highmark( | 5282 | setup_pagelist_highmark( |
5278 | per_cpu_ptr(zone->pageset, cpu), high); | 5283 | per_cpu_ptr(zone->pageset, cpu), high); |
5279 | } | 5284 | } |
5280 | } | 5285 | } |
5281 | return 0; | 5286 | return 0; |
5282 | } | 5287 | } |
5283 | 5288 | ||
5284 | int hashdist = HASHDIST_DEFAULT; | 5289 | int hashdist = HASHDIST_DEFAULT; |
5285 | 5290 | ||
5286 | #ifdef CONFIG_NUMA | 5291 | #ifdef CONFIG_NUMA |
5287 | static int __init set_hashdist(char *str) | 5292 | static int __init set_hashdist(char *str) |
5288 | { | 5293 | { |
5289 | if (!str) | 5294 | if (!str) |
5290 | return 0; | 5295 | return 0; |
5291 | hashdist = simple_strtoul(str, &str, 0); | 5296 | hashdist = simple_strtoul(str, &str, 0); |
5292 | return 1; | 5297 | return 1; |
5293 | } | 5298 | } |
5294 | __setup("hashdist=", set_hashdist); | 5299 | __setup("hashdist=", set_hashdist); |
5295 | #endif | 5300 | #endif |
5296 | 5301 | ||
5297 | /* | 5302 | /* |
5298 | * allocate a large system hash table from bootmem | 5303 | * allocate a large system hash table from bootmem |
5299 | * - it is assumed that the hash table must contain an exact power-of-2 | 5304 | * - it is assumed that the hash table must contain an exact power-of-2 |
5300 | * quantity of entries | 5305 | * quantity of entries |
5301 | * - limit is the number of hash buckets, not the total allocation size | 5306 | * - limit is the number of hash buckets, not the total allocation size |
5302 | */ | 5307 | */ |
5303 | void *__init alloc_large_system_hash(const char *tablename, | 5308 | void *__init alloc_large_system_hash(const char *tablename, |
5304 | unsigned long bucketsize, | 5309 | unsigned long bucketsize, |
5305 | unsigned long numentries, | 5310 | unsigned long numentries, |
5306 | int scale, | 5311 | int scale, |
5307 | int flags, | 5312 | int flags, |
5308 | unsigned int *_hash_shift, | 5313 | unsigned int *_hash_shift, |
5309 | unsigned int *_hash_mask, | 5314 | unsigned int *_hash_mask, |
5310 | unsigned long low_limit, | 5315 | unsigned long low_limit, |
5311 | unsigned long high_limit) | 5316 | unsigned long high_limit) |
5312 | { | 5317 | { |
5313 | unsigned long long max = high_limit; | 5318 | unsigned long long max = high_limit; |
5314 | unsigned long log2qty, size; | 5319 | unsigned long log2qty, size; |
5315 | void *table = NULL; | 5320 | void *table = NULL; |
5316 | 5321 | ||
5317 | /* allow the kernel cmdline to have a say */ | 5322 | /* allow the kernel cmdline to have a say */ |
5318 | if (!numentries) { | 5323 | if (!numentries) { |
5319 | /* round applicable memory size up to nearest megabyte */ | 5324 | /* round applicable memory size up to nearest megabyte */ |
5320 | numentries = nr_kernel_pages; | 5325 | numentries = nr_kernel_pages; |
5321 | numentries += (1UL << (20 - PAGE_SHIFT)) - 1; | 5326 | numentries += (1UL << (20 - PAGE_SHIFT)) - 1; |
5322 | numentries >>= 20 - PAGE_SHIFT; | 5327 | numentries >>= 20 - PAGE_SHIFT; |
5323 | numentries <<= 20 - PAGE_SHIFT; | 5328 | numentries <<= 20 - PAGE_SHIFT; |
5324 | 5329 | ||
5325 | /* limit to 1 bucket per 2^scale bytes of low memory */ | 5330 | /* limit to 1 bucket per 2^scale bytes of low memory */ |
5326 | if (scale > PAGE_SHIFT) | 5331 | if (scale > PAGE_SHIFT) |
5327 | numentries >>= (scale - PAGE_SHIFT); | 5332 | numentries >>= (scale - PAGE_SHIFT); |
5328 | else | 5333 | else |
5329 | numentries <<= (PAGE_SHIFT - scale); | 5334 | numentries <<= (PAGE_SHIFT - scale); |
5330 | 5335 | ||
5331 | /* Make sure we've got at least a 0-order allocation.. */ | 5336 | /* Make sure we've got at least a 0-order allocation.. */ |
5332 | if (unlikely(flags & HASH_SMALL)) { | 5337 | if (unlikely(flags & HASH_SMALL)) { |
5333 | /* Makes no sense without HASH_EARLY */ | 5338 | /* Makes no sense without HASH_EARLY */ |
5334 | WARN_ON(!(flags & HASH_EARLY)); | 5339 | WARN_ON(!(flags & HASH_EARLY)); |
5335 | if (!(numentries >> *_hash_shift)) { | 5340 | if (!(numentries >> *_hash_shift)) { |
5336 | numentries = 1UL << *_hash_shift; | 5341 | numentries = 1UL << *_hash_shift; |
5337 | BUG_ON(!numentries); | 5342 | BUG_ON(!numentries); |
5338 | } | 5343 | } |
5339 | } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) | 5344 | } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) |
5340 | numentries = PAGE_SIZE / bucketsize; | 5345 | numentries = PAGE_SIZE / bucketsize; |
5341 | } | 5346 | } |
5342 | numentries = roundup_pow_of_two(numentries); | 5347 | numentries = roundup_pow_of_two(numentries); |
5343 | 5348 | ||
5344 | /* limit allocation size to 1/16 total memory by default */ | 5349 | /* limit allocation size to 1/16 total memory by default */ |
5345 | if (max == 0) { | 5350 | if (max == 0) { |
5346 | max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; | 5351 | max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; |
5347 | do_div(max, bucketsize); | 5352 | do_div(max, bucketsize); |
5348 | } | 5353 | } |
5349 | max = min(max, 0x80000000ULL); | 5354 | max = min(max, 0x80000000ULL); |
5350 | 5355 | ||
5351 | if (numentries < low_limit) | 5356 | if (numentries < low_limit) |
5352 | numentries = low_limit; | 5357 | numentries = low_limit; |
5353 | if (numentries > max) | 5358 | if (numentries > max) |
5354 | numentries = max; | 5359 | numentries = max; |
5355 | 5360 | ||
5356 | log2qty = ilog2(numentries); | 5361 | log2qty = ilog2(numentries); |
5357 | 5362 | ||
5358 | do { | 5363 | do { |
5359 | size = bucketsize << log2qty; | 5364 | size = bucketsize << log2qty; |
5360 | if (flags & HASH_EARLY) | 5365 | if (flags & HASH_EARLY) |
5361 | table = alloc_bootmem_nopanic(size); | 5366 | table = alloc_bootmem_nopanic(size); |
5362 | else if (hashdist) | 5367 | else if (hashdist) |
5363 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); | 5368 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); |
5364 | else { | 5369 | else { |
5365 | /* | 5370 | /* |
5366 | * If bucketsize is not a power-of-two, we may free | 5371 | * If bucketsize is not a power-of-two, we may free |
5367 | * some pages at the end of hash table which | 5372 | * some pages at the end of hash table which |
5368 | * alloc_pages_exact() automatically does | 5373 | * alloc_pages_exact() automatically does |
5369 | */ | 5374 | */ |
5370 | if (get_order(size) < MAX_ORDER) { | 5375 | if (get_order(size) < MAX_ORDER) { |
5371 | table = alloc_pages_exact(size, GFP_ATOMIC); | 5376 | table = alloc_pages_exact(size, GFP_ATOMIC); |
5372 | kmemleak_alloc(table, size, 1, GFP_ATOMIC); | 5377 | kmemleak_alloc(table, size, 1, GFP_ATOMIC); |
5373 | } | 5378 | } |
5374 | } | 5379 | } |
5375 | } while (!table && size > PAGE_SIZE && --log2qty); | 5380 | } while (!table && size > PAGE_SIZE && --log2qty); |
5376 | 5381 | ||
5377 | if (!table) | 5382 | if (!table) |
5378 | panic("Failed to allocate %s hash table\n", tablename); | 5383 | panic("Failed to allocate %s hash table\n", tablename); |
5379 | 5384 | ||
5380 | printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", | 5385 | printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", |
5381 | tablename, | 5386 | tablename, |
5382 | (1UL << log2qty), | 5387 | (1UL << log2qty), |
5383 | ilog2(size) - PAGE_SHIFT, | 5388 | ilog2(size) - PAGE_SHIFT, |
5384 | size); | 5389 | size); |
5385 | 5390 | ||
5386 | if (_hash_shift) | 5391 | if (_hash_shift) |
5387 | *_hash_shift = log2qty; | 5392 | *_hash_shift = log2qty; |
5388 | if (_hash_mask) | 5393 | if (_hash_mask) |
5389 | *_hash_mask = (1 << log2qty) - 1; | 5394 | *_hash_mask = (1 << log2qty) - 1; |
5390 | 5395 | ||
5391 | return table; | 5396 | return table; |
5392 | } | 5397 | } |
5393 | 5398 | ||
5394 | /* Return a pointer to the bitmap storing bits affecting a block of pages */ | 5399 | /* Return a pointer to the bitmap storing bits affecting a block of pages */ |
5395 | static inline unsigned long *get_pageblock_bitmap(struct zone *zone, | 5400 | static inline unsigned long *get_pageblock_bitmap(struct zone *zone, |
5396 | unsigned long pfn) | 5401 | unsigned long pfn) |
5397 | { | 5402 | { |
5398 | #ifdef CONFIG_SPARSEMEM | 5403 | #ifdef CONFIG_SPARSEMEM |
5399 | return __pfn_to_section(pfn)->pageblock_flags; | 5404 | return __pfn_to_section(pfn)->pageblock_flags; |
5400 | #else | 5405 | #else |
5401 | return zone->pageblock_flags; | 5406 | return zone->pageblock_flags; |
5402 | #endif /* CONFIG_SPARSEMEM */ | 5407 | #endif /* CONFIG_SPARSEMEM */ |
5403 | } | 5408 | } |
5404 | 5409 | ||
5405 | static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) | 5410 | static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) |
5406 | { | 5411 | { |
5407 | #ifdef CONFIG_SPARSEMEM | 5412 | #ifdef CONFIG_SPARSEMEM |
5408 | pfn &= (PAGES_PER_SECTION-1); | 5413 | pfn &= (PAGES_PER_SECTION-1); |
5409 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; | 5414 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; |
5410 | #else | 5415 | #else |
5411 | pfn = pfn - zone->zone_start_pfn; | 5416 | pfn = pfn - zone->zone_start_pfn; |
5412 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; | 5417 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; |
5413 | #endif /* CONFIG_SPARSEMEM */ | 5418 | #endif /* CONFIG_SPARSEMEM */ |
5414 | } | 5419 | } |
5415 | 5420 | ||
5416 | /** | 5421 | /** |
5417 | * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages | 5422 | * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages |
5418 | * @page: The page within the block of interest | 5423 | * @page: The page within the block of interest |
5419 | * @start_bitidx: The first bit of interest to retrieve | 5424 | * @start_bitidx: The first bit of interest to retrieve |
5420 | * @end_bitidx: The last bit of interest | 5425 | * @end_bitidx: The last bit of interest |
5421 | * returns pageblock_bits flags | 5426 | * returns pageblock_bits flags |
5422 | */ | 5427 | */ |
5423 | unsigned long get_pageblock_flags_group(struct page *page, | 5428 | unsigned long get_pageblock_flags_group(struct page *page, |
5424 | int start_bitidx, int end_bitidx) | 5429 | int start_bitidx, int end_bitidx) |
5425 | { | 5430 | { |
5426 | struct zone *zone; | 5431 | struct zone *zone; |
5427 | unsigned long *bitmap; | 5432 | unsigned long *bitmap; |
5428 | unsigned long pfn, bitidx; | 5433 | unsigned long pfn, bitidx; |
5429 | unsigned long flags = 0; | 5434 | unsigned long flags = 0; |
5430 | unsigned long value = 1; | 5435 | unsigned long value = 1; |
5431 | 5436 | ||
5432 | zone = page_zone(page); | 5437 | zone = page_zone(page); |
5433 | pfn = page_to_pfn(page); | 5438 | pfn = page_to_pfn(page); |
5434 | bitmap = get_pageblock_bitmap(zone, pfn); | 5439 | bitmap = get_pageblock_bitmap(zone, pfn); |
5435 | bitidx = pfn_to_bitidx(zone, pfn); | 5440 | bitidx = pfn_to_bitidx(zone, pfn); |
5436 | 5441 | ||
5437 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | 5442 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) |
5438 | if (test_bit(bitidx + start_bitidx, bitmap)) | 5443 | if (test_bit(bitidx + start_bitidx, bitmap)) |
5439 | flags |= value; | 5444 | flags |= value; |
5440 | 5445 | ||
5441 | return flags; | 5446 | return flags; |
5442 | } | 5447 | } |
5443 | 5448 | ||
5444 | /** | 5449 | /** |
5445 | * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages | 5450 | * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages |
5446 | * @page: The page within the block of interest | 5451 | * @page: The page within the block of interest |
5447 | * @start_bitidx: The first bit of interest | 5452 | * @start_bitidx: The first bit of interest |
5448 | * @end_bitidx: The last bit of interest | 5453 | * @end_bitidx: The last bit of interest |
5449 | * @flags: The flags to set | 5454 | * @flags: The flags to set |
5450 | */ | 5455 | */ |
5451 | void set_pageblock_flags_group(struct page *page, unsigned long flags, | 5456 | void set_pageblock_flags_group(struct page *page, unsigned long flags, |
5452 | int start_bitidx, int end_bitidx) | 5457 | int start_bitidx, int end_bitidx) |
5453 | { | 5458 | { |
5454 | struct zone *zone; | 5459 | struct zone *zone; |
5455 | unsigned long *bitmap; | 5460 | unsigned long *bitmap; |
5456 | unsigned long pfn, bitidx; | 5461 | unsigned long pfn, bitidx; |
5457 | unsigned long value = 1; | 5462 | unsigned long value = 1; |
5458 | 5463 | ||
5459 | zone = page_zone(page); | 5464 | zone = page_zone(page); |
5460 | pfn = page_to_pfn(page); | 5465 | pfn = page_to_pfn(page); |
5461 | bitmap = get_pageblock_bitmap(zone, pfn); | 5466 | bitmap = get_pageblock_bitmap(zone, pfn); |
5462 | bitidx = pfn_to_bitidx(zone, pfn); | 5467 | bitidx = pfn_to_bitidx(zone, pfn); |
5463 | VM_BUG_ON(pfn < zone->zone_start_pfn); | 5468 | VM_BUG_ON(pfn < zone->zone_start_pfn); |
5464 | VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages); | 5469 | VM_BUG_ON(pfn >= zone->zone_start_pfn + zone->spanned_pages); |
5465 | 5470 | ||
5466 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) | 5471 | for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) |
5467 | if (flags & value) | 5472 | if (flags & value) |
5468 | __set_bit(bitidx + start_bitidx, bitmap); | 5473 | __set_bit(bitidx + start_bitidx, bitmap); |
5469 | else | 5474 | else |
5470 | __clear_bit(bitidx + start_bitidx, bitmap); | 5475 | __clear_bit(bitidx + start_bitidx, bitmap); |
5471 | } | 5476 | } |
5472 | 5477 | ||
5473 | /* | 5478 | /* |
5474 | * This function checks whether pageblock includes unmovable pages or not. | 5479 | * This function checks whether pageblock includes unmovable pages or not. |
5475 | * If @count is not zero, it is okay to include less @count unmovable pages | 5480 | * If @count is not zero, it is okay to include less @count unmovable pages |
5476 | * | 5481 | * |
5477 | * PageLRU check wihtout isolation or lru_lock could race so that | 5482 | * PageLRU check wihtout isolation or lru_lock could race so that |
5478 | * MIGRATE_MOVABLE block might include unmovable pages. It means you can't | 5483 | * MIGRATE_MOVABLE block might include unmovable pages. It means you can't |
5479 | * expect this function should be exact. | 5484 | * expect this function should be exact. |
5480 | */ | 5485 | */ |
5481 | static bool | 5486 | static bool |
5482 | __has_unmovable_pages(struct zone *zone, struct page *page, int count) | 5487 | __has_unmovable_pages(struct zone *zone, struct page *page, int count) |
5483 | { | 5488 | { |
5484 | unsigned long pfn, iter, found; | 5489 | unsigned long pfn, iter, found; |
5485 | int mt; | 5490 | int mt; |
5486 | 5491 | ||
5487 | /* | 5492 | /* |
5488 | * For avoiding noise data, lru_add_drain_all() should be called | 5493 | * For avoiding noise data, lru_add_drain_all() should be called |
5489 | * If ZONE_MOVABLE, the zone never contains unmovable pages | 5494 | * If ZONE_MOVABLE, the zone never contains unmovable pages |
5490 | */ | 5495 | */ |
5491 | if (zone_idx(zone) == ZONE_MOVABLE) | 5496 | if (zone_idx(zone) == ZONE_MOVABLE) |
5492 | return false; | 5497 | return false; |
5493 | mt = get_pageblock_migratetype(page); | 5498 | mt = get_pageblock_migratetype(page); |
5494 | if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) | 5499 | if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) |
5495 | return false; | 5500 | return false; |
5496 | 5501 | ||
5497 | pfn = page_to_pfn(page); | 5502 | pfn = page_to_pfn(page); |
5498 | for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { | 5503 | for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { |
5499 | unsigned long check = pfn + iter; | 5504 | unsigned long check = pfn + iter; |
5500 | 5505 | ||
5501 | if (!pfn_valid_within(check)) | 5506 | if (!pfn_valid_within(check)) |
5502 | continue; | 5507 | continue; |
5503 | 5508 | ||
5504 | page = pfn_to_page(check); | 5509 | page = pfn_to_page(check); |
5505 | /* | 5510 | /* |
5506 | * We can't use page_count without pin a page | 5511 | * We can't use page_count without pin a page |
5507 | * because another CPU can free compound page. | 5512 | * because another CPU can free compound page. |
5508 | * This check already skips compound tails of THP | 5513 | * This check already skips compound tails of THP |
5509 | * because their page->_count is zero at all time. | 5514 | * because their page->_count is zero at all time. |
5510 | */ | 5515 | */ |
5511 | if (!atomic_read(&page->_count)) { | 5516 | if (!atomic_read(&page->_count)) { |
5512 | if (PageBuddy(page)) | 5517 | if (PageBuddy(page)) |
5513 | iter += (1 << page_order(page)) - 1; | 5518 | iter += (1 << page_order(page)) - 1; |
5514 | continue; | 5519 | continue; |
5515 | } | 5520 | } |
5516 | 5521 | ||
5517 | if (!PageLRU(page)) | 5522 | if (!PageLRU(page)) |
5518 | found++; | 5523 | found++; |
5519 | /* | 5524 | /* |
5520 | * If there are RECLAIMABLE pages, we need to check it. | 5525 | * If there are RECLAIMABLE pages, we need to check it. |
5521 | * But now, memory offline itself doesn't call shrink_slab() | 5526 | * But now, memory offline itself doesn't call shrink_slab() |
5522 | * and it still to be fixed. | 5527 | * and it still to be fixed. |
5523 | */ | 5528 | */ |
5524 | /* | 5529 | /* |
5525 | * If the page is not RAM, page_count()should be 0. | 5530 | * If the page is not RAM, page_count()should be 0. |
5526 | * we don't need more check. This is an _used_ not-movable page. | 5531 | * we don't need more check. This is an _used_ not-movable page. |
5527 | * | 5532 | * |
5528 | * The problematic thing here is PG_reserved pages. PG_reserved | 5533 | * The problematic thing here is PG_reserved pages. PG_reserved |
5529 | * is set to both of a memory hole page and a _used_ kernel | 5534 | * is set to both of a memory hole page and a _used_ kernel |
5530 | * page at boot. | 5535 | * page at boot. |
5531 | */ | 5536 | */ |
5532 | if (found > count) | 5537 | if (found > count) |
5533 | return true; | 5538 | return true; |
5534 | } | 5539 | } |
5535 | return false; | 5540 | return false; |
5536 | } | 5541 | } |
5537 | 5542 | ||
5538 | bool is_pageblock_removable_nolock(struct page *page) | 5543 | bool is_pageblock_removable_nolock(struct page *page) |
5539 | { | 5544 | { |
5540 | struct zone *zone; | 5545 | struct zone *zone; |
5541 | unsigned long pfn; | 5546 | unsigned long pfn; |
5542 | 5547 | ||
5543 | /* | 5548 | /* |
5544 | * We have to be careful here because we are iterating over memory | 5549 | * We have to be careful here because we are iterating over memory |
5545 | * sections which are not zone aware so we might end up outside of | 5550 | * sections which are not zone aware so we might end up outside of |
5546 | * the zone but still within the section. | 5551 | * the zone but still within the section. |
5547 | * We have to take care about the node as well. If the node is offline | 5552 | * We have to take care about the node as well. If the node is offline |
5548 | * its NODE_DATA will be NULL - see page_zone. | 5553 | * its NODE_DATA will be NULL - see page_zone. |
5549 | */ | 5554 | */ |
5550 | if (!node_online(page_to_nid(page))) | 5555 | if (!node_online(page_to_nid(page))) |
5551 | return false; | 5556 | return false; |
5552 | 5557 | ||
5553 | zone = page_zone(page); | 5558 | zone = page_zone(page); |
5554 | pfn = page_to_pfn(page); | 5559 | pfn = page_to_pfn(page); |
5555 | if (zone->zone_start_pfn > pfn || | 5560 | if (zone->zone_start_pfn > pfn || |
5556 | zone->zone_start_pfn + zone->spanned_pages <= pfn) | 5561 | zone->zone_start_pfn + zone->spanned_pages <= pfn) |
5557 | return false; | 5562 | return false; |
5558 | 5563 | ||
5559 | return !__has_unmovable_pages(zone, page, 0); | 5564 | return !__has_unmovable_pages(zone, page, 0); |
5560 | } | 5565 | } |
5561 | 5566 | ||
5562 | int set_migratetype_isolate(struct page *page) | 5567 | int set_migratetype_isolate(struct page *page) |
5563 | { | 5568 | { |
5564 | struct zone *zone; | 5569 | struct zone *zone; |
5565 | unsigned long flags, pfn; | 5570 | unsigned long flags, pfn; |
5566 | struct memory_isolate_notify arg; | 5571 | struct memory_isolate_notify arg; |
5567 | int notifier_ret; | 5572 | int notifier_ret; |
5568 | int ret = -EBUSY; | 5573 | int ret = -EBUSY; |
5569 | 5574 | ||
5570 | zone = page_zone(page); | 5575 | zone = page_zone(page); |
5571 | 5576 | ||
5572 | spin_lock_irqsave(&zone->lock, flags); | 5577 | spin_lock_irqsave(&zone->lock, flags); |
5573 | 5578 | ||
5574 | pfn = page_to_pfn(page); | 5579 | pfn = page_to_pfn(page); |
5575 | arg.start_pfn = pfn; | 5580 | arg.start_pfn = pfn; |
5576 | arg.nr_pages = pageblock_nr_pages; | 5581 | arg.nr_pages = pageblock_nr_pages; |
5577 | arg.pages_found = 0; | 5582 | arg.pages_found = 0; |
5578 | 5583 | ||
5579 | /* | 5584 | /* |
5580 | * It may be possible to isolate a pageblock even if the | 5585 | * It may be possible to isolate a pageblock even if the |
5581 | * migratetype is not MIGRATE_MOVABLE. The memory isolation | 5586 | * migratetype is not MIGRATE_MOVABLE. The memory isolation |
5582 | * notifier chain is used by balloon drivers to return the | 5587 | * notifier chain is used by balloon drivers to return the |
5583 | * number of pages in a range that are held by the balloon | 5588 | * number of pages in a range that are held by the balloon |
5584 | * driver to shrink memory. If all the pages are accounted for | 5589 | * driver to shrink memory. If all the pages are accounted for |
5585 | * by balloons, are free, or on the LRU, isolation can continue. | 5590 | * by balloons, are free, or on the LRU, isolation can continue. |
5586 | * Later, for example, when memory hotplug notifier runs, these | 5591 | * Later, for example, when memory hotplug notifier runs, these |
5587 | * pages reported as "can be isolated" should be isolated(freed) | 5592 | * pages reported as "can be isolated" should be isolated(freed) |
5588 | * by the balloon driver through the memory notifier chain. | 5593 | * by the balloon driver through the memory notifier chain. |
5589 | */ | 5594 | */ |
5590 | notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); | 5595 | notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); |
5591 | notifier_ret = notifier_to_errno(notifier_ret); | 5596 | notifier_ret = notifier_to_errno(notifier_ret); |
5592 | if (notifier_ret) | 5597 | if (notifier_ret) |
5593 | goto out; | 5598 | goto out; |
5594 | /* | 5599 | /* |
5595 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. | 5600 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. |
5596 | * We just check MOVABLE pages. | 5601 | * We just check MOVABLE pages. |
5597 | */ | 5602 | */ |
5598 | if (!__has_unmovable_pages(zone, page, arg.pages_found)) | 5603 | if (!__has_unmovable_pages(zone, page, arg.pages_found)) |
5599 | ret = 0; | 5604 | ret = 0; |
5600 | /* | 5605 | /* |
5601 | * Unmovable means "not-on-lru" pages. If Unmovable pages are | 5606 | * Unmovable means "not-on-lru" pages. If Unmovable pages are |
5602 | * larger than removable-by-driver pages reported by notifier, | 5607 | * larger than removable-by-driver pages reported by notifier, |
5603 | * we'll fail. | 5608 | * we'll fail. |
5604 | */ | 5609 | */ |
5605 | 5610 | ||
5606 | out: | 5611 | out: |
5607 | if (!ret) { | 5612 | if (!ret) { |
5608 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | 5613 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); |
5609 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | 5614 | move_freepages_block(zone, page, MIGRATE_ISOLATE); |
5610 | } | 5615 | } |
5611 | 5616 | ||
5612 | spin_unlock_irqrestore(&zone->lock, flags); | 5617 | spin_unlock_irqrestore(&zone->lock, flags); |
5613 | if (!ret) | 5618 | if (!ret) |
5614 | drain_all_pages(); | 5619 | drain_all_pages(); |
5615 | return ret; | 5620 | return ret; |
5616 | } | 5621 | } |
5617 | 5622 | ||
5618 | void unset_migratetype_isolate(struct page *page, unsigned migratetype) | 5623 | void unset_migratetype_isolate(struct page *page, unsigned migratetype) |
5619 | { | 5624 | { |
5620 | struct zone *zone; | 5625 | struct zone *zone; |
5621 | unsigned long flags; | 5626 | unsigned long flags; |
5622 | zone = page_zone(page); | 5627 | zone = page_zone(page); |
5623 | spin_lock_irqsave(&zone->lock, flags); | 5628 | spin_lock_irqsave(&zone->lock, flags); |
5624 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | 5629 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) |
5625 | goto out; | 5630 | goto out; |
5626 | set_pageblock_migratetype(page, migratetype); | 5631 | set_pageblock_migratetype(page, migratetype); |
5627 | move_freepages_block(zone, page, migratetype); | 5632 | move_freepages_block(zone, page, migratetype); |
5628 | out: | 5633 | out: |
5629 | spin_unlock_irqrestore(&zone->lock, flags); | 5634 | spin_unlock_irqrestore(&zone->lock, flags); |
5630 | } | 5635 | } |
5631 | 5636 | ||
5632 | #ifdef CONFIG_CMA | 5637 | #ifdef CONFIG_CMA |
5633 | 5638 | ||
5634 | static unsigned long pfn_max_align_down(unsigned long pfn) | 5639 | static unsigned long pfn_max_align_down(unsigned long pfn) |
5635 | { | 5640 | { |
5636 | return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, | 5641 | return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, |
5637 | pageblock_nr_pages) - 1); | 5642 | pageblock_nr_pages) - 1); |
5638 | } | 5643 | } |
5639 | 5644 | ||
5640 | static unsigned long pfn_max_align_up(unsigned long pfn) | 5645 | static unsigned long pfn_max_align_up(unsigned long pfn) |
5641 | { | 5646 | { |
5642 | return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, | 5647 | return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES, |
5643 | pageblock_nr_pages)); | 5648 | pageblock_nr_pages)); |
5644 | } | 5649 | } |
5645 | 5650 | ||
5646 | static struct page * | 5651 | static struct page * |
5647 | __alloc_contig_migrate_alloc(struct page *page, unsigned long private, | 5652 | __alloc_contig_migrate_alloc(struct page *page, unsigned long private, |
5648 | int **resultp) | 5653 | int **resultp) |
5649 | { | 5654 | { |
5650 | gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; | 5655 | gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; |
5651 | 5656 | ||
5652 | if (PageHighMem(page)) | 5657 | if (PageHighMem(page)) |
5653 | gfp_mask |= __GFP_HIGHMEM; | 5658 | gfp_mask |= __GFP_HIGHMEM; |
5654 | 5659 | ||
5655 | return alloc_page(gfp_mask); | 5660 | return alloc_page(gfp_mask); |
5656 | } | 5661 | } |
5657 | 5662 | ||
5658 | /* [start, end) must belong to a single zone. */ | 5663 | /* [start, end) must belong to a single zone. */ |
5659 | static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) | 5664 | static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) |
5660 | { | 5665 | { |
5661 | /* This function is based on compact_zone() from compaction.c. */ | 5666 | /* This function is based on compact_zone() from compaction.c. */ |
5662 | 5667 | ||
5663 | unsigned long pfn = start; | 5668 | unsigned long pfn = start; |
5664 | unsigned int tries = 0; | 5669 | unsigned int tries = 0; |
5665 | int ret = 0; | 5670 | int ret = 0; |
5666 | 5671 | ||
5667 | struct compact_control cc = { | 5672 | struct compact_control cc = { |
5668 | .nr_migratepages = 0, | 5673 | .nr_migratepages = 0, |
5669 | .order = -1, | 5674 | .order = -1, |
5670 | .zone = page_zone(pfn_to_page(start)), | 5675 | .zone = page_zone(pfn_to_page(start)), |
5671 | .sync = true, | 5676 | .sync = true, |
5672 | }; | 5677 | }; |
5673 | INIT_LIST_HEAD(&cc.migratepages); | 5678 | INIT_LIST_HEAD(&cc.migratepages); |
5674 | 5679 | ||
5675 | migrate_prep_local(); | 5680 | migrate_prep_local(); |
5676 | 5681 | ||
5677 | while (pfn < end || !list_empty(&cc.migratepages)) { | 5682 | while (pfn < end || !list_empty(&cc.migratepages)) { |
5678 | if (fatal_signal_pending(current)) { | 5683 | if (fatal_signal_pending(current)) { |
5679 | ret = -EINTR; | 5684 | ret = -EINTR; |
5680 | break; | 5685 | break; |
5681 | } | 5686 | } |
5682 | 5687 | ||
5683 | if (list_empty(&cc.migratepages)) { | 5688 | if (list_empty(&cc.migratepages)) { |
5684 | cc.nr_migratepages = 0; | 5689 | cc.nr_migratepages = 0; |
5685 | pfn = isolate_migratepages_range(cc.zone, &cc, | 5690 | pfn = isolate_migratepages_range(cc.zone, &cc, |
5686 | pfn, end); | 5691 | pfn, end); |
5687 | if (!pfn) { | 5692 | if (!pfn) { |
5688 | ret = -EINTR; | 5693 | ret = -EINTR; |
5689 | break; | 5694 | break; |
5690 | } | 5695 | } |
5691 | tries = 0; | 5696 | tries = 0; |
5692 | } else if (++tries == 5) { | 5697 | } else if (++tries == 5) { |
5693 | ret = ret < 0 ? ret : -EBUSY; | 5698 | ret = ret < 0 ? ret : -EBUSY; |
5694 | break; | 5699 | break; |
5695 | } | 5700 | } |
5696 | 5701 | ||
5697 | ret = migrate_pages(&cc.migratepages, | 5702 | ret = migrate_pages(&cc.migratepages, |
5698 | __alloc_contig_migrate_alloc, | 5703 | __alloc_contig_migrate_alloc, |
5699 | 0, false, MIGRATE_SYNC); | 5704 | 0, false, MIGRATE_SYNC); |
5700 | } | 5705 | } |
5701 | 5706 | ||
5702 | putback_lru_pages(&cc.migratepages); | 5707 | putback_lru_pages(&cc.migratepages); |
5703 | return ret > 0 ? 0 : ret; | 5708 | return ret > 0 ? 0 : ret; |
5704 | } | 5709 | } |
5705 | 5710 | ||
5706 | /* | 5711 | /* |
5707 | * Update zone's cma pages counter used for watermark level calculation. | 5712 | * Update zone's cma pages counter used for watermark level calculation. |
5708 | */ | 5713 | */ |
5709 | static inline void __update_cma_watermarks(struct zone *zone, int count) | 5714 | static inline void __update_cma_watermarks(struct zone *zone, int count) |
5710 | { | 5715 | { |
5711 | unsigned long flags; | 5716 | unsigned long flags; |
5712 | spin_lock_irqsave(&zone->lock, flags); | 5717 | spin_lock_irqsave(&zone->lock, flags); |
5713 | zone->min_cma_pages += count; | 5718 | zone->min_cma_pages += count; |
5714 | spin_unlock_irqrestore(&zone->lock, flags); | 5719 | spin_unlock_irqrestore(&zone->lock, flags); |
5715 | setup_per_zone_wmarks(); | 5720 | setup_per_zone_wmarks(); |
5716 | } | 5721 | } |
5717 | 5722 | ||
5718 | /* | 5723 | /* |
5719 | * Trigger memory pressure bump to reclaim some pages in order to be able to | 5724 | * Trigger memory pressure bump to reclaim some pages in order to be able to |
5720 | * allocate 'count' pages in single page units. Does similar work as | 5725 | * allocate 'count' pages in single page units. Does similar work as |
5721 | *__alloc_pages_slowpath() function. | 5726 | *__alloc_pages_slowpath() function. |
5722 | */ | 5727 | */ |
5723 | static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) | 5728 | static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) |
5724 | { | 5729 | { |
5725 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 5730 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
5726 | struct zonelist *zonelist = node_zonelist(0, gfp_mask); | 5731 | struct zonelist *zonelist = node_zonelist(0, gfp_mask); |
5727 | int did_some_progress = 0; | 5732 | int did_some_progress = 0; |
5728 | int order = 1; | 5733 | int order = 1; |
5729 | 5734 | ||
5730 | /* | 5735 | /* |
5731 | * Increase level of watermarks to force kswapd do his job | 5736 | * Increase level of watermarks to force kswapd do his job |
5732 | * to stabilise at new watermark level. | 5737 | * to stabilise at new watermark level. |
5733 | */ | 5738 | */ |
5734 | __update_cma_watermarks(zone, count); | 5739 | __update_cma_watermarks(zone, count); |
5735 | 5740 | ||
5736 | /* Obey watermarks as if the page was being allocated */ | 5741 | /* Obey watermarks as if the page was being allocated */ |
5737 | while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) { | 5742 | while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) { |
5738 | wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); | 5743 | wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); |
5739 | 5744 | ||
5740 | did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, | 5745 | did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, |
5741 | NULL); | 5746 | NULL); |
5742 | if (!did_some_progress) { | 5747 | if (!did_some_progress) { |
5743 | /* Exhausted what can be done so it's blamo time */ | 5748 | /* Exhausted what can be done so it's blamo time */ |
5744 | out_of_memory(zonelist, gfp_mask, order, NULL, false); | 5749 | out_of_memory(zonelist, gfp_mask, order, NULL, false); |
5745 | } | 5750 | } |
5746 | } | 5751 | } |
5747 | 5752 | ||
5748 | /* Restore original watermark levels. */ | 5753 | /* Restore original watermark levels. */ |
5749 | __update_cma_watermarks(zone, -count); | 5754 | __update_cma_watermarks(zone, -count); |
5750 | 5755 | ||
5751 | return count; | 5756 | return count; |
5752 | } | 5757 | } |
5753 | 5758 | ||
5754 | /** | 5759 | /** |
5755 | * alloc_contig_range() -- tries to allocate given range of pages | 5760 | * alloc_contig_range() -- tries to allocate given range of pages |
5756 | * @start: start PFN to allocate | 5761 | * @start: start PFN to allocate |
5757 | * @end: one-past-the-last PFN to allocate | 5762 | * @end: one-past-the-last PFN to allocate |
5758 | * @migratetype: migratetype of the underlaying pageblocks (either | 5763 | * @migratetype: migratetype of the underlaying pageblocks (either |
5759 | * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks | 5764 | * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks |
5760 | * in range must have the same migratetype and it must | 5765 | * in range must have the same migratetype and it must |
5761 | * be either of the two. | 5766 | * be either of the two. |
5762 | * | 5767 | * |
5763 | * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES | 5768 | * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES |
5764 | * aligned, however it's the caller's responsibility to guarantee that | 5769 | * aligned, however it's the caller's responsibility to guarantee that |
5765 | * we are the only thread that changes migrate type of pageblocks the | 5770 | * we are the only thread that changes migrate type of pageblocks the |
5766 | * pages fall in. | 5771 | * pages fall in. |
5767 | * | 5772 | * |
5768 | * The PFN range must belong to a single zone. | 5773 | * The PFN range must belong to a single zone. |
5769 | * | 5774 | * |
5770 | * Returns zero on success or negative error code. On success all | 5775 | * Returns zero on success or negative error code. On success all |
5771 | * pages which PFN is in [start, end) are allocated for the caller and | 5776 | * pages which PFN is in [start, end) are allocated for the caller and |
5772 | * need to be freed with free_contig_range(). | 5777 | * need to be freed with free_contig_range(). |
5773 | */ | 5778 | */ |
5774 | int alloc_contig_range(unsigned long start, unsigned long end, | 5779 | int alloc_contig_range(unsigned long start, unsigned long end, |
5775 | unsigned migratetype) | 5780 | unsigned migratetype) |
5776 | { | 5781 | { |
5777 | struct zone *zone = page_zone(pfn_to_page(start)); | 5782 | struct zone *zone = page_zone(pfn_to_page(start)); |
5778 | unsigned long outer_start, outer_end; | 5783 | unsigned long outer_start, outer_end; |
5779 | int ret = 0, order; | 5784 | int ret = 0, order; |
5780 | 5785 | ||
5781 | /* | 5786 | /* |
5782 | * What we do here is we mark all pageblocks in range as | 5787 | * What we do here is we mark all pageblocks in range as |
5783 | * MIGRATE_ISOLATE. Because pageblock and max order pages may | 5788 | * MIGRATE_ISOLATE. Because pageblock and max order pages may |
5784 | * have different sizes, and due to the way page allocator | 5789 | * have different sizes, and due to the way page allocator |
5785 | * work, we align the range to biggest of the two pages so | 5790 | * work, we align the range to biggest of the two pages so |
5786 | * that page allocator won't try to merge buddies from | 5791 | * that page allocator won't try to merge buddies from |
5787 | * different pageblocks and change MIGRATE_ISOLATE to some | 5792 | * different pageblocks and change MIGRATE_ISOLATE to some |
5788 | * other migration type. | 5793 | * other migration type. |
5789 | * | 5794 | * |
5790 | * Once the pageblocks are marked as MIGRATE_ISOLATE, we | 5795 | * Once the pageblocks are marked as MIGRATE_ISOLATE, we |
5791 | * migrate the pages from an unaligned range (ie. pages that | 5796 | * migrate the pages from an unaligned range (ie. pages that |
5792 | * we are interested in). This will put all the pages in | 5797 | * we are interested in). This will put all the pages in |
5793 | * range back to page allocator as MIGRATE_ISOLATE. | 5798 | * range back to page allocator as MIGRATE_ISOLATE. |
5794 | * | 5799 | * |
5795 | * When this is done, we take the pages in range from page | 5800 | * When this is done, we take the pages in range from page |
5796 | * allocator removing them from the buddy system. This way | 5801 | * allocator removing them from the buddy system. This way |
5797 | * page allocator will never consider using them. | 5802 | * page allocator will never consider using them. |
5798 | * | 5803 | * |
5799 | * This lets us mark the pageblocks back as | 5804 | * This lets us mark the pageblocks back as |
5800 | * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the | 5805 | * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the |
5801 | * aligned range but not in the unaligned, original range are | 5806 | * aligned range but not in the unaligned, original range are |
5802 | * put back to page allocator so that buddy can use them. | 5807 | * put back to page allocator so that buddy can use them. |
5803 | */ | 5808 | */ |
5804 | 5809 | ||
5805 | ret = start_isolate_page_range(pfn_max_align_down(start), | 5810 | ret = start_isolate_page_range(pfn_max_align_down(start), |
5806 | pfn_max_align_up(end), migratetype); | 5811 | pfn_max_align_up(end), migratetype); |
5807 | if (ret) | 5812 | if (ret) |
5808 | goto done; | 5813 | goto done; |
5809 | 5814 | ||
5810 | ret = __alloc_contig_migrate_range(start, end); | 5815 | ret = __alloc_contig_migrate_range(start, end); |
5811 | if (ret) | 5816 | if (ret) |
5812 | goto done; | 5817 | goto done; |
5813 | 5818 | ||
5814 | /* | 5819 | /* |
5815 | * Pages from [start, end) are within a MAX_ORDER_NR_PAGES | 5820 | * Pages from [start, end) are within a MAX_ORDER_NR_PAGES |
5816 | * aligned blocks that are marked as MIGRATE_ISOLATE. What's | 5821 | * aligned blocks that are marked as MIGRATE_ISOLATE. What's |
5817 | * more, all pages in [start, end) are free in page allocator. | 5822 | * more, all pages in [start, end) are free in page allocator. |
5818 | * What we are going to do is to allocate all pages from | 5823 | * What we are going to do is to allocate all pages from |
5819 | * [start, end) (that is remove them from page allocator). | 5824 | * [start, end) (that is remove them from page allocator). |
5820 | * | 5825 | * |
5821 | * The only problem is that pages at the beginning and at the | 5826 | * The only problem is that pages at the beginning and at the |
5822 | * end of interesting range may be not aligned with pages that | 5827 | * end of interesting range may be not aligned with pages that |
5823 | * page allocator holds, ie. they can be part of higher order | 5828 | * page allocator holds, ie. they can be part of higher order |
5824 | * pages. Because of this, we reserve the bigger range and | 5829 | * pages. Because of this, we reserve the bigger range and |
5825 | * once this is done free the pages we are not interested in. | 5830 | * once this is done free the pages we are not interested in. |
5826 | * | 5831 | * |
5827 | * We don't have to hold zone->lock here because the pages are | 5832 | * We don't have to hold zone->lock here because the pages are |
5828 | * isolated thus they won't get removed from buddy. | 5833 | * isolated thus they won't get removed from buddy. |
5829 | */ | 5834 | */ |
5830 | 5835 | ||
5831 | lru_add_drain_all(); | 5836 | lru_add_drain_all(); |
5832 | drain_all_pages(); | 5837 | drain_all_pages(); |
5833 | 5838 | ||
5834 | order = 0; | 5839 | order = 0; |
5835 | outer_start = start; | 5840 | outer_start = start; |
5836 | while (!PageBuddy(pfn_to_page(outer_start))) { | 5841 | while (!PageBuddy(pfn_to_page(outer_start))) { |
5837 | if (++order >= MAX_ORDER) { | 5842 | if (++order >= MAX_ORDER) { |
5838 | ret = -EBUSY; | 5843 | ret = -EBUSY; |
5839 | goto done; | 5844 | goto done; |
5840 | } | 5845 | } |
5841 | outer_start &= ~0UL << order; | 5846 | outer_start &= ~0UL << order; |
5842 | } | 5847 | } |
5843 | 5848 | ||
5844 | /* Make sure the range is really isolated. */ | 5849 | /* Make sure the range is really isolated. */ |
5845 | if (test_pages_isolated(outer_start, end)) { | 5850 | if (test_pages_isolated(outer_start, end)) { |
5846 | pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", | 5851 | pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", |
5847 | outer_start, end); | 5852 | outer_start, end); |
5848 | ret = -EBUSY; | 5853 | ret = -EBUSY; |
5849 | goto done; | 5854 | goto done; |
5850 | } | 5855 | } |
5851 | 5856 | ||
5852 | /* | 5857 | /* |
5853 | * Reclaim enough pages to make sure that contiguous allocation | 5858 | * Reclaim enough pages to make sure that contiguous allocation |
5854 | * will not starve the system. | 5859 | * will not starve the system. |
5855 | */ | 5860 | */ |
5856 | __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); | 5861 | __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); |
5857 | 5862 | ||
5858 | /* Grab isolated pages from freelists. */ | 5863 | /* Grab isolated pages from freelists. */ |
5859 | outer_end = isolate_freepages_range(outer_start, end); | 5864 | outer_end = isolate_freepages_range(outer_start, end); |
5860 | if (!outer_end) { | 5865 | if (!outer_end) { |
5861 | ret = -EBUSY; | 5866 | ret = -EBUSY; |
5862 | goto done; | 5867 | goto done; |
5863 | } | 5868 | } |
5864 | 5869 | ||
5865 | /* Free head and tail (if any) */ | 5870 | /* Free head and tail (if any) */ |
5866 | if (start != outer_start) | 5871 | if (start != outer_start) |
5867 | free_contig_range(outer_start, start - outer_start); | 5872 | free_contig_range(outer_start, start - outer_start); |
5868 | if (end != outer_end) | 5873 | if (end != outer_end) |
5869 | free_contig_range(end, outer_end - end); | 5874 | free_contig_range(end, outer_end - end); |
5870 | 5875 | ||
5871 | done: | 5876 | done: |
5872 | undo_isolate_page_range(pfn_max_align_down(start), | 5877 | undo_isolate_page_range(pfn_max_align_down(start), |
5873 | pfn_max_align_up(end), migratetype); | 5878 | pfn_max_align_up(end), migratetype); |
5874 | return ret; | 5879 | return ret; |
5875 | } | 5880 | } |
5876 | 5881 | ||
5877 | void free_contig_range(unsigned long pfn, unsigned nr_pages) | 5882 | void free_contig_range(unsigned long pfn, unsigned nr_pages) |
5878 | { | 5883 | { |
5879 | for (; nr_pages--; ++pfn) | 5884 | for (; nr_pages--; ++pfn) |
5880 | __free_page(pfn_to_page(pfn)); | 5885 | __free_page(pfn_to_page(pfn)); |
5881 | } | 5886 | } |
5882 | #endif | 5887 | #endif |
5883 | 5888 | ||
5884 | #ifdef CONFIG_MEMORY_HOTREMOVE | 5889 | #ifdef CONFIG_MEMORY_HOTREMOVE |
5885 | /* | 5890 | /* |
5886 | * All pages in the range must be isolated before calling this. | 5891 | * All pages in the range must be isolated before calling this. |
5887 | */ | 5892 | */ |
5888 | void | 5893 | void |
5889 | __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | 5894 | __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) |
5890 | { | 5895 | { |
5891 | struct page *page; | 5896 | struct page *page; |
5892 | struct zone *zone; | 5897 | struct zone *zone; |
5893 | int order, i; | 5898 | int order, i; |
5894 | unsigned long pfn; | 5899 | unsigned long pfn; |
5895 | unsigned long flags; | 5900 | unsigned long flags; |
5896 | /* find the first valid pfn */ | 5901 | /* find the first valid pfn */ |
5897 | for (pfn = start_pfn; pfn < end_pfn; pfn++) | 5902 | for (pfn = start_pfn; pfn < end_pfn; pfn++) |
5898 | if (pfn_valid(pfn)) | 5903 | if (pfn_valid(pfn)) |
5899 | break; | 5904 | break; |
5900 | if (pfn == end_pfn) | 5905 | if (pfn == end_pfn) |
5901 | return; | 5906 | return; |
5902 | zone = page_zone(pfn_to_page(pfn)); | 5907 | zone = page_zone(pfn_to_page(pfn)); |
5903 | spin_lock_irqsave(&zone->lock, flags); | 5908 | spin_lock_irqsave(&zone->lock, flags); |
5904 | pfn = start_pfn; | 5909 | pfn = start_pfn; |
5905 | while (pfn < end_pfn) { | 5910 | while (pfn < end_pfn) { |
5906 | if (!pfn_valid(pfn)) { | 5911 | if (!pfn_valid(pfn)) { |
5907 | pfn++; | 5912 | pfn++; |
5908 | continue; | 5913 | continue; |
5909 | } | 5914 | } |
5910 | page = pfn_to_page(pfn); | 5915 | page = pfn_to_page(pfn); |
5911 | BUG_ON(page_count(page)); | 5916 | BUG_ON(page_count(page)); |
5912 | BUG_ON(!PageBuddy(page)); | 5917 | BUG_ON(!PageBuddy(page)); |
5913 | order = page_order(page); | 5918 | order = page_order(page); |
5914 | #ifdef CONFIG_DEBUG_VM | 5919 | #ifdef CONFIG_DEBUG_VM |
5915 | printk(KERN_INFO "remove from free list %lx %d %lx\n", | 5920 | printk(KERN_INFO "remove from free list %lx %d %lx\n", |
5916 | pfn, 1 << order, end_pfn); | 5921 | pfn, 1 << order, end_pfn); |
5917 | #endif | 5922 | #endif |
5918 | list_del(&page->lru); | 5923 | list_del(&page->lru); |
5919 | rmv_page_order(page); | 5924 | rmv_page_order(page); |
5920 | zone->free_area[order].nr_free--; | 5925 | zone->free_area[order].nr_free--; |
5921 | __mod_zone_page_state(zone, NR_FREE_PAGES, | 5926 | __mod_zone_page_state(zone, NR_FREE_PAGES, |
5922 | - (1UL << order)); | 5927 | - (1UL << order)); |
5923 | for (i = 0; i < (1 << order); i++) | 5928 | for (i = 0; i < (1 << order); i++) |
5924 | SetPageReserved((page+i)); | 5929 | SetPageReserved((page+i)); |
5925 | pfn += (1 << order); | 5930 | pfn += (1 << order); |
5926 | } | 5931 | } |
5927 | spin_unlock_irqrestore(&zone->lock, flags); | 5932 | spin_unlock_irqrestore(&zone->lock, flags); |
5928 | } | 5933 | } |
5929 | #endif | 5934 | #endif |
5930 | 5935 | ||
5931 | #ifdef CONFIG_MEMORY_FAILURE | 5936 | #ifdef CONFIG_MEMORY_FAILURE |
5932 | bool is_free_buddy_page(struct page *page) | 5937 | bool is_free_buddy_page(struct page *page) |
5933 | { | 5938 | { |
5934 | struct zone *zone = page_zone(page); | 5939 | struct zone *zone = page_zone(page); |
5935 | unsigned long pfn = page_to_pfn(page); | 5940 | unsigned long pfn = page_to_pfn(page); |
5936 | unsigned long flags; | 5941 | unsigned long flags; |
5937 | int order; | 5942 | int order; |
5938 | 5943 | ||
5939 | spin_lock_irqsave(&zone->lock, flags); | 5944 | spin_lock_irqsave(&zone->lock, flags); |
5940 | for (order = 0; order < MAX_ORDER; order++) { | 5945 | for (order = 0; order < MAX_ORDER; order++) { |
5941 | struct page *page_head = page - (pfn & ((1 << order) - 1)); | 5946 | struct page *page_head = page - (pfn & ((1 << order) - 1)); |
5942 | 5947 | ||
5943 | if (PageBuddy(page_head) && page_order(page_head) >= order) | 5948 | if (PageBuddy(page_head) && page_order(page_head) >= order) |
5944 | break; | 5949 | break; |
5945 | } | 5950 | } |
5946 | spin_unlock_irqrestore(&zone->lock, flags); | 5951 | spin_unlock_irqrestore(&zone->lock, flags); |
5947 | 5952 | ||
5948 | return order < MAX_ORDER; | 5953 | return order < MAX_ORDER; |
5949 | } | 5954 | } |
5950 | #endif | 5955 | #endif |
5951 | 5956 | ||
5952 | static const struct trace_print_flags pageflag_names[] = { | 5957 | static const struct trace_print_flags pageflag_names[] = { |
5953 | {1UL << PG_locked, "locked" }, | 5958 | {1UL << PG_locked, "locked" }, |
5954 | {1UL << PG_error, "error" }, | 5959 | {1UL << PG_error, "error" }, |
5955 | {1UL << PG_referenced, "referenced" }, | 5960 | {1UL << PG_referenced, "referenced" }, |
5956 | {1UL << PG_uptodate, "uptodate" }, | 5961 | {1UL << PG_uptodate, "uptodate" }, |
5957 | {1UL << PG_dirty, "dirty" }, | 5962 | {1UL << PG_dirty, "dirty" }, |
5958 | {1UL << PG_lru, "lru" }, | 5963 | {1UL << PG_lru, "lru" }, |
5959 | {1UL << PG_active, "active" }, | 5964 | {1UL << PG_active, "active" }, |
5960 | {1UL << PG_slab, "slab" }, | 5965 | {1UL << PG_slab, "slab" }, |
5961 | {1UL << PG_owner_priv_1, "owner_priv_1" }, | 5966 | {1UL << PG_owner_priv_1, "owner_priv_1" }, |
5962 | {1UL << PG_arch_1, "arch_1" }, | 5967 | {1UL << PG_arch_1, "arch_1" }, |
5963 | {1UL << PG_reserved, "reserved" }, | 5968 | {1UL << PG_reserved, "reserved" }, |
5964 | {1UL << PG_private, "private" }, | 5969 | {1UL << PG_private, "private" }, |
5965 | {1UL << PG_private_2, "private_2" }, | 5970 | {1UL << PG_private_2, "private_2" }, |
5966 | {1UL << PG_writeback, "writeback" }, | 5971 | {1UL << PG_writeback, "writeback" }, |
5967 | #ifdef CONFIG_PAGEFLAGS_EXTENDED | 5972 | #ifdef CONFIG_PAGEFLAGS_EXTENDED |
5968 | {1UL << PG_head, "head" }, | 5973 | {1UL << PG_head, "head" }, |
5969 | {1UL << PG_tail, "tail" }, | 5974 | {1UL << PG_tail, "tail" }, |
5970 | #else | 5975 | #else |
5971 | {1UL << PG_compound, "compound" }, | 5976 | {1UL << PG_compound, "compound" }, |
5972 | #endif | 5977 | #endif |
5973 | {1UL << PG_swapcache, "swapcache" }, | 5978 | {1UL << PG_swapcache, "swapcache" }, |
5974 | {1UL << PG_mappedtodisk, "mappedtodisk" }, | 5979 | {1UL << PG_mappedtodisk, "mappedtodisk" }, |
5975 | {1UL << PG_reclaim, "reclaim" }, | 5980 | {1UL << PG_reclaim, "reclaim" }, |
5976 | {1UL << PG_swapbacked, "swapbacked" }, | 5981 | {1UL << PG_swapbacked, "swapbacked" }, |
5977 | {1UL << PG_unevictable, "unevictable" }, | 5982 | {1UL << PG_unevictable, "unevictable" }, |
5978 | #ifdef CONFIG_MMU | 5983 | #ifdef CONFIG_MMU |
5979 | {1UL << PG_mlocked, "mlocked" }, | 5984 | {1UL << PG_mlocked, "mlocked" }, |
5980 | #endif | 5985 | #endif |
5981 | #ifdef CONFIG_ARCH_USES_PG_UNCACHED | 5986 | #ifdef CONFIG_ARCH_USES_PG_UNCACHED |
5982 | {1UL << PG_uncached, "uncached" }, | 5987 | {1UL << PG_uncached, "uncached" }, |
5983 | #endif | 5988 | #endif |
5984 | #ifdef CONFIG_MEMORY_FAILURE | 5989 | #ifdef CONFIG_MEMORY_FAILURE |
5985 | {1UL << PG_hwpoison, "hwpoison" }, | 5990 | {1UL << PG_hwpoison, "hwpoison" }, |
5986 | #endif | 5991 | #endif |
5987 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 5992 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
5988 | {1UL << PG_compound_lock, "compound_lock" }, | 5993 | {1UL << PG_compound_lock, "compound_lock" }, |
5989 | #endif | 5994 | #endif |
5990 | }; | 5995 | }; |
5991 | 5996 | ||
5992 | static void dump_page_flags(unsigned long flags) | 5997 | static void dump_page_flags(unsigned long flags) |
5993 | { | 5998 | { |
5994 | const char *delim = ""; | 5999 | const char *delim = ""; |
5995 | unsigned long mask; | 6000 | unsigned long mask; |
5996 | int i; | 6001 | int i; |
5997 | 6002 | ||
5998 | BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); | 6003 | BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); |
5999 | 6004 | ||
6000 | printk(KERN_ALERT "page flags: %#lx(", flags); | 6005 | printk(KERN_ALERT "page flags: %#lx(", flags); |
6001 | 6006 | ||
6002 | /* remove zone id */ | 6007 | /* remove zone id */ |
6003 | flags &= (1UL << NR_PAGEFLAGS) - 1; | 6008 | flags &= (1UL << NR_PAGEFLAGS) - 1; |
6004 | 6009 | ||
6005 | for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { | 6010 | for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { |
6006 | 6011 | ||
6007 | mask = pageflag_names[i].mask; | 6012 | mask = pageflag_names[i].mask; |
6008 | if ((flags & mask) != mask) | 6013 | if ((flags & mask) != mask) |
6009 | continue; | 6014 | continue; |
6010 | 6015 | ||
6011 | flags &= ~mask; | 6016 | flags &= ~mask; |
6012 | printk("%s%s", delim, pageflag_names[i].name); | 6017 | printk("%s%s", delim, pageflag_names[i].name); |
6013 | delim = "|"; | 6018 | delim = "|"; |
6014 | } | 6019 | } |
6015 | 6020 | ||
6016 | /* check for left over flags */ | 6021 | /* check for left over flags */ |
6017 | if (flags) | 6022 | if (flags) |
6018 | printk("%s%#lx", delim, flags); | 6023 | printk("%s%#lx", delim, flags); |
6019 | 6024 | ||
6020 | printk(")\n"); | 6025 | printk(")\n"); |
6021 | } | 6026 | } |
6022 | 6027 | ||
6023 | void dump_page(struct page *page) | 6028 | void dump_page(struct page *page) |
6024 | { | 6029 | { |
6025 | printk(KERN_ALERT | 6030 | printk(KERN_ALERT |
6026 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", | 6031 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", |
6027 | page, atomic_read(&page->_count), page_mapcount(page), | 6032 | page, atomic_read(&page->_count), page_mapcount(page), |
6028 | page->mapping, page->index); | 6033 | page->mapping, page->index); |
6029 | dump_page_flags(page->flags); | 6034 | dump_page_flags(page->flags); |
6030 | mem_cgroup_print_bad_page(page); | 6035 | mem_cgroup_print_bad_page(page); |
6031 | } | 6036 | } |
6032 | 6037 |